Doclea MCP

Official

Overview Schema Related Servers Score Discussions

markdown.test.ts•12.2 KiB

/** * Tests for markdown semantic chunking */ import { describe, expect, test } from "bun:test"; import { chunkMarkdown, extractFrontmatter, getHeadersAtLine, } from "../../chunking/markdown"; describe("Markdown Chunking", () => { describe("chunkMarkdown", () => { test("should handle empty input", async () => { expect(await chunkMarkdown("")).toEqual([]); expect(await chunkMarkdown(" ")).toEqual([]); }); test("should chunk simple markdown by headers", async () => { const markdown = `# Title Some intro content. ## Section 1 Content for section 1. ## Section 2 Content for section 2.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBeGreaterThanOrEqual(3); expect(chunks[0].metadata.headers).toContain("Title"); expect(chunks[0].metadata.level).toBe(1); }); test("should preserve header hierarchy", async () => { const markdown = `# Main ## Sub 1 Content 1. ### Sub Sub Nested content. ## Sub 2 Content 2.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); // Find the "Sub Sub" section const nestedChunk = chunks.find((c) => c.metadata.headers.includes("Sub Sub"), ); expect(nestedChunk).toBeDefined(); expect(nestedChunk?.metadata.headers).toContain("Main"); expect(nestedChunk?.metadata.headers).toContain("Sub 1"); expect(nestedChunk?.metadata.headers).toContain("Sub Sub"); }); test("should track line numbers correctly", async () => { const markdown = `Line 1 Line 2 Line 3 # Header on line 4 Line 5 Line 6`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); // First chunk (before header) expect(chunks[0].metadata.startLine).toBe(1); // Header section starts on line 4 const headerChunk = chunks.find((c) => c.content.includes("# Header on line 4"), ); expect(headerChunk).toBeDefined(); expect(headerChunk?.metadata.startLine).toBe(4); }); test("should preserve frontmatter in first chunk", async () => { const markdown = `--- title: Test Document date: 2024-01-01 --- # Introduction Content here.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks[0].metadata.hasFrontmatter).toBe(true); expect(chunks[0].content).toContain("---"); expect(chunks[0].content).toContain("title: Test Document"); }); test("should handle code blocks atomically", async () => { const markdown = `# Code Example Here is some code: \`\`\`typescript function hello() { console.log("Hello, World!"); return 42; } \`\`\` More text after code.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); // Find chunk with code block const codeChunk = chunks.find((c) => c.metadata.hasCodeBlock); expect(codeChunk).toBeDefined(); // Code block should not be split expect(codeChunk?.content).toContain('console.log("Hello, World!")'); expect(codeChunk?.content).toContain("return 42"); }); test("should not split code blocks even if over token limit", async () => { const longCode = `const x = ${'"a".repeat(100);\n'.repeat(20)}`; const markdown = `# Code \`\`\`javascript ${longCode} \`\`\``; const chunks = await chunkMarkdown(markdown, { maxTokens: 50 }); // Code block gets its own chunk even if large const codeChunk = chunks.find((c) => c.metadata.hasCodeBlock); expect(codeChunk).toBeDefined(); expect(codeChunk?.content).toContain("```javascript"); expect(codeChunk?.content).toContain("```"); }); test("should split large sections while respecting token limits", async () => { // Create content that will exceed token limit - need substantial content const longContent = "This is a sentence with multiple words. ".repeat(50); const markdown = `# Large Section ${longContent}`; const chunks = await chunkMarkdown(markdown, { maxTokens: 100 }); expect(chunks.length).toBeGreaterThan(1); // Most chunks should be approximately within limit // (first chunk includes header which adds tokens) const regularChunks = chunks.slice(1); for (const chunk of regularChunks) { // Allow larger variance since we split by lines, not tokens expect(chunk.tokenCount).toBeLessThanOrEqual(150); } }); test("should handle multiple code blocks", async () => { const markdown = `# Examples First example: \`\`\`python print("Hello") \`\`\` Second example: \`\`\`javascript console.log("World"); \`\`\``; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); const codeChunks = chunks.filter((c) => c.metadata.hasCodeBlock); expect(codeChunks.length).toBeGreaterThanOrEqual(1); }); test("should handle headers inside code blocks correctly", async () => { const markdown = `# Real Header \`\`\`markdown # This is not a header ## Neither is this \`\`\` ## Another Real Header Content.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); // Should have 2 real headers, not 4 const h1Chunks = chunks.filter((c) => c.metadata.level === 1); const h2Chunks = chunks.filter((c) => c.metadata.level === 2); expect(h1Chunks.length).toBe(1); expect(h2Chunks.length).toBe(1); }); test("should handle nested headers correctly", async () => { const markdown = `# H1 ## H2a ### H3 ## H2b Content.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); // H3 should have H1 and H2a in its headers const h3Chunk = chunks.find((c) => c.metadata.headers.includes("H3")); expect(h3Chunk).toBeDefined(); expect(h3Chunk?.metadata.headers).toContain("H1"); expect(h3Chunk?.metadata.headers).toContain("H2a"); // H2b should NOT have H2a in its headers (sibling, not parent) const h2bChunk = chunks.find((c) => c.metadata.headers.includes("H2b")); expect(h2bChunk).toBeDefined(); expect(h2bChunk?.metadata.headers).not.toContain("H2a"); }); test("should include token count in each chunk", async () => { const markdown = `# Test Some content here.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); for (const chunk of chunks) { expect(chunk.tokenCount).toBeGreaterThan(0); expect(typeof chunk.tokenCount).toBe("number"); } }); test("should handle content without headers", async () => { const markdown = `Just some plain text content without any headers at all. Multiple paragraphs even.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBe(1); expect(chunks[0].metadata.level).toBe(0); expect(chunks[0].metadata.headers).toEqual([]); }); test("should handle only frontmatter", async () => { const markdown = `--- title: Only Frontmatter ---`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBe(1); expect(chunks[0].metadata.hasFrontmatter).toBe(true); }); test("should respect maxTokens option", async () => { const markdown = "word ".repeat(500); const chunks256 = await chunkMarkdown(markdown, { maxTokens: 256 }); const chunks100 = await chunkMarkdown(markdown, { maxTokens: 100 }); expect(chunks100.length).toBeGreaterThan(chunks256.length); }); }); describe("extractFrontmatter", () => { test("should extract frontmatter", () => { const markdown = `--- title: Test author: Me --- # Content`; const result = extractFrontmatter(markdown); expect(result.frontmatter).toContain("title: Test"); expect(result.frontmatter).toContain("author: Me"); expect(result.content.trim()).toBe("# Content"); expect(result.frontmatterLines).toBe(5); }); test("should handle missing frontmatter", () => { const markdown = `# No Frontmatter Just content.`; const result = extractFrontmatter(markdown); expect(result.frontmatter).toBeNull(); expect(result.content).toBe(markdown); expect(result.frontmatterLines).toBe(0); }); test("should handle empty frontmatter", () => { const markdown = `--- --- Content`; const result = extractFrontmatter(markdown); expect(result.frontmatter).toBe(""); expect(result.content.trim()).toBe("Content"); }); test("should not treat --- in content as frontmatter", () => { const markdown = `# Title Some content --- More content`; const result = extractFrontmatter(markdown); expect(result.frontmatter).toBeNull(); expect(result.content).toBe(markdown); }); }); describe("getHeadersAtLine", () => { test("should return empty array for line before any headers", () => { const markdown = `Some content before headers. # First Header`; const headers = getHeadersAtLine(markdown, 1); expect(headers).toEqual([]); }); test("should return headers at specific line", () => { const markdown = `# H1 ## H2 Content here on line 5`; const headers = getHeadersAtLine(markdown, 5); expect(headers).toEqual(["H1", "H2"]); }); test("should handle header hierarchy correctly", () => { const markdown = `# Main ## Sub 1 ### Deep ## Sub 2 Content`; // At "Deep" (line 3), should have Main > Sub 1 > Deep const headersAtDeep = getHeadersAtLine(markdown, 3); expect(headersAtDeep).toEqual(["Main", "Sub 1", "Deep"]); // At "Sub 2" (line 4), should have Main > Sub 2 (Deep is sibling, not parent) const headersAtSub2 = getHeadersAtLine(markdown, 4); expect(headersAtSub2).toEqual(["Main", "Sub 2"]); }); test("should ignore headers inside code blocks", () => { const markdown = `# Real Header \`\`\` # Fake Header \`\`\` Content`; const headers = getHeadersAtLine(markdown, 5); expect(headers).toEqual(["Real Header"]); expect(headers).not.toContain("Fake Header"); }); test("should handle out-of-range line numbers", () => { const markdown = `# Header Content`; // Line 100 doesn't exist, should return headers up to end const headers = getHeadersAtLine(markdown, 100); expect(headers).toEqual(["Header"]); }); }); describe("Edge cases", () => { test("should handle markdown with only code block", async () => { const markdown = `\`\`\`javascript console.log("Hello"); \`\`\``; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBe(1); expect(chunks[0].metadata.hasCodeBlock).toBe(true); }); test("should handle unclosed code block", async () => { const markdown = `# Header \`\`\`javascript // Code never closes const x = 1;`; // Should not throw const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBeGreaterThan(0); }); test("should handle special characters in headers", async () => { const markdown = `# Hello <World> & "Quotes" Content.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks[0].metadata.headers[0]).toBe('Hello <World> & "Quotes"'); }); test("should handle unicode in content", async () => { const markdown = `# 你好世界这是中文内容。 ## Émojis 🎉 Content with émojis 🚀 and spëcial çharacters.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBeGreaterThan(0); expect(chunks[0].metadata.headers).toContain("你好世界"); }); test("should handle windows line endings", async () => { const markdown = "# Header\r\n\r\nContent\r\n\r\n## Sub\r\n\r\nMore"; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); expect(chunks.length).toBeGreaterThan(0); }); test("should handle consecutive headers", async () => { const markdown = `# H1 ## H2 ### H3 #### H4 Finally some content.`; const chunks = await chunkMarkdown(markdown, { maxTokens: 500 }); // Headers should be tracked correctly const lastChunk = chunks[chunks.length - 1]; expect(lastChunk.metadata.headers.length).toBeLessThanOrEqual(4); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/docleaai/doclea-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

markdown.test.ts•12.2 KiB