Skip to main content
Glama
MarkdownChunker.test.tsโ€ข11.6 kB
/** * Unit tests for MarkdownChunker */ import { MarkdownChunker } from "../../parsers/MarkdownChunker.js"; import type { MarkdownHeading } from "../../markdown/types.js"; describe("MarkdownChunker", () => { describe("Basic Chunking", () => { it("should create chunks from content", () => { const chunker = new MarkdownChunker(); const content = "# Introduction\n\nThis is the introduction section."; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Introduction", slug: "introduction", line_number: 1, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks.length).toBeGreaterThan(0); expect(chunks[0].file_id).toBe("file1"); expect(chunks[0].chunk_index).toBe(0); }); it("should set chunk indices correctly", () => { const chunker = new MarkdownChunker(); const content = `# Section 1 Content 1 # Section 2 Content 2 # Section 3 Content 3`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Section 1", slug: "section-1", line_number: 1, parent_id: null, }, { id: "h2", file_id: "file1", level: 1, text: "Section 2", slug: "section-2", line_number: 4, parent_id: null, }, { id: "h3", file_id: "file1", level: 1, text: "Section 3", slug: "section-3", line_number: 7, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks[0].chunk_index).toBe(0); expect(chunks[1].chunk_index).toBe(1); expect(chunks[2].chunk_index).toBe(2); }); }); describe("Heading-Based Chunking", () => { it("should split content by headings", () => { const chunker = new MarkdownChunker(); const content = `# Chapter 1 This is chapter 1 content. # Chapter 2 This is chapter 2 content.`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Chapter 1", slug: "chapter-1", line_number: 1, parent_id: null, }, { id: "h2", file_id: "file1", level: 1, text: "Chapter 2", slug: "chapter-2", line_number: 4, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks).toHaveLength(2); expect(chunks[0].content).toContain("Chapter 1"); expect(chunks[0].content).toContain("chapter 1 content"); expect(chunks[1].content).toContain("Chapter 2"); expect(chunks[1].content).toContain("chapter 2 content"); }); it("should preserve heading hierarchy in context", () => { const chunker = new MarkdownChunker({ preserveContext: true }); const content = `# Chapter 1 ## Section 1.1 ### Subsection 1.1.1 Content here`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Chapter 1", slug: "chapter-1", line_number: 1, parent_id: null, }, { id: "h2", file_id: "file1", level: 2, text: "Section 1.1", slug: "section-1-1", line_number: 2, parent_id: "h1", }, { id: "h3", file_id: "file1", level: 3, text: "Subsection 1.1.1", slug: "subsection-1-1-1", line_number: 3, parent_id: "h2", }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks[2].heading).toBe("Chapter 1 > Section 1.1 > Subsection 1.1.1"); }); it("should handle content before first heading", () => { const chunker = new MarkdownChunker(); const content = `This is preamble text before any heading. # First Heading Content after heading.`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "First Heading", slug: "first-heading", line_number: 3, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks).toHaveLength(2); expect(chunks[0].heading).toBeNull(); expect(chunks[0].content).toContain("preamble text"); expect(chunks[1].heading).toBe("First Heading"); }); }); describe("Size-Based Chunking", () => { it("should split large sections into multiple chunks", () => { const chunker = new MarkdownChunker({ maxChunkSize: 20, overlapSize: 5 }); const content = `# Long Section ${"word ".repeat(100)}`; // 100 words const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Long Section", slug: "long-section", line_number: 1, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks.length).toBeGreaterThan(1); // Each chunk should be around maxChunkSize for (const chunk of chunks) { expect(chunk.word_count).toBeLessThanOrEqual(25); // Some tolerance } }); it("should create overlapping chunks", () => { const chunker = new MarkdownChunker({ maxChunkSize: 10, overlapSize: 3, splitOnHeadings: false, }); const content = "one two three four five six seven eight nine ten eleven twelve"; const chunks = chunker.chunk("file1", content, []); expect(chunks.length).toBeGreaterThan(1); // Check for overlap (last words of chunk N should appear in chunk N+1) if (chunks.length > 1) { const chunk0Words = chunks[0].content.split(/\s+/); // Should have some overlap const lastWordsChunk0 = chunk0Words.slice(-3).join(" "); expect(chunks[1].content).toContain(lastWordsChunk0.split(" ")[0]); } }); it("should handle size-based chunking without headings", () => { const chunker = new MarkdownChunker({ maxChunkSize: 20, overlapSize: 5, splitOnHeadings: false, }); const content = "word ".repeat(50); const chunks = chunker.chunk("file1", content, []); expect(chunks.length).toBeGreaterThan(1); expect(chunks[0].heading).toBeNull(); }); }); describe("Word and Token Counting", () => { it("should count words correctly", () => { const chunker = new MarkdownChunker(); const content = `# Test This is five words total.`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Test", slug: "test", line_number: 1, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks[0].word_count).toBeGreaterThan(0); }); it("should estimate token count", () => { const chunker = new MarkdownChunker(); const content = `# Test Some content here.`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Test", slug: "test", line_number: 1, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); // Token count should be roughly 1.3x word count expect(chunks[0].token_count).toBeGreaterThan(chunks[0].word_count); expect(chunks[0].token_count).toBeLessThan(chunks[0].word_count * 2); }); }); describe("Configuration", () => { it("should use default configuration", () => { const chunker = new MarkdownChunker(); const content = "Test content"; const chunks = chunker.chunk("file1", content, []); expect(chunks).toBeDefined(); }); it("should accept custom configuration", () => { const customConfig = { maxChunkSize: 100, overlapSize: 20, splitOnHeadings: false, preserveContext: false, }; const chunker = new MarkdownChunker(customConfig); const content = "word ".repeat(200); const chunks = chunker.chunk("file1", content, []); expect(chunks.length).toBeGreaterThan(0); }); it("should disable context preservation when configured", () => { const chunker = new MarkdownChunker({ preserveContext: false }); const content = `# Chapter 1 ## Section 1.1 Content`; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Chapter 1", slug: "chapter-1", line_number: 1, parent_id: null, }, { id: "h2", file_id: "file1", level: 2, text: "Section 1.1", slug: "section-1-1", line_number: 2, parent_id: "h1", }, ]; const chunks = chunker.chunk("file1", content, headings); // Should only have section heading, not full context expect(chunks[1].heading).toBe("Section 1.1"); expect(chunks[1].heading).not.toContain("Chapter 1"); }); }); describe("Edge Cases", () => { it("should handle empty content", () => { const chunker = new MarkdownChunker(); const chunks = chunker.chunk("file1", "", []); expect(chunks).toHaveLength(0); }); it("should handle content with no headings", () => { const chunker = new MarkdownChunker({ splitOnHeadings: true }); const content = "Just plain text without any headings."; const chunks = chunker.chunk("file1", content, []); // Should fallback to size-based chunking expect(chunks).toHaveLength(1); expect(chunks[0].heading).toBeNull(); }); it("should handle very short content", () => { const chunker = new MarkdownChunker({ maxChunkSize: 1000 }); const content = "# Test\nShort."; const headings: MarkdownHeading[] = [ { id: "h1", file_id: "file1", level: 1, text: "Test", slug: "test", line_number: 1, parent_id: null, }, ]; const chunks = chunker.chunk("file1", content, headings); expect(chunks).toHaveLength(1); }); it("should handle single word content", () => { const chunker = new MarkdownChunker(); const content = "Test"; const chunks = chunker.chunk("file1", content, []); expect(chunks).toHaveLength(1); expect(chunks[0].word_count).toBe(1); }); it("should prevent infinite loop with invalid overlap config", () => { const chunker = new MarkdownChunker({ maxChunkSize: 10, overlapSize: 15, // Overlap > max size splitOnHeadings: false, }); const content = "word ".repeat(50); const chunks = chunker.chunk("file1", content, []); // Should still produce chunks without infinite loop expect(chunks.length).toBeGreaterThan(0); expect(chunks.length).toBeLessThan(100); // Sanity check }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/xiaolai/claude-writers-aid-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server