Skip to main content
Glama

docs-mcp-server

SemanticMarkdownSplitter.test.ts13 kB
import { describe, expect, it, vi } from "vitest"; import { SemanticMarkdownSplitter } from "./SemanticMarkdownSplitter"; vi.mock("../utils/logger"); describe("SemanticMarkdownSplitter", () => { it("should handle empty markdown", async () => { const splitter = new SemanticMarkdownSplitter(100, 5000); const result = await splitter.splitText(""); expect(result).toEqual([]); }); it("should handle markdown with no headings", async () => { const splitter = new SemanticMarkdownSplitter(100, 5000); const markdown = "This is some text without any headings."; const result = await splitter.splitText(markdown); expect(result).toEqual([ { types: ["text"], content: "This is some text without any headings.", section: { level: 0, path: [], }, }, ]); }); it("should correctly split on H1-H6 headings", async () => { const splitter = new SemanticMarkdownSplitter(100, 5000); const markdown = ` # Chapter 1 Some text in chapter 1. ## Section 1.1 More text in section 1.1. ### Subsection 1.1.1 Text in subsection. This should stay with previous section. #### H4 Heading Some text after h4 ##### H5 Heading Some text after h5 ###### H6 Heading Some text after h6 ## Section 1.2 Final text. # Chapter 2 Text in chapter 2. `; const result = await splitter.splitText(markdown); expect(result).toEqual([ { types: ["heading"], content: "# Chapter 1", section: { level: 1, path: ["Chapter 1"], }, }, { types: ["text"], content: "Some text in chapter 1.", section: { level: 1, path: ["Chapter 1"], }, }, { types: ["heading"], content: "## Section 1.1", section: { level: 2, path: ["Chapter 1", "Section 1.1"], }, }, { types: ["text"], content: "More text in section 1.1.", section: { level: 2, path: ["Chapter 1", "Section 1.1"], }, }, { types: ["heading"], content: "### Subsection 1.1.1", section: { level: 3, path: ["Chapter 1", "Section 1.1", "Subsection 1.1.1"], }, }, { types: ["text"], content: "Text in subsection. This should stay with previous section.", section: { level: 3, path: ["Chapter 1", "Section 1.1", "Subsection 1.1.1"], }, }, { types: ["heading"], content: "#### H4 Heading", section: { level: 4, path: ["Chapter 1", "Section 1.1", "Subsection 1.1.1", "H4 Heading"], }, }, { types: ["text"], content: "Some text after h4", section: { level: 4, path: ["Chapter 1", "Section 1.1", "Subsection 1.1.1", "H4 Heading"], }, }, { types: ["heading"], content: "##### H5 Heading", section: { level: 5, path: [ "Chapter 1", "Section 1.1", "Subsection 1.1.1", "H4 Heading", "H5 Heading", ], }, }, { types: ["text"], content: "Some text after h5", section: { level: 5, path: [ "Chapter 1", "Section 1.1", "Subsection 1.1.1", "H4 Heading", "H5 Heading", ], }, }, { types: ["heading"], content: "###### H6 Heading", section: { level: 6, path: [ "Chapter 1", "Section 1.1", "Subsection 1.1.1", "H4 Heading", "H5 Heading", "H6 Heading", ], }, }, { types: ["text"], content: "Some text after h6", section: { level: 6, path: [ "Chapter 1", "Section 1.1", "Subsection 1.1.1", "H4 Heading", "H5 Heading", "H6 Heading", ], }, }, { types: ["heading"], content: "## Section 1.2", section: { level: 2, path: ["Chapter 1", "Section 1.2"], }, }, { types: ["text"], content: "Final text.", section: { level: 2, path: ["Chapter 1", "Section 1.2"], }, }, { types: ["heading"], content: "# Chapter 2", section: { level: 1, path: ["Chapter 2"], }, }, { types: ["text"], content: "Text in chapter 2.", section: { level: 1, path: ["Chapter 2"], }, }, ]); }); it("should separate headings, text, code, and tables", async () => { const splitter = new SemanticMarkdownSplitter(100, 5000); const markdown = ` # Mixed Content Section This is some text. More text here. \`\`\`javascript // Some code in JavaScript console.log('Hello'); \`\`\` | Header 1 | Header 2 | | -------- | -------- | | Cell 1 | Cell 2 | `; const result = await splitter.splitText(markdown); expect(result).toEqual([ { types: ["heading"], content: "# Mixed Content Section", section: { level: 1, path: ["Mixed Content Section"], }, }, { types: ["text"], content: "This is some text. More text here.", section: { level: 1, path: ["Mixed Content Section"], }, }, { types: ["code"], content: "```javascript\n// Some code in JavaScript\nconsole.log('Hello');\n```", section: { level: 1, path: ["Mixed Content Section"], }, }, { types: ["table"], content: "| Header 1 | Header 2 |\n|---|---|\n| Cell 1 | Cell 2 |", section: { level: 1, path: ["Mixed Content Section"], }, }, ]); }); it("should correctly split long tables while preserving headers", async () => { const splitter = new SemanticMarkdownSplitter(10, 100); // Create a table with many rows that will exceed chunkSize const tableRows = Array.from( { length: 20 }, (_, i) => `| ${i + 1} | This is row ${i + 1} | ${(i + 1) * 100} |`, ).join("\n"); const markdown = ` | ID | Description | Value | |----|------------|-------| ${tableRows} `; const result = await splitter.splitText(markdown); // Verify that we got multiple chunks expect(result.length).toBeGreaterThan(1); // Verify each chunk for (const chunk of result) { expect(chunk.types).toEqual(["table"]); // Each chunk should start with the header expect(chunk.content).toMatch(/^\| ID \| Description \| Value \|/); // Each chunk should have the header separator expect(chunk.content).toMatch(/\|---|---|---\|/); // Each chunk should have at least one data row expect(chunk.content.split("\n").length).toBeGreaterThan(2); // Each chunk should be valid markdown table format expect(chunk.content).toMatch(/^\|.*\|$/gm); // Each chunk should be within size limit expect(chunk.content.length).toBeLessThanOrEqual(100); } }); it("should correctly split long code blocks while preserving language", async () => { const splitter = new SemanticMarkdownSplitter(10, 100); // Create a long code block that will exceed chunkSize const codeLines = Array.from( { length: 20 }, (_, i) => `console.log("This is line ${i + 1} with some extra text to make it longer");`, ).join("\n"); const markdown = ` \`\`\`javascript ${codeLines} \`\`\` `; const result = await splitter.splitText(markdown); // Verify that we got multiple chunks expect(result.length).toBeGreaterThan(1); // Verify each chunk for (const chunk of result) { expect(chunk.types).toEqual(["code"]); // Each chunk should start with the language identifier expect(chunk.content).toMatch(/^```javascript\n/); // Each chunk should end with closing backticks expect(chunk.content).toMatch(/\n```$/); // Each chunk should contain actual code expect(chunk.content).toMatch(/console\.log/); // Each chunk should be within size limit expect(chunk.content.length).toBeLessThanOrEqual(100); } }); it("should handle tables that cannot be split semantically by using character-based splitting", async () => { const splitter = new SemanticMarkdownSplitter(20, 20); const markdown = ` | Header1 | Header2 | |---------|---------| | Cell1 | Cell2 |`; // Should not throw an error anymore const result = await splitter.splitText(markdown); // Verify we got chunks back expect(result.length).toBeGreaterThan(0); // Each chunk should be under the max size expect(result.every((chunk) => chunk.content.length <= 20)).toBe(true); }); it("should handle code blocks that cannot be split semantically by using character-based splitting", async () => { const splitter = new SemanticMarkdownSplitter(20, 20); const markdown = "```javascript\nconst x = 1;\n```"; // Should not throw an error anymore const result = await splitter.splitText(markdown); // Verify we got chunks back expect(result.length).toBeGreaterThan(0); // Each chunk should be under the max size expect(result.every((chunk) => chunk.content.length <= 20)).toBe(true); }); it("should handle JSON code blocks in markdown properly", async () => { const markdown = ` # API Documentation Here's an example API response: \`\`\`json { "name": "test-library", "version": "1.0.0", "dependencies": { "react": "^18.0.0", "lodash": "^4.17.21" } } \`\`\` This JSON shows the package structure. `; const splitter = new SemanticMarkdownSplitter(1000, 2000); const chunks = await splitter.splitText(markdown); expect(chunks).toHaveLength(4); // Should have heading chunk expect(chunks[0]).toEqual({ types: ["heading"], content: "# API Documentation", section: { level: 1, path: ["API Documentation"], }, }); // Should have text chunk expect(chunks[1]).toEqual({ types: ["text"], content: "Here's an example API response:", section: { level: 1, path: ["API Documentation"], }, }); // Should have JSON code block chunk with preserved formatting expect(chunks[2].types).toEqual(["code"]); expect(chunks[2].content).toMatch(/^```json\n/); expect(chunks[2].content).toMatch(/\n```$/); expect(chunks[2].content).toContain("test-library"); expect(chunks[2].content).toContain("react"); expect(chunks[2].section).toEqual({ level: 1, path: ["API Documentation"], }); // Should have final text chunk expect(chunks[3]).toEqual({ types: ["text"], content: "This JSON shows the package structure.", section: { level: 1, path: ["API Documentation"], }, }); }); it("should handle raw JSON as plain text in edge cases", async () => { // This simulates the edge case where JSON content somehow gets processed as markdown // In practice, this should be rare because JSON should be routed to JsonPipeline const rawJson = `{"name": "test", "version": "1.0.0"}`; const splitter = new SemanticMarkdownSplitter(1000, 2000); const chunks = await splitter.splitText(rawJson); // Should treat as plain text content expect(chunks).toHaveLength(1); expect(chunks[0].content).toContain("test"); expect(chunks[0].content).toContain("1.0.0"); expect(chunks[0].section.path).toEqual([]); expect(chunks[0].types).toEqual(["text"]); }); it("should handle invalid JSON as plain text", async () => { const invalidJson = `{ "name": "test-library", "version": "1.0.0" // This comment makes it invalid JSON "invalid": true }`; const splitter = new SemanticMarkdownSplitter(1000, 2000); const chunks = await splitter.splitText(invalidJson); // Should treat as plain text content without structural splitting expect(chunks).toHaveLength(1); expect(chunks[0].section.path).toEqual([]); }); it("should preserve content for non-JSON text", async () => { const textContent = `This is not JSON at all, just plain text content that should be preserved.`; const splitter = new SemanticMarkdownSplitter(1000, 2000); const chunks = await splitter.splitText(textContent); // Should preserve the content as-is in a single chunk expect(chunks).toHaveLength(1); expect(chunks[0].content).toContain(textContent); expect(chunks[0].section.path).toEqual([]); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server