Skip to main content
Glama

docs-mcp-server

PipelineFactory.integration.test.ts18.5 kB
import { describe, expect, it } from "vitest"; import { ScrapeMode } from "../types"; import { type PipelineConfiguration, PipelineFactory } from "./PipelineFactory"; describe("PipelineFactory Integration", () => { describe("configuration propagation", () => { it("should propagate custom chunk sizes through process method", async () => { // Create pipelines with custom configuration const config: PipelineConfiguration = { chunkSizes: { preferred: 100, // Very small for testing max: 200, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); // Create content that would definitely exceed the custom chunk size const longContent = "This is a test sentence that is long enough to be split.\n".repeat(10); // ~570 characters with newlines // Test with TextPipeline (last pipeline, universal fallback) const textPipeline = pipelines[4]; // TextPipeline // Create mock RawContent for the process method const rawContent = { source: "test.txt", content: longContent, mimeType: "text/plain", }; const scraperOptions = { url: "test.txt", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; const processed = await textPipeline.process(rawContent, scraperOptions); // Verify that chunks are smaller due to custom configuration // With 570 characters and 100 char preferred size, should be multiple chunks expect(processed.chunks.length).toBeGreaterThan(1); // Should be split into multiple chunks processed.chunks.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Should be much smaller than default 1500 expect(chunk.content.length).toBeLessThan(300); }); }); it("should use default chunk sizes when no configuration provided", async () => { const pipelines = PipelineFactory.createStandardPipelines(); // Create moderate content that would fit in default chunks const moderateContent = "This is a test sentence. ".repeat(10); // ~250 characters // Test with TextPipeline const textPipeline = pipelines[4]; const rawContent = { source: "test.txt", content: moderateContent, mimeType: "text/plain", }; const scraperOptions = { url: "test.txt", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; const processed = await textPipeline.process(rawContent, scraperOptions); // With default chunk size (1500), this should fit in one chunk expect(processed.chunks.length).toBe(1); expect(processed.chunks[0].content.length).toBeLessThan(300); }); it("should handle different pipeline types with custom configuration", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 300, max: 600, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); // Test each pipeline const testContent = "This is a test content that might be split. ".repeat(10); // ~450 characters for (const pipeline of pipelines) { const rawContent = { source: "test.txt", content: testContent, mimeType: "text/plain", }; const scraperOptions = { url: "test.txt", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; const processed = await pipeline.process(rawContent, scraperOptions); expect(processed.chunks.length).toBeGreaterThanOrEqual(1); // Verify each chunk respects the configuration processed.chunks.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Allow some flexibility for splitting logic, but ensure it's not wildly large expect(chunk.content.length).toBeLessThan(800); }); } }); }); describe("content type processing behavior", () => { const baseOptions = { url: "test", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; // Helper function to find and process content with the first matching pipeline async function processContent(content: string, mimeType: string) { // Use small chunk sizes to force splitting for test content const pipelines = PipelineFactory.createStandardPipelines({ chunkSizes: { preferred: 80, max: 150 }, }); const rawContent = { source: "test", content, mimeType, }; // Find the first pipeline that can process this content for (const pipeline of pipelines) { if (pipeline.canProcess(rawContent)) { return await pipeline.process(rawContent, baseOptions); } } throw new Error(`No pipeline found for content type: ${mimeType}`); } it("should process HTML content with heading hierarchy and markdown conversion", async () => { const htmlContent = ` <h1>Main Title</h1> <p>Some paragraph content here.</p> <h2>Subsection</h2> <p>More content in subsection.</p> <table> <tr><th>Header</th></tr> <tr><td>Data</td></tr> </table> `; const result = await processContent(htmlContent, "text/html"); // HTML should be converted to markdown and create hierarchical structure expect(result.chunks.length).toBeGreaterThan(1); // Should have chunks with heading-based hierarchy const headingChunks = result.chunks.filter( (chunk) => chunk.types.includes("heading") || chunk.section.path.length > 0, ); expect(headingChunks.length).toBeGreaterThan(0); // Should convert table to markdown format const tableChunks = result.chunks.filter((chunk) => chunk.types.includes("table")); if (tableChunks.length > 0) { expect(tableChunks[0].content).toMatch(/\|.*\|/); // Markdown table format } }); it("should process JavaScript/TypeScript with semantic code boundaries", async () => { const jsContent = ` function greet(name) { return "Hello, " + name; } class Calculator { add(a, b) { return a + b; } multiply(a, b) { return a * b; } } const result = greet("World"); console.log(result); `; const result = await processContent(jsContent, "application/javascript"); // Should split along semantic boundaries (functions, classes) expect(result.chunks.length).toBeGreaterThan(1); // Should preserve code structure and formatting result.chunks.forEach((chunk) => { expect(chunk.types).toContain("code"); // All chunks should have content (including whitespace for perfect reconstruction) expect(chunk.content.length).toBeGreaterThan(0); }); // Should maintain perfect reconstruction const reconstructed = result.chunks.map((chunk) => chunk.content).join(""); expect(reconstructed.trim()).toBe(jsContent.trim()); expect(reconstructed).toContain("add(a, b)"); expect(reconstructed).toContain("multiply(a, b)"); expect(reconstructed).toContain('greet("World")'); expect(reconstructed).toContain("console.log(result)"); }); it("should process JSON with structure-aware organization", async () => { const jsonContent = JSON.stringify( { name: "Test Library", version: "1.0.0", dependencies: { lodash: "^4.17.21", express: "^4.18.0", }, scripts: { build: "webpack --mode production", test: "jest", start: "node index.js", }, config: { database: { host: "localhost", port: 5432, name: "testdb", }, }, }, null, 2, ); const result = await processContent(jsonContent, "application/json"); // Should handle JSON structure appropriately expect(result.chunks.length).toBeGreaterThanOrEqual(1); // Should preserve JSON formatting and structure result.chunks.forEach((chunk) => { expect(chunk.content.trim()).not.toBe(""); // JSON chunks should be valid when reconstructed const reconstructed = result.chunks.map((c) => c.content).join(""); expect(() => JSON.parse(reconstructed)).not.toThrow(); }); }); it("should process Markdown with content type distinction and hierarchy", async () => { const markdownContent = ` # Main Document This is the introduction paragraph. ## Code Section Here's some code: \`\`\`javascript function example() { return "Hello World"; } \`\`\` ## Data Section | Name | Value | |------|-------| | Item1 | 100 | | Item2 | 200 | ### Subsection More detailed content here. `; const result = await processContent(markdownContent, "text/markdown"); // Should create multiple chunks with different content types expect(result.chunks.length).toBeGreaterThan(3); // Should distinguish between content types const contentTypes = new Set(result.chunks.flatMap((chunk) => chunk.types)); expect(contentTypes.size).toBeGreaterThan(1); // Should have multiple content types // Should create hierarchical paths based on headings const hierarchicalChunks = result.chunks.filter( (chunk) => chunk.section.path.length > 0, ); expect(hierarchicalChunks.length).toBeGreaterThan(0); // Should preserve markdown structure const codeChunks = result.chunks.filter((chunk) => chunk.types.includes("code")); const tableChunks = result.chunks.filter((chunk) => chunk.types.includes("table")); expect(codeChunks.length).toBeGreaterThan(0); expect(tableChunks.length).toBeGreaterThan(0); }); it("should process plain text with simple structure and no hierarchy", async () => { const textContent = ` This is a plain text document. It has multiple lines and paragraphs. This is another paragraph with some content. The text should be split appropriately but without any complex structure. Final paragraph here. `; const result = await processContent(textContent, "text/plain"); // Should split into chunks but maintain simplicity expect(result.chunks.length).toBeGreaterThanOrEqual(1); // All chunks should be text type with no hierarchy result.chunks.forEach((chunk) => { expect(chunk.types).toEqual(["text"]); expect(chunk.section.path).toEqual([]); // No hierarchical structure expect(chunk.section.level).toBe(0); }); // Should preserve content exactly const reconstructed = result.chunks.map((chunk) => chunk.content).join(""); expect(reconstructed.trim()).toBe(textContent.trim()); }); }); describe("configuration behavior validation", () => { const baseOptions = { url: "test", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; it("should respect semantic boundaries even with small chunk sizes", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 50, // Very small max: 100, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); const markdownContent = ` # Main Title ## Section One Content for section one that is longer than the chunk size limit. ## Section Two More content for section two that also exceeds the small limit. `; const rawContent = { source: "test.md", content: markdownContent, mimeType: "text/markdown", }; // Find markdown pipeline const markdownPipeline = pipelines.find((p) => p.canProcess(rawContent)); expect(markdownPipeline).toBeDefined(); const result = await markdownPipeline!.process(rawContent, baseOptions); // Even with small chunk size, should maintain semantic structure const headingChunks = result.chunks.filter((chunk) => chunk.types.includes("heading"), ); expect(headingChunks.length).toBeGreaterThan(0); // Should still create proper hierarchy despite size constraints const hierarchicalChunks = result.chunks.filter( (chunk) => chunk.section.path.length > 0, ); expect(hierarchicalChunks.length).toBeGreaterThan(0); }); it("should preserve logical units in code even with large chunk sizes", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 2000, // Large max: 4000, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); const codeContent = ` function small() { return 1; } function another() { return 2; } class MyClass { method1() { return "a"; } method2() { return "b"; } } `; const rawContent = { source: "test.js", content: codeContent, mimeType: "application/javascript", }; const codePipeline = pipelines.find((p) => p.canProcess(rawContent)); expect(codePipeline).toBeDefined(); const result = await codePipeline!.process(rawContent, baseOptions); // Even with large chunk size allowing everything in one chunk, // should still respect logical code boundaries expect(result.chunks.length).toBeGreaterThanOrEqual(1); // Should maintain code structure result.chunks.forEach((chunk) => { expect(chunk.types).toContain("code"); expect(chunk.content.trim()).not.toBe(""); }); }); it("should handle size constraints appropriately across content types", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 100, max: 200, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); const testCases = [ { content: "Short text content.", mimeType: "text/plain" }, { content: `{"key": "value", "nested": {"data": "content"}}`, mimeType: "application/json", }, { content: "function test() { return true; }", mimeType: "application/javascript", }, ]; for (const testCase of testCases) { const rawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, }; const pipeline = pipelines.find((p) => p.canProcess(rawContent)); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // All should respect the size constraints result.chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(250); // Small buffer for edge cases }); } }); }); describe("fallback and edge case behavior", () => { const baseOptions = { url: "test", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; it("should reject unknown MIME types - no pipeline should process them", async () => { const pipelines = PipelineFactory.createStandardPipelines(); const unknownContent = { source: "test.unknown", content: "Some content in an unknown format.", mimeType: "application/unknown-format", }; // No pipeline should accept unknown MIME types const acceptingPipeline = pipelines.find((p) => p.canProcess(unknownContent)); expect(acceptingPipeline).toBeUndefined(); // Verify that each pipeline explicitly rejects it pipelines.forEach((pipeline) => { expect(pipeline.canProcess(unknownContent)).toBe(false); }); }); it("should handle invalid JSON as text content", async () => { const pipelines = PipelineFactory.createStandardPipelines(); const invalidJsonContent = { source: "test.json", content: '{"invalid": json, missing quotes}', mimeType: "application/json", }; const jsonPipeline = pipelines.find((p) => p.canProcess(invalidJsonContent)); expect(jsonPipeline).toBeDefined(); const result = await jsonPipeline!.process(invalidJsonContent, baseOptions); // Should handle gracefully and process as text-like content expect(result.chunks.length).toBeGreaterThanOrEqual(1); expect(result.metadata.isValidJson).toBe(false); }); it("should maintain content integrity across different processing paths", async () => { const pipelines = PipelineFactory.createStandardPipelines(); const testCases = [ { content: "<p>HTML content</p>", mimeType: "text/html" }, { content: "# Markdown content", mimeType: "text/markdown" }, { content: "function test() {}", mimeType: "application/javascript" }, { content: "Plain text content", mimeType: "text/plain" }, ]; for (const testCase of testCases) { const rawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, }; const pipeline = pipelines.find((p) => p.canProcess(rawContent)); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // Content should be preserved (allowing for format conversion) expect(result.textContent.trim()).not.toBe(""); expect(result.chunks.length).toBeGreaterThan(0); // Should be able to reconstruct meaningful content const reconstructed = result.chunks .map((chunk) => chunk.content) .join("") .trim(); expect(reconstructed).not.toBe(""); } }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server