docs-mcp-server

Overview Schema Related Servers Score Discussions

PipelineFactory.integration.test.ts•19.6 KiB

import { describe, expect, it } from "vitest"; import { FetchStatus, type RawContent } from "../fetcher"; import { ScrapeMode } from "../types"; import { type PipelineConfiguration, PipelineFactory } from "./PipelineFactory"; describe("PipelineFactory Integration", () => { describe("configuration propagation", () => { it("should propagate custom chunk sizes through process method", async () => { // Create pipelines with custom configuration const config: PipelineConfiguration = { chunkSizes: { preferred: 100, // Very small for testing max: 200, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); // Create content that would definitely exceed the custom chunk size const longContent = "This is a test sentence that is long enough to be split.\n".repeat(10); // ~570 characters with newlines // Test with TextPipeline (last pipeline, universal fallback) const textPipeline = pipelines[4]; // TextPipeline // Create mock RawContent for the process method const rawContent: RawContent = { source: "test.txt", content: longContent, mimeType: "text/plain", status: FetchStatus.SUCCESS, }; const scraperOptions = { url: "test.txt", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; const processed = await textPipeline.process(rawContent, scraperOptions); // Verify that chunks are smaller due to custom configuration // With 570 characters and 100 char preferred size, should be multiple chunks expect(processed.chunks?.length).toBeGreaterThan(1); // Should be split into multiple chunks processed.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Should be much smaller than default 1500 expect(chunk.content.length).toBeLessThan(300); }); }); it("should use default chunk sizes when no configuration provided", async () => { const pipelines = PipelineFactory.createStandardPipelines(); // Create moderate content that would fit in default chunks const moderateContent = "This is a test sentence. ".repeat(10); // ~250 characters // Test with TextPipeline const textPipeline = pipelines[4]; const rawContent: RawContent = { source: "test.txt", content: moderateContent, mimeType: "text/plain", status: FetchStatus.SUCCESS, }; const scraperOptions = { url: "test.txt", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; const processed = await textPipeline.process(rawContent, scraperOptions); // With default chunk size (1500), this should fit in one chunk expect(processed.chunks?.length).toBe(1); expect(processed.chunks?.[0].content?.length).toBeLessThan(300); }); it("should handle different pipeline types with custom configuration", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 300, max: 600, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); // Test each pipeline const testContent = "This is a test content that might be split. ".repeat(10); // ~450 characters for (const pipeline of pipelines) { const rawContent: RawContent = { source: "test.txt", content: testContent, mimeType: "text/plain", status: FetchStatus.SUCCESS, }; const scraperOptions = { url: "test.txt", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; const processed = await pipeline.process(rawContent, scraperOptions); expect(processed.chunks?.length).toBeGreaterThanOrEqual(1); // Verify each chunk respects the configuration processed.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); // Allow some flexibility for splitting logic, but ensure it's not wildly large expect(chunk.content.length).toBeLessThan(800); }); } }); }); describe("content type processing behavior", () => { const baseOptions = { url: "test", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; // Helper function to find and process content with the first matching pipeline async function processContent(content: string, mimeType: string) { // Use small chunk sizes to force splitting for test content const pipelines = PipelineFactory.createStandardPipelines({ chunkSizes: { preferred: 80, max: 150 }, }); const rawContent: RawContent = { source: "test", content, mimeType, status: FetchStatus.SUCCESS, }; // Find the first pipeline that can process this content const contentBuffer = Buffer.from(content); for (const pipeline of pipelines) { if (pipeline.canProcess(mimeType, contentBuffer)) { return await pipeline.process(rawContent, baseOptions); } } throw new Error(`No pipeline found for content type: ${mimeType}`); } it("should process HTML content with heading hierarchy and markdown conversion", async () => { const htmlContent = ` <h1>Main Title</h1> <p>Some paragraph content here.</p> <h2>Subsection</h2> <p>More content in subsection.</p> <table> <tr><th>Header</th></tr> <tr><td>Data</td></tr> </table> `; const result = await processContent(htmlContent, "text/html"); // HTML should be converted to markdown and create chunks expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should have chunks with heading-based hierarchy const headingChunks = result.chunks?.filter( (chunk) => chunk.types.includes("heading") || chunk.section.path.length > 0, ); expect(headingChunks?.length).toBeGreaterThan(0); // Should convert table to markdown format const tableChunks = result.chunks?.filter((chunk) => chunk.types.includes("table")); expect(tableChunks?.[0].content).toMatch(/\|.*\|/); // Markdown table format // Verify no single chunk exceeds max size (150 with buffer) result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(200); }); }); it("should process JavaScript/TypeScript with semantic code boundaries", async () => { const jsContent = ` function greet(name) { return "Hello, " + name; } class Calculator { add(a, b) { return a + b; } multiply(a, b) { return a * b; } } const result = greet("World"); console.log(result); `; const result = await processContent(jsContent, "application/javascript"); // Should split along semantic boundaries (functions, classes) expect(result.chunks?.length).toBeGreaterThan(1); // Should preserve code structure and formatting result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); // All chunks should have content (including whitespace for perfect reconstruction) expect(chunk.content.length).toBeGreaterThan(0); }); // Should maintain perfect reconstruction const reconstructed = result.chunks?.map((chunk) => chunk.content).join(""); expect(reconstructed?.trim()).toBe(jsContent.trim()); expect(reconstructed).toContain("add(a, b)"); expect(reconstructed).toContain("multiply(a, b)"); expect(reconstructed).toContain('greet("World")'); expect(reconstructed).toContain("console.log(result)"); }); it("should process JSON with structure-aware organization", async () => { const jsonContent = JSON.stringify( { name: "Test Library", version: "1.0.0", dependencies: { lodash: "^4.17.21", express: "^4.18.0", }, scripts: { build: "webpack --mode production", test: "jest", start: "node index.js", }, config: { database: { host: "localhost", port: 5432, name: "testdb", }, }, }, null, 2, ); const result = await processContent(jsonContent, "application/json"); // Should handle JSON structure appropriately expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should preserve JSON formatting and structure result.chunks?.forEach((chunk) => { expect(chunk.content.trim()).not.toBe(""); // JSON chunks should be valid when reconstructed const reconstructed = result.chunks?.map((c) => c.content).join(""); expect(() => JSON.parse(reconstructed || "")).not.toThrow(); }); }); it("should process Markdown with content type distinction and hierarchy", async () => { const markdownContent = ` # Main Document This is the introduction paragraph. ## Code Section Here's some code: \`\`\`javascript function example() { return "Hello World"; } \`\`\` ## Data Section | Name | Value | |------|-------| | Item1 | 100 | | Item2 | 200 | ### Subsection More detailed content here. `; const result = await processContent(markdownContent, "text/markdown"); // Should create chunks (quantity depends on greedy merging logic) expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should distinguish between content types const contentTypes = new Set(result.chunks?.flatMap((chunk) => chunk.types)); expect(contentTypes.size).toBeGreaterThan(1); // Should have multiple content types // Should create hierarchical paths based on headings const hierarchicalChunks = result.chunks?.filter( (chunk) => chunk.section.path.length > 0, ); expect(hierarchicalChunks?.length).toBeGreaterThan(0); // Should preserve markdown structure const codeChunks = result.chunks?.filter((chunk) => chunk.types.includes("code")); const tableChunks = result.chunks?.filter((chunk) => chunk.types.includes("table")); expect(codeChunks?.length).toBeGreaterThan(0); expect(tableChunks?.length).toBeGreaterThan(0); // Verify no single chunk exceeds max size (150 with buffer) result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(200); }); }); it("should process plain text with simple structure and no hierarchy", async () => { const textContent = ` This is a plain text document. It has multiple lines and paragraphs. This is another paragraph with some content. The text should be split appropriately but without any complex structure. Final paragraph here. `; const result = await processContent(textContent, "text/plain"); // Should split into chunks but maintain simplicity expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // All chunks should be text type with no hierarchy result.chunks?.forEach((chunk) => { expect(chunk.types).toEqual(["text"]); expect(chunk.section.path).toEqual([]); // No hierarchical structure expect(chunk.section.level).toBe(0); }); // Should preserve content exactly const reconstructed = result.chunks?.map((chunk) => chunk.content).join(""); expect(reconstructed?.trim()).toBe(textContent.trim()); }); }); describe("configuration behavior validation", () => { const baseOptions = { url: "test", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; it("should respect semantic boundaries even with small chunk sizes", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 50, // Very small max: 100, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); const markdownContent = ` # Main Title ## Section One Content for section one that is longer than the chunk size limit. ## Section Two More content for section two that also exceeds the small limit. `; const rawContent: RawContent = { source: "test.md", content: markdownContent, mimeType: "text/markdown", status: FetchStatus.SUCCESS, }; // Find markdown pipeline const contentBuffer = Buffer.from(markdownContent); const markdownPipeline = pipelines.find((p) => p.canProcess(rawContent.mimeType, contentBuffer), ); expect(markdownPipeline).toBeDefined(); const result = await markdownPipeline!.process(rawContent, baseOptions); // Even with small chunk size, should maintain semantic structure const headingChunks = result.chunks?.filter((chunk) => chunk.types.includes("heading"), ); expect(headingChunks?.length).toBeGreaterThan(0); // Should still create proper hierarchy despite size constraints const hierarchicalChunks = result.chunks?.filter( (chunk) => chunk.section.path.length > 0, ); expect(hierarchicalChunks?.length).toBeGreaterThan(0); }); it("should preserve logical units in code even with large chunk sizes", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 2000, // Large max: 4000, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); const codeContent = ` function small() { return 1; } function another() { return 2; } class MyClass { method1() { return "a"; } method2() { return "b"; } } `; const rawContent: RawContent = { source: "test.js", content: codeContent, mimeType: "application/javascript", status: FetchStatus.SUCCESS, }; const contentBuffer = Buffer.from(codeContent); const codePipeline = pipelines.find((p) => p.canProcess(rawContent.mimeType, contentBuffer), ); expect(codePipeline).toBeDefined(); const result = await codePipeline!.process(rawContent, baseOptions); // Even with large chunk size allowing everything in one chunk, // should still respect logical code boundaries expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // Should maintain code structure result.chunks?.forEach((chunk) => { expect(chunk.types).toContain("code"); expect(chunk.content.trim()).not.toBe(""); }); }); it("should handle size constraints appropriately across content types", async () => { const config: PipelineConfiguration = { chunkSizes: { preferred: 100, max: 200, }, }; const pipelines = PipelineFactory.createStandardPipelines(config); const testCases = [ { content: "Short text content.", mimeType: "text/plain" }, { content: `{"key": "value", "nested": {"data": "content"}}`, mimeType: "application/json", }, { content: "function test() { return true; }", mimeType: "application/javascript", }, ]; for (const testCase of testCases) { const rawContent: RawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, status: FetchStatus.SUCCESS, }; const contentBuffer = Buffer.from(testCase.content); const pipeline = pipelines.find((p) => p.canProcess(rawContent.mimeType, contentBuffer), ); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // All should respect the size constraints result.chunks?.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(250); // Small buffer for edge cases }); } }); }); describe("fallback and edge case behavior", () => { const baseOptions = { url: "test", library: "test", version: "1.0.0", scrapeMode: ScrapeMode.Fetch, ignoreErrors: false, maxConcurrency: 1, }; it("should reject unknown MIME types - no pipeline should process them", async () => { const pipelines = PipelineFactory.createStandardPipelines(); const unknownContent = { source: "test.unknown", content: "Some content in an unknown format.", mimeType: "application/unknown-format", }; // No pipeline should accept unknown MIME types const contentBuffer = Buffer.from(unknownContent.content); const acceptingPipeline = pipelines.find((p) => p.canProcess(unknownContent.mimeType, contentBuffer), ); expect(acceptingPipeline).toBeUndefined(); // Verify that each pipeline explicitly rejects it pipelines.forEach((pipeline) => { expect(pipeline.canProcess(unknownContent.mimeType, contentBuffer)).toBe(false); }); }); it("should handle invalid JSON as text content", async () => { const pipelines = PipelineFactory.createStandardPipelines(); const invalidJsonContent: RawContent = { source: "test.json", content: '{"invalid": json, missing quotes}', mimeType: "application/json", status: FetchStatus.SUCCESS, }; const contentBuffer = Buffer.from(invalidJsonContent.content); const jsonPipeline = pipelines.find((p) => p.canProcess(invalidJsonContent.mimeType, contentBuffer), ); expect(jsonPipeline).toBeDefined(); const result = await jsonPipeline!.process(invalidJsonContent, baseOptions); // Should handle gracefully and process as text-like content expect(result.chunks?.length).toBeGreaterThanOrEqual(1); // expect(result.metadata.isValidJson).toBe(false); }); it("should maintain content integrity across different processing paths", async () => { const pipelines = PipelineFactory.createStandardPipelines(); const testCases = [ { content: "<p>HTML content</p>", mimeType: "text/html" }, { content: "# Markdown content", mimeType: "text/markdown" }, { content: "function test() {}", mimeType: "application/javascript" }, { content: "Plain text content", mimeType: "text/plain" }, ]; for (const testCase of testCases) { const rawContent: RawContent = { source: "test", content: testCase.content, mimeType: testCase.mimeType, status: FetchStatus.SUCCESS, }; const contentBuffer = Buffer.from(testCase.content); const pipeline = pipelines.find((p) => p.canProcess(rawContent.mimeType, contentBuffer), ); expect(pipeline).toBeDefined(); const result = await pipeline!.process(rawContent, baseOptions); // Content should be preserved (allowing for format conversion) expect(result.textContent?.trim()).not.toBe(""); expect(result.chunks?.length).toBeGreaterThan(0); // Should be able to reconstruct meaningful content const reconstructed = result.chunks ?.map((chunk) => chunk.content) .join("") .trim(); expect(reconstructed).not.toBe(""); } }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

PipelineFactory.integration.test.ts•19.6 KiB