docs-mcp-server

Overview Schema Related Servers Score Discussions

JsonDocumentSplitter.test.ts•25.8 KiB

import { describe, expect, it } from "vitest"; import { SPLITTER_MAX_CHUNK_SIZE } from "../utils/config"; import { JsonDocumentSplitter } from "./JsonDocumentSplitter"; describe("JsonDocumentSplitter", () => { const splitter = new JsonDocumentSplitter(); describe("concatenation-friendly chunking", () => { it("should create building-block chunks that concatenate to valid JSON", async () => { const content = '{"name": "test", "version": "1.0.0"}'; const chunks = await splitter.splitText(content); // Concatenate all chunks to verify they form valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Should have opening brace, two properties, closing brace expect(chunks.some((c) => c.content.trim() === "{")).toBe(true); expect(chunks.some((c) => c.content.includes('"name": "test"'))).toBe(true); expect(chunks.some((c) => c.content.includes('"version": "1.0.0"'))).toBe(true); expect( chunks.some((c) => c.content.trim() === "}" || c.content.trim() === "},"), ).toBe(true); }); it("should handle comma placement correctly", async () => { const content = '{"first": "value1", "second": "value2", "third": "value3"}'; const chunks = await splitter.splitText(content); // Find property chunks const properties = chunks.filter( (c) => c.content.includes('"first"') || c.content.includes('"second"') || c.content.includes('"third"'), ); // First two properties should have commas, last should not const firstProp = properties.find((c) => c.content.includes('"first"')); const thirdProp = properties.find((c) => c.content.includes('"third"')); expect(firstProp?.content).toContain(","); expect(thirdProp?.content).not.toContain(","); }); }); describe("nested structure handling", () => { it("should create concatenable chunks for nested objects", async () => { const content = '{"config": {"debug": true, "port": 8080}}'; const chunks = await splitter.splitText(content); // Should be able to concatenate to valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Should have hierarchical structure with proper indentation expect(chunks.some((c) => c.content.includes('"config": '))).toBe(true); expect(chunks.some((c) => c.content.includes(' "debug": true'))).toBe(true); expect(chunks.some((c) => c.content.includes(' "port": 8080'))).toBe(true); // Verify level/path relationship for nested chunks const configChunk = chunks.find((c) => c.content.includes('"config":')); expect(configChunk).toBeDefined(); expect(configChunk!.section.level).toBe(configChunk!.section.path.length); const debugChunk = chunks.find((c) => c.content.includes('"debug": true')); expect(debugChunk).toBeDefined(); expect(debugChunk!.section.level).toBe(debugChunk!.section.path.length); expect(debugChunk!.section.level).toBeGreaterThan(configChunk!.section.level); }); it("should handle nested arrays correctly", async () => { const content = '{"items": [1, 2, 3]}'; const chunks = await splitter.splitText(content); // Should concatenate to valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Should have array structure expect(chunks.some((c) => c.content.includes('"items": '))).toBe(true); expect(chunks.some((c) => c.content.trim() === "[")).toBe(true); expect(chunks.some((c) => c.content.includes("1,"))).toBe(true); expect( chunks.some((c) => c.content.includes("3") && !c.content.includes("3,")), ).toBe(true); // Last item no comma expect( chunks.some((c) => c.content.trim() === "]" || c.content.trim() === "],"), ).toBe(true); // Verify level/path relationships chunks.forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); // Test specific path structures for array items const itemsChunk = chunks.find((c) => c.content.includes('"items":')); expect(itemsChunk).toBeDefined(); expect(itemsChunk!.section.path).toEqual(["root", "items"]); expect(itemsChunk!.section.level).toBe(2); // Find array item chunks by their content and verify exact paths const firstItemChunk = chunks.find((c) => c.content.includes("1,")); expect(firstItemChunk).toBeDefined(); expect(firstItemChunk!.section.path).toEqual(["root", "items", "[0]"]); expect(firstItemChunk!.section.level).toBe(3); const secondItemChunk = chunks.find((c) => c.content.includes("2,")); expect(secondItemChunk).toBeDefined(); expect(secondItemChunk!.section.path).toEqual(["root", "items", "[1]"]); expect(secondItemChunk!.section.level).toBe(3); const thirdItemChunk = chunks.find( (c) => c.content.includes("3") && !c.content.includes("3,"), ); expect(thirdItemChunk).toBeDefined(); expect(thirdItemChunk!.section.path).toEqual(["root", "items", "[2]"]); expect(thirdItemChunk!.section.level).toBe(3); }); it("should handle complex arrays with nested objects correctly", async () => { const content = '{"users": [{"name": "Alice", "age": 30}, {"name": "Bob"}]}'; const chunks = await splitter.splitText(content); // Should concatenate to valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Verify all chunks follow level === path.length rule chunks.forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); // Test specific array index paths const aliceNameChunk = chunks.find((c) => c.content.includes('"name": "Alice"')); expect(aliceNameChunk).toBeDefined(); expect(aliceNameChunk!.section.path).toEqual(["root", "users", "[0]", "name"]); expect(aliceNameChunk!.section.level).toBe(4); const aliceAgeChunk = chunks.find((c) => c.content.includes('"age": 30')); expect(aliceAgeChunk).toBeDefined(); expect(aliceAgeChunk!.section.path).toEqual(["root", "users", "[0]", "age"]); expect(aliceAgeChunk!.section.level).toBe(4); const bobNameChunk = chunks.find((c) => c.content.includes('"name": "Bob"')); expect(bobNameChunk).toBeDefined(); expect(bobNameChunk!.section.path).toEqual(["root", "users", "[1]", "name"]); expect(bobNameChunk!.section.level).toBe(4); }); }); describe("path and structure information", () => { it("should maintain hierarchical path information", async () => { const content = '{"a": {"b": {"c": "value"}}}'; const chunks = await splitter.splitText(content); // Check for proper path hierarchy expect(chunks.some((chunk) => chunk.section.path.includes("a"))).toBe(true); expect(chunks.some((chunk) => chunk.section.path.includes("b"))).toBe(true); expect(chunks.some((chunk) => chunk.section.path.includes("c"))).toBe(true); // Verify level corresponds to path length chunks.forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); // Find specific chunks and verify their levels const aChunk = chunks.find( (chunk) => chunk.section.path.includes("a") && chunk.content.includes('"a":'), ); expect(aChunk).toBeDefined(); expect(aChunk!.section.path).toEqual(["root", "a"]); expect(aChunk!.section.level).toBe(2); const cChunk = chunks.find( (chunk) => chunk.section.path.includes("c") && chunk.content.includes('"c": "value"'), ); expect(cChunk).toBeDefined(); expect(cChunk!.section.path).toEqual(["root", "a", "b", "c"]); expect(cChunk!.section.level).toBe(4); }); it("should provide appropriate level numbers", async () => { const content = '{"level1": {"level2": "value"}}'; const chunks = await splitter.splitText(content); const level1Chunks = chunks.filter((chunk) => chunk.section.path.includes("level1"), ); const level2Chunks = chunks.filter((chunk) => chunk.section.path.includes("level2"), ); expect(level1Chunks.some((chunk) => chunk.section.level >= 2)).toBe(true); expect(level2Chunks.some((chunk) => chunk.section.level >= 3)).toBe(true); // Verify that level equals path length for all chunks [...level1Chunks, ...level2Chunks].forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); }); }); describe("edge cases", () => { it("should handle invalid JSON gracefully", async () => { const content = '{"invalid": json}'; const chunks = await splitter.splitText(content); expect(chunks).toHaveLength(1); expect(chunks[0].section.path).toEqual(["invalid-json"]); expect(chunks[0].content).toBe(content); }); it("should handle empty objects", async () => { const content = "{}"; const chunks = await splitter.splitText(content); expect(chunks.length).toBeGreaterThan(0); const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); }); it("should handle empty arrays", async () => { const content = "[]"; const chunks = await splitter.splitText(content); expect(chunks.length).toBeGreaterThan(0); const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); }); it("should handle null values correctly", async () => { const content = '{"nullable": null}'; const chunks = await splitter.splitText(content); const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); expect(chunks.some((chunk) => chunk.content.includes("null"))).toBe(true); }); }); describe("indentation preservation", () => { it("should maintain proper indentation in nested structures", async () => { const content = '{"outer": {"inner": "value"}}'; const chunks = await splitter.splitText(content); // Check for proper indentation levels expect(chunks.some((c) => c.content.includes(' "inner": "value"'))).toBe(true); // 2-space indent }); it("should respect preserveFormatting option", async () => { const splitterNoFormat = new JsonDocumentSplitter({ preserveFormatting: false }); const content = '{"test": "value"}'; const chunks = await splitterNoFormat.splitText(content); // With formatting disabled, should have minimal whitespace const hasIndentation = chunks.some((c) => c.content.startsWith(" ")); expect(hasIndentation).toBe(false); }); }); describe("integration with GreedySplitter", () => { it("should create chunks that work well with GreedySplitter optimization", async () => { const { GreedySplitter } = await import("./GreedySplitter"); const jsonSplitter = new JsonDocumentSplitter(); const greedySplitter = new GreedySplitter(jsonSplitter, 500, 1500, 5000); const complexJson = { application: { name: "Complex Application Configuration", version: "2.1.0", services: { database: { primary: { host: "primary-db.example.com", port: 5432, ssl: true, poolSize: 20, }, replica: { host: "replica-db.example.com", port: 5432, ssl: true, poolSize: 10, }, }, cache: { redis: { host: "cache.example.com", port: 6379, database: 0, }, }, }, features: { authentication: true, authorization: true, monitoring: true, logging: { level: "info", format: "json", }, }, }, }; const content = JSON.stringify(complexJson, null, 2); // Test JsonDocumentSplitter alone const jsonChunks = await jsonSplitter.splitText(content); expect(jsonChunks.length).toBeGreaterThan(5); // Should create many small chunks // Test GreedySplitter optimization const optimizedChunks = await greedySplitter.splitText(content); expect(optimizedChunks.length).toBeLessThanOrEqual(jsonChunks.length); // Should consolidate // Verify concatenation still produces valid JSON const concatenated = optimizedChunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(complexJson); // Verify chunks are reasonably sized optimizedChunks.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); }); }); }); describe("chunk size limits", () => { it("should respect max chunk size when processing deep nested JSON", async () => { const largeValue = "x".repeat(6000); const deepJson = { level1: { level2: { level3: { level4: { level5: { level6: { largeData: largeValue, }, }, }, }, }, }, }; const chunks = await splitter.splitText(JSON.stringify(deepJson, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); }); it("should respect max chunk size when exceeding maxChunks limit", async () => { const largeJson: Record<string, unknown> = {}; for (let i = 0; i < 100; i++) { largeJson[`property${i}`] = "x".repeat(6000); } const limitedSplitter = new JsonDocumentSplitter({ maxChunks: 50 }); const chunks = await limitedSplitter.splitText(JSON.stringify(largeJson, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); const hasTextSplitterChunks = chunks.some( (c) => c.section.level === 0 && c.section.path.length === 0, ); expect(hasTextSplitterChunks).toBe(true); }); it("should handle very large single JSON values at max depth", async () => { const veryLargeValue = "y".repeat(15000); const json = { level1: { level2: { level3: { level4: { level5: { level6: { hugeData: veryLargeValue, moreData: "additional data", }, }, }, }, }, }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); expect(chunks.length).toBeGreaterThan(1); }); it("should handle array with large values at max depth", async () => { const largeValue = "z".repeat(6000); const json = { level1: { level2: { level3: { level4: [largeValue, largeValue, largeValue], }, }, }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); }); it("should split oversized primitive properties before hitting max depth", async () => { const largeValue = "x".repeat(6000); const json = { level1: { largeProp: largeValue, smallProp: "ok", }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); expect(chunks.length).toBeGreaterThan(4); }); it("should split oversized primitive array items before hitting max depth", async () => { const largeValue = "y".repeat(6000); const json = { level1: { items: [largeValue, "small"], }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); expect(chunks.length).toBeGreaterThan(5); }); it("should keep small chunks well below the max when not at depth limit", async () => { const json = { config: { database: { host: "localhost", port: 5432, name: "mydb", }, cache: { enabled: true, ttl: 3600, }, }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 5 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThan(200); }); const hasJsonStructure = chunks.some((c) => c.section.level > 0); expect(hasJsonStructure).toBe(true); }); }); describe("depth limiting", () => { it("should stop chunking at maxDepth and serialize remaining content as text", async () => { const deepJson = { level1: { level2: { level3: { level4: { level5: { level6: { deepValue: "this should be serialized as text", }, }, }, }, }, }, }; const splitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await splitter.splitText(JSON.stringify(deepJson, null, 2)); // Should have chunks for levels 1-3, then serialize the rest as text const pathDepths = chunks.map((c) => c.section.path.length); const maxPathDepth = Math.max(...pathDepths); // Max path depth should not exceed maxDepth + some buffer for structure chunks expect(maxPathDepth).toBeLessThanOrEqual(5); // Should find a chunk that contains the deeply nested content serialized as text const textSerializedChunk = chunks.find( (c) => c.content.includes("level4") && c.content.includes("level5") && c.content.includes("level6") && c.content.includes("deepValue"), ); expect(textSerializedChunk).toBeDefined(); // Verify concatenation still produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(deepJson); }); it("should handle depth limit with arrays", async () => { const deepArrayJson = { level1: [ { level2: [ { level3: [ { level4: [ { level5: "deep value", }, ], }, ], }, ], }, ], }; const splitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await splitter.splitText(JSON.stringify(deepArrayJson, null, 2)); // Verify that deep content is serialized const hasSerializedDeepContent = chunks.some( (c) => c.content.includes("level4") && c.content.includes("level5"), ); expect(hasSerializedDeepContent).toBe(true); // Verify concatenation still produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(deepArrayJson); }); it("should use default maxDepth when not specified", async () => { const { JSON_MAX_NESTING_DEPTH } = await import("../utils/config"); // Create JSON with depth exceeding the default let deepJson: any = { value: "leaf" }; for (let i = 0; i < JSON_MAX_NESTING_DEPTH + 3; i++) { deepJson = { [`level${i}`]: deepJson }; } const splitter = new JsonDocumentSplitter(); const chunks = await splitter.splitText(JSON.stringify(deepJson, null, 2)); // Should have chunks but not excessive amounts expect(chunks.length).toBeGreaterThan(0); expect(chunks.length).toBeLessThan(100); // Reasonable upper bound // Verify concatenation produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(deepJson); }); it("should not serialize primitives and shallow structures as text", async () => { const shallowJson = { level1: { level2: { value: "normal value", number: 42, bool: true, }, }, }; const splitter = new JsonDocumentSplitter({ maxDepth: 5 }); const chunks = await splitter.splitText(JSON.stringify(shallowJson, null, 2)); // All value chunks should be individual, not serialized together const valueChunk = chunks.find((c) => c.content.includes('"value": "normal value"'), ); expect(valueChunk).toBeDefined(); expect(valueChunk?.content).not.toContain("number"); expect(valueChunk?.content).not.toContain("bool"); }); }); describe("chunk count limiting", () => { it("should fall back to text splitting when maxChunks is exceeded", async () => { // Create a JSON with many properties that will exceed the limit const largeJson: Record<string, any> = {}; for (let i = 0; i < 100; i++) { largeJson[`property${i}`] = { subProperty1: `value${i}a`, subProperty2: `value${i}b`, subProperty3: `value${i}c`, }; } const splitter = new JsonDocumentSplitter({ maxChunks: 50 }); const chunks = await splitter.splitText(JSON.stringify(largeJson, null, 2)); // Should fall back to text splitting, resulting in fewer chunks expect(chunks.length).toBeLessThanOrEqual(100); // Much less than the ~600+ it would create // Verify the chunks don't have the fine-grained JSON structure // Text splitter uses level 0 and empty path const hasTextSplitterChunks = chunks.some( (c) => c.section.level === 0 && c.section.path.length === 0, ); expect(hasTextSplitterChunks).toBe(true); }); it("should not fall back when under maxChunks limit", async () => { const moderateJson: Record<string, any> = {}; for (let i = 0; i < 10; i++) { moderateJson[`property${i}`] = `value${i}`; } const splitter = new JsonDocumentSplitter({ maxChunks: 100 }); const chunks = await splitter.splitText(JSON.stringify(moderateJson, null, 2)); // Should use JSON splitting (level > 0 and non-empty paths) const hasJsonSplitterChunks = chunks.every( (c) => c.section.level > 0 || c.section.path.length > 0, ); expect(hasJsonSplitterChunks).toBe(true); // Verify concatenation produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(moderateJson); }); it("should use default maxChunks when not specified", async () => { const { JSON_MAX_CHUNKS } = await import("../utils/config"); // Create a moderately sized JSON that won't exceed default limit const json: Record<string, string> = {}; for (let i = 0; i < 50; i++) { json[`prop${i}`] = `value${i}`; } const splitter = new JsonDocumentSplitter(); const chunks = await splitter.splitText(JSON.stringify(json, null, 2)); // Should be well under the default limit expect(chunks.length).toBeLessThan(JSON_MAX_CHUNKS); // Should still use JSON splitting const hasJsonSplitterChunks = chunks.some((c) => c.section.path.includes("root")); expect(hasJsonSplitterChunks).toBe(true); }); }); describe("combined depth and chunk limiting", () => { it("should handle both depth and chunk limits together", async () => { // Create JSON that is both deep and wide const complexJson: Record<string, any> = {}; for (let i = 0; i < 20; i++) { complexJson[`branch${i}`] = { level1: { level2: { level3: { level4: { level5: { deepValue: `value${i}`, }, }, }, }, }, }; } const splitter = new JsonDocumentSplitter({ maxDepth: 3, maxChunks: 200 }); const chunks = await splitter.splitText(JSON.stringify(complexJson, null, 2)); // Should limit depth to prevent excessive nesting const pathDepths = chunks.map((c) => c.section.path.length); const maxPathDepth = Math.max(...pathDepths); expect(maxPathDepth).toBeLessThanOrEqual(6); // Some buffer for structure // Should have reasonable chunk count (may fall back to text splitting) expect(chunks.length).toBeLessThanOrEqual(250); // Verify concatenation produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(complexJson); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

JsonDocumentSplitter.test.ts•25.8 KiB