Skip to main content
Glama
JsonDocumentSplitter.test.ts26.5 kB
import { describe, expect, it } from "vitest"; import { SPLITTER_MAX_CHUNK_SIZE } from "../utils/config"; import { JsonDocumentSplitter } from "./JsonDocumentSplitter"; describe("JsonDocumentSplitter", () => { const splitter = new JsonDocumentSplitter(); describe("concatenation-friendly chunking", () => { it("should create building-block chunks that concatenate to valid JSON", async () => { const content = '{"name": "test", "version": "1.0.0"}'; const chunks = await splitter.splitText(content); // Concatenate all chunks to verify they form valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Should have opening brace, two properties, closing brace expect(chunks.some((c) => c.content.trim() === "{")).toBe(true); expect(chunks.some((c) => c.content.includes('"name": "test"'))).toBe(true); expect(chunks.some((c) => c.content.includes('"version": "1.0.0"'))).toBe(true); expect( chunks.some((c) => c.content.trim() === "}" || c.content.trim() === "},"), ).toBe(true); }); it("should handle comma placement correctly", async () => { const content = '{"first": "value1", "second": "value2", "third": "value3"}'; const chunks = await splitter.splitText(content); // Find property chunks const properties = chunks.filter( (c) => c.content.includes('"first"') || c.content.includes('"second"') || c.content.includes('"third"'), ); // First two properties should have commas, last should not const firstProp = properties.find((c) => c.content.includes('"first"')); const thirdProp = properties.find((c) => c.content.includes('"third"')); expect(firstProp?.content).toContain(","); expect(thirdProp?.content).not.toContain(","); }); }); describe("nested structure handling", () => { it("should create concatenable chunks for nested objects", async () => { const content = '{"config": {"debug": true, "port": 8080}}'; const chunks = await splitter.splitText(content); // Should be able to concatenate to valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Should have hierarchical structure with proper indentation expect(chunks.some((c) => c.content.includes('"config": '))).toBe(true); expect(chunks.some((c) => c.content.includes(' "debug": true'))).toBe(true); expect(chunks.some((c) => c.content.includes(' "port": 8080'))).toBe(true); // Verify level/path relationship for nested chunks const configChunk = chunks.find((c) => c.content.includes('"config":')); expect(configChunk).toBeDefined(); expect(configChunk!.section.level).toBe(configChunk!.section.path.length); const debugChunk = chunks.find((c) => c.content.includes('"debug": true')); expect(debugChunk).toBeDefined(); expect(debugChunk!.section.level).toBe(debugChunk!.section.path.length); expect(debugChunk!.section.level).toBeGreaterThan(configChunk!.section.level); }); it("should handle nested arrays correctly", async () => { const content = '{"items": [1, 2, 3]}'; const chunks = await splitter.splitText(content); // Should concatenate to valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Should have array structure expect(chunks.some((c) => c.content.includes('"items": '))).toBe(true); expect(chunks.some((c) => c.content.trim() === "[")).toBe(true); expect(chunks.some((c) => c.content.includes("1,"))).toBe(true); expect( chunks.some((c) => c.content.includes("3") && !c.content.includes("3,")), ).toBe(true); // Last item no comma expect( chunks.some((c) => c.content.trim() === "]" || c.content.trim() === "],"), ).toBe(true); // Verify level/path relationships chunks.forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); // Test specific path structures for array items const itemsChunk = chunks.find((c) => c.content.includes('"items":')); expect(itemsChunk).toBeDefined(); expect(itemsChunk!.section.path).toEqual(["root", "items"]); expect(itemsChunk!.section.level).toBe(2); // Find array item chunks by their content and verify exact paths const firstItemChunk = chunks.find((c) => c.content.includes("1,")); expect(firstItemChunk).toBeDefined(); expect(firstItemChunk!.section.path).toEqual(["root", "items", "[0]"]); expect(firstItemChunk!.section.level).toBe(3); const secondItemChunk = chunks.find((c) => c.content.includes("2,")); expect(secondItemChunk).toBeDefined(); expect(secondItemChunk!.section.path).toEqual(["root", "items", "[1]"]); expect(secondItemChunk!.section.level).toBe(3); const thirdItemChunk = chunks.find( (c) => c.content.includes("3") && !c.content.includes("3,"), ); expect(thirdItemChunk).toBeDefined(); expect(thirdItemChunk!.section.path).toEqual(["root", "items", "[2]"]); expect(thirdItemChunk!.section.level).toBe(3); }); it("should handle complex arrays with nested objects correctly", async () => { const content = '{"users": [{"name": "Alice", "age": 30}, {"name": "Bob"}]}'; const chunks = await splitter.splitText(content); // Should concatenate to valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); // Verify all chunks follow level === path.length rule chunks.forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); // Test specific array index paths const aliceNameChunk = chunks.find((c) => c.content.includes('"name": "Alice"')); expect(aliceNameChunk).toBeDefined(); expect(aliceNameChunk!.section.path).toEqual(["root", "users", "[0]", "name"]); expect(aliceNameChunk!.section.level).toBe(4); const aliceAgeChunk = chunks.find((c) => c.content.includes('"age": 30')); expect(aliceAgeChunk).toBeDefined(); expect(aliceAgeChunk!.section.path).toEqual(["root", "users", "[0]", "age"]); expect(aliceAgeChunk!.section.level).toBe(4); const bobNameChunk = chunks.find((c) => c.content.includes('"name": "Bob"')); expect(bobNameChunk).toBeDefined(); expect(bobNameChunk!.section.path).toEqual(["root", "users", "[1]", "name"]); expect(bobNameChunk!.section.level).toBe(4); }); }); describe("path and structure information", () => { it("should maintain hierarchical path information", async () => { const content = '{"a": {"b": {"c": "value"}}}'; const chunks = await splitter.splitText(content); // Check for proper path hierarchy expect(chunks.some((chunk) => chunk.section.path.includes("a"))).toBe(true); expect(chunks.some((chunk) => chunk.section.path.includes("b"))).toBe(true); expect(chunks.some((chunk) => chunk.section.path.includes("c"))).toBe(true); // Verify level corresponds to path length chunks.forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); // Find specific chunks and verify their levels const aChunk = chunks.find( (chunk) => chunk.section.path.includes("a") && chunk.content.includes('"a":'), ); expect(aChunk).toBeDefined(); expect(aChunk!.section.path).toEqual(["root", "a"]); expect(aChunk!.section.level).toBe(2); const cChunk = chunks.find( (chunk) => chunk.section.path.includes("c") && chunk.content.includes('"c": "value"'), ); expect(cChunk).toBeDefined(); expect(cChunk!.section.path).toEqual(["root", "a", "b", "c"]); expect(cChunk!.section.level).toBe(4); }); it("should provide appropriate level numbers", async () => { const content = '{"level1": {"level2": "value"}}'; const chunks = await splitter.splitText(content); const level1Chunks = chunks.filter((chunk) => chunk.section.path.includes("level1"), ); const level2Chunks = chunks.filter((chunk) => chunk.section.path.includes("level2"), ); expect(level1Chunks.some((chunk) => chunk.section.level >= 2)).toBe(true); expect(level2Chunks.some((chunk) => chunk.section.level >= 3)).toBe(true); // Verify that level equals path length for all chunks [...level1Chunks, ...level2Chunks].forEach((chunk) => { expect(chunk.section.level).toBe(chunk.section.path.length); }); }); }); describe("edge cases", () => { it("should handle invalid JSON gracefully", async () => { const content = '{"invalid": json}'; const chunks = await splitter.splitText(content); expect(chunks).toHaveLength(1); expect(chunks[0].section.path).toEqual(["invalid-json"]); expect(chunks[0].content).toBe(content); }); it("should handle empty objects", async () => { const content = "{}"; const chunks = await splitter.splitText(content); expect(chunks.length).toBeGreaterThan(0); const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); }); it("should handle empty arrays", async () => { const content = "[]"; const chunks = await splitter.splitText(content); expect(chunks.length).toBeGreaterThan(0); const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); }); it("should handle null values correctly", async () => { const content = '{"nullable": null}'; const chunks = await splitter.splitText(content); const concatenated = chunks.map((c) => c.content).join("\n"); expect(() => JSON.parse(concatenated)).not.toThrow(); expect(chunks.some((chunk) => chunk.content.includes("null"))).toBe(true); }); }); describe("indentation preservation", () => { it("should maintain proper indentation in nested structures", async () => { const content = '{"outer": {"inner": "value"}}'; const chunks = await splitter.splitText(content); // Check for proper indentation levels expect(chunks.some((c) => c.content.includes(' "inner": "value"'))).toBe(true); // 2-space indent }); it("should respect preserveFormatting option", async () => { const splitterNoFormat = new JsonDocumentSplitter({ preserveFormatting: false }); const content = '{"test": "value"}'; const chunks = await splitterNoFormat.splitText(content); // With formatting disabled, should have minimal whitespace const hasIndentation = chunks.some((c) => c.content.startsWith(" ")); expect(hasIndentation).toBe(false); }); }); describe("integration with GreedySplitter", () => { it("should create chunks that work well with GreedySplitter optimization", async () => { const { GreedySplitter } = await import("./GreedySplitter"); const jsonSplitter = new JsonDocumentSplitter(); const greedySplitter = new GreedySplitter(jsonSplitter, 500, 1500, 5000); const complexJson = { application: { name: "Complex Application Configuration", version: "2.1.0", services: { database: { primary: { host: "primary-db.example.com", port: 5432, ssl: true, poolSize: 20, }, replica: { host: "replica-db.example.com", port: 5432, ssl: true, poolSize: 10, }, }, cache: { redis: { host: "cache.example.com", port: 6379, database: 0, }, }, }, features: { authentication: true, authorization: true, monitoring: true, logging: { level: "info", format: "json", }, }, }, }; const content = JSON.stringify(complexJson, null, 2); // Test JsonDocumentSplitter alone const jsonChunks = await jsonSplitter.splitText(content); expect(jsonChunks.length).toBeGreaterThan(5); // Should create many small chunks // Test GreedySplitter optimization const optimizedChunks = await greedySplitter.splitText(content); expect(optimizedChunks.length).toBeLessThanOrEqual(jsonChunks.length); // Should consolidate // Verify concatenation still produces valid JSON const concatenated = optimizedChunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(complexJson); // Verify chunks are reasonably sized optimizedChunks.forEach((chunk) => { expect(chunk.content.length).toBeGreaterThan(0); }); }); }); describe("chunk size limits", () => { it("should respect max chunk size when processing deep nested JSON", async () => { const largeValue = "x".repeat(6000); const deepJson = { level1: { level2: { level3: { level4: { level5: { level6: { largeData: largeValue, }, }, }, }, }, }, }; const chunks = await splitter.splitText(JSON.stringify(deepJson, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); }); it("should respect max chunk size when exceeding maxChunks limit", async () => { const largeJson: Record<string, unknown> = {}; for (let i = 0; i < 100; i++) { largeJson[`property${i}`] = "x".repeat(6000); } const limitedSplitter = new JsonDocumentSplitter({ maxChunks: 50 }); const chunks = await limitedSplitter.splitText(JSON.stringify(largeJson, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); const hasTextSplitterChunks = chunks.some( (c) => c.section.level === 0 && c.section.path.length === 0, ); expect(hasTextSplitterChunks).toBe(true); }); it("should handle very large single JSON values at max depth", async () => { const veryLargeValue = "y".repeat(15000); const json = { level1: { level2: { level3: { level4: { level5: { level6: { hugeData: veryLargeValue, moreData: "additional data", }, }, }, }, }, }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); expect(chunks.length).toBeGreaterThan(1); }); it("should handle array with large values at max depth", async () => { const largeValue = "z".repeat(6000); const json = { level1: { level2: { level3: { level4: [largeValue, largeValue, largeValue], }, }, }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); }); it("should split oversized primitive properties before hitting max depth", async () => { const largeValue = "x".repeat(6000); const json = { level1: { largeProp: largeValue, smallProp: "ok", }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); expect(chunks.length).toBeGreaterThan(4); }); it("should split oversized primitive array items before hitting max depth", async () => { const largeValue = "y".repeat(6000); const json = { level1: { items: [largeValue, "small"], }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThanOrEqual(SPLITTER_MAX_CHUNK_SIZE); }); expect(chunks.length).toBeGreaterThan(5); }); it("should keep small chunks well below the max when not at depth limit", async () => { const json = { config: { database: { host: "localhost", port: 5432, name: "mydb", }, cache: { enabled: true, ttl: 3600, }, }, }; const limitedSplitter = new JsonDocumentSplitter({ maxDepth: 5 }); const chunks = await limitedSplitter.splitText(JSON.stringify(json, null, 2)); chunks.forEach((chunk) => { expect(chunk.content.length).toBeLessThan(200); }); const hasJsonStructure = chunks.some((c) => c.section.level > 0); expect(hasJsonStructure).toBe(true); }); }); describe("depth limiting", () => { it("should stop chunking at maxDepth and serialize remaining content as text", async () => { const deepJson = { level1: { level2: { level3: { level4: { level5: { level6: { deepValue: "this should be serialized as text", }, }, }, }, }, }, }; const splitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await splitter.splitText(JSON.stringify(deepJson, null, 2)); // Should have chunks for levels 1-3, then serialize the rest as text const pathDepths = chunks.map((c) => c.section.path.length); const maxPathDepth = Math.max(...pathDepths); // Max path depth should not exceed maxDepth + some buffer for structure chunks expect(maxPathDepth).toBeLessThanOrEqual(5); // Should find a chunk that contains the deeply nested content serialized as text const textSerializedChunk = chunks.find( (c) => c.content.includes("level4") && c.content.includes("level5") && c.content.includes("level6") && c.content.includes("deepValue"), ); expect(textSerializedChunk).toBeDefined(); // Verify concatenation still produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(deepJson); }); it("should handle depth limit with arrays", async () => { const deepArrayJson = { level1: [ { level2: [ { level3: [ { level4: [ { level5: "deep value", }, ], }, ], }, ], }, ], }; const splitter = new JsonDocumentSplitter({ maxDepth: 3 }); const chunks = await splitter.splitText(JSON.stringify(deepArrayJson, null, 2)); // Verify that deep content is serialized const hasSerializedDeepContent = chunks.some( (c) => c.content.includes("level4") && c.content.includes("level5"), ); expect(hasSerializedDeepContent).toBe(true); // Verify concatenation still produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(deepArrayJson); }); it("should use default maxDepth when not specified", async () => { const { JSON_MAX_NESTING_DEPTH } = await import("../utils/config"); // Create JSON with depth exceeding the default let deepJson: any = { value: "leaf" }; for (let i = 0; i < JSON_MAX_NESTING_DEPTH + 3; i++) { deepJson = { [`level${i}`]: deepJson }; } const splitter = new JsonDocumentSplitter(); const chunks = await splitter.splitText(JSON.stringify(deepJson, null, 2)); // Should have chunks but not excessive amounts expect(chunks.length).toBeGreaterThan(0); expect(chunks.length).toBeLessThan(100); // Reasonable upper bound // Verify concatenation produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(deepJson); }); it("should not serialize primitives and shallow structures as text", async () => { const shallowJson = { level1: { level2: { value: "normal value", number: 42, bool: true, }, }, }; const splitter = new JsonDocumentSplitter({ maxDepth: 5 }); const chunks = await splitter.splitText(JSON.stringify(shallowJson, null, 2)); // All value chunks should be individual, not serialized together const valueChunk = chunks.find((c) => c.content.includes('"value": "normal value"'), ); expect(valueChunk).toBeDefined(); expect(valueChunk?.content).not.toContain("number"); expect(valueChunk?.content).not.toContain("bool"); }); }); describe("chunk count limiting", () => { it("should fall back to text splitting when maxChunks is exceeded", async () => { // Create a JSON with many properties that will exceed the limit const largeJson: Record<string, any> = {}; for (let i = 0; i < 100; i++) { largeJson[`property${i}`] = { subProperty1: `value${i}a`, subProperty2: `value${i}b`, subProperty3: `value${i}c`, }; } const splitter = new JsonDocumentSplitter({ maxChunks: 50 }); const chunks = await splitter.splitText(JSON.stringify(largeJson, null, 2)); // Should fall back to text splitting, resulting in fewer chunks expect(chunks.length).toBeLessThanOrEqual(100); // Much less than the ~600+ it would create // Verify the chunks don't have the fine-grained JSON structure // Text splitter uses level 0 and empty path const hasTextSplitterChunks = chunks.some( (c) => c.section.level === 0 && c.section.path.length === 0, ); expect(hasTextSplitterChunks).toBe(true); }); it("should not fall back when under maxChunks limit", async () => { const moderateJson: Record<string, any> = {}; for (let i = 0; i < 10; i++) { moderateJson[`property${i}`] = `value${i}`; } const splitter = new JsonDocumentSplitter({ maxChunks: 100 }); const chunks = await splitter.splitText(JSON.stringify(moderateJson, null, 2)); // Should use JSON splitting (level > 0 and non-empty paths) const hasJsonSplitterChunks = chunks.every( (c) => c.section.level > 0 || c.section.path.length > 0, ); expect(hasJsonSplitterChunks).toBe(true); // Verify concatenation produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(moderateJson); }); it("should use default maxChunks when not specified", async () => { const { JSON_MAX_CHUNKS } = await import("../utils/config"); // Create a moderately sized JSON that won't exceed default limit const json: Record<string, string> = {}; for (let i = 0; i < 50; i++) { json[`prop${i}`] = `value${i}`; } const splitter = new JsonDocumentSplitter(); const chunks = await splitter.splitText(JSON.stringify(json, null, 2)); // Should be well under the default limit expect(chunks.length).toBeLessThan(JSON_MAX_CHUNKS); // Should still use JSON splitting const hasJsonSplitterChunks = chunks.some((c) => c.section.path.includes("root")); expect(hasJsonSplitterChunks).toBe(true); }); }); describe("combined depth and chunk limiting", () => { it("should handle both depth and chunk limits together", async () => { // Create JSON that is both deep and wide const complexJson: Record<string, any> = {}; for (let i = 0; i < 20; i++) { complexJson[`branch${i}`] = { level1: { level2: { level3: { level4: { level5: { deepValue: `value${i}`, }, }, }, }, }, }; } const splitter = new JsonDocumentSplitter({ maxDepth: 3, maxChunks: 200 }); const chunks = await splitter.splitText(JSON.stringify(complexJson, null, 2)); // Should limit depth to prevent excessive nesting const pathDepths = chunks.map((c) => c.section.path.length); const maxPathDepth = Math.max(...pathDepths); expect(maxPathDepth).toBeLessThanOrEqual(6); // Some buffer for structure // Should have reasonable chunk count (may fall back to text splitting) expect(chunks.length).toBeLessThanOrEqual(250); // Verify concatenation produces valid JSON const concatenated = chunks.map((c) => c.content).join("\n"); const parsed = JSON.parse(concatenated); expect(parsed).toEqual(complexJson); }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server