de en es ja ko ru zh

docs-mcp-server

by arabold

TypeScript

MIT License

542

676

Overview InspectNew Endpoints Schema Related Servers Reviews Score

Need Help?View Source Code Report Issue

GreedySplitter.ts•6.27 kB

import type { ContentChunk, DocumentSplitter, SectionContentType } from "./types"; /** * Takes small document chunks and greedily concatenates them into larger, more meaningful units * while preserving document structure and semantic boundaries. * * This approach improves embedding quality by: * - Maintaining context by keeping related content together * - Respecting natural document breaks at major section boundaries (H1/H2) * - Ensuring chunks are large enough to capture meaningful relationships * - Preventing chunks from becoming too large for effective embedding */ export class GreedySplitter implements DocumentSplitter { private baseSplitter: DocumentSplitter; private minChunkSize: number; private preferredChunkSize: number; /** * Combines a base document splitter with size constraints to produce optimally-sized chunks. * The base splitter handles the initial semantic splitting, while this class handles * the concatenation strategy. */ constructor( baseSplitter: DocumentSplitter, minChunkSize: number, preferredChunkSize: number, ) { this.baseSplitter = baseSplitter; this.minChunkSize = minChunkSize; this.preferredChunkSize = preferredChunkSize; } /** * Uses a greedy concatenation strategy to build optimally-sized chunks. Small chunks * are combined until they reach the minimum size, but splits are preserved at major * section boundaries to maintain document structure. This balances the need for * context with semantic coherence. */ async splitText(markdown: string, contentType?: string): Promise<ContentChunk[]> { const initialChunks = await this.baseSplitter.splitText(markdown, contentType); const concatenatedChunks: ContentChunk[] = []; let currentChunk: ContentChunk | null = null; for (const nextChunk of initialChunks) { if (currentChunk) { if (this.wouldExceedMaxSize(currentChunk, nextChunk)) { concatenatedChunks.push(currentChunk); currentChunk = this.cloneChunk(nextChunk); continue; } if ( currentChunk.content.length >= this.minChunkSize && this.startsNewMajorSection(nextChunk) ) { concatenatedChunks.push(currentChunk); currentChunk = this.cloneChunk(nextChunk); continue; } currentChunk.content += `${currentChunk.content.endsWith("\n") ? "" : "\n"}${nextChunk.content}`; currentChunk.section = this.mergeSectionInfo(currentChunk, nextChunk); currentChunk.types = this.mergeTypes(currentChunk.types, nextChunk.types); } else { currentChunk = this.cloneChunk(nextChunk); } } if (currentChunk) { concatenatedChunks.push(currentChunk); } return concatenatedChunks; } private cloneChunk(chunk: ContentChunk): ContentChunk { return { types: [...chunk.types], content: chunk.content, section: { level: chunk.section.level, path: [...chunk.section.path], }, }; } /** * H1 and H2 headings represent major conceptual breaks in the document. * Preserving these splits helps maintain the document's logical structure. */ private startsNewMajorSection(chunk: ContentChunk): boolean { return chunk.section.level === 1 || chunk.section.level === 2; } /** * Size limit check to ensure chunks remain within embedding model constraints. * Essential for maintaining consistent embedding quality and avoiding truncation. */ private wouldExceedMaxSize( currentChunk: ContentChunk | null, nextChunk: ContentChunk, ): boolean { if (!currentChunk) { return false; } return ( currentChunk.content.length + nextChunk.content.length > this.preferredChunkSize ); } /** * Checks if one path is a prefix of another path, indicating a parent-child relationship */ private isPathIncluded(parentPath: string[], childPath: string[]): boolean { if (parentPath.length >= childPath.length) return false; return parentPath.every((part, i) => part === childPath[i]); } /** * Merges section metadata when concatenating chunks, following these rules: * 1. Level: Always uses the lowest (most general) level between chunks * 2. Path selection: * - For parent-child relationships (one path includes the other), uses the child's path * - For siblings/unrelated sections, uses the common parent path * - If no common path exists, uses the root path ([]) */ private mergeSectionInfo( currentChunk: ContentChunk, nextChunk: ContentChunk, ): ContentChunk["section"] { // Always use the lowest level const level = Math.min(currentChunk.section.level, nextChunk.section.level); // If sections are exactly equal, preserve all metadata if ( currentChunk.section.level === nextChunk.section.level && currentChunk.section.path.length === nextChunk.section.path.length && currentChunk.section.path.every((p, i) => p === nextChunk.section.path[i]) ) { return currentChunk.section; } // Check if one path includes the other if (this.isPathIncluded(currentChunk.section.path, nextChunk.section.path)) { return { path: nextChunk.section.path, level, }; } if (this.isPathIncluded(nextChunk.section.path, currentChunk.section.path)) { return { path: currentChunk.section.path, level, }; } // Find common parent path const commonPath = this.findCommonPrefix( currentChunk.section.path, nextChunk.section.path, ); return { path: commonPath, level, }; } private mergeTypes( currentTypes: SectionContentType[], nextTypes: SectionContentType[], ): SectionContentType[] { return [...new Set([...currentTypes, ...nextTypes])]; } /** * Returns longest common prefix between two paths */ private findCommonPrefix(path1: string[], path2: string[]): string[] { const common: string[] = []; for (let i = 0; i < Math.min(path1.length, path2.length); i++) { if (path1[i] === path2[i]) { common.push(path1[i]); } else { break; } } return common; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server