Skip to main content
Glama

docs-mcp-server

TreesitterSourceCodeSplitter.ts15.2 kB
/** * Tree-sitter based source code splitter. * * Provides semantic parsing and chunking of source code using tree-sitter ASTs. * Maintains compatibility with existing chunk patterns and hierarchical structure, * while providing robust parsing for JavaScript, TypeScript, JSX, and TSX files. */ import { SPLITTER_MAX_CHUNK_SIZE } from "../../utils"; import { TextContentSplitter } from "../splitters/TextContentSplitter"; import type { ContentChunk, DocumentSplitter } from "../types"; import { LanguageParserRegistry } from "./LanguageParserRegistry"; import type { CodeBoundary, LanguageParser } from "./parsers/types"; /** * Configuration options for tree-sitter source code splitting */ export interface TreesitterSourceCodeSplitterOptions { /** Maximum size for individual chunks before delegating to TextSplitter */ maxChunkSize?: number; } /** * Tree-sitter based source code splitter that provides semantic parsing * while maintaining compatibility with existing chunk patterns */ export class TreesitterSourceCodeSplitter implements DocumentSplitter { private readonly textContentSplitter: TextContentSplitter; private readonly registry: LanguageParserRegistry; private readonly options: Required<TreesitterSourceCodeSplitterOptions>; constructor(options: TreesitterSourceCodeSplitterOptions = {}) { this.options = { maxChunkSize: options.maxChunkSize ?? SPLITTER_MAX_CHUNK_SIZE, }; // Initialize registry and text content splitter this.registry = new LanguageParserRegistry(); this.textContentSplitter = new TextContentSplitter({ chunkSize: this.options.maxChunkSize, }); } async splitText(content: string, contentType?: string): Promise<ContentChunk[]> { if (!content.trim()) { return []; } // Try to get a parser for this content type const parser = this.getParserForContent(contentType); if (!parser) { // Fall back to TextContentSplitter for unsupported languages return this.fallbackToTextSplitter(content); } try { // Parse the source code const parseResult = parser.parse(content); if (parseResult.hasErrors) { console.warn( `Tree-sitter parsing had errors for ${contentType}, but continuing with partial results`, ); } // Extract simplified boundaries for chunking const boundaries = parser.extractBoundaries(parseResult.tree, content); if (boundaries.length === 0) { // No semantic boundaries found, fall back to text splitter return this.fallbackToTextSplitter(content); } // Build hierarchical relationships between boundaries const hierarchicalBoundaries = this.buildBoundaryHierarchy(boundaries); // Direct conversion to chunks (parser is responsible for suppressing unwanted nested boundaries) return await this.boundariesToChunks(hierarchicalBoundaries, content, contentType); } catch (error) { // Graceful fallback to TextContentSplitter on any parsing error console.warn( "TreesitterSourceCodeSplitter failed, falling back to TextContentSplitter:", error, ); return this.fallbackToTextSplitter(content); } } /** * Helper method to fall back to TextContentSplitter and convert results to ContentChunk[] */ private async fallbackToTextSplitter(content: string): Promise<ContentChunk[]> { const textChunks = await this.textContentSplitter.split(content); return textChunks.map((chunk) => ({ types: ["code"], content: chunk, section: { level: 0, path: [], }, })); } /** * Get the appropriate parser for the given content type */ private getParserForContent(contentType?: string): LanguageParser | undefined { if (!contentType) { return undefined; } // Try to find parser by MIME type first let parser = this.registry.getParserByMimeType(contentType); if (parser) { return parser; } // Try to extract file extension from content type const extensionMatch = contentType.match(/\\.([a-zA-Z]+)$/); if (extensionMatch) { const extension = `.${extensionMatch[1]}`; parser = this.registry.getParserByExtension(extension); if (parser) { return parser; } } // Check for common patterns in content type if (contentType.includes("javascript") || contentType.includes("typescript")) { return this.registry.getParser("typescript"); } if (contentType.includes("jsx") || contentType.includes("tsx")) { // Unified TypeScript parser also handles JSX/TSX return this.registry.getParser("typescript"); } return undefined; } /** * Check if the content type is supported */ isSupportedContentType(contentType?: string): boolean { return this.getParserForContent(contentType) !== undefined; } /** * Get the list of supported languages */ getSupportedLanguages(): string[] { return this.registry.getSupportedLanguages(); } /** * Get the list of supported file extensions */ getSupportedExtensions(): string[] { return this.registry.getSupportedExtensions(); } /** * Get the list of supported MIME types */ getSupportedMimeTypes(): string[] { return this.registry.getSupportedMimeTypes(); } /** * Helper method to split content using TextContentSplitter only if needed * and create ContentChunks with the specified hierarchical path and level */ private async splitContentIntoChunks( content: string, path: string[], level: number, ): Promise<ContentChunk[]> { // Preserve whitespace-only content if it fits within chunk size (for perfect reconstruction) // Only skip if content is completely empty if (content.length === 0) { return []; } // Only apply TextContentSplitter if content exceeds max chunk size if (content.length <= this.options.maxChunkSize) { // Content is small enough, return as single chunk preserving original formatting return [ { types: ["code"] as const, content, section: { level, path, }, }, ]; } // Content is too large, use TextContentSplitter to break it down const textChunks = await this.textContentSplitter.split(content); // Convert text chunks to ContentChunks with semantic context return textChunks.map((textChunk) => ({ types: ["code"] as const, content: textChunk, section: { level, path, }, })); } /** * Convert boundaries to chunks. * Algorithm: * - Collect line breakpoints: file start, each boundary start, each boundary end+1, file end+1 * - Create linear segments between breakpoints (each line appears exactly once) * - Determine containing (innermost) boundary for each segment for path/level * - First segment belonging to a structural boundary => structural chunk; subsequent segments demoted to content * - Universal max size enforcement: any segment > maxChunkSize is further split via TextContentSplitter * - No heuristic de-noising or whitespace merging; reconstruction is guaranteed by preserving order + exact text */ private async boundariesToChunks( boundaries: CodeBoundary[], content: string, _contentType?: string, ): Promise<ContentChunk[]> { const lines = content.split("\n"); const totalLines = lines.length; if (boundaries.length === 0) { // No boundaries found, use TextContentSplitter on entire content const subChunks = await this.splitContentIntoChunks(content, [], 0); return subChunks; } // NOTE: Removed previous adjustment that forcibly shifted the first boundary // to line 1 when only whitespace preceded it. That logic rewrote startLine // and broke documentation merging tests (expected doc start line). // We preserve original boundary start lines now. // Step 1: Collect all boundary points (start and end+1 for exclusive ranges) const boundaryPoints = new Set<number>(); boundaryPoints.add(1); // Always start from line 1 boundaryPoints.add(totalLines + 1); // Always end after last line for (const boundary of boundaries) { boundaryPoints.add(boundary.startLine); boundaryPoints.add(boundary.endLine + 1); // +1 for exclusive end } // Step 2: Sort points to create segments const sortedPoints = Array.from(boundaryPoints).sort((a, b) => a - b); // Step 3: Create segments between consecutive points (collect first, don't process yet) interface TextSegment { startLine: number; endLine: number; content: string; containingBoundary?: CodeBoundary; } const segments: TextSegment[] = []; for (let i = 0; i < sortedPoints.length - 1; i++) { const startLine = sortedPoints[i]; const endLine = sortedPoints[i + 1] - 1; // Convert back to inclusive end // Skip empty segments if (startLine > endLine || startLine > totalLines) { continue; } // Extract content for this segment const segmentLines = lines.slice(startLine - 1, Math.min(endLine, totalLines)); // Convert to 0-indexed and clamp let segmentContent = segmentLines.join("\n"); // Add trailing newline for all segments except the last one (for perfect reconstruction) if (endLine < totalLines) { segmentContent += "\n"; } if (segmentContent.length === 0) { continue; // Skip empty segments } // Determine which boundary this segment belongs to (innermost containing boundary) const containingBoundary = this.findContainingBoundary( startLine, endLine, boundaries, ); segments.push({ startLine, endLine, content: segmentContent, containingBoundary, }); } // Step 4: Convert segments directly to chunks (whitespace retained verbatim) const chunks: ContentChunk[] = []; // Ensure only ONE structural chunk is emitted per structural boundary. const structuralBoundaryFirstChunk = new Set<CodeBoundary>(); // Accumulate whitespace-only segments and prepend to next non-whitespace segment // to avoid emitting standalone whitespace chunks (tests assert no empty-trim chunks) let pendingWhitespace = ""; for (const segment of segments) { if (segment.content.trim() === "") { pendingWhitespace += segment.content; continue; } // Assign path and level based on containing boundary let path: string[]; let level: number; const boundary = segment.containingBoundary; if (boundary) { // Use the boundary's hierarchical path and level path = boundary.path || [boundary.name || "unnamed"]; level = boundary.level || path.length; } else { // No containing boundary, this is global code path = []; level = 0; } // Determine initial structural classification let isStructural = boundary?.boundaryType === "structural"; if (isStructural && boundary) { if (structuralBoundaryFirstChunk.has(boundary)) { // Demote subsequent segments of the same structural boundary isStructural = false; } else { structuralBoundaryFirstChunk.add(boundary); } } // If segment is too large, delegate to TextContentSplitter const segmentChunks = await this.splitContentIntoChunks( segment.content, path, level, ); // Overwrite types based on structural vs content classification (always include "code" for backward compatibility) for (const c of segmentChunks) { // Prepend any accumulated whitespace to the FIRST chunk emitted for this segment if (pendingWhitespace) { c.content = pendingWhitespace + c.content; pendingWhitespace = ""; } c.types = isStructural ? ["code", "structural"] : ["code"]; } chunks.push(...segmentChunks); } // If file ended with whitespace-only content, append it to last chunk (preserve reconstructability) if (pendingWhitespace && chunks.length > 0) { chunks[chunks.length - 1].content += pendingWhitespace; } return chunks; } /** * Build hierarchical relationships between boundaries based on containment */ private buildBoundaryHierarchy(boundaries: CodeBoundary[]): CodeBoundary[] { // Create a copy of boundaries to avoid mutating the original const hierarchicalBoundaries = boundaries.map((b) => ({ ...b })); // Build parent-child relationships for (let i = 0; i < hierarchicalBoundaries.length; i++) { const boundary = hierarchicalBoundaries[i]; let parent: CodeBoundary | undefined; let smallestRange = Infinity; // Find the smallest containing parent for (let j = 0; j < hierarchicalBoundaries.length; j++) { if (i === j) continue; const candidate = hierarchicalBoundaries[j]; // Check if candidate contains boundary if ( candidate.startLine <= boundary.startLine && candidate.endLine >= boundary.endLine && candidate.startByte <= boundary.startByte && candidate.endByte >= boundary.endByte ) { const range = candidate.endLine - candidate.startLine; // Keep the smallest containing boundary (innermost parent) if (range < smallestRange) { smallestRange = range; parent = candidate; } } } if (parent) { boundary.parent = parent; } // Build hierarchical path boundary.path = this.buildBoundaryPath(boundary); boundary.level = boundary.path.length; } return hierarchicalBoundaries; } /** * Build hierarchical path for a boundary by walking up the parent chain */ private buildBoundaryPath(boundary: CodeBoundary): string[] { const path: string[] = []; let current: CodeBoundary | undefined = boundary; // Walk up the parent chain while (current) { if (current.name) { path.unshift(current.name); // Add to beginning to build path from root } current = current.parent; } return path; } /** * Find the innermost boundary that contains the given line range */ private findContainingBoundary( startLine: number, endLine: number, boundaries: CodeBoundary[], ): CodeBoundary | undefined { let innermost: CodeBoundary | undefined; let smallestRange = Infinity; for (const boundary of boundaries) { // Check if boundary contains the segment if (boundary.startLine <= startLine && boundary.endLine >= endLine) { const range = boundary.endLine - boundary.startLine; // Keep the smallest containing boundary (innermost) if (range < smallestRange) { smallestRange = range; innermost = boundary; } } } return innermost; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arabold/docs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server