Tea Rags MCP

tree-sitter-chunker.ts•22.8 KiB

/** * TreeSitterChunker - AST-aware code chunking using tree-sitter * Primary chunking strategy for supported languages * * OPTIMIZATION: Lazy-loads parsers on first use to reduce startup time. * Before: All 9 parsers loaded at construction (~3-5 seconds) * After: Parsers loaded on demand (~0ms startup, ~100-200ms first use per language) */ import type { Content } from "mdast"; import { remark } from "remark"; import remarkGfm from "remark-gfm"; import Parser from "tree-sitter"; import type { ChunkerConfig, CodeChunk } from "../types.js"; import type { CodeChunker } from "./base.js"; import { CharacterChunker } from "./character-chunker.js"; import { LANGUAGE_DEFINITIONS, type LanguageConfig, type LanguageDefinition } from "./chunker-config.js"; import { createHookContext } from "./hooks/types.js"; export class TreeSitterChunker implements CodeChunker { /** Cache of initialized parsers (lazy-loaded) */ private readonly parserCache: Map<string, LanguageConfig> = new Map(); private readonly fallbackChunker: CharacterChunker; /** Track loading promises to avoid duplicate loads */ private readonly loadingPromises: Map<string, Promise<LanguageConfig | null>> = new Map(); /** * Build symbolId from name and optional parentName * Format: "ParentName.childName" or just "name" */ private buildSymbolId(name?: string, parentName?: string): string | undefined { if (!name) return undefined; return parentName ? `${parentName}.${name}` : name; } constructor(private readonly config: ChunkerConfig) { this.fallbackChunker = new CharacterChunker(config); // NO parser initialization here - lazy load on demand! } /** * Get or lazily initialize parser for a language. * Returns null if language is not supported. */ private async getLanguageConfig(language: string): Promise<LanguageConfig | null> { // Check cache first const cached = this.parserCache.get(language); if (cached) { return cached; } // Check if already loading (avoid duplicate loads) const loading = this.loadingPromises.get(language); if (loading) { return loading; } // Check if language is defined const definition = LANGUAGE_DEFINITIONS[language]; if (!definition) { return null; } // Start loading const loadPromise = this.initializeParser(language, definition); this.loadingPromises.set(language, loadPromise); try { const config = await loadPromise; if (config) { this.parserCache.set(language, config); } return config; } finally { this.loadingPromises.delete(language); } } /** * Initialize a parser for a specific language */ private async initializeParser(language: string, definition: LanguageDefinition): Promise<LanguageConfig | null> { try { const startTime = Date.now(); // Dynamic import of language module const mod = (await definition.loadModule()) as Record<string, unknown>; const langModule = ( definition.extractLanguage ? definition.extractLanguage(mod) : mod.default || mod ) as Parser.Language; // Create and configure parser const parser = new Parser(); parser.setLanguage(langModule); if (process.env.DEBUG) { console.error(`[TreeSitter] Lazy-loaded ${language} parser in ${Date.now() - startTime}ms`); } return { parser, chunkableTypes: definition.chunkableTypes, childChunkTypes: definition.childChunkTypes, alwaysExtractChildren: definition.alwaysExtractChildren, isDocumentation: definition.isDocumentation, hooks: definition.hooks, }; } catch (error) { console.error(`[TreeSitter] Failed to load parser for ${language}:`, error); return null; } } async chunk(code: string, filePath: string, language: string): Promise<CodeChunk[]> { // Check if this language should skip tree-sitter (e.g., markdown uses remark) const definition = LANGUAGE_DEFINITIONS[language]; if (definition && (definition as LanguageDefinition & { skipTreeSitter?: boolean }).skipTreeSitter) { // Use specialized chunker for this language (e.g., remark for markdown) if (definition.isDocumentation) { return this.chunkMarkdownSimple(code, filePath, language); } } // Lazy-load parser for this language const langConfig = await this.getLanguageConfig(language); if (!langConfig) { // Fallback to character-based chunking return this.fallbackChunker.chunk(code, filePath, language); } try { const tree = langConfig.parser.parse(code); const chunks: CodeChunk[] = []; // Find all chunkable nodes const nodes = this.findChunkableNodes(tree.rootNode, langConfig.chunkableTypes); for (const [index, node] of nodes.entries()) { const content = code.substring(node.startIndex, node.endIndex); // Skip chunks that are too small if (content.length < 50) { continue; } // Determine if we should extract children from this node. // Two cases: (1) node is too large for a single chunk, or // (2) language always extracts children (e.g., Ruby methods from classes) const isTooLarge = content.length > this.config.maxChunkSize * 2; const hasChildTypes = langConfig.childChunkTypes && langConfig.childChunkTypes.length > 0; const shouldExtractChildren = hasChildTypes && (isTooLarge || langConfig.alwaysExtractChildren); if (shouldExtractChildren) { const parentName = this.extractName(node, code); const parentType = node.type; const childNodes = this.findChildChunkableNodes(node, langConfig.childChunkTypes ?? []); // Filter to children that meet minimum size const validChildren = childNodes.filter((c) => code.substring(c.startIndex, c.endIndex).length >= 50); if (validChildren.length > 0) { // Run hook chain const ctx = createHookContext(node, validChildren, code, { maxChunkSize: this.config.maxChunkSize, }); for (const hook of langConfig.hooks ?? []) { hook.process(ctx); } // Extract each child (method) as individual chunk for (let ci = 0; ci < validChildren.length; ci++) { const childNode = validChildren[ci]; const childContent = code.substring(childNode.startIndex, childNode.endIndex); // If child is also too large, use character fallback if (childContent.length > this.config.maxChunkSize * 2) { const subChunks = await this.fallbackChunker.chunk(childContent, filePath, language); for (const subChunk of subChunks) { chunks.push({ ...subChunk, startLine: childNode.startPosition.row + 1 + subChunk.startLine - 1, endLine: childNode.endPosition.row + 1 + subChunk.endLine - 1, metadata: { ...subChunk.metadata, chunkIndex: chunks.length, parentName, parentType, }, }); } continue; } let finalContent = childContent.trim(); let startLine = childNode.startPosition.row + 1; // Apply hook-provided prefix (e.g., preceding comments) const prefix = ctx.methodPrefixes.get(ci); if (prefix) { finalContent = `${prefix}\n${finalContent}`; } const overrideStart = ctx.methodStartLines.get(ci); if (overrideStart !== undefined) { startLine = overrideStart; } const childName = this.extractName(childNode, code); chunks.push({ content: finalContent, startLine, endLine: childNode.endPosition.row + 1, metadata: { filePath, language, chunkIndex: chunks.length, chunkType: this.getChunkType(childNode.type), name: childName, parentName, parentType, symbolId: this.buildSymbolId(childName, parentName), }, }); } // Extract class-level code (everything outside methods) as body chunk(s) if (langConfig.alwaysExtractChildren) { const hasHookChain = langConfig.hooks && langConfig.hooks.length > 0; if (hasHookChain) { // Hook chain ran — use hook-provided body chunks (may be empty) for (const result of ctx.bodyChunks) { chunks.push({ content: result.content, startLine: result.startLine, endLine: result.endLine, metadata: { filePath, language, chunkIndex: chunks.length, chunkType: "block", name: parentName, parentName, parentType, symbolId: this.buildSymbolId(parentName), lineRanges: result.lineRanges, }, }); } } else { // No hooks — generic fallback: single body chunk const bodyContent = this.extractContainerBody(node, validChildren, code); if (bodyContent && bodyContent.trim().length >= 50) { chunks.push({ content: bodyContent.trim(), startLine: node.startPosition.row + 1, endLine: node.endPosition.row + 1, metadata: { filePath, language, chunkIndex: chunks.length, chunkType: "block", name: parentName, parentName, parentType, symbolId: this.buildSymbolId(parentName), }, }); } } } continue; } // No valid children found if (isTooLarge) { // Fall back to character chunking for oversized nodes const subChunks = await this.fallbackChunker.chunk(content, filePath, language); for (const subChunk of subChunks) { chunks.push({ ...subChunk, startLine: node.startPosition.row + 1 + subChunk.startLine - 1, endLine: node.startPosition.row + 1 + subChunk.endLine - 1, metadata: { ...subChunk.metadata, chunkIndex: chunks.length, parentName, parentType, }, }); } continue; } // alwaysExtractChildren but no valid children — fall through to single chunk } const nodeName = this.extractName(node, code); chunks.push({ content: content.trim(), startLine: node.startPosition.row + 1, endLine: node.endPosition.row + 1, metadata: { filePath, language, chunkIndex: index, chunkType: this.getChunkType(node.type), name: nodeName, symbolId: this.buildSymbolId(nodeName), }, }); } // If no chunks found or file is small, use fallback if (chunks.length === 0 && code.length > 100) { return this.fallbackChunker.chunk(code, filePath, language); } return chunks; } catch (error) { // On parsing error, fallback to character-based chunking console.error(`Tree-sitter parsing failed for ${filePath}:`, error); return this.fallbackChunker.chunk(code, filePath, language); } } supportsLanguage(language: string): boolean { return language in LANGUAGE_DEFINITIONS; } getStrategyName(): string { return "tree-sitter"; } /** * Get list of supported languages */ getSupportedLanguages(): string[] { return Object.keys(LANGUAGE_DEFINITIONS); } /** * Preload specific language parsers (optional optimization) * Call this if you know which languages will be used */ async preloadLanguages(languages: string[]): Promise<void> { await Promise.all(languages.map(async (lang) => this.getLanguageConfig(lang))); } /** * Get stats about loaded parsers */ getLoadedParsers(): { loaded: string[]; available: string[] } { return { loaded: Array.from(this.parserCache.keys()), available: Object.keys(LANGUAGE_DEFINITIONS), }; } /** * Remark-based markdown chunker using unified/mdast AST parser. * Uses remark (CommonMark/GFM parser) instead of tree-sitter due to * compatibility issues with tree-sitter-markdown grammar (requires tree-sitter 0.26+). * * Creates chunks for: * 1. Sections (heading + content until next heading of same/higher level) * 2. Fenced code blocks with language detection (for searching code examples) */ private async chunkMarkdownSimple(code: string, filePath: string, language: string): Promise<CodeChunk[]> { const chunks: CodeChunk[] = []; const lines = code.split("\n"); // Parse markdown with remark (GFM for GitHub flavored markdown) const tree = remark().use(remarkGfm).parse(code); // Collect headings with positions interface HeadingInfo { depth: number; text: string; startLine: number; endLine: number; nodeIndex: number; } const headings: HeadingInfo[] = []; for (let i = 0; i < tree.children.length; i++) { const node = tree.children[i]; if (node.type === "heading" && node.position) { // Extract text from heading children const text = this.extractTextFromMdastNode(node); headings.push({ depth: node.depth, text, startLine: node.position.start.line, endLine: node.position.end.line, nodeIndex: i, }); } } // Collect code blocks interface CodeBlockInfo { lang: string | undefined; value: string; startLine: number; endLine: number; } const codeBlocks: CodeBlockInfo[] = []; const collectCodeBlocks = (node: Content) => { if (node.type === "code" && node.position) { codeBlocks.push({ lang: node.lang || undefined, value: node.value, startLine: node.position.start.line, endLine: node.position.end.line, }); } if ("children" in node && Array.isArray(node.children)) { for (const child of node.children) { collectCodeBlocks(child as Content); } } }; for (const child of tree.children) { collectCodeBlocks(child); } // Create section chunks for (let i = 0; i < headings.length; i++) { const heading = headings[i]; // Find end of section (next heading of ANY level, or end of document) // This creates smaller, more focused chunks for semantic search let sectionEndLine = lines.length; if (i + 1 < headings.length) { sectionEndLine = headings[i + 1].startLine - 1; } // Extract section content from original code const sectionLines = lines.slice(heading.startLine - 1, sectionEndLine); const sectionContent = sectionLines.join("\n").trim(); // Skip very small sections if (sectionContent.length < 50) { continue; } // If section is too large, split it if (sectionContent.length > this.config.maxChunkSize * 2) { const subChunks = await this.fallbackChunker.chunk(sectionContent, filePath, language); for (const subChunk of subChunks) { chunks.push({ ...subChunk, startLine: heading.startLine + subChunk.startLine - 1, endLine: heading.startLine + subChunk.endLine - 1, metadata: { ...subChunk.metadata, chunkIndex: chunks.length, name: heading.text, parentName: heading.text, parentType: `h${heading.depth}`, isDocumentation: true, }, }); } continue; } chunks.push({ content: sectionContent, startLine: heading.startLine, endLine: sectionEndLine, metadata: { filePath, language, chunkIndex: chunks.length, chunkType: "block", name: heading.text, symbolId: heading.text, isDocumentation: true, }, }); } // Create code block chunks (for searching code examples in docs) for (const block of codeBlocks) { // Skip very small code blocks if (block.value.length < 30) { continue; } const codeBlockName = block.lang ? `Code: ${block.lang}` : "Code block"; chunks.push({ content: block.value, startLine: block.startLine + 1, // +1 to skip ``` line endLine: block.endLine - 1, // -1 to skip closing ``` metadata: { filePath, // Use the code block's language, not "markdown" language: block.lang || "code", chunkIndex: chunks.length, chunkType: "block", name: codeBlockName, symbolId: codeBlockName, isDocumentation: true, }, }); } // Handle preamble (content before first heading) if (headings.length > 0 && headings[0].startLine > 1) { const preamble = lines .slice(0, headings[0].startLine - 1) .join("\n") .trim(); if (preamble.length >= 50) { chunks.unshift({ content: preamble, startLine: 1, endLine: headings[0].startLine - 1, metadata: { filePath, language, chunkIndex: 0, chunkType: "block", name: "Preamble", symbolId: "Preamble", isDocumentation: true, }, }); // Re-index all chunks for (let i = 1; i < chunks.length; i++) { chunks[i].metadata.chunkIndex = i; } } } // If no headings and no code blocks, treat whole document as one chunk if (chunks.length === 0 && code.length >= 50) { chunks.push({ content: code.trim(), startLine: 1, endLine: lines.length, metadata: { filePath, language, chunkIndex: 0, chunkType: "block", isDocumentation: true, }, }); } return chunks; } /** * Extract text content from mdast node (handles nested inlines like emphasis, links, etc.) */ private extractTextFromMdastNode(node: Content): string { if (node.type === "text") { return (node as { type: "text"; value: string }).value; } if ("children" in node && Array.isArray(node.children)) { return node.children.map((child: Content) => this.extractTextFromMdastNode(child)).join(""); } return ""; } /** * Find all chunkable nodes in the AST */ private findChunkableNodes(node: Parser.SyntaxNode, chunkableTypes: string[]): Parser.SyntaxNode[] { const nodes: Parser.SyntaxNode[] = []; const traverse = (n: Parser.SyntaxNode) => { if (chunkableTypes.includes(n.type)) { nodes.push(n); // Don't traverse children of chunkable nodes to avoid nested chunks return; } for (const child of n.children) { traverse(child); } }; traverse(node); return nodes; } /** * Find chunkable child nodes inside a parent node (e.g., methods inside a class). * Unlike findChunkableNodes, this DOES traverse into the parent's children * even if the parent is a chunkable type. */ private findChildChunkableNodes(parentNode: Parser.SyntaxNode, childChunkTypes: string[]): Parser.SyntaxNode[] { const nodes: Parser.SyntaxNode[] = []; const traverse = (n: Parser.SyntaxNode) => { // Skip the parent node itself if (n === parentNode) { for (const child of n.children) { traverse(child); } return; } if (childChunkTypes.includes(n.type)) { nodes.push(n); // Don't traverse into this node's children return; } for (const child of n.children) { traverse(child); } }; traverse(parentNode); return nodes; } /** * Extract the "body" of a container node (class/module), excluding child chunks (methods). * Collects class-level code: includes, associations, scopes, validations, constants, etc. * Returns the collected lines as a string, or undefined if nothing remains. */ private extractContainerBody( containerNode: Parser.SyntaxNode, childNodes: Parser.SyntaxNode[], code: string, ): string | undefined { const containerStartRow = containerNode.startPosition.row; const containerEndRow = containerNode.endPosition.row; const lines = code.split("\n"); // Build a set of line numbers occupied by child nodes (methods) const methodLines = new Set<number>(); for (const child of childNodes) { for (let { row } = child.startPosition; row <= child.endPosition.row; row++) { methodLines.add(row); } } // Collect lines from the container that are NOT inside any method const bodyLines: string[] = []; for (let row = containerStartRow; row <= containerEndRow; row++) { if (!methodLines.has(row)) { bodyLines.push(lines[row]); } } const body = bodyLines.join("\n").trim(); return body.length > 0 ? body : undefined; } /** * Extract function/class name from AST node */ private extractName(node: Parser.SyntaxNode, code: string): string | undefined { // Try to find name node const nameNode = node.childForFieldName("name"); if (nameNode) { return code.substring(nameNode.startIndex, nameNode.endIndex); } // For some node types, name might be in a different location for (const child of node.children) { if (child.type === "identifier" || child.type === "type_identifier") { return code.substring(child.startIndex, child.endIndex); } } return undefined; } /** * Map AST node type to chunk type */ private getChunkType(nodeType: string): "function" | "class" | "interface" | "block" { if (nodeType.includes("function") || nodeType.includes("method")) { return "function"; } if (nodeType.includes("class") || nodeType.includes("struct") || nodeType.includes("module")) { return "class"; } if (nodeType.includes("interface") || nodeType.includes("trait")) { return "interface"; } return "block"; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/artk0de/TeaRAGs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

tree-sitter-chunker.ts•22.8 KiB