Skip to main content
Glama
tree-sitter-chunker.ts7.22 kB
/** * TreeSitterChunker - AST-aware code chunking using tree-sitter * Primary chunking strategy for supported languages */ import Parser from "tree-sitter"; // tree-sitter language modules don't have proper types import Bash from "tree-sitter-bash"; import Go from "tree-sitter-go"; import Java from "tree-sitter-java"; import JavaScript from "tree-sitter-javascript"; import Python from "tree-sitter-python"; import Rust from "tree-sitter-rust"; import TypeScript from "tree-sitter-typescript"; import type { ChunkerConfig, CodeChunk } from "../types.js"; import type { CodeChunker } from "./base.js"; import { CharacterChunker } from "./character-chunker.js"; interface LanguageConfig { parser: Parser; chunkableTypes: string[]; } export class TreeSitterChunker implements CodeChunker { private languages: Map<string, LanguageConfig> = new Map(); private fallbackChunker: CharacterChunker; constructor(private config: ChunkerConfig) { this.fallbackChunker = new CharacterChunker(config); this.initializeParsers(); } private initializeParsers(): void { // TypeScript const tsParser = new Parser(); tsParser.setLanguage(TypeScript.typescript as any); this.languages.set("typescript", { parser: tsParser, chunkableTypes: [ "function_declaration", "method_definition", "class_declaration", "interface_declaration", "type_alias_declaration", "enum_declaration", ], }); // JavaScript const jsParser = new Parser(); jsParser.setLanguage(JavaScript as any); this.languages.set("javascript", { parser: jsParser, chunkableTypes: [ "function_declaration", "method_definition", "class_declaration", "export_statement", ], }); // Python const pyParser = new Parser(); pyParser.setLanguage(Python as any); this.languages.set("python", { parser: pyParser, chunkableTypes: ["function_definition", "class_definition", "decorated_definition"], }); // Go const goParser = new Parser(); goParser.setLanguage(Go as any); this.languages.set("go", { parser: goParser, chunkableTypes: [ "function_declaration", "method_declaration", "type_declaration", "interface_declaration", ], }); // Rust const rustParser = new Parser(); rustParser.setLanguage(Rust as any); this.languages.set("rust", { parser: rustParser, chunkableTypes: ["function_item", "impl_item", "trait_item", "struct_item", "enum_item"], }); // Java const javaParser = new Parser(); javaParser.setLanguage(Java as any); this.languages.set("java", { parser: javaParser, chunkableTypes: [ "method_declaration", "class_declaration", "interface_declaration", "enum_declaration", ], }); // Bash const bashParser = new Parser(); bashParser.setLanguage(Bash as any); this.languages.set("bash", { parser: bashParser, chunkableTypes: ["function_definition", "command"], }); } async chunk(code: string, filePath: string, language: string): Promise<CodeChunk[]> { const langConfig = this.languages.get(language); if (!langConfig) { // Fallback to character-based chunking return this.fallbackChunker.chunk(code, filePath, language); } try { const tree = langConfig.parser.parse(code); const chunks: CodeChunk[] = []; // Find all chunkable nodes const nodes = this.findChunkableNodes(tree.rootNode, langConfig.chunkableTypes); for (const [index, node] of nodes.entries()) { const content = code.substring(node.startIndex, node.endIndex); // Skip chunks that are too small if (content.length < 50) { continue; } // If chunk is too large, fall back to character chunking for this node if (content.length > this.config.maxChunkSize * 2) { const subChunks = await this.fallbackChunker.chunk(content, filePath, language); // Adjust line numbers for sub-chunks for (const subChunk of subChunks) { chunks.push({ ...subChunk, startLine: node.startPosition.row + 1 + subChunk.startLine - 1, endLine: node.startPosition.row + 1 + subChunk.endLine - 1, metadata: { ...subChunk.metadata, chunkIndex: chunks.length, }, }); } continue; } chunks.push({ content: content.trim(), startLine: node.startPosition.row + 1, endLine: node.endPosition.row + 1, metadata: { filePath, language, chunkIndex: index, chunkType: this.getChunkType(node.type), name: this.extractName(node, code), }, }); } // If no chunks found or file is small, use fallback if (chunks.length === 0 && code.length > 100) { return this.fallbackChunker.chunk(code, filePath, language); } return chunks; } catch (error) { // On parsing error, fallback to character-based chunking console.error(`Tree-sitter parsing failed for ${filePath}:`, error); return this.fallbackChunker.chunk(code, filePath, language); } } supportsLanguage(language: string): boolean { return this.languages.has(language); } getStrategyName(): string { return "tree-sitter"; } /** * Find all chunkable nodes in the AST */ private findChunkableNodes( node: Parser.SyntaxNode, chunkableTypes: string[] ): Parser.SyntaxNode[] { const nodes: Parser.SyntaxNode[] = []; const traverse = (n: Parser.SyntaxNode) => { if (chunkableTypes.includes(n.type)) { nodes.push(n); // Don't traverse children of chunkable nodes to avoid nested chunks return; } for (const child of n.children) { traverse(child); } }; traverse(node); return nodes; } /** * Extract function/class name from AST node */ private extractName(node: Parser.SyntaxNode, code: string): string | undefined { // Try to find name node const nameNode = node.childForFieldName("name"); if (nameNode) { return code.substring(nameNode.startIndex, nameNode.endIndex); } // For some node types, name might be in a different location for (const child of node.children) { if (child.type === "identifier" || child.type === "type_identifier") { return code.substring(child.startIndex, child.endIndex); } } return undefined; } /** * Map AST node type to chunk type */ private getChunkType(nodeType: string): "function" | "class" | "interface" | "block" { if (nodeType.includes("function") || nodeType.includes("method")) { return "function"; } if (nodeType.includes("class") || nodeType.includes("struct")) { return "class"; } if (nodeType.includes("interface") || nodeType.includes("trait")) { return "interface"; } return "block"; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mhalder/qdrant-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server