Skip to main content
Glama
treeSitterProcessor.ts19.1 kB
/** * @fileOverview: Tree-sitter based AST parsing and code analysis with fallback parsing support * @module: TreeSitterProcessor * @keyFunctions: * - parseAndChunk(): Parse code and create semantic chunks with symbol extraction * - extractSymbols(): Extract code symbols and cross-references * - initializeParsers(): Set up language-specific parsers with error handling * - fallbackParse(): Basic parsing when tree-sitter is unavailable * @dependencies: * - tree-sitter: Core parsing engine (optional) * - tree-sitter-typescript: TypeScript/TSX parsing * - tree-sitter-javascript: JavaScript parsing * - tree-sitter-python: Python parsing * @context: Provides robust AST parsing with graceful fallback to basic parsing when tree-sitter dependencies are unavailable, supporting multiple programming languages */ import { logger } from '../utils/logger'; // Optional tree-sitter import to avoid hard native dependency at runtime let Parser: any = null; let TypeScript: any = null; let JavaScript: any = null; let Python: any = null; // Dynamic import for ESM-only tree-sitter packages async function initializeTreeSitterParsers() { try { if (!Parser) { Parser = await import('tree-sitter'); Parser = Parser.default || Parser; } if (!TypeScript) { const tsModule = await import('tree-sitter-typescript'); TypeScript = tsModule.default.typescript; } if (!JavaScript) { const jsModule = await import('tree-sitter-javascript'); JavaScript = jsModule.default; } if (!Python) { const pyModule = await import('tree-sitter-python'); Python = pyModule.default; } logger.info('✅ Tree-sitter parsers initialized successfully'); } catch (error) { logger.warn('⚠️ Some tree-sitter parsers not available:', { error: error instanceof Error ? error.message : String(error), }); } } export interface CodeChunk { content: string; startLine: number; endLine: number; tokenEstimate: number; symbolId?: string; symbolName?: string; symbolType?: string; } export interface CodeSymbol { name: string; kind: string; startLine: number; endLine: number; lang: string; source: string; } export interface CodeXRef { name: string; kind: 'import' | 'export'; startLine: number; endLine: number; targetPath?: string; } export class TreeSitterProcessor { private parsers: Map<string, any>; constructor() { this.parsers = new Map(); } async initialize(): Promise<void> { await initializeTreeSitterParsers(); await this.initializeParsers(); } private async initializeParsers(): Promise<void> { if (!Parser) { logger.warn('Tree-sitter parser not available, will use fallback parsing'); return; } try { if (TypeScript) { const tsParser = new Parser(); try { tsParser.setLanguage(TypeScript); this.parsers.set('typescript', tsParser); } catch (error) { logger.warn('Failed to initialize TypeScript parser:', { error: error instanceof Error ? error.message : String(error), }); } } if (JavaScript) { const jsParser = new Parser(); try { jsParser.setLanguage(JavaScript); this.parsers.set('javascript', jsParser); } catch (error) { logger.warn('Failed to initialize JavaScript parser:', { error: error instanceof Error ? error.message : String(error), }); } } if (Python) { const pyParser = new Parser(); try { pyParser.setLanguage(Python); this.parsers.set('python', pyParser); } catch (error) { logger.warn('Failed to initialize Python parser:', { error: error instanceof Error ? error.message : String(error), }); } } } catch (error) { logger.warn('Failed to initialize some tree-sitter parsers:', { error: error instanceof Error ? error.message : String(error), }); } } async parseAndChunk( content: string, language: string, filePath: string ): Promise<{ chunks: CodeChunk[]; symbols: CodeSymbol[]; xrefs: CodeXRef[]; }> { // Validate input parameters if (!content || typeof content !== 'string') { logger.warn('Invalid content for parsing', { filePath, contentType: typeof content }); const chunks = this.fallbackChunking(content || '', filePath); return { chunks, symbols: [], xrefs: [] }; } if (!language || typeof language !== 'string') { logger.warn('Invalid language for parsing', { filePath, languageType: typeof language }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } const parser = this.parsers.get(language); if (!parser) { logger.warn('No parser available for language', { language, filePath }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } try { // Additional validation before parsing if (content.length === 0) { logger.warn('Empty content for parsing', { filePath }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } // Check for problematic content that can cause Tree-sitter to fail const hasNullBytes = content.includes('\0'); const hasInvalidChars = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(content); const isTooLarge = content.length > 1024 * 1024; // 1MB limit for safety if (hasNullBytes || hasInvalidChars || isTooLarge) { logger.warn('Problematic content detected, using fallback', { filePath, hasNullBytes, hasInvalidChars, isTooLarge, contentLength: content.length, }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } // Log Python-specific parsing issues for debugging if (language === 'python') { logger.debug('🐍 Parsing Python file', { filePath, contentLength: content.length, lineCount: content.split('\n').length, hasIndents: /^\s+/.test(content), firstLine: content.split('\n')[0]?.substring(0, 100), }); } // Additional content validation - check for incomplete/truncated content const trimmedContent = content.trim(); if (trimmedContent.length === 0) { logger.warn('Content is only whitespace', { filePath }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } // Check for obviously invalid syntax patterns that can crash Tree-sitter const openBraces = (content.match(/\{/g) || []).length; const closeBraces = (content.match(/\}/g) || []).length; if (Math.abs(openBraces - closeBraces) > 100) { logger.warn('Highly unbalanced braces, likely invalid syntax', { filePath, openBraces, closeBraces, difference: Math.abs(openBraces - closeBraces), }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } let tree; try { // Attempt to parse with a timeout safeguard tree = parser.parse(content); } catch (parseError) { const errorMessage = parseError instanceof Error ? parseError.message : String(parseError); if ( errorMessage.includes('Invalid argument') || errorMessage.includes('parse') || errorMessage.includes('invalid') ) { logger.warn('Tree-sitter parse error, using fallback', { filePath, errorMessage, }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } throw parseError; // Re-throw if it's not a parsing error we can handle } // Validate the parsed tree if (!tree || !tree.rootNode) { logger.warn('Invalid parse tree, using fallback', { filePath }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } // Validate root node with cross-language support const rootType = tree.rootNode.type; const acceptedRootTypes = new Set(['program', 'module']); // JS/TS: program, Python: module if (!acceptedRootTypes.has(rootType)) { logger.warn('Unexpected root node type; continuing with cautious extraction', { filePath, rootNodeType: rootType, acceptedRootTypes: Array.from(acceptedRootTypes).join(', '), }); } const chunks = this.extractChunks(tree, content, language); const symbols = this.extractSymbols(tree, content, language); const xrefs = this.extractXRefs(tree, language); return { chunks, symbols, xrefs }; } catch (error) { logger.error('Tree-sitter processing failed, using fallback', { filePath, error: error instanceof Error ? error.message : String(error), }); const chunks = this.fallbackChunking(content, filePath); return { chunks, symbols: [], xrefs: [] }; } } private extractChunks(tree: any, content: string, language: string): CodeChunk[] { const chunks: CodeChunk[] = []; const lines = content.split('\n'); const traverse = (node: any) => { try { if (this.isChunkableNode(node, language)) { const startLine = node.startPosition?.row + 1; const endLine = node.endPosition?.row + 1; if (startLine && endLine && startLine <= endLine) { const nodeContent = this.getNodeContent(node, lines); if (nodeContent && (endLine - startLine > 10 || nodeContent.length > 100)) { chunks.push({ content: nodeContent, startLine, endLine, tokenEstimate: this.estimateTokens(nodeContent), symbolName: this.getSymbolName(node), symbolType: node.type, }); } } } // Safely traverse children if (node.children && Array.isArray(node.children)) { for (const child of node.children) { if (child) { traverse(child); } } } } catch (error) { logger.warn('Error traversing node', { language, error: error instanceof Error ? error.message : String(error), }); // Continue with other nodes } }; try { if (tree?.rootNode) { traverse(tree.rootNode); } } catch (error) { logger.warn('Error extracting chunks from tree', { error: error instanceof Error ? error.message : String(error), }); } if (chunks.length === 0) { return this.fallbackChunking(content, ''); } return chunks; } private isChunkableNode(node: any, language: string): boolean { try { if (!node || !node.type) return false; if (language === 'typescript' || language === 'javascript') { return [ 'function_declaration', 'method_definition', 'arrow_function', 'class_declaration', 'interface_declaration', 'type_alias_declaration', 'export_statement', ].includes(node.type); } if (language === 'python') { return ['function_definition', 'class_definition', 'decorated_definition'].includes( node.type ); } return false; } catch (error) { logger.warn('Error checking if node is chunkable', { language, error: error instanceof Error ? error.message : String(error), }); return false; } } private getNodeContent(node: any, lines: string[]): string { try { if (!node || !node.startPosition || !node.endPosition || !Array.isArray(lines)) { return ''; } const startLine = node.startPosition.row; const endLine = node.endPosition.row; if ( typeof startLine !== 'number' || typeof endLine !== 'number' || startLine < 0 || endLine < startLine || endLine >= lines.length ) { return ''; } return lines.slice(startLine, endLine + 1).join('\n'); } catch (error) { logger.warn('Error getting node content', { error: error instanceof Error ? error.message : String(error), }); return ''; } } private getSymbolName(node: any): string | undefined { try { if (!node || !Array.isArray(node.children)) return undefined; const nameNode = node.children.find( (child: any) => child && (child.type === 'identifier' || child.type === 'property_identifier') ); if (nameNode?.text) return nameNode.text; // Fallback: search descendants for an identifier (helps for Python decorated_definition) try { if (typeof node.descendantsOfType === 'function') { const ids = node.descendantsOfType(['identifier', 'property_identifier']); if (ids && ids[0] && ids[0].text) return ids[0].text; } } catch {} return undefined; } catch (error) { logger.warn('Error getting symbol name', { error: error instanceof Error ? error.message : String(error), }); return undefined; } } private fallbackChunking(content: string, filePath: string): CodeChunk[] { // If content is completely empty or only whitespace, create minimal chunk const trimmedContent = content.trim(); if (trimmedContent.length === 0) { return [ { content: '', startLine: 1, endLine: 1, tokenEstimate: 0, symbolName: 'empty_file', symbolType: 'fallback', }, ]; } const lines = content.split('\n'); const chunks: CodeChunk[] = []; const chunkSize = 50; for (let i = 0; i < lines.length; i += chunkSize) { const endIndex = Math.min(i + chunkSize, lines.length); const chunkContent = lines.slice(i, endIndex).join('\n'); chunks.push({ content: chunkContent, startLine: i + 1, endLine: endIndex, tokenEstimate: this.estimateTokens(chunkContent), }); } return chunks; } private estimateTokens(content: string): number { return Math.ceil(content.length / 4); } private extractSymbols(tree: any, content: string, language: string): CodeSymbol[] { const symbols: CodeSymbol[] = []; const lines = content.split('\n'); const traverse = (node: any) => { if (this.isSymbolNode(node, language)) { const startLine = node.startPosition.row + 1; const endLine = node.endPosition.row + 1; const nodeContent = this.getNodeContent(node, lines); const symbolName = this.getSymbolName(node); if (symbolName) { symbols.push({ name: symbolName, kind: node.type, startLine, endLine, lang: language, source: nodeContent, }); } } for (const child of node.children) { traverse(child); } }; traverse(tree.rootNode); return symbols; } private isSymbolNode(node: any, language: string): boolean { if (language === 'typescript' || language === 'javascript') { return [ 'function_declaration', 'function_expression', 'arrow_function', 'class_declaration', 'interface_declaration', 'type_alias_declaration', 'variable_declarator', 'method_definition', 'property_definition', ].includes(node.type); } if (language === 'python') { return [ 'function_definition', 'class_definition', 'decorated_definition', 'assignment', ].includes(node.type); } return false; } private extractXRefs(tree: any, language: string): CodeXRef[] { const xrefs: CodeXRef[] = []; const traverse = (node: any) => { if (language === 'typescript' || language === 'javascript') { if (node.type === 'import_statement' || node.type === 'export_statement') { const kind = node.type === 'import_statement' ? 'import' : 'export'; const pathNode = typeof node.descendantsOfType === 'function' ? node.descendantsOfType('string_literal')[0] : undefined; const targetPath = pathNode?.text ? pathNode.text.slice(1, -1) : undefined; const importClause = typeof node.descendantsOfType === 'function' ? node.descendantsOfType('import_clause')[0] : undefined; if (importClause && typeof importClause.descendantsOfType === 'function') { const namedImports = importClause.descendantsOfType('named_imports')[0]; if (namedImports) { for (const specifier of namedImports.descendantsOfType('import_specifier')) { xrefs.push({ name: specifier.text, kind, startLine: specifier.startPosition.row + 1, endLine: specifier.endPosition.row + 1, targetPath, }); } } } } } else if (language === 'python') { // Python import extraction: import_statement, import_from_statement if (node.type === 'import_statement' || node.type === 'import_from_statement') { const kind: 'import' = 'import'; let modulePath: string | undefined; try { if (typeof node.descendantsOfType === 'function') { const dotted = node.descendantsOfType('dotted_name'); if (dotted && dotted[0] && dotted[0].text) modulePath = dotted[0].text; } } catch {} try { if (typeof node.descendantsOfType === 'function') { const names = node.descendantsOfType(['aliased_import', 'identifier']); for (const n of names) { if (!n || !n.text) continue; xrefs.push({ name: n.text, kind, startLine: node.startPosition.row + 1, endLine: node.endPosition.row + 1, targetPath: modulePath, }); } } } catch {} } } if (node.children && Array.isArray(node.children)) { for (const child of node.children) { traverse(child); } } }; if (tree?.rootNode) { traverse(tree.rootNode); } return xrefs; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sbarron/AmbianceMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server