CodeRAG

ast-chunking.ts•14 KiB

/** * AST-Based Code Chunking using Synth * * Splits code at semantic boundaries (functions, classes, etc.) * Supports multiple languages via language-config registry. */ import { chunkText } from './embeddings.js' import { type EmbeddedLanguageConfig, getLanguageConfig, getLanguageFromPath, LANGUAGE_REGISTRY, type LanguageConfig, } from './language-config.js' // ============================================ // Synth AST Types // ============================================ interface Position { line: number column: number offset: number } interface Span { start: Position end: Position } type NodeId = number interface BaseNode { id: NodeId type: string span?: Span parent: NodeId | null children: NodeId[] data?: Record<string, unknown> } interface Tree { meta: { language: string source: string created: number modified: number data?: Record<string, unknown> } root: NodeId nodes: BaseNode[] } /** * Synth parser interface. * Note: WASM-based parsers (v0.3.x) require parseAsync(). * Sync parse() throws for tree-sitter based languages (c, go, java, php, python, ruby, rust). */ type SynthParser = { parse: (source: string, options?: Record<string, unknown>) => Tree parseAsync: (source: string, options?: Record<string, unknown>) => Promise<Tree> } // ============================================ // Chunking Types // ============================================ /** * AST-based chunking options */ export interface ASTChunkOptions { readonly maxChunkSize?: number readonly minChunkSize?: number readonly preserveContext?: boolean readonly nodeTypes?: string[] /** Enable recursive parsing of embedded code (e.g., code blocks in markdown) */ readonly parseEmbedded?: boolean } /** * Chunk result with metadata */ export interface ChunkResult { readonly content: string readonly type: string readonly startLine: number readonly endLine: number readonly metadata: Record<string, unknown> } // ============================================ // Parser Cache // ============================================ const parserCache = new Map<string, SynthParser | null>() /** * Load Synth parser for a language * Uses registry first, then tries auto-discovery */ async function loadSynthParser(language: string): Promise<SynthParser | null> { const langLower = language.toLowerCase() // Check cache if (parserCache.has(langLower)) { return parserCache.get(langLower) || null } // Try registry first const config = getLanguageConfig(langLower) if (config) { try { const parserModule = (await import(config.parser)) as { parse: SynthParser['parse'] parseAsync: SynthParser['parseAsync'] } const parser: SynthParser = { parse: parserModule.parse, parseAsync: parserModule.parseAsync, } parserCache.set(langLower, parser) return parser } catch (error) { console.error(`[WARN] Failed to load parser ${config.parser} for ${language}:`, error) } } // Auto-discovery: try @sylphx/synth-{language} try { const parserModule = (await import(`@sylphx/synth-${langLower}`)) as { parse: SynthParser['parse'] parseAsync: SynthParser['parseAsync'] } const parser: SynthParser = { parse: parserModule.parse, parseAsync: parserModule.parseAsync, } parserCache.set(langLower, parser) console.error(`[INFO] Auto-discovered parser for ${language}`) return parser } catch { // Not available } // Cache null to avoid repeated lookups parserCache.set(langLower, null) return null } // ============================================ // AST Utilities // ============================================ function getSourceText(tree: Tree, node: BaseNode): string { if (!node.span) return '' return tree.meta.source.slice(node.span.start.offset, node.span.end.offset) } function getNode(tree: Tree, nodeId: NodeId): BaseNode | undefined { return tree.nodes[nodeId] } /** * Check if node is a semantic boundary based on language config */ function isSemanticBoundary(node: BaseNode, config: LanguageConfig | undefined): boolean { if (!config) return false return config.boundaries.includes(node.type) } /** * Extract context nodes (imports, types) based on language config */ function extractContextNodes(tree: Tree, config: LanguageConfig | undefined): BaseNode[] { if (!config?.contextTypes) return [] const contextTypes = config.contextTypes return tree.nodes.filter((node) => contextTypes.includes(node.type)) } /** * Check if node contains embedded code that should be recursively parsed */ function _getEmbeddedConfig( node: BaseNode, config: LanguageConfig | undefined ): EmbeddedLanguageConfig | undefined { if (!config?.embedded) return undefined return config.embedded.find((e) => e.nodeType === node.type) } /** * Detect embedded language from node */ function detectEmbeddedLanguage( node: BaseNode, embeddedConfig: EmbeddedLanguageConfig ): string | undefined { // Try to get language from node data if (embeddedConfig.langAttr && node.data) { const lang = node.data[embeddedConfig.langAttr] if (typeof lang === 'string' && lang.length > 0) { return lang.toLowerCase() } } // Use default language if specified return embeddedConfig.defaultLanguage } // ============================================ // Chunk Extraction // ============================================ /** * Merge small non-semantic chunks */ function mergeSmallChunks(chunks: ChunkResult[], minChunkSize: number): ChunkResult[] { if (chunks.length === 0) return [] const merged: ChunkResult[] = [] let buffer: ChunkResult | null = null for (const chunk of chunks) { // Don't merge semantic chunks (those without 'split' metadata) const isSemanticChunk = !chunk.metadata.split if (!buffer) { if (isSemanticChunk || chunk.content.length >= minChunkSize) { merged.push(chunk) } else { buffer = chunk } continue } const isBufferSemantic = !buffer.metadata.split if ( !isSemanticChunk && !isBufferSemantic && buffer.content.length < minChunkSize && chunk.content.length < minChunkSize ) { buffer = { content: `${buffer.content}\n\n${chunk.content}`, type: `${buffer.type}+${chunk.type}`, startLine: buffer.startLine, endLine: chunk.endLine, metadata: { ...buffer.metadata, merged: true }, } } else { merged.push(buffer) if (isSemanticChunk || chunk.content.length >= minChunkSize) { merged.push(chunk) buffer = null } else { buffer = chunk } } } if (buffer) { merged.push(buffer) } return merged } /** * Extract sub-chunks from large nodes */ function extractSubChunks( tree: Tree, node: BaseNode, options: { maxChunkSize: number } ): ChunkResult[] { const chunks: ChunkResult[] = [] if (node.children.length > 0) { for (const childId of node.children) { const child = getNode(tree, childId) if (!child || !child.span) continue const content = getSourceText(tree, child) if (content.length > options.maxChunkSize) { chunks.push(...extractSubChunks(tree, child, options)) } else { chunks.push({ content, type: child.type, startLine: child.span.start.line + 1, endLine: child.span.end.line + 1, metadata: { ...child.data }, }) } } } else { // No children, split by characters const content = getSourceText(tree, node) const charChunks = chunkText(content, { maxChunkSize: options.maxChunkSize }) charChunks.forEach((chunk, i) => { chunks.push({ content: chunk, type: `${node.type}[${i}]`, startLine: node.span?.start.line ?? 0 + 1, endLine: node.span?.end.line ?? 0 + 1, metadata: { split: true, index: i }, }) }) } return chunks } /** * Extract semantic chunks from AST */ function extractSemanticChunks( tree: Tree, config: LanguageConfig | undefined, options: { maxChunkSize: number minChunkSize: number preserveContext: boolean nodeTypes?: string[] } ): ChunkResult[] { const chunks: ChunkResult[] = [] // Extract context prefix let contextPrefix = '' if (options.preserveContext && config) { const contextNodes = extractContextNodes(tree, config) contextPrefix = contextNodes.map((node) => getSourceText(tree, node)).join('\n') if (contextPrefix) contextPrefix += '\n\n' } // Get root node const root = tree.nodes[0] if (!root) return [] // For JS/TS, navigate through Program node let topLevelNodes = root.children const lang = tree.meta.language.toLowerCase() if (topLevelNodes.length === 1 && ['javascript', 'typescript', 'jsx', 'tsx'].includes(lang)) { const firstChild = tree.nodes[topLevelNodes[0]] if (firstChild?.type === 'Program' && firstChild.children.length > 0) { topLevelNodes = firstChild.children } } // Process top-level children for (const childId of topLevelNodes) { const node = tree.nodes[childId] if (!node || !node.span) continue // Check if this is a semantic boundary const isBoundary = options.nodeTypes ? options.nodeTypes.includes(node.type) : isSemanticBoundary(node, config) if (isBoundary) { const content = getSourceText(tree, node) const finalContent = options.preserveContext ? contextPrefix + content : content if (finalContent.length > options.maxChunkSize) { const subChunks = extractSubChunks(tree, node, options) chunks.push(...subChunks) } else { chunks.push({ content: finalContent, type: node.type, startLine: node.span.start.line + 1, endLine: node.span.end.line + 1, metadata: { ...node.data }, }) } } } return chunks } /** * Parse code with Synth */ async function parseWithSynth( code: string, language: string, config: LanguageConfig | undefined ): Promise<Tree | null> { const parser = await loadSynthParser(language) if (!parser) return null try { const options = config?.parserOptions ?? {} // WASM parsers require async parsing return await parser.parseAsync(code, options) } catch (error) { console.error(`[WARN] Synth parsing failed for ${language}:`, error) return null } } /** * Recursively parse embedded code blocks */ async function parseEmbeddedChunks( chunks: ChunkResult[], config: LanguageConfig | undefined, options: ASTChunkOptions ): Promise<ChunkResult[]> { if (!config?.embedded) return chunks const result: ChunkResult[] = [] for (const chunk of chunks) { const embeddedConfig = config.embedded.find((e) => e.nodeType === chunk.type) if (embeddedConfig?.recursive) { // Detect the embedded language const embeddedLang = detectEmbeddedLanguage( { type: chunk.type, data: chunk.metadata } as BaseNode, embeddedConfig ) if (embeddedLang && embeddedLang !== 'text' && embeddedLang !== 'plain') { // Get the actual code content (strip markdown fences if present) let codeContent = chunk.content const fenceMatch = codeContent.match(/^```\w*\n([\s\S]*?)\n```$/m) if (fenceMatch) { codeContent = fenceMatch[1] } // Recursively parse const subChunks = await chunkCodeByAST(codeContent, `file.${embeddedLang}`, { ...options, parseEmbedded: false, // Prevent infinite recursion }) if (subChunks.length > 0 && !subChunks[0].metadata.fallback) { // Add parent context to sub-chunks for (const subChunk of subChunks) { result.push({ ...subChunk, startLine: chunk.startLine + (subChunk.startLine - 1), endLine: chunk.startLine + (subChunk.endLine - 1), metadata: { ...subChunk.metadata, embeddedIn: chunk.type, embeddedLanguage: embeddedLang, }, }) } continue } } } // Keep original chunk if no embedded parsing result.push(chunk) } return result } // ============================================ // Main API // ============================================ /** * Chunk code using AST analysis * * @example * ```typescript * const chunks = await chunkCodeByAST( * code, * 'example.ts', * { maxChunkSize: 1000, preserveContext: true } * ); * ``` */ export async function chunkCodeByAST( code: string, filePath: string, options: ASTChunkOptions = {} ): Promise<readonly ChunkResult[]> { const { maxChunkSize = 1000, minChunkSize = 100, preserveContext = true, nodeTypes, parseEmbedded = true, } = options // 1. Detect language from file path const language = getLanguageFromPath(filePath) if (!language) { console.error('[WARN] Unknown language, falling back to character chunking') return createFallbackChunks(code, maxChunkSize) } // 2. Get language config const config = getLanguageConfig(language) // 3. Parse AST const tree = await parseWithSynth(code, language, config) if (!tree) { console.error('[WARN] AST parsing failed, falling back to character chunking') return createFallbackChunks(code, maxChunkSize) } // 4. Extract semantic chunks let chunks = extractSemanticChunks(tree, config, { maxChunkSize, minChunkSize, preserveContext, nodeTypes, }) // 5. Parse embedded code (e.g., code blocks in markdown) if (parseEmbedded && config?.embedded) { chunks = await parseEmbeddedChunks(chunks, config, options) } // 6. Merge small chunks const merged = mergeSmallChunks(chunks, minChunkSize) // 7. Fallback if no chunks extracted if (merged.length === 0 && code.trim().length > 0) { return [ { content: code, type: 'unknown', startLine: 1, endLine: code.split('\n').length, metadata: { fallback: true, reason: 'no-semantic-boundaries' }, }, ] } return merged } /** * Create fallback chunks using character-based splitting */ function createFallbackChunks(code: string, maxChunkSize: number): ChunkResult[] { const chunks = chunkText(code, { maxChunkSize }) return chunks.map((content, i) => ({ content, type: 'text', startLine: 0, endLine: 0, metadata: { fallback: true, index: i }, })) } /** * Simple wrapper for backward compatibility */ export async function chunkCodeByASTSimple( code: string, filePath: string, options: ASTChunkOptions = {} ): Promise<readonly string[]> { const chunks = await chunkCodeByAST(code, filePath, options) return chunks.map((chunk) => chunk.content) } /** * Get list of supported languages */ export function getSupportedLanguages(): string[] { return Object.keys(LANGUAGE_REGISTRY) }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/SylphxAI/coderag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ast-chunking.ts•14 KiB