/**
* TreeSitterChunker - AST-aware code chunking using tree-sitter
* Primary chunking strategy for supported languages
*
* OPTIMIZATION: Lazy-loads parsers on first use to reduce startup time.
* Before: All 9 parsers loaded at construction (~3-5 seconds)
* After: Parsers loaded on demand (~0ms startup, ~100-200ms first use per language)
*/
import Parser from "tree-sitter";
import { remark } from "remark";
import remarkGfm from "remark-gfm";
import type { Root, Heading, Code, Content } from "mdast";
import type { ChunkerConfig, CodeChunk } from "../types.js";
import type { CodeChunker } from "./base.js";
import { CharacterChunker } from "./character-chunker.js";
interface LanguageDefinition {
/** Function to load the language module (lazy) */
loadModule: () => any;
/** Function to extract language from module (some have nested structure) */
extractLanguage?: (mod: any) => any;
/** AST node types that should be chunked */
chunkableTypes: string[];
/**
* Child types to look for when a chunkable node is too large.
* If a class/module exceeds maxChunkSize, we recurse to find these smaller units.
*/
childChunkTypes?: string[];
/**
* Flag to identify documentation languages (markdown, etc.)
* Used for filtering search results by content type
*/
isDocumentation?: boolean;
}
interface LanguageConfig {
parser: Parser;
chunkableTypes: string[];
childChunkTypes?: string[];
isDocumentation?: boolean;
}
/**
* Language definitions - modules are NOT loaded until first use
*/
const LANGUAGE_DEFINITIONS: Record<string, LanguageDefinition> = {
typescript: {
loadModule: () => import("tree-sitter-typescript"),
extractLanguage: (mod) => mod.default?.typescript || mod.typescript,
chunkableTypes: [
"function_declaration",
"method_definition",
"class_declaration",
"interface_declaration",
"type_alias_declaration",
"enum_declaration",
],
},
javascript: {
loadModule: () => import("tree-sitter-javascript"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: [
"function_declaration",
"method_definition",
"class_declaration",
"export_statement",
],
},
python: {
loadModule: () => import("tree-sitter-python"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: ["function_definition", "class_definition", "decorated_definition"],
},
go: {
loadModule: () => import("tree-sitter-go"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: [
"function_declaration",
"method_declaration",
"type_declaration",
"interface_declaration",
],
},
rust: {
loadModule: () => import("tree-sitter-rust"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: ["function_item", "impl_item", "trait_item", "struct_item", "enum_item"],
},
java: {
loadModule: () => import("tree-sitter-java"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: [
"method_declaration",
"class_declaration",
"interface_declaration",
"enum_declaration",
],
},
bash: {
loadModule: () => import("tree-sitter-bash"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: ["function_definition", "command"],
},
ruby: {
loadModule: () => import("tree-sitter-ruby"),
extractLanguage: (mod) => mod.default || mod,
chunkableTypes: [
"method", // def method_name ... end
"singleton_method", // def self.method_name ... end
"class", // class Foo ... end (small classes kept whole)
"module", // module Bar ... end (small modules kept whole)
"singleton_class", // class << self ... end
],
// When class/module is too large, recursively look for these smaller units
// NOTE: "singleton_class" removed from childChunkTypes - we traverse THROUGH it
// to find the methods inside (class << self ... end contains methods)
childChunkTypes: ["method", "singleton_method"],
// Removed problematic types:
// - "lambda", "block" → too small (1 line), fragments context
// - "do_block" → creates too many tiny chunks from iterators
// - "rescue" → loses protected code context
// - "singleton_class" → we pass through it to find methods inside
},
markdown: {
// Markdown uses remark parser (unified/mdast) instead of tree-sitter
// due to compatibility issues with tree-sitter-markdown grammar (requires tree-sitter 0.26+)
// Remark is a robust CommonMark/GFM parser used by VS Code, Gatsby, etc.
loadModule: () => Promise.resolve(null),
chunkableTypes: [],
// Flag for documentation files - enables filtering in search API
isDocumentation: true,
// Skip tree-sitter parsing, use remark-based chunker
skipTreeSitter: true,
} as LanguageDefinition & { skipTreeSitter?: boolean },
};
export class TreeSitterChunker implements CodeChunker {
/** Cache of initialized parsers (lazy-loaded) */
private parserCache: Map<string, LanguageConfig> = new Map();
private fallbackChunker: CharacterChunker;
/** Track loading promises to avoid duplicate loads */
private loadingPromises: Map<string, Promise<LanguageConfig | null>> = new Map();
constructor(private config: ChunkerConfig) {
this.fallbackChunker = new CharacterChunker(config);
// NO parser initialization here - lazy load on demand!
}
/**
* Get or lazily initialize parser for a language.
* Returns null if language is not supported.
*/
private async getLanguageConfig(language: string): Promise<LanguageConfig | null> {
// Check cache first
if (this.parserCache.has(language)) {
return this.parserCache.get(language)!;
}
// Check if already loading (avoid duplicate loads)
if (this.loadingPromises.has(language)) {
return this.loadingPromises.get(language)!;
}
// Check if language is defined
const definition = LANGUAGE_DEFINITIONS[language];
if (!definition) {
return null;
}
// Start loading
const loadPromise = this.initializeParser(language, definition);
this.loadingPromises.set(language, loadPromise);
try {
const config = await loadPromise;
if (config) {
this.parserCache.set(language, config);
}
return config;
} finally {
this.loadingPromises.delete(language);
}
}
/**
* Initialize a parser for a specific language
*/
private async initializeParser(
language: string,
definition: LanguageDefinition,
): Promise<LanguageConfig | null> {
try {
const startTime = Date.now();
// Dynamic import of language module
const mod = await definition.loadModule();
const langModule = definition.extractLanguage
? definition.extractLanguage(mod)
: mod.default || mod;
// Create and configure parser
const parser = new Parser();
parser.setLanguage(langModule as any);
if (process.env.DEBUG) {
console.error(
`[TreeSitter] Lazy-loaded ${language} parser in ${Date.now() - startTime}ms`,
);
}
return {
parser,
chunkableTypes: definition.chunkableTypes,
childChunkTypes: definition.childChunkTypes,
isDocumentation: definition.isDocumentation,
};
} catch (error) {
console.error(`[TreeSitter] Failed to load parser for ${language}:`, error);
return null;
}
}
async chunk(code: string, filePath: string, language: string): Promise<CodeChunk[]> {
// Check if this language should skip tree-sitter (e.g., markdown uses remark)
const definition = LANGUAGE_DEFINITIONS[language];
if (definition && (definition as LanguageDefinition & { skipTreeSitter?: boolean }).skipTreeSitter) {
// Use specialized chunker for this language (e.g., remark for markdown)
if (definition.isDocumentation) {
return this.chunkMarkdownSimple(code, filePath, language);
}
}
// Lazy-load parser for this language
const langConfig = await this.getLanguageConfig(language);
if (!langConfig) {
// Fallback to character-based chunking
return this.fallbackChunker.chunk(code, filePath, language);
}
try {
const tree = langConfig.parser.parse(code);
const chunks: CodeChunk[] = [];
// Find all chunkable nodes
const nodes = this.findChunkableNodes(tree.rootNode, langConfig.chunkableTypes);
for (const [index, node] of nodes.entries()) {
const content = code.substring(node.startIndex, node.endIndex);
// Skip chunks that are too small
if (content.length < 50) {
continue;
}
// If chunk is too large, try AST-aware splitting first
if (content.length > this.config.maxChunkSize * 2) {
const parentName = this.extractName(node, code);
const parentType = node.type;
// Try to find smaller chunkable units inside (e.g., methods inside class)
if (langConfig.childChunkTypes && langConfig.childChunkTypes.length > 0) {
const childNodes = this.findChildChunkableNodes(node, langConfig.childChunkTypes);
if (childNodes.length > 0) {
// Found methods/functions inside - chunk them individually
for (const childNode of childNodes) {
const childContent = code.substring(childNode.startIndex, childNode.endIndex);
// Skip if child is also too large (will be handled by character fallback)
if (childContent.length > this.config.maxChunkSize * 2) {
const subChunks = await this.fallbackChunker.chunk(childContent, filePath, language);
for (const subChunk of subChunks) {
chunks.push({
...subChunk,
startLine: childNode.startPosition.row + 1 + subChunk.startLine - 1,
endLine: childNode.startPosition.row + 1 + subChunk.endLine - 1,
metadata: {
...subChunk.metadata,
chunkIndex: chunks.length,
parentName,
parentType,
},
});
}
continue;
}
// Skip too small chunks
if (childContent.length < 50) continue;
chunks.push({
content: childContent.trim(),
startLine: childNode.startPosition.row + 1,
endLine: childNode.endPosition.row + 1,
metadata: {
filePath,
language,
chunkIndex: chunks.length,
chunkType: this.getChunkType(childNode.type),
name: this.extractName(childNode, code),
parentName, // Keep class/module context
parentType,
},
});
}
continue;
}
}
// No child chunks found - fall back to character chunking
const subChunks = await this.fallbackChunker.chunk(content, filePath, language);
// Adjust line numbers for sub-chunks
for (const subChunk of subChunks) {
chunks.push({
...subChunk,
startLine: node.startPosition.row + 1 + subChunk.startLine - 1,
endLine: node.startPosition.row + 1 + subChunk.endLine - 1,
metadata: {
...subChunk.metadata,
chunkIndex: chunks.length,
parentName,
parentType,
},
});
}
continue;
}
chunks.push({
content: content.trim(),
startLine: node.startPosition.row + 1,
endLine: node.endPosition.row + 1,
metadata: {
filePath,
language,
chunkIndex: index,
chunkType: this.getChunkType(node.type),
name: this.extractName(node, code),
},
});
}
// If no chunks found or file is small, use fallback
if (chunks.length === 0 && code.length > 100) {
return this.fallbackChunker.chunk(code, filePath, language);
}
return chunks;
} catch (error) {
// On parsing error, fallback to character-based chunking
console.error(`Tree-sitter parsing failed for ${filePath}:`, error);
return this.fallbackChunker.chunk(code, filePath, language);
}
}
supportsLanguage(language: string): boolean {
return language in LANGUAGE_DEFINITIONS;
}
getStrategyName(): string {
return "tree-sitter";
}
/**
* Get list of supported languages
*/
getSupportedLanguages(): string[] {
return Object.keys(LANGUAGE_DEFINITIONS);
}
/**
* Preload specific language parsers (optional optimization)
* Call this if you know which languages will be used
*/
async preloadLanguages(languages: string[]): Promise<void> {
await Promise.all(languages.map((lang) => this.getLanguageConfig(lang)));
}
/**
* Get stats about loaded parsers
*/
getLoadedParsers(): { loaded: string[]; available: string[] } {
return {
loaded: Array.from(this.parserCache.keys()),
available: Object.keys(LANGUAGE_DEFINITIONS),
};
}
/**
* Remark-based markdown chunker using unified/mdast AST parser.
* Uses remark (CommonMark/GFM parser) instead of tree-sitter due to
* compatibility issues with tree-sitter-markdown grammar (requires tree-sitter 0.26+).
*
* Creates chunks for:
* 1. Sections (heading + content until next heading of same/higher level)
* 2. Fenced code blocks with language detection (for searching code examples)
*/
private async chunkMarkdownSimple(
code: string,
filePath: string,
language: string,
): Promise<CodeChunk[]> {
const chunks: CodeChunk[] = [];
const lines = code.split("\n");
// Parse markdown with remark (GFM for GitHub flavored markdown)
const tree = remark().use(remarkGfm).parse(code) as Root;
// Collect headings with positions
interface HeadingInfo {
depth: number;
text: string;
startLine: number;
endLine: number;
nodeIndex: number;
}
const headings: HeadingInfo[] = [];
for (let i = 0; i < tree.children.length; i++) {
const node = tree.children[i];
if (node.type === "heading" && node.position) {
// Extract text from heading children
const text = this.extractTextFromMdastNode(node);
headings.push({
depth: node.depth,
text,
startLine: node.position.start.line,
endLine: node.position.end.line,
nodeIndex: i,
});
}
}
// Collect code blocks
interface CodeBlockInfo {
lang: string | undefined;
value: string;
startLine: number;
endLine: number;
}
const codeBlocks: CodeBlockInfo[] = [];
const collectCodeBlocks = (node: Content) => {
if (node.type === "code" && node.position) {
codeBlocks.push({
lang: (node as Code).lang || undefined,
value: (node as Code).value,
startLine: node.position.start.line,
endLine: node.position.end.line,
});
}
if ("children" in node && Array.isArray(node.children)) {
for (const child of node.children) {
collectCodeBlocks(child as Content);
}
}
};
for (const child of tree.children) {
collectCodeBlocks(child);
}
// Create section chunks
for (let i = 0; i < headings.length; i++) {
const heading = headings[i];
// Find end of section (next heading of ANY level, or end of document)
// This creates smaller, more focused chunks for semantic search
let sectionEndLine = lines.length;
if (i + 1 < headings.length) {
sectionEndLine = headings[i + 1].startLine - 1;
}
// Extract section content from original code
const sectionLines = lines.slice(heading.startLine - 1, sectionEndLine);
const sectionContent = sectionLines.join("\n").trim();
// Skip very small sections
if (sectionContent.length < 50) {
continue;
}
// If section is too large, split it
if (sectionContent.length > this.config.maxChunkSize * 2) {
const subChunks = await this.fallbackChunker.chunk(sectionContent, filePath, language);
for (const subChunk of subChunks) {
chunks.push({
...subChunk,
startLine: heading.startLine + subChunk.startLine - 1,
endLine: heading.startLine + subChunk.endLine - 1,
metadata: {
...subChunk.metadata,
chunkIndex: chunks.length,
name: heading.text,
parentName: heading.text,
parentType: `h${heading.depth}`,
isDocumentation: true,
},
});
}
continue;
}
chunks.push({
content: sectionContent,
startLine: heading.startLine,
endLine: sectionEndLine,
metadata: {
filePath,
language,
chunkIndex: chunks.length,
chunkType: "block",
name: heading.text,
isDocumentation: true,
},
});
}
// Create code block chunks (for searching code examples in docs)
for (const block of codeBlocks) {
// Skip very small code blocks
if (block.value.length < 30) {
continue;
}
chunks.push({
content: block.value,
startLine: block.startLine + 1, // +1 to skip ``` line
endLine: block.endLine - 1, // -1 to skip closing ```
metadata: {
filePath,
// Use the code block's language, not "markdown"
language: block.lang || "code",
chunkIndex: chunks.length,
chunkType: "block",
name: block.lang ? `Code: ${block.lang}` : "Code block",
isDocumentation: true,
},
});
}
// Handle preamble (content before first heading)
if (headings.length > 0 && headings[0].startLine > 1) {
const preamble = lines.slice(0, headings[0].startLine - 1).join("\n").trim();
if (preamble.length >= 50) {
chunks.unshift({
content: preamble,
startLine: 1,
endLine: headings[0].startLine - 1,
metadata: {
filePath,
language,
chunkIndex: 0,
chunkType: "block",
name: "Preamble",
isDocumentation: true,
},
});
// Re-index all chunks
for (let i = 1; i < chunks.length; i++) {
chunks[i].metadata.chunkIndex = i;
}
}
}
// If no headings and no code blocks, treat whole document as one chunk
if (chunks.length === 0 && code.length >= 50) {
chunks.push({
content: code.trim(),
startLine: 1,
endLine: lines.length,
metadata: {
filePath,
language,
chunkIndex: 0,
chunkType: "block",
isDocumentation: true,
},
});
}
return chunks;
}
/**
* Extract text content from mdast node (handles nested inlines like emphasis, links, etc.)
*/
private extractTextFromMdastNode(node: Content): string {
if (node.type === "text") {
return (node as { type: "text"; value: string }).value;
}
if ("children" in node && Array.isArray(node.children)) {
return node.children.map((child: Content) => this.extractTextFromMdastNode(child)).join("");
}
return "";
}
/**
* Find all chunkable nodes in the AST
*/
private findChunkableNodes(
node: Parser.SyntaxNode,
chunkableTypes: string[],
): Parser.SyntaxNode[] {
const nodes: Parser.SyntaxNode[] = [];
const traverse = (n: Parser.SyntaxNode) => {
if (chunkableTypes.includes(n.type)) {
nodes.push(n);
// Don't traverse children of chunkable nodes to avoid nested chunks
return;
}
for (const child of n.children) {
traverse(child);
}
};
traverse(node);
return nodes;
}
/**
* Find chunkable child nodes inside a parent node (e.g., methods inside a class).
* Unlike findChunkableNodes, this DOES traverse into the parent's children
* even if the parent is a chunkable type.
*/
private findChildChunkableNodes(
parentNode: Parser.SyntaxNode,
childChunkTypes: string[],
): Parser.SyntaxNode[] {
const nodes: Parser.SyntaxNode[] = [];
const traverse = (n: Parser.SyntaxNode) => {
// Skip the parent node itself
if (n === parentNode) {
for (const child of n.children) {
traverse(child);
}
return;
}
if (childChunkTypes.includes(n.type)) {
nodes.push(n);
// Don't traverse into this node's children
return;
}
for (const child of n.children) {
traverse(child);
}
};
traverse(parentNode);
return nodes;
}
/**
* Extract function/class name from AST node
*/
private extractName(node: Parser.SyntaxNode, code: string): string | undefined {
// Try to find name node
const nameNode = node.childForFieldName("name");
if (nameNode) {
return code.substring(nameNode.startIndex, nameNode.endIndex);
}
// For some node types, name might be in a different location
for (const child of node.children) {
if (child.type === "identifier" || child.type === "type_identifier") {
return code.substring(child.startIndex, child.endIndex);
}
}
return undefined;
}
/**
* Map AST node type to chunk type
*/
private getChunkType(nodeType: string): "function" | "class" | "interface" | "block" {
if (nodeType.includes("function") || nodeType.includes("method")) {
return "function";
}
if (nodeType.includes("class") || nodeType.includes("struct") || nodeType.includes("module")) {
return "class";
}
if (nodeType.includes("interface") || nodeType.includes("trait")) {
return "interface";
}
return "block";
}
}