Codebase Context

Overview Schema Related Servers Score Discussions

indexer.ts•21.8 KiB

/** * Core Indexer - Orchestrates codebase indexing * Scans files, delegates to analyzers, creates embeddings, stores in vector DB */ import { promises as fs } from "fs"; import path from "path"; import { glob } from "glob"; import ignore from "ignore"; import { CodebaseMetadata, CodeChunk, IndexingProgress, IndexingStats, IndexingPhase, CodebaseConfig, AnalysisResult, } from "../types/index.js"; import { analyzerRegistry } from "./analyzer-registry.js"; import { isCodeFile, isBinaryFile } from "../utils/language-detection.js"; import { getEmbeddingProvider, EmbeddingProvider, } from "../embeddings/index.js"; import { getStorageProvider, VectorStorageProvider, CodeChunkWithEmbedding, } from "../storage/index.js"; import { LibraryUsageTracker, PatternDetector, ImportGraph, InternalFileGraph, FileExport } from "../utils/usage-tracker.js"; import { getFileCommitDates } from "../utils/git-dates.js"; export interface IndexerOptions { rootPath: string; config?: Partial<CodebaseConfig>; onProgress?: (progress: IndexingProgress) => void; } export class CodebaseIndexer { private rootPath: string; private config: CodebaseConfig; private progress: IndexingProgress; private onProgressCallback?: (progress: IndexingProgress) => void; constructor(options: IndexerOptions) { this.rootPath = path.resolve(options.rootPath); this.config = this.mergeConfig(options.config); this.onProgressCallback = options.onProgress; this.progress = { phase: "initializing", percentage: 0, filesProcessed: 0, totalFiles: 0, chunksCreated: 0, errors: [], startedAt: new Date(), }; } private mergeConfig(userConfig?: Partial<CodebaseConfig>): CodebaseConfig { const defaultConfig: CodebaseConfig = { analyzers: { angular: { enabled: true, priority: 100 }, react: { enabled: false, priority: 90 }, vue: { enabled: false, priority: 90 }, generic: { enabled: true, priority: 10 }, }, include: ["**/*.{ts,tsx,js,jsx,html,css,scss,sass,less}"], exclude: [ "node_modules/**", "dist/**", "build/**", ".git/**", "coverage/**", ], respectGitignore: true, parsing: { maxFileSize: 1048576, // 1MB chunkSize: 100, chunkOverlap: 10, parseTests: true, parseNodeModules: false, }, styleGuides: { autoDetect: true, paths: ["STYLE_GUIDE.md", "docs/style-guide.md", "ARCHITECTURE.md"], parseMarkdown: true, }, documentation: { autoDetect: true, includeReadmes: true, includeChangelogs: false, }, embedding: { provider: "transformers", model: "Xenova/bge-base-en-v1.5", batchSize: 100, }, skipEmbedding: false, storage: { provider: "lancedb", path: "./codebase-index", }, }; return { ...defaultConfig, ...userConfig, analyzers: { ...defaultConfig.analyzers, ...userConfig?.analyzers }, parsing: { ...defaultConfig.parsing, ...userConfig?.parsing }, styleGuides: { ...defaultConfig.styleGuides, ...userConfig?.styleGuides }, documentation: { ...defaultConfig.documentation, ...userConfig?.documentation, }, embedding: { ...defaultConfig.embedding, ...userConfig?.embedding }, storage: { ...defaultConfig.storage, ...userConfig?.storage }, }; } async index(): Promise<IndexingStats> { const startTime = Date.now(); const stats: IndexingStats = { totalFiles: 0, indexedFiles: 0, skippedFiles: 0, totalChunks: 0, totalLines: 0, duration: 0, avgChunkSize: 0, componentsByType: {}, componentsByLayer: { presentation: 0, business: 0, data: 0, state: 0, core: 0, shared: 0, feature: 0, infrastructure: 0, unknown: 0, }, errors: [], startedAt: new Date(), }; try { // Phase 1: Scanning this.updateProgress("scanning", 0); let files = await this.scanFiles(); // Memory safety: limit total files to prevent heap exhaustion const MAX_FILES = 10000; if (files.length > MAX_FILES) { console.warn( `WARNING: Found ${files.length} files, limiting to ${MAX_FILES} to prevent memory issues.` ); console.warn( `Consider using more specific include patterns or excluding large directories.` ); files = files.slice(0, MAX_FILES); } stats.totalFiles = files.length; this.progress.totalFiles = files.length; console.error(`Found ${files.length} files to index`); // Phase 2: Analyzing & Parsing this.updateProgress("analyzing", 0); const allChunks: CodeChunk[] = []; const libraryTracker = new LibraryUsageTracker(); const patternDetector = new PatternDetector(); const importGraph = new ImportGraph(); const internalFileGraph = new InternalFileGraph(this.rootPath); // Fetch git commit dates for pattern momentum analysis const fileDates = await getFileCommitDates(this.rootPath); for (let i = 0; i < files.length; i++) { const file = files[i]; this.progress.currentFile = file; this.progress.filesProcessed = i + 1; this.progress.percentage = Math.round(((i + 1) / files.length) * 100); try { // Normalize line endings to \n for consistent cross-platform output const rawContent = await fs.readFile(file, "utf-8"); const content = rawContent.replace(/\r\n/g, "\n"); const result = await analyzerRegistry.analyzeFile(file, content); if (result) { allChunks.push(...result.chunks); stats.indexedFiles++; stats.totalLines += content.split("\n").length; // Track library usage AND import graph from imports for (const imp of result.imports) { libraryTracker.track(imp.source, file); importGraph.trackImport(imp.source, file, imp.line || 1); // Track internal file-to-file imports (relative paths) if (imp.source.startsWith('.')) { // Resolve the relative import to an absolute path const fileDir = path.dirname(file); let resolvedPath = path.resolve(fileDir, imp.source); // Try common extensions if not already specified const ext = path.extname(resolvedPath); if (!ext) { for (const tryExt of ['.ts', '.tsx', '.js', '.jsx']) { const withExt = resolvedPath + tryExt; // We don't check if file exists for performance - just track what's referenced resolvedPath = withExt; break; } } internalFileGraph.trackImport(file, resolvedPath, imp.imports); } } // Track exports for unused export detection if (result.exports && result.exports.length > 0) { const fileExports: FileExport[] = result.exports.map(exp => ({ name: exp.name, type: exp.isDefault ? 'default' : (exp.type as FileExport['type']) || 'other', })); internalFileGraph.trackExports(file, fileExports); } // Detect generic patterns from code patternDetector.detectFromCode(content, file); // Helper to extract code snippet around a pattern const extractSnippet = (pattern: RegExp, linesBefore = 1, linesAfter = 3): string | undefined => { const match = content.match(pattern); if (!match) return undefined; const lines = content.split('\n'); const matchIndex = content.substring(0, match.index).split('\n').length - 1; const start = Math.max(0, matchIndex - linesBefore); const end = Math.min(lines.length, matchIndex + linesAfter + 1); return lines.slice(start, end).join('\n').trim(); }; const relPath = file.split(/[\\/]/).slice(-3).join('/'); // Get file date for pattern momentum tracking // Try multiple path formats since git uses forward slashes const normalizedRelPath = path.relative(this.rootPath, file).replace(/\\/g, '/'); const fileDate = fileDates.get(normalizedRelPath); // GENERIC PATTERN FORWARDING // Framework analyzers return detectedPatterns in metadata - we just forward them // This keeps the indexer framework-agnostic if (result.metadata?.detectedPatterns) { for (const pattern of result.metadata.detectedPatterns) { // Try to extract a relevant snippet for the pattern const snippetPattern = this.getSnippetPatternFor(pattern.category, pattern.name); const snippet = snippetPattern ? extractSnippet(snippetPattern) : undefined; patternDetector.track(pattern.category, pattern.name, snippet ? { file: relPath, snippet } : undefined, fileDate); } } // Track file for Golden File scoring (framework-agnostic based on patterns) const detectedPatterns = result.metadata?.detectedPatterns || []; const hasPattern = (category: string, name: string) => detectedPatterns.some((p: { category: string; name: string }) => p.category === category && p.name === name); const patternScore = ( (hasPattern('dependencyInjection', 'inject() function') ? 1 : 0) + (hasPattern('stateManagement', 'Signals') ? 1 : 0) + (hasPattern('reactivity', 'Computed') ? 1 : 0) + (hasPattern('reactivity', 'Effect') ? 1 : 0) + (hasPattern('componentStyle', 'Standalone') ? 1 : 0) + (hasPattern('componentInputs', 'Signal-based inputs') ? 1 : 0) ); if (patternScore >= 3) { patternDetector.trackGoldenFile(relPath, patternScore, { inject: hasPattern('dependencyInjection', 'inject() function'), signals: hasPattern('stateManagement', 'Signals'), computed: hasPattern('reactivity', 'Computed'), effect: hasPattern('reactivity', 'Effect'), standalone: hasPattern('componentStyle', 'Standalone'), signalInputs: hasPattern('componentInputs', 'Signal-based inputs'), }); } // Update component statistics for (const component of result.components) { if (component.componentType) { stats.componentsByType[component.componentType] = (stats.componentsByType[component.componentType] || 0) + 1; } if (component.layer) { stats.componentsByLayer[component.layer]++; } } } else { stats.skippedFiles++; } } catch (error) { stats.skippedFiles++; stats.errors.push({ filePath: file, error: error instanceof Error ? error.message : String(error), phase: "analyzing", timestamp: new Date(), }); } if (this.onProgressCallback) { this.onProgressCallback(this.progress); } } stats.totalChunks = allChunks.length; stats.avgChunkSize = allChunks.length > 0 ? Math.round( allChunks.reduce((sum, c) => sum + c.content.length, 0) / allChunks.length ) : 0; // Memory safety: limit chunks to prevent embedding memory issues const MAX_CHUNKS = 5000; let chunksToEmbed = allChunks; if (allChunks.length > MAX_CHUNKS) { console.warn( `WARNING: ${allChunks.length} chunks exceed limit. Indexing first ${MAX_CHUNKS} chunks.` ); chunksToEmbed = allChunks.slice(0, MAX_CHUNKS); } // Phase 3: Embedding let chunksWithEmbeddings: CodeChunkWithEmbedding[] = []; if (!this.config.skipEmbedding) { this.updateProgress("embedding", 50); console.error(`Creating embeddings for ${chunksToEmbed.length} chunks...`); // Initialize embedding provider const embeddingProvider = await getEmbeddingProvider(this.config.embedding); // Generate embeddings for all chunks const batchSize = this.config.embedding?.batchSize || 32; for (let i = 0; i < chunksToEmbed.length; i += batchSize) { const batch = chunksToEmbed.slice(i, i + batchSize); const texts = batch.map((chunk) => { // Create a searchable text representation const parts = [chunk.content]; if (chunk.metadata?.componentName) { parts.unshift(`Component: ${chunk.metadata.componentName}`); } if (chunk.componentType) { parts.unshift(`Type: ${chunk.componentType}`); } return parts.join("\n"); }); const embeddings = await embeddingProvider.embedBatch(texts); for (let j = 0; j < batch.length; j++) { chunksWithEmbeddings.push({ ...batch[j], embedding: embeddings[j], }); } // Update progress const embeddingProgress = 50 + Math.round((i / chunksToEmbed.length) * 25); this.updateProgress("embedding", embeddingProgress); if ( (i + batchSize) % 100 === 0 || i + batchSize >= chunksToEmbed.length ) { console.error( `Embedded ${Math.min(i + batchSize, chunksToEmbed.length)}/${chunksToEmbed.length } chunks` ); } } } else { console.error("Skipping embedding generation (skipEmbedding=true)"); } // Phase 4: Storing this.updateProgress("storing", 75); if (!this.config.skipEmbedding) { console.error(`Storing ${chunksToEmbed.length} chunks...`); // Store in LanceDB for vector search const storagePath = path.join(this.rootPath, ".codebase-index"); const storageProvider = await getStorageProvider({ path: storagePath }); await storageProvider.clear(); // Clear existing index await storageProvider.store(chunksWithEmbeddings); } // Also save JSON for keyword search (Fuse.js) - use chunksToEmbed for consistency const indexPath = path.join(this.rootPath, ".codebase-index.json"); // Write without pretty-printing to save memory await fs.writeFile(indexPath, JSON.stringify(chunksToEmbed)); // Save library usage and pattern stats const intelligencePath = path.join(this.rootPath, ".codebase-intelligence.json"); const libraryStats = libraryTracker.getStats(); // Extract tsconfig paths for AI to understand import aliases let tsconfigPaths: Record<string, string[]> | undefined; try { const tsconfigPath = path.join(this.rootPath, "tsconfig.json"); const tsconfigContent = await fs.readFile(tsconfigPath, "utf-8"); const tsconfig = JSON.parse(tsconfigContent); if (tsconfig.compilerOptions?.paths) { tsconfigPaths = tsconfig.compilerOptions.paths; console.error(`Found ${Object.keys(tsconfigPaths!).length} path aliases in tsconfig.json`); } } catch (error) { // No tsconfig.json or no paths defined } const intelligence = { libraryUsage: libraryStats, patterns: patternDetector.getAllPatterns(), goldenFiles: patternDetector.getGoldenFiles(5), // tsconfig paths help AI understand import aliases (e.g., @mycompany/* -> libs/*) // This reveals which @scoped packages are internal vs external tsconfigPaths, importGraph: { usages: importGraph.getAllUsages(), topUsed: importGraph.getTopUsed(30), }, // Internal file graph for circular dependency and unused export detection internalFileGraph: internalFileGraph.toJSON(), generatedAt: new Date().toISOString(), }; await fs.writeFile(intelligencePath, JSON.stringify(intelligence, null, 2)); // Phase 5: Complete this.updateProgress("complete", 100); stats.duration = Date.now() - startTime; stats.completedAt = new Date(); console.error(`Indexing complete in ${stats.duration}ms`); console.error( `Indexed ${stats.indexedFiles} files, ${stats.totalChunks} chunks` ); return stats; } catch (error) { this.progress.phase = "error"; stats.errors.push({ filePath: this.rootPath, error: error instanceof Error ? error.message : String(error), phase: this.progress.phase, timestamp: new Date(), }); throw error; } } private async scanFiles(): Promise<string[]> { const files: string[] = []; // Read .gitignore if respecting it let ig: ReturnType<typeof ignore.default> | null = null; if (this.config.respectGitignore) { try { const gitignorePath = path.join(this.rootPath, ".gitignore"); const gitignoreContent = await fs.readFile(gitignorePath, "utf-8"); ig = ignore.default().add(gitignoreContent); } catch (error) { // No .gitignore or couldn't read it } } // Scan with glob const includePatterns = this.config.include || ["**/*"]; const excludePatterns = this.config.exclude || []; for (const pattern of includePatterns) { const matches = await glob(pattern, { cwd: this.rootPath, absolute: true, ignore: excludePatterns, nodir: true, }); for (const file of matches) { const relativePath = path.relative(this.rootPath, file); // Check gitignore if (ig && ig.ignores(relativePath)) { continue; } // Check if it's a code file if (!isCodeFile(file) || isBinaryFile(file)) { continue; } // Check file size try { const stats = await fs.stat(file); if (stats.size > (this.config.parsing?.maxFileSize || 1048576)) { console.warn(`Skipping large file: ${file} (${stats.size} bytes)`); continue; } } catch (error) { continue; } files.push(file); } } return files; } private updateProgress(phase: IndexingPhase, percentage: number): void { this.progress.phase = phase; this.progress.percentage = percentage; if (this.onProgressCallback) { this.onProgressCallback(this.progress); } } async detectMetadata(): Promise<CodebaseMetadata> { // Try to use the most specific analyzer for metadata detection const primaryAnalyzer = analyzerRegistry.getAll()[0]; // Highest priority let metadata: CodebaseMetadata; if (primaryAnalyzer) { metadata = await primaryAnalyzer.detectCodebaseMetadata(this.rootPath); } else { // Fallback metadata metadata = { name: path.basename(this.rootPath), rootPath: this.rootPath, languages: [], dependencies: [], architecture: { type: "mixed", layers: { presentation: 0, business: 0, data: 0, state: 0, core: 0, shared: 0, feature: 0, infrastructure: 0, unknown: 0, }, patterns: [], }, styleGuides: [], documentation: [], projectStructure: { type: "single-app", }, statistics: { totalFiles: 0, totalLines: 0, totalComponents: 0, componentsByType: {}, componentsByLayer: { presentation: 0, business: 0, data: 0, state: 0, core: 0, shared: 0, feature: 0, infrastructure: 0, unknown: 0, }, }, customMetadata: {}, }; } // Load intelligence data if available try { const intelligencePath = path.join(this.rootPath, ".codebase-intelligence.json"); const intelligenceContent = await fs.readFile(intelligencePath, "utf-8"); const intelligence = JSON.parse(intelligenceContent); metadata.customMetadata = { ...metadata.customMetadata, libraryUsage: intelligence.libraryUsage, patterns: intelligence.patterns, intelligenceGeneratedAt: intelligence.generatedAt, }; } catch (error) { // Intelligence file doesn't exist yet (indexing not run) } return metadata; } /** * Get regex pattern for extracting code snippets based on pattern category and name * This maps abstract pattern names to actual code patterns */ private getSnippetPatternFor(category: string, name: string): RegExp | null { const patterns: Record<string, Record<string, RegExp>> = { dependencyInjection: { 'inject() function': /\binject\s*[<(]/, 'Constructor injection': /constructor\s*$/, }, stateManagement: { 'RxJS': /BehaviorSubject|ReplaySubject|Subject|Observable/, 'Signals': /\bsignal\s*[<(]/, }, reactivity: { 'Effect': /\beffect\s*\(/, 'Computed': /\bcomputed\s*[<(]/, }, componentStyle: { 'Standalone': /standalone\s*:\s*true/, 'NgModule-based': /@(?:Component|Directive|Pipe)\s*\(/, }, componentInputs: { 'Signal-based inputs': /\binput\s*[<(]/, 'Decorator-based @Input': /@Input\($/, }, }; return patterns[category]?.[name] || null; } getProgress(): IndexingProgress { return { ...this.progress }; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickSys/codebase-context'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

indexer.ts•21.8 KiB