Codebase Context

Overview Schema Related Servers Score Discussions

indexer.ts•43.1 KiB

/** * Core Indexer - Orchestrates codebase indexing * Scans files, delegates to analyzers, creates embeddings, stores in vector DB */ import { randomUUID } from 'crypto'; import { promises as fs } from 'fs'; import path from 'path'; import { glob } from 'glob'; import ignore from 'ignore'; import { CodebaseMetadata, CodeChunk, IndexingProgress, IndexingStats, IndexingPhase, CodebaseConfig, Dependency, ArchitecturalLayer, IntelligenceData } from '../types/index.js'; import { analyzerRegistry } from './analyzer-registry.js'; import { isCodeFile, isBinaryFile } from '../utils/language-detection.js'; import { getEmbeddingProvider, DEFAULT_MODEL } from '../embeddings/index.js'; import { getStorageProvider, CodeChunkWithEmbedding } from '../storage/index.js'; import { LibraryUsageTracker, PatternDetector, ImportGraph, InternalFileGraph, FileExport } from '../utils/usage-tracker.js'; import { mergeSmallChunks } from '../utils/chunking.js'; import { getFileCommitDates } from '../utils/git-dates.js'; import { CODEBASE_CONTEXT_DIRNAME, INDEX_FORMAT_VERSION, INDEXING_STATS_FILENAME, INDEX_META_FILENAME, INDEX_META_VERSION, INTELLIGENCE_FILENAME, KEYWORD_INDEX_FILENAME, MANIFEST_FILENAME, RELATIONSHIPS_FILENAME, VECTOR_DB_DIRNAME } from '../constants/codebase-context.js'; const STAGING_DIRNAME = '.staging'; const PREVIOUS_DIRNAME = '.previous'; import { computeFileHashes, readManifest, writeManifest, diffManifest, type FileManifest, type ManifestDiff } from './manifest.js'; let cachedToolVersion: string | null = null; async function getToolVersion(): Promise<string> { if (cachedToolVersion) return cachedToolVersion; try { const pkgRaw = await fs.readFile(new URL('../../package.json', import.meta.url), 'utf-8'); const pkg = JSON.parse(pkgRaw) as { version?: unknown }; if (typeof pkg.version === 'string' && pkg.version.trim()) { cachedToolVersion = pkg.version; return cachedToolVersion; } } catch { // Best-effort — fall back below } cachedToolVersion = 'unknown'; return cachedToolVersion; } /** * Perform a Windows-safe atomic swap of staging artifacts into active location. * Strategy: move current active to .previous, then rename staging to active. * If staging rename fails, restore from .previous. */ async function atomicSwapStagingToActive( contextDir: string, stagingDir: string, buildId: string ): Promise<void> { const previousDir = path.join(contextDir, PREVIOUS_DIRNAME); const activeMetaPath = path.join(contextDir, INDEX_META_FILENAME); const activeIndexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME); const activeIntelligencePath = path.join(contextDir, INTELLIGENCE_FILENAME); const activeVectorDir = path.join(contextDir, VECTOR_DB_DIRNAME); const activeManifestPath = path.join(contextDir, MANIFEST_FILENAME); const activeStatsPath = path.join(contextDir, INDEXING_STATS_FILENAME); const activeRelationshipsPath = path.join(contextDir, RELATIONSHIPS_FILENAME); const stagingMetaPath = path.join(stagingDir, INDEX_META_FILENAME); const stagingIndexPath = path.join(stagingDir, KEYWORD_INDEX_FILENAME); const stagingIntelligencePath = path.join(stagingDir, INTELLIGENCE_FILENAME); const stagingVectorDir = path.join(stagingDir, VECTOR_DB_DIRNAME); const stagingManifestPath = path.join(stagingDir, MANIFEST_FILENAME); const stagingStatsPath = path.join(stagingDir, INDEXING_STATS_FILENAME); const stagingRelationshipsPath = path.join(stagingDir, RELATIONSHIPS_FILENAME); // Step 1: Create .previous directory and move current active there await fs.mkdir(previousDir, { recursive: true }); const moveIfExists = async (src: string, dest: string): Promise<void> => { try { await fs.rename(src, dest); } catch (error) { const code = (error as NodeJS.ErrnoException).code; if (code !== 'ENOENT') { // File doesn't exist is OK, other errors are problems throw error; } } }; const moveDirIfExists = async (src: string, dest: string): Promise<void> => { try { const stat = await fs.stat(src); if (stat.isDirectory()) { await fs.rename(src, dest); } } catch (error) { const code = (error as NodeJS.ErrnoException).code; if (code !== 'ENOENT') { throw error; } } }; // Move active artifacts to .previous await moveIfExists(activeMetaPath, path.join(previousDir, INDEX_META_FILENAME)); await moveIfExists(activeIndexPath, path.join(previousDir, KEYWORD_INDEX_FILENAME)); await moveIfExists(activeIntelligencePath, path.join(previousDir, INTELLIGENCE_FILENAME)); await moveIfExists(activeManifestPath, path.join(previousDir, MANIFEST_FILENAME)); await moveIfExists(activeStatsPath, path.join(previousDir, INDEXING_STATS_FILENAME)); await moveIfExists(activeRelationshipsPath, path.join(previousDir, RELATIONSHIPS_FILENAME)); await moveDirIfExists(activeVectorDir, path.join(previousDir, VECTOR_DB_DIRNAME)); // Step 2: Move staging artifacts to active location try { await moveIfExists(stagingMetaPath, activeMetaPath); await moveIfExists(stagingIndexPath, activeIndexPath); await moveIfExists(stagingIntelligencePath, activeIntelligencePath); await moveIfExists(stagingManifestPath, activeManifestPath); await moveIfExists(stagingStatsPath, activeStatsPath); await moveIfExists(stagingRelationshipsPath, activeRelationshipsPath); await moveDirIfExists(stagingVectorDir, activeVectorDir); // Step 3: Clean up .previous and staging directories await cleanupDirectory(previousDir); await cleanupDirectory(stagingDir); // Also clean up the parent .staging/ directory if empty const stagingBase = path.join(contextDir, STAGING_DIRNAME); try { const remaining = await fs.readdir(stagingBase); if (remaining.length === 0) { await fs.rmdir(stagingBase); } } catch { // Directory doesn't exist or not empty - ignore } console.error(`Atomic swap complete: build ${buildId} now active`); } catch (swapError) { console.error('Atomic swap failed, attempting rollback:', swapError); // Attempt rollback: restore from .previous try { await moveIfExists(path.join(previousDir, INDEX_META_FILENAME), activeMetaPath); await moveIfExists(path.join(previousDir, KEYWORD_INDEX_FILENAME), activeIndexPath); await moveIfExists(path.join(previousDir, INTELLIGENCE_FILENAME), activeIntelligencePath); await moveIfExists(path.join(previousDir, MANIFEST_FILENAME), activeManifestPath); await moveIfExists(path.join(previousDir, INDEXING_STATS_FILENAME), activeStatsPath); await moveIfExists(path.join(previousDir, RELATIONSHIPS_FILENAME), activeRelationshipsPath); await moveDirIfExists(path.join(previousDir, VECTOR_DB_DIRNAME), activeVectorDir); console.error('Rollback successful'); } catch (rollbackError) { console.error('Rollback also failed:', rollbackError); } throw swapError; } } /** * Best-effort cleanup of a directory and its contents. */ async function cleanupDirectory(dirPath: string): Promise<void> { try { await fs.rm(dirPath, { recursive: true, force: true }); } catch { // Best-effort: ignore cleanup failures } } export interface IndexerOptions { rootPath: string; config?: Partial<CodebaseConfig>; onProgress?: (progress: IndexingProgress) => void; incrementalOnly?: boolean; } interface PersistedIndexingStats { indexedFiles: number; totalChunks: number; totalFiles: number; generatedAt: string; } export class CodebaseIndexer { private rootPath: string; private config: CodebaseConfig; private progress: IndexingProgress; private onProgressCallback?: (progress: IndexingProgress) => void; private incrementalOnly: boolean; constructor(options: IndexerOptions) { this.rootPath = path.resolve(options.rootPath); this.config = this.mergeConfig(options.config); this.onProgressCallback = options.onProgress; this.incrementalOnly = options.incrementalOnly ?? false; this.progress = { phase: 'initializing', percentage: 0, filesProcessed: 0, totalFiles: 0, chunksCreated: 0, errors: [], startedAt: new Date() }; } private mergeConfig(userConfig?: Partial<CodebaseConfig>): CodebaseConfig { const defaultConfig: CodebaseConfig = { analyzers: { angular: { enabled: true, priority: 100 }, react: { enabled: false, priority: 90 }, vue: { enabled: false, priority: 90 }, generic: { enabled: true, priority: 10 } }, include: ['**/*.{ts,tsx,js,jsx,html,css,scss,sass,less}'], exclude: ['node_modules/**', 'dist/**', 'build/**', '.git/**', 'coverage/**'], respectGitignore: true, parsing: { maxFileSize: 1048576, chunkSize: 50, chunkOverlap: 0, parseTests: true, parseNodeModules: false }, styleGuides: { autoDetect: true, paths: ['STYLE_GUIDE.md', 'docs/style-guide.md', 'ARCHITECTURE.md'], parseMarkdown: true }, documentation: { autoDetect: true, includeReadmes: true, includeChangelogs: false }, embedding: { provider: 'transformers', model: DEFAULT_MODEL, batchSize: 32 }, skipEmbedding: false, storage: { provider: 'lancedb', path: './codebase-index' } }; return { ...defaultConfig, ...userConfig, analyzers: { ...defaultConfig.analyzers, ...userConfig?.analyzers }, parsing: { ...defaultConfig.parsing, ...userConfig?.parsing }, styleGuides: { ...defaultConfig.styleGuides, ...userConfig?.styleGuides }, documentation: { ...defaultConfig.documentation, ...userConfig?.documentation }, embedding: { ...defaultConfig.embedding, ...userConfig?.embedding }, storage: { ...defaultConfig.storage, ...userConfig?.storage } }; } async index(): Promise<IndexingStats> { const startTime = Date.now(); const stats: IndexingStats = { totalFiles: 0, indexedFiles: 0, skippedFiles: 0, totalChunks: 0, totalLines: 0, duration: 0, avgChunkSize: 0, componentsByType: {}, componentsByLayer: { presentation: 0, business: 0, data: 0, state: 0, core: 0, shared: 0, feature: 0, infrastructure: 0, unknown: 0 }, errors: [], startedAt: new Date() }; let stagingDir: string | null = null; try { const buildId = randomUUID(); const generatedAt = new Date().toISOString(); const toolVersion = await getToolVersion(); // Phase 1: Scanning this.updateProgress('scanning', 0); let files = await this.scanFiles(); // Memory safety: limit total files to prevent heap exhaustion const MAX_FILES = 10000; if (files.length > MAX_FILES) { console.warn( `WARNING: Found ${files.length} files, limiting to ${MAX_FILES} to prevent memory issues.` ); console.warn( `Consider using more specific include patterns or excluding large directories.` ); files = files.slice(0, MAX_FILES); } stats.totalFiles = files.length; this.progress.totalFiles = files.length; console.error(`Found ${files.length} files to index`); // Phase 1b: Incremental diff (if incremental mode) const contextDir = path.join(this.rootPath, CODEBASE_CONTEXT_DIRNAME); const manifestPath = path.join(contextDir, MANIFEST_FILENAME); const indexingStatsPath = path.join(contextDir, INDEXING_STATS_FILENAME); let diff: ManifestDiff | null = null; let currentHashes: Record<string, string> | null = null; let previousManifest: FileManifest | null = null; if (this.incrementalOnly) { this.updateProgress('scanning', 10); console.error('Computing file hashes for incremental diff...'); currentHashes = await computeFileHashes(files, this.rootPath); previousManifest = await readManifest(manifestPath); diff = diffManifest(previousManifest, currentHashes); console.error( `Incremental diff: ${diff.added.length} added, ${diff.changed.length} changed, ` + `${diff.deleted.length} deleted, ${diff.unchanged.length} unchanged` ); stats.incremental = { added: diff.added.length, changed: diff.changed.length, deleted: diff.deleted.length, unchanged: diff.unchanged.length }; // Short-circuit: nothing changed if (diff.added.length === 0 && diff.changed.length === 0 && diff.deleted.length === 0) { console.error('No files changed - skipping re-index.'); this.updateProgress('complete', 100); stats.duration = Date.now() - startTime; stats.completedAt = new Date(); let restoredFromPersistedStats = false; try { const persisted = JSON.parse( await fs.readFile(indexingStatsPath, 'utf-8') ) as Partial<PersistedIndexingStats>; if ( typeof persisted.indexedFiles === 'number' && typeof persisted.totalChunks === 'number' && typeof persisted.totalFiles === 'number' ) { stats.indexedFiles = persisted.indexedFiles; stats.totalChunks = persisted.totalChunks; stats.totalFiles = persisted.totalFiles; restoredFromPersistedStats = true; } } catch { // No persisted stats yet — fall back below } if (!restoredFromPersistedStats) { if (previousManifest) { stats.indexedFiles = Object.keys(previousManifest.files).length; } try { const existingIndexPath = path.join(contextDir, KEYWORD_INDEX_FILENAME); const existing = JSON.parse(await fs.readFile(existingIndexPath, 'utf-8')) as unknown; const existingObj = existing as { chunks?: unknown }; const existingChunks: unknown = Array.isArray(existing) ? existing : existingObj && Array.isArray(existingObj.chunks) ? existingObj.chunks : null; if (Array.isArray(existingChunks)) { stats.totalChunks = existingChunks.length; if (stats.indexedFiles === 0) { const uniqueFiles = new Set( existingChunks.map((c: { filePath?: string }) => c.filePath) ); stats.indexedFiles = uniqueFiles.size; } } } catch { // Keyword index doesn't exist yet — keep best-known counts } } stats.totalFiles = files.length; return stats; } } // Build the set of files that need analysis + embedding (incremental: only added/changed) const filesToProcess = diff ? files.filter((f) => { const rel = path.relative(this.rootPath, f).replace(/\\/g, '/'); return diff!.added.includes(rel) || diff!.changed.includes(rel); }) : files; // Phase 2: Analyzing & Parsing // Intelligence tracking (patterns, libraries, import graph) runs on ALL files // but embedding only runs on filesToProcess this.updateProgress('analyzing', 0); const allChunks: CodeChunk[] = []; const changedChunks: CodeChunk[] = []; // Only chunks from added/changed files const libraryTracker = new LibraryUsageTracker(); const patternDetector = new PatternDetector(); const importGraph = new ImportGraph(); const internalFileGraph = new InternalFileGraph(this.rootPath); // Fetch git commit dates for pattern momentum analysis const fileDates = await getFileCommitDates(this.rootPath); // When incremental, track which files need embedding const filesToProcessSet = diff ? new Set(filesToProcess.map((f) => f)) : null; for (let i = 0; i < files.length; i++) { const file = files[i]; this.progress.currentFile = file; this.progress.filesProcessed = i + 1; this.progress.percentage = Math.round(((i + 1) / files.length) * 100); try { // Normalize line endings to \n for consistent cross-platform output const rawContent = await fs.readFile(file, 'utf-8'); const content = rawContent.replace(/\r\n/g, '\n'); const result = await analyzerRegistry.analyzeFile(file, content); if (result) { const isFileChanged = !filesToProcessSet || filesToProcessSet.has(file); const mergedChunks = mergeSmallChunks(result.chunks, 15); allChunks.push(...mergedChunks); if (isFileChanged) { changedChunks.push(...mergedChunks); } stats.indexedFiles++; stats.totalLines += content.split('\n').length; // Track library usage AND import graph from imports for (const imp of result.imports) { libraryTracker.track(imp.source, file); importGraph.trackImport(imp.source, file, imp.line || 1); // Track internal file-to-file imports (relative paths) if (imp.source.startsWith('.')) { // Resolve the relative import to an absolute path const fileDir = path.dirname(file); let resolvedPath = path.resolve(fileDir, imp.source); // Try common extensions if not already specified const ext = path.extname(resolvedPath); if (!ext) { for (const tryExt of ['.ts', '.tsx', '.js', '.jsx']) { const withExt = resolvedPath + tryExt; // We don't check if file exists for performance - just track what's referenced resolvedPath = withExt; break; } } internalFileGraph.trackImport(file, resolvedPath, imp.imports); } } // Track exports for unused export detection if (result.exports && result.exports.length > 0) { const fileExports: FileExport[] = result.exports.map((exp) => ({ name: exp.name, type: exp.isDefault ? 'default' : (exp.type as FileExport['type']) || 'other' })); internalFileGraph.trackExports(file, fileExports); } // Detect generic patterns from code patternDetector.detectFromCode(content, file); // Helper to extract code snippet around a pattern const extractSnippet = ( pattern: RegExp, linesBefore = 1, linesAfter = 3 ): string | undefined => { const match = content.match(pattern); if (!match) return undefined; const lines = content.split('\n'); const matchIndex = content.substring(0, match.index).split('\n').length - 1; const start = Math.max(0, matchIndex - linesBefore); const end = Math.min(lines.length, matchIndex + linesAfter + 1); return lines.slice(start, end).join('\n').trim(); }; const relPath = file.split(/[\\/]/).slice(-3).join('/'); // Get file date for pattern momentum tracking // Try multiple path formats since git uses forward slashes const normalizedRelPath = path.relative(this.rootPath, file).replace(/\\/g, '/'); const fileDate = fileDates.get(normalizedRelPath); // GENERIC PATTERN FORWARDING // Framework analyzers return detectedPatterns in metadata - we just forward them // This keeps the indexer framework-agnostic if ( result.metadata?.detectedPatterns && Array.isArray(result.metadata.detectedPatterns) ) { for (const pattern of result.metadata.detectedPatterns as Array<{ category: string; name: string; }>) { // Try to extract a relevant snippet for the pattern // Ask analyzer registry for snippet pattern (framework-agnostic delegation) const analyzer = analyzerRegistry.findAnalyzer(file); const snippetPattern = analyzer?.getSnippetPattern?.(pattern.category, pattern.name) ?? null; const snippet = snippetPattern ? extractSnippet(snippetPattern) : undefined; patternDetector.track( pattern.category, pattern.name, snippet ? { file: relPath, snippet } : undefined, fileDate ); } } // Track file for Golden File scoring (framework-agnostic) // A golden file = file with patterns in ≥3 distinct categories const rawPatterns = result.metadata?.detectedPatterns; const detectedPatterns: Array<{ category: string; name: string }> = Array.isArray( rawPatterns ) ? (rawPatterns as Array<{ category: string; name: string }>) : []; const uniqueCategories = new Set(detectedPatterns.map((p) => p.category)); const patternScore = uniqueCategories.size; if (patternScore >= 3) { const patternFlags: Record<string, boolean> = {}; for (const p of detectedPatterns) { patternFlags[`${p.category}:${p.name}`] = true; } patternDetector.trackGoldenFile(relPath, patternScore, patternFlags); } // Update component statistics for (const component of result.components) { if (component.componentType) { stats.componentsByType[component.componentType] = (stats.componentsByType[component.componentType] || 0) + 1; } if (component.layer) { stats.componentsByLayer[component.layer]++; } } } else { stats.skippedFiles++; } } catch (error) { stats.skippedFiles++; stats.errors.push({ filePath: file, error: error instanceof Error ? error.message : String(error), phase: 'analyzing', timestamp: new Date() }); } if (this.onProgressCallback) { this.onProgressCallback(this.progress); } } stats.totalChunks = allChunks.length; stats.avgChunkSize = allChunks.length > 0 ? Math.round(allChunks.reduce((sum, c) => sum + c.content.length, 0) / allChunks.length) : 0; // Determine which chunks to embed: in incremental mode, only changed/added file chunks const chunksForEmbedding = diff ? changedChunks : allChunks; // Memory safety: limit chunks to prevent embedding memory issues const MAX_CHUNKS = 5000; let chunksToEmbed = chunksForEmbedding; if (chunksForEmbedding.length > MAX_CHUNKS) { console.warn( `WARNING: ${chunksForEmbedding.length} chunks exceed limit. Indexing first ${MAX_CHUNKS} chunks.` ); chunksToEmbed = chunksForEmbedding.slice(0, MAX_CHUNKS); } // Phase 3: Embedding (only changed/added chunks in incremental mode) const chunksWithEmbeddings: CodeChunkWithEmbedding[] = []; if (!this.config.skipEmbedding && chunksToEmbed.length > 0) { this.updateProgress('embedding', 50); console.error( `Creating embeddings for ${chunksToEmbed.length} chunks` + (diff ? ` (${allChunks.length} total, ${chunksToEmbed.length} changed)` : '') + '...' ); // Initialize embedding provider const embeddingProvider = await getEmbeddingProvider(this.config.embedding); // Generate embeddings for all chunks // Outer batch size controls how many chunks we collect before calling embedBatch. // embedBatch internally sub-batches further based on model context size. const batchSize = Math.min(this.config.embedding?.batchSize || 32, 32); for (let i = 0; i < chunksToEmbed.length; i += batchSize) { const batch = chunksToEmbed.slice(i, i + batchSize); const texts = batch.map((chunk) => { const meta: string[] = []; if (chunk.relativePath) { meta.push(`path:${chunk.relativePath}`); } if (chunk.componentType && chunk.componentType !== 'unknown') { meta.push(`type:${chunk.componentType}`); } if (chunk.metadata?.componentName) { meta.push(`component:${chunk.metadata.componentName}`); } if (chunk.layer && chunk.layer !== 'unknown') { meta.push(`layer:${chunk.layer}`); } const prefix = meta.length > 0 ? meta.join(' ') + '\n' : ''; return prefix + chunk.content; }); const embeddings = await embeddingProvider.embedBatch(texts); for (let j = 0; j < batch.length; j++) { chunksWithEmbeddings.push({ ...batch[j], embedding: embeddings[j] }); } // Update progress const embeddingProgress = 50 + Math.round((i / chunksToEmbed.length) * 25); this.updateProgress('embedding', embeddingProgress); if ((i + batchSize) % 100 === 0 || i + batchSize >= chunksToEmbed.length) { console.error( `Embedded ${Math.min(i + batchSize, chunksToEmbed.length)}/${ chunksToEmbed.length } chunks` ); } } } else if (this.config.skipEmbedding) { console.error('Skipping embedding generation (skipEmbedding=true)'); } else if (chunksToEmbed.length === 0 && diff) { console.error('No chunks to embed (all unchanged)'); } // Phase 4: Storing this.updateProgress('storing', 75); // For full rebuilds, use staging directory for atomic swap // For incremental, write directly to active location const isFullRebuild = !diff; let activeContextDir = contextDir; if (isFullRebuild) { // Create staging directory for atomic swap const stagingBase = path.join(contextDir, STAGING_DIRNAME); stagingDir = path.join(stagingBase, buildId); await fs.mkdir(stagingDir, { recursive: true }); activeContextDir = stagingDir; console.error(`Full rebuild: writing to staging ${stagingDir}`); } await fs.mkdir(activeContextDir, { recursive: true }); if (!this.config.skipEmbedding) { const storagePath = path.join(activeContextDir, VECTOR_DB_DIRNAME); const storageProvider = await getStorageProvider({ path: storagePath }); if (diff) { // Incremental: delete old chunks for changed + deleted files, then add new const filesToDelete = [...diff.changed, ...diff.deleted].map((rel) => path.join(this.rootPath, rel).replace(/\\/g, '/') ); // Also try with OS-native separators for matching const filePathsForDelete = [...diff.changed, ...diff.deleted].map((rel) => path.resolve(this.rootPath, rel) ); const allDeletePaths = [...new Set([...filesToDelete, ...filePathsForDelete])]; if (allDeletePaths.length > 0) { await storageProvider.deleteByFilePaths(allDeletePaths); } if (chunksWithEmbeddings.length > 0) { await storageProvider.store(chunksWithEmbeddings); } console.error( `Incremental store: deleted chunks for ${diff.changed.length + diff.deleted.length} files, ` + `added ${chunksWithEmbeddings.length} new chunks` ); } else { // Full rebuild: store to staging (no clear - fresh directory) console.error(`Storing ${chunksToEmbed.length} chunks to staging...`); await storageProvider.store(chunksWithEmbeddings); } } // Vector DB build marker (required for version gating) // Write after semantic store step so marker reflects the latest DB state. const vectorDir = path.join(activeContextDir, VECTOR_DB_DIRNAME); await fs.mkdir(vectorDir, { recursive: true }); await fs.writeFile( path.join(vectorDir, 'index-build.json'), JSON.stringify({ buildId, formatVersion: INDEX_FORMAT_VERSION }) ); // Keyword index always uses ALL chunks (full regen) const indexPath = path.join(activeContextDir, KEYWORD_INDEX_FILENAME); // Memory safety: cap keyword index too const keywordChunks = allChunks.length > MAX_CHUNKS ? allChunks.slice(0, MAX_CHUNKS) : allChunks; await fs.writeFile( indexPath, JSON.stringify({ header: { buildId, formatVersion: INDEX_FORMAT_VERSION }, chunks: keywordChunks }) ); // Save library usage and pattern stats (always full regen) const intelligencePath = path.join(activeContextDir, INTELLIGENCE_FILENAME); const libraryStats = libraryTracker.getStats(); // Extract tsconfig paths for AI to understand import aliases let tsconfigPaths: Record<string, string[]> | undefined; try { const tsconfigPath = path.join(this.rootPath, 'tsconfig.json'); const tsconfigContent = await fs.readFile(tsconfigPath, 'utf-8'); const tsconfig = JSON.parse(tsconfigContent); if (tsconfig.compilerOptions?.paths) { tsconfigPaths = tsconfig.compilerOptions.paths; console.error( `Found ${Object.keys(tsconfigPaths!).length} path aliases in tsconfig.json` ); } } catch (_error) { // No tsconfig.json or no paths defined } const intelligence = { header: { buildId, formatVersion: INDEX_FORMAT_VERSION }, libraryUsage: libraryStats, patterns: patternDetector.getAllPatterns(), goldenFiles: patternDetector.getGoldenFiles(5), // tsconfig paths help AI understand import aliases (e.g., @mycompany/* -> libs/*) // This reveals which @scoped packages are internal vs external tsconfigPaths, importGraph: { usages: importGraph.getAllUsages(), topUsed: importGraph.getTopUsed(30) }, // Internal file graph for circular dependency and unused export detection internalFileGraph: internalFileGraph.toJSON(), generatedAt }; await fs.writeFile(intelligencePath, JSON.stringify(intelligence, null, 2)); // Write relationships sidecar (versioned, for fast lookup) const relationshipsPath = path.join(activeContextDir, RELATIONSHIPS_FILENAME); const graphData = internalFileGraph.toJSON(); // Build reverse import map (importedBy) const importedBy: Record<string, string[]> = {}; if (graphData.imports) { for (const [file, deps] of Object.entries(graphData.imports)) { for (const dep of deps as string[]) { if (!importedBy[dep]) importedBy[dep] = []; importedBy[dep].push(file); } } } // Build symbol export map (exportedBy) const exportedBy: Record<string, string[]> = {}; if (graphData.exports) { for (const [file, exps] of Object.entries(graphData.exports)) { for (const exp of exps as Array<{ name: string; type: string }>) { if (exp.name && exp.name !== 'default') { if (!exportedBy[exp.name]) exportedBy[exp.name] = []; if (!exportedBy[exp.name].includes(file)) { exportedBy[exp.name].push(file); } } } } } const relationships = { header: { buildId, formatVersion: INDEX_FORMAT_VERSION }, generatedAt, graph: { imports: graphData.imports || {}, importedBy, exports: graphData.exports || {} }, symbols: { exportedBy }, stats: graphData.stats || internalFileGraph.getStats() }; await fs.writeFile(relationshipsPath, JSON.stringify(relationships, null, 2)); // Write manifest (both full and incremental) // For full rebuild, write to staging; for incremental, write to active const activeManifestPath = path.join(activeContextDir, MANIFEST_FILENAME); const manifest: FileManifest = { version: 1, generatedAt: new Date().toISOString(), files: currentHashes ?? (await computeFileHashes(files, this.rootPath)) }; await writeManifest(activeManifestPath, manifest); const persistedStats: PersistedIndexingStats = { indexedFiles: stats.indexedFiles, totalChunks: stats.totalChunks, totalFiles: stats.totalFiles, generatedAt }; const activeIndexingStatsPath = path.join(activeContextDir, INDEXING_STATS_FILENAME); await fs.writeFile(activeIndexingStatsPath, JSON.stringify(persistedStats, null, 2)); // Index meta (authoritative) — write last so readers never observe meta pointing to missing artifacts. const metaPath = path.join(activeContextDir, INDEX_META_FILENAME); await fs.writeFile( metaPath, JSON.stringify( { metaVersion: INDEX_META_VERSION, formatVersion: INDEX_FORMAT_VERSION, buildId, generatedAt, toolVersion, artifacts: { keywordIndex: { path: KEYWORD_INDEX_FILENAME }, vectorDb: { path: VECTOR_DB_DIRNAME, provider: 'lancedb' }, intelligence: { path: INTELLIGENCE_FILENAME }, manifest: { path: MANIFEST_FILENAME }, indexingStats: { path: INDEXING_STATS_FILENAME }, relationships: { path: RELATIONSHIPS_FILENAME } } }, null, 2 ) ); // Atomic swap for full rebuilds: move staging into active location if (isFullRebuild && stagingDir) { console.error('Performing atomic swap of staging to active...'); await atomicSwapStagingToActive(contextDir, stagingDir, buildId); } // Phase 5: Complete this.updateProgress('complete', 100); stats.duration = Date.now() - startTime; stats.completedAt = new Date(); if (diff) { console.error( `Incremental indexing complete in ${stats.duration}ms ` + `(${diff.added.length} added, ${diff.changed.length} changed, ` + `${diff.deleted.length} deleted, ${diff.unchanged.length} unchanged)` ); } else { console.error(`Indexing complete in ${stats.duration}ms`); console.error(`Indexed ${stats.indexedFiles} files, ${stats.totalChunks} chunks`); } return stats; } catch (error) { this.progress.phase = 'error'; stats.errors.push({ filePath: this.rootPath, error: error instanceof Error ? error.message : String(error), phase: this.progress.phase, timestamp: new Date() }); // Clean up staging directory on failure (best-effort) if (stagingDir) { console.error('Cleaning up staging directory after failure...'); await cleanupDirectory(stagingDir); } throw error; } } private async scanFiles(): Promise<string[]> { const files: string[] = []; const seen = new Set<string>(); // Read .gitignore if respecting it let ig: ReturnType<typeof ignore.default> | null = null; if (this.config.respectGitignore) { try { const gitignorePath = path.join(this.rootPath, '.gitignore'); const gitignoreContent = await fs.readFile(gitignorePath, 'utf-8'); ig = ignore.default().add(gitignoreContent); } catch (_error) { // No .gitignore or couldn't read it } } // Scan with glob const includePatterns = this.config.include || ['**/*']; const excludePatterns = this.config.exclude || []; for (const pattern of includePatterns) { const matches = await glob(pattern, { cwd: this.rootPath, absolute: true, ignore: excludePatterns, nodir: true }); for (const file of matches) { const normalizedFile = file.replace(/\\/g, '/'); if (seen.has(normalizedFile)) { continue; } seen.add(normalizedFile); const relativePath = path.relative(this.rootPath, file); // Check gitignore if (ig && ig.ignores(relativePath)) { continue; } // Check if it's a code file if (!isCodeFile(file) || isBinaryFile(file)) { continue; } // Check file size try { const stats = await fs.stat(file); if (stats.size > (this.config.parsing?.maxFileSize || 1048576)) { console.warn(`Skipping large file: ${file} (${stats.size} bytes)`); continue; } } catch (_error) { continue; } files.push(file); } } return files; } private updateProgress(phase: IndexingPhase, percentage: number): void { this.progress.phase = phase; this.progress.percentage = percentage; if (this.onProgressCallback) { this.onProgressCallback(this.progress); } } async detectMetadata(): Promise<CodebaseMetadata> { // Get all registered analyzers (sorted by priority, highest first) const analyzers = analyzerRegistry.getAll(); // Start with base metadata template let metadata: CodebaseMetadata = { name: path.basename(this.rootPath), rootPath: this.rootPath, languages: [], dependencies: [], architecture: { type: 'mixed', layers: { presentation: 0, business: 0, data: 0, state: 0, core: 0, shared: 0, feature: 0, infrastructure: 0, unknown: 0 }, patterns: [] }, styleGuides: [], documentation: [], projectStructure: { type: 'single-app' }, statistics: { totalFiles: 0, totalLines: 0, totalComponents: 0, componentsByType: {}, componentsByLayer: { presentation: 0, business: 0, data: 0, state: 0, core: 0, shared: 0, feature: 0, infrastructure: 0, unknown: 0 } }, customMetadata: {} }; // Loop through all analyzers (highest priority first) and merge their metadata // Higher priority analyzers' values win on conflicts for (const analyzer of analyzers) { try { const analyzerMeta = await analyzer.detectCodebaseMetadata(this.rootPath); metadata = this.mergeMetadata(metadata, analyzerMeta); } catch (error) { // Analyzer failed, continue with next console.warn(`Analyzer ${analyzer.name} failed to detect metadata:`, error); } } // Load intelligence data if available try { const intelligencePath = path.join( this.rootPath, CODEBASE_CONTEXT_DIRNAME, INTELLIGENCE_FILENAME ); const intelligenceContent = await fs.readFile(intelligencePath, 'utf-8'); const intelligence = JSON.parse(intelligenceContent) as IntelligenceData; // Phase 06: ignore legacy intelligence files that lack a versioned header. if (!intelligence || typeof intelligence !== 'object' || !intelligence.header) { return metadata; } metadata.customMetadata = { ...metadata.customMetadata, libraryUsage: intelligence.libraryUsage, patterns: intelligence.patterns, intelligenceGeneratedAt: intelligence.generatedAt }; } catch (_error) { // Intelligence file doesn't exist yet (indexing not run) } return metadata; } /** * Merge two CodebaseMetadata objects. * The 'incoming' metadata takes precedence for non-empty values. */ private mergeMetadata(base: CodebaseMetadata, incoming: CodebaseMetadata): CodebaseMetadata { return { name: incoming.name || base.name, rootPath: incoming.rootPath || base.rootPath, languages: [...new Set([...base.languages, ...incoming.languages])], // Merge and deduplicate dependencies: this.mergeDependencies(base.dependencies, incoming.dependencies), framework: incoming.framework || base.framework, // Framework from higher priority analyzer wins architecture: { type: incoming.architecture?.type || base.architecture.type, layers: this.mergeLayers(base.architecture.layers, incoming.architecture?.layers), patterns: [ ...new Set([ ...(base.architecture.patterns || []), ...(incoming.architecture?.patterns || []) ]) ] // Merge and deduplicate }, styleGuides: [...new Set([...base.styleGuides, ...incoming.styleGuides])], // Merge and deduplicate documentation: [...new Set([...base.documentation, ...incoming.documentation])], // Merge and deduplicate projectStructure: incoming.projectStructure?.type !== 'single-app' ? incoming.projectStructure : base.projectStructure, statistics: this.mergeStatistics(base.statistics, incoming.statistics), customMetadata: { ...base.customMetadata, ...incoming.customMetadata } }; } private mergeDependencies(base: Dependency[], incoming: Dependency[]): Dependency[] { const seen = new Set(base.map((d) => d.name)); const result = [...base]; for (const dep of incoming) { if (!seen.has(dep.name)) { result.push(dep); seen.add(dep.name); } } return result; } private mergeLayers( base: Record<ArchitecturalLayer, number>, incoming?: Partial<Record<ArchitecturalLayer, number>> ): Record<ArchitecturalLayer, number> { if (!incoming) return base; return { presentation: Math.max(base.presentation || 0, incoming.presentation || 0), business: Math.max(base.business || 0, incoming.business || 0), data: Math.max(base.data || 0, incoming.data || 0), state: Math.max(base.state || 0, incoming.state || 0), core: Math.max(base.core || 0, incoming.core || 0), shared: Math.max(base.shared || 0, incoming.shared || 0), feature: Math.max(base.feature || 0, incoming.feature || 0), infrastructure: Math.max(base.infrastructure || 0, incoming.infrastructure || 0), unknown: Math.max(base.unknown || 0, incoming.unknown || 0) }; } private mergeStatistics( base: CodebaseMetadata['statistics'], incoming: CodebaseMetadata['statistics'] ): CodebaseMetadata['statistics'] { return { totalFiles: Math.max(base.totalFiles || 0, incoming.totalFiles || 0), totalLines: Math.max(base.totalLines || 0, incoming.totalLines || 0), totalComponents: Math.max(base.totalComponents || 0, incoming.totalComponents || 0), componentsByType: { ...base.componentsByType, ...incoming.componentsByType }, componentsByLayer: this.mergeLayers(base.componentsByLayer, incoming.componentsByLayer) }; } getProgress(): IndexingProgress { return { ...this.progress }; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickSys/codebase-context'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

indexer.ts•43.1 KiB