CodeRAG

indexer.ts•48.8 KiB

/** * Codebase indexer service * Uses chunk-level indexing for better search granularity */ import fs from 'node:fs/promises' import path from 'node:path' import { chunkCodeByAST } from './ast-chunking.js' import { getCoderagDataDir } from './db/client.js' import type { EmbeddingProvider } from './embeddings.js' import { IncrementalTFIDF, type IncrementalUpdate } from './incremental-tfidf.js' import { createCacheKey, LRUCache } from './search-cache.js' import { type CodebaseFile, MemoryStorage, type Storage } from './storage.js' import { type ChunkData, PersistentStorage } from './storage-persistent.js' import { buildSearchIndex, getQueryTokens, type SearchIndex, tokenize } from './tfidf.js' import { detectLanguage, type FileMetadata, type Ignore, isTextFile, loadGitignore, readFileContent, scanFileMetadata, simpleHash, } from './utils.js' import { type VectorDocument, VectorStorage } from './vector-storage.js' export interface IndexerOptions { codebaseRoot?: string maxFileSize?: number storage?: Storage onProgress?: (current: number, total: number, file: string) => void watch?: boolean onFileChange?: (event: FileChangeEvent) => void embeddingProvider?: EmbeddingProvider vectorBatchSize?: number // Default: 10 indexingBatchSize?: number // Default: 50 - files processed per batch (Memory optimization) lowMemoryMode?: boolean // Default: true - use SQL-based search instead of in-memory index } export interface FileChangeEvent { type: 'add' | 'change' | 'unlink' path: string timestamp: number } export interface IndexingStatus { isIndexing: boolean progress: number totalFiles: number processedFiles: number totalChunks: number indexedChunks: number currentFile?: string } /** * Result of comparing filesystem with database */ export interface FileDiff { added: FileMetadata[] changed: FileMetadata[] deleted: string[] // paths only unchanged: number } export class CodebaseIndexer { private codebaseRoot: string private maxFileSize: number private storage: Storage private searchIndex: SearchIndex | null = null private incrementalEngine: IncrementalTFIDF | null = null private pendingFileChanges: IncrementalUpdate[] = [] private searchCache: LRUCache<SearchResult[]> private watcher: import('@parcel/watcher').AsyncSubscription | null = null private isWatching = false private onFileChangeCallback?: (event: FileChangeEvent) => void private pendingUpdates = new Map<string, NodeJS.Timeout>() private ignoreFilter: Ignore | null = null private status: IndexingStatus = { isIndexing: false, progress: 0, totalFiles: 0, processedFiles: 0, totalChunks: 0, indexedChunks: 0, } private vectorStorage?: VectorStorage private embeddingProvider?: EmbeddingProvider private vectorBatchSize: number private indexingBatchSize: number private lowMemoryMode: boolean constructor(options: IndexerOptions = {}) { this.codebaseRoot = options.codebaseRoot || process.cwd() this.maxFileSize = options.maxFileSize || 1048576 // 1MB this.storage = options.storage || new MemoryStorage() this.onFileChangeCallback = options.onFileChange this.searchCache = new LRUCache<SearchResult[]>(100, 5) // 100 entries, 5 min TTL this.embeddingProvider = options.embeddingProvider this.vectorBatchSize = options.vectorBatchSize || 10 this.indexingBatchSize = options.indexingBatchSize || 50 // Memory optimization // Default to low memory mode when using persistent storage this.lowMemoryMode = options.lowMemoryMode ?? options.storage instanceof PersistentStorage // Initialize vector storage if embedding provider is available if (this.embeddingProvider) { // Use global ~/.coderag/projects/<hash>/ directory for vector storage const dataDir = getCoderagDataDir(this.codebaseRoot) const vectorDbPath = path.join(dataDir, 'vectors.lance') this.vectorStorage = new VectorStorage({ dimensions: this.embeddingProvider.dimensions, dbPath: vectorDbPath, }) console.error( `[INFO] Vector storage initialized: ${this.embeddingProvider.dimensions} dimensions` ) } } /** * Get current indexing status */ getStatus(): IndexingStatus { return { ...this.status } } /** * Compare filesystem with database to find changes * Used for incremental updates after long periods of inactivity */ private async diffFilesystem( dbMetadata: Map<string, { mtime: number; hash: string }> ): Promise<FileDiff> { if (!this.ignoreFilter) { this.ignoreFilter = loadGitignore(this.codebaseRoot) } const added: FileMetadata[] = [] const changed: FileMetadata[] = [] const deleted: string[] = [] let unchanged = 0 // Track which db files we've seen in filesystem const seenPaths = new Set<string>() // Scan filesystem for (const metadata of scanFileMetadata(this.codebaseRoot, { ignoreFilter: this.ignoreFilter, codebaseRoot: this.codebaseRoot, maxFileSize: this.maxFileSize, })) { seenPaths.add(metadata.path) const dbEntry = dbMetadata.get(metadata.path) if (!dbEntry) { // New file added.push(metadata) } else if (Math.abs(metadata.mtime - dbEntry.mtime) > 1000) { // mtime changed (1 second tolerance for filesystem precision) // File might have changed, need to verify with hash changed.push(metadata) } else { unchanged++ } } // Find deleted files (in db but not in filesystem) for (const dbPath of dbMetadata.keys()) { if (!seenPaths.has(dbPath)) { deleted.push(dbPath) } } return { added, changed, deleted, unchanged } } /** * Process incremental changes (add, update, delete files) * Uses chunk-level indexing with SQL-based updates */ private async processIncrementalChanges( diff: FileDiff, dbMetadata: Map<string, { mtime: number; hash: string }>, options: IndexerOptions ): Promise<void> { const persistentStorage = this.storage as PersistentStorage // Step 1: Get terms for deleted files (before deleting, for IDF recalculation) let _deletedTerms = new Set<string>() if (diff.deleted.length > 0) { console.error(`[INFO] Getting terms for ${diff.deleted.length} deleted files...`) _deletedTerms = await persistentStorage.getTermsForFiles(diff.deleted) console.error(`[INFO] Deleting ${diff.deleted.length} removed files...`) await persistentStorage.deleteFiles(diff.deleted) } // Step 2: Process added and changed files - chunk and index const filesToProcess = [...diff.added, ...diff.changed] let totalChunks = 0 if (filesToProcess.length > 0) { console.error(`[INFO] Processing ${filesToProcess.length} files...`) const batchSize = this.indexingBatchSize let processedCount = 0 for (let i = 0; i < filesToProcess.length; i += batchSize) { const batchMetadata = filesToProcess.slice(i, i + batchSize) const batchFiles: CodebaseFile[] = [] const fileChunks: Array<{ filePath: string; chunks: ChunkData[] }> = [] for (const metadata of batchMetadata) { const content = readFileContent(metadata.absolutePath) if (content === null) continue const newHash = simpleHash(content) // For changed files, verify content actually changed using hash const dbEntry = dbMetadata.get(metadata.path) if (dbEntry && dbEntry.hash === newHash) { // File content unchanged, just mtime difference - skip processedCount++ continue } const codebaseFile: CodebaseFile = { path: metadata.path, content, size: metadata.size, mtime: new Date(metadata.mtime), language: metadata.language, hash: newHash, } batchFiles.push(codebaseFile) // Chunk the file using AST const chunks = await chunkCodeByAST(content, metadata.path) const chunkData: ChunkData[] = chunks.map((chunk) => ({ content: chunk.content, type: chunk.type, startLine: chunk.startLine, endLine: chunk.endLine, metadata: chunk.metadata, })) fileChunks.push({ filePath: metadata.path, chunks: chunkData }) totalChunks += chunkData.length processedCount++ this.status.currentFile = metadata.path this.status.progress = Math.round((processedCount / filesToProcess.length) * 30) options.onProgress?.(processedCount, filesToProcess.length, metadata.path) } // Store batch to database (file content) if (batchFiles.length > 0) { await persistentStorage.storeFiles(batchFiles) } // Store chunks for this batch if (fileChunks.length > 0) { const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks) // Build TF-IDF vectors for chunks const chunkVectors: Array<{ chunkId: number terms: Map<string, { tf: number; tfidf: number; rawFreq: number }> tokenCount: number }> = [] for (const fc of fileChunks) { const chunkIds = chunkIdMap.get(fc.filePath) if (!chunkIds) continue for (let j = 0; j < fc.chunks.length; j++) { const chunk = fc.chunks[j] const chunkId = chunkIds[j] if (!chunkId) continue // Tokenize chunk content const tokens = await tokenize(chunk.content) const termFreq = new Map<string, number>() for (const token of tokens) { termFreq.set(token, (termFreq.get(token) || 0) + 1) } // Calculate TF const totalTerms = tokens.length if (totalTerms === 0) continue const terms = new Map<string, { tf: number; tfidf: number; rawFreq: number }>() for (const [term, freq] of termFreq) { terms.set(term, { tf: freq / totalTerms, tfidf: 0, // Will be calculated after IDF rebuild rawFreq: freq, }) } chunkVectors.push({ chunkId, terms, tokenCount: totalTerms }) } } // Store chunk vectors if (chunkVectors.length > 0) { await persistentStorage.storeManyChunkVectors(chunkVectors) } } } } this.status.progress = 50 // Step 3: Rebuild IDF scores from vectors (SQL-based) console.error('[INFO] Recalculating IDF scores...') await persistentStorage.rebuildIdfScoresFromVectors() this.status.progress = 70 // Step 4: Recalculate TF-IDF scores (SQL-based batch update) console.error('[INFO] Updating TF-IDF scores...') await persistentStorage.recalculateTfidfScores() this.status.progress = 80 // Step 5: Update pre-computed magnitudes (for cosine similarity search) console.error('[INFO] Updating chunk magnitudes...') await persistentStorage.updateChunkMagnitudes() this.status.progress = 90 // Step 6: Update average document length (for BM25) console.error('[INFO] Updating average document length...') await persistentStorage.updateAverageDocLength() this.status.progress = 95 // Step 7: Invalidate search cache this.searchCache.invalidate() console.error('[INFO] Search cache invalidated') // Log summary console.error( `[SUCCESS] Incremental update complete: ${filesToProcess.length - diff.changed.length} files added, ${diff.changed.length} changed, ${diff.deleted.length} deleted, ${totalChunks} chunks indexed` ) } /** * Get search index */ getSearchIndex(): SearchIndex | null { return this.searchIndex } /** * Index the codebase */ async index(options: IndexerOptions = {}): Promise<void> { this.status.isIndexing = true this.status.progress = 0 this.status.processedFiles = 0 this.status.indexedChunks = 0 try { // Try to load existing index from persistent storage if (this.storage instanceof PersistentStorage) { const existingFileCount = await this.storage.count() const existingChunkCount = (await this.storage.getChunkCount?.()) ?? 0 if (existingFileCount > 0) { console.error( `[INFO] Found existing index: ${existingFileCount} files, ${existingChunkCount} chunks` ) // Verify IDF scores exist (index is valid) const idf = await this.storage.getIdfScores() if (idf.size > 0) { // Incremental diff: compare filesystem vs database console.error('[INFO] Checking for file changes since last index...') const dbMetadata = await this.storage.getAllFileMetadata() const diff = await this.diffFilesystem(dbMetadata) const totalChanges = diff.added.length + diff.changed.length + diff.deleted.length if (totalChanges === 0) { // No changes - use existing index console.error( `[SUCCESS] No changes detected (${diff.unchanged} files, ${existingChunkCount} chunks)` ) this.status.progress = 100 this.status.totalFiles = existingFileCount this.status.processedFiles = existingFileCount this.status.totalChunks = existingChunkCount this.status.indexedChunks = existingChunkCount // Start watching if requested if (options.watch) { await this.startWatch() } this.status.isIndexing = false return } // Process incremental changes console.error( `[INFO] Incremental update: ${diff.added.length} added, ${diff.changed.length} changed, ${diff.deleted.length} deleted` ) await this.processIncrementalChanges(diff, dbMetadata, options) // Get updated chunk count after incremental changes const updatedChunkCount = (await this.storage.getChunkCount?.()) ?? 0 const updatedFileCount = existingFileCount + diff.added.length - diff.deleted.length this.status.progress = 100 this.status.totalFiles = updatedFileCount this.status.processedFiles = updatedFileCount this.status.totalChunks = updatedChunkCount this.status.indexedChunks = updatedChunkCount // Start watching if requested if (options.watch) { await this.startWatch() } this.status.isIndexing = false return } console.error('[WARN] Index verification failed, rebuilding...') } } // Load .gitignore this.ignoreFilter = loadGitignore(this.codebaseRoot) const ignoreFilter = this.ignoreFilter // Phase 1: Scan file metadata only (no content loaded - Memory optimization) console.error('[INFO] Scanning codebase (metadata only)...') const fileMetadataList: FileMetadata[] = [] for (const metadata of scanFileMetadata(this.codebaseRoot, { ignoreFilter, codebaseRoot: this.codebaseRoot, maxFileSize: this.maxFileSize, })) { fileMetadataList.push(metadata) } this.status.totalFiles = fileMetadataList.length console.error(`[INFO] Found ${fileMetadataList.length} files`) // Phase 2: Process files in batches with chunk-level indexing // Only batch content is in memory at any time console.error(`[INFO] Processing files in batches of ${this.indexingBatchSize}...`) const batchSize = this.indexingBatchSize let processedCount = 0 let totalChunks = 0 // Check if we're using persistent storage for chunk-based indexing const isPersistent = this.storage instanceof PersistentStorage const persistentStorage = isPersistent ? (this.storage as PersistentStorage) : null // For non-persistent storage, still use incremental engine if (!isPersistent) { this.incrementalEngine = new IncrementalTFIDF() } for (let i = 0; i < fileMetadataList.length; i += batchSize) { const batchMetadata = fileMetadataList.slice(i, i + batchSize) const batchFiles: CodebaseFile[] = [] const fileChunks: Array<{ filePath: string; chunks: ChunkData[]; content: string }> = [] const batchUpdates: import('./incremental-tfidf.js').IncrementalUpdate[] = [] // Read content for this batch only for (const metadata of batchMetadata) { const content = readFileContent(metadata.absolutePath) if (content === null) continue const codebaseFile: CodebaseFile = { path: metadata.path, content, size: metadata.size, mtime: new Date(metadata.mtime), language: metadata.language, hash: simpleHash(content), } batchFiles.push(codebaseFile) // Chunk the file using AST const chunks = await chunkCodeByAST(content, metadata.path) const chunkData: ChunkData[] = chunks.map((chunk) => ({ content: chunk.content, type: chunk.type, startLine: chunk.startLine, endLine: chunk.endLine, metadata: chunk.metadata, })) fileChunks.push({ filePath: metadata.path, chunks: chunkData, content }) totalChunks += chunkData.length // For non-persistent storage, use incremental engine if (!isPersistent) { batchUpdates.push({ type: 'add', uri: `file://${metadata.path}`, newContent: content, }) } processedCount++ this.status.currentFile = metadata.path this.status.processedFiles = processedCount this.status.indexedChunks = totalChunks this.status.progress = Math.round((processedCount / fileMetadataList.length) * 40) options.onProgress?.(processedCount, fileMetadataList.length, metadata.path) } // Store batch to database if (batchFiles.length > 0) { if (this.storage.storeFiles) { await this.storage.storeFiles(batchFiles) } else { for (const file of batchFiles) { await this.storage.storeFile(file) } } // For persistent storage: store chunks and build TF-IDF vectors per chunk if (persistentStorage && fileChunks.length > 0) { const chunkIdMap = await persistentStorage.storeManyChunks( fileChunks.map((fc) => ({ filePath: fc.filePath, chunks: fc.chunks })) ) // Build TF-IDF vectors for chunks const chunkVectors: Array<{ chunkId: number terms: Map<string, { tf: number; tfidf: number; rawFreq: number }> tokenCount: number }> = [] for (const fc of fileChunks) { const chunkIds = chunkIdMap.get(fc.filePath) if (!chunkIds) continue for (let j = 0; j < fc.chunks.length; j++) { const chunk = fc.chunks[j] const chunkId = chunkIds[j] if (!chunkId) continue // Tokenize chunk content const tokens = await tokenize(chunk.content) const termFreq = new Map<string, number>() for (const token of tokens) { termFreq.set(token, (termFreq.get(token) || 0) + 1) } // Calculate TF const totalTerms = tokens.length if (totalTerms === 0) continue const terms = new Map<string, { tf: number; tfidf: number; rawFreq: number }>() for (const [term, freq] of termFreq) { terms.set(term, { tf: freq / totalTerms, tfidf: 0, // Will be calculated after IDF rebuild rawFreq: freq, }) } chunkVectors.push({ chunkId, terms, tokenCount: totalTerms }) } } // Store chunk vectors if (chunkVectors.length > 0) { await persistentStorage.storeManyChunkVectors(chunkVectors) } } // For non-persistent storage: use incremental engine if (this.incrementalEngine && batchUpdates.length > 0) { await this.incrementalEngine.applyUpdates(batchUpdates) } } // Clear batch references for GC batchFiles.length = 0 fileChunks.length = 0 batchUpdates.length = 0 } console.error(`[INFO] Total chunks created: ${totalChunks}`) this.status.totalChunks = totalChunks this.status.progress = 50 // Finalize index based on storage type if (persistentStorage) { // Persistent storage: rebuild IDF and TF-IDF scores console.error('[INFO] Rebuilding IDF scores...') await persistentStorage.rebuildIdfScoresFromVectors() this.status.progress = 60 console.error('[INFO] Recalculating TF-IDF scores...') await persistentStorage.recalculateTfidfScores() this.status.progress = 70 console.error('[INFO] Computing chunk magnitudes...') await persistentStorage.updateChunkMagnitudes() this.status.progress = 80 console.error('[INFO] Computing average document length...') await persistentStorage.updateAverageDocLength() this.status.progress = 85 // Release in-memory structures in low memory mode if (this.lowMemoryMode) { this.searchIndex = null this.incrementalEngine = null console.error('[INFO] Low memory mode: released in-memory index') } console.error('[SUCCESS] Chunk-level TF-IDF index persisted') } else if (this.incrementalEngine) { // Non-persistent storage: build in-memory index console.error('[INFO] Finalizing in-memory search index...') const indexData = this.incrementalEngine.getIndex() this.searchIndex = { documents: indexData.documents, idf: indexData.idf, totalDocuments: indexData.totalDocuments, metadata: { generatedAt: new Date().toISOString(), version: '1.0.0', }, } console.error('[INFO] Incremental index engine initialized') } // Build vector index if embedding provider available if (this.embeddingProvider && this.vectorStorage) { await this.buildVectorIndexFromMetadata(fileMetadataList) } this.status.progress = 100 this.status.indexedChunks = totalChunks console.error(`[SUCCESS] Indexed ${totalChunks} chunks from ${fileMetadataList.length} files`) // Start watching if requested if (options.watch) { await this.startWatch() } } catch (error) { console.error('[ERROR] Failed to index codebase:', error) throw error } finally { this.status.isIndexing = false this.status.currentFile = undefined } } /** * Start watching for file changes * Uses @parcel/watcher which provides native FSEvents on macOS */ async startWatch(): Promise<void> { if (this.isWatching) { console.error('[WARN] Already watching for changes') return } if (!this.ignoreFilter) { this.ignoreFilter = loadGitignore(this.codebaseRoot) } console.error('[INFO] Starting file watcher (native FSEvents)...') const watcher = await import('@parcel/watcher') // Subscribe to file changes this.watcher = await watcher.subscribe( this.codebaseRoot, (err, events) => { if (err) { console.error('[WARN] File watcher error:', err.message) return } for (const event of events) { const absolutePath = event.path const relativePath = path.relative(this.codebaseRoot, absolutePath) // Skip ignored files if (this.shouldIgnore(relativePath)) { continue } // Map @parcel/watcher event types to our types const eventType = event.type === 'create' ? 'add' : event.type === 'delete' ? 'unlink' : 'change' this.handleFileChange(eventType, absolutePath) } }, { // Use native backend (FSEvents on macOS, inotify on Linux) backend: undefined, // auto-detect best backend ignore: [ '**/node_modules/**', '**/.git/**', '**/dist/**', '**/build/**', '**/.next/**', '**/.turbo/**', '**/.cache/**', '**/coverage/**', '**/*.log', ], } ) this.isWatching = true console.error('[SUCCESS] File watcher started (native FSEvents)') } /** * Check if a file should be ignored */ private shouldIgnore(relativePath: string): boolean { // Skip empty paths if (!relativePath) return true // Check gitignore if (this.ignoreFilter?.ignores(relativePath)) { return true } return false } /** * Stop watching for file changes */ async stopWatch(): Promise<void> { if (!this.isWatching || !this.watcher) { return } console.error('[INFO] Stopping file watcher...') await this.watcher.unsubscribe() this.watcher = null this.isWatching = false // Clear pending updates for (const timeout of this.pendingUpdates.values()) { clearTimeout(timeout) } this.pendingUpdates.clear() // Clear pending file changes to prevent memory leak this.pendingFileChanges = [] console.error('[SUCCESS] File watcher stopped') } /** * Close indexer and release all resources * Should be called when the indexer is no longer needed */ async close(): Promise<void> { // Stop file watcher first await this.stopWatch() // Close vector storage (LanceDB connection) if (this.vectorStorage) { await this.vectorStorage.close() this.vectorStorage = undefined } // Close persistent storage (SQLite connection) if (this.storage instanceof PersistentStorage) { this.storage.close() } // Clear all in-memory state this.searchIndex = null this.incrementalEngine = null this.pendingFileChanges = [] this.searchCache.clear() this.ignoreFilter = null console.error('[SUCCESS] Indexer closed and resources released') } /** * Handle file change events with debouncing */ private handleFileChange(type: 'add' | 'change' | 'unlink', absolutePath: string): void { const relativePath = path.relative(this.codebaseRoot, absolutePath) // Check if file should be ignored if (this.ignoreFilter?.ignores(relativePath)) { return } // Debounce updates (wait 500ms after last change) const existing = this.pendingUpdates.get(relativePath) if (existing) { clearTimeout(existing) } const timeout = setTimeout(async () => { this.pendingUpdates.delete(relativePath) await this.processFileChange(type, relativePath, absolutePath) }, 500) this.pendingUpdates.set(relativePath, timeout) } /** * Process file change and update index */ private async processFileChange( type: 'add' | 'change' | 'unlink', relativePath: string, absolutePath: string ): Promise<void> { const event: FileChangeEvent = { type, path: relativePath, timestamp: Date.now(), } try { if (type === 'unlink') { // Track deletion for incremental update const existingFile = await this.storage.getFile(relativePath) if (existingFile && this.searchIndex) { const oldDoc = this.searchIndex.documents.find((d) => d.uri === `file://${relativePath}`) if (oldDoc) { this.pendingFileChanges.push({ type: 'delete', uri: `file://${relativePath}`, oldDocument: oldDoc, }) } } // Remove from storage await this.storage.deleteFile(relativePath) // Remove from vector storage await this.deleteFileVector(relativePath) console.error(`[FILE] Removed: ${relativePath}`) } else { // Check if file is text and within size limit const stats = await fs.stat(absolutePath) if (stats.size > this.maxFileSize) { console.error(`[FILE] Skipped (too large): ${relativePath}`) return } if (!isTextFile(absolutePath)) { console.error(`[FILE] Skipped (binary): ${relativePath}`) return } // Read file content const content = await fs.readFile(absolutePath, 'utf-8') const hash = simpleHash(content) // OPTIMIZATION: Check if file actually changed using hash comparison const existingFile = await this.storage.getFile(relativePath) if (existingFile && existingFile.hash === hash) { console.error(`[FILE] Skipped (unchanged): ${relativePath}`) // File hasn't changed, skip indexing this.onFileChangeCallback?.(event) return } // Track change for incremental update if (this.searchIndex) { const uri = `file://${relativePath}` const oldDoc = this.searchIndex.documents.find((d) => d.uri === uri) if (oldDoc) { // Update existing document this.pendingFileChanges.push({ type: 'update', uri, oldDocument: oldDoc, newContent: content, }) } else { // Add new document this.pendingFileChanges.push({ type: 'add', uri, newContent: content, }) } } // File changed or new, process it const codebaseFile: CodebaseFile = { path: relativePath, content, size: stats.size, mtime: stats.mtime, language: detectLanguage(relativePath), hash, } await this.storage.storeFile(codebaseFile) // Update vector storage await this.updateFileVector(relativePath, content) console.error(`[FILE] ${type === 'add' ? 'Added' : 'Updated'}: ${relativePath}`) } // Rebuild search index await this.rebuildSearchIndex() // Notify callback this.onFileChangeCallback?.(event) } catch (error) { console.error(`[ERROR] Failed to process file change (${relativePath}):`, error) } } /** * Rebuild search index from current storage * Uses incremental update when possible for performance */ private async rebuildSearchIndex(): Promise<void> { // If no incremental engine or no pending changes, do full rebuild if (!this.incrementalEngine || this.pendingFileChanges.length === 0) { // CRITICAL: Clear pending changes to prevent memory leak // In lowMemoryMode, incrementalEngine is null, so we must clear here this.pendingFileChanges = [] return this.fullRebuildSearchIndex() } // Use try/finally to ensure pendingFileChanges is always cleared // This prevents memory leak if an exception occurs during rebuild try { // Check if incremental update is recommended if (await this.incrementalEngine.shouldFullRebuild(this.pendingFileChanges)) { console.error( '[INFO] Changes too extensive, performing full rebuild instead of incremental' ) this.pendingFileChanges = [] return this.fullRebuildSearchIndex() } // Perform incremental update const stats = await this.incrementalEngine.applyUpdates(this.pendingFileChanges) // Update search index from incremental engine const indexData = this.incrementalEngine.getIndex() this.searchIndex = { documents: indexData.documents, idf: indexData.idf, totalDocuments: indexData.totalDocuments, metadata: { generatedAt: new Date().toISOString(), version: '1.0.0', }, } console.error( `[SUCCESS] Incremental update: ${stats.affectedDocuments} docs, ${stats.affectedTerms} terms, ${stats.updateTime}ms` ) // Invalidate search cache (index changed) this.searchCache.invalidate() console.error('[INFO] Search cache invalidated') // Persist if using persistent storage if (this.storage instanceof PersistentStorage) { await this.persistSearchIndex() } } finally { // Always clear pending changes to prevent memory leak this.pendingFileChanges = [] } } /** * Full rebuild of search index (fallback when incremental not possible) * For persistent storage, this rebuilds the chunk-level index */ private async fullRebuildSearchIndex(): Promise<void> { // For persistent storage, rebuild chunk index if (this.storage instanceof PersistentStorage) { const persistentStorage = this.storage const allFiles = await this.storage.getAllFiles() console.error(`[INFO] Full rebuild: re-chunking ${allFiles.length} files...`) // Re-chunk all files const fileChunks: Array<{ filePath: string; chunks: ChunkData[] }> = [] for (const file of allFiles) { const chunks = await chunkCodeByAST(file.content, file.path) const chunkData: ChunkData[] = chunks.map((chunk) => ({ content: chunk.content, type: chunk.type, startLine: chunk.startLine, endLine: chunk.endLine, metadata: chunk.metadata, })) fileChunks.push({ filePath: file.path, chunks: chunkData }) } // Store all chunks (this also deletes old chunks) const chunkIdMap = await persistentStorage.storeManyChunks(fileChunks) // Build TF-IDF vectors for all chunks const chunkVectors: Array<{ chunkId: number terms: Map<string, { tf: number; tfidf: number; rawFreq: number }> tokenCount: number }> = [] for (const fc of fileChunks) { const chunkIds = chunkIdMap.get(fc.filePath) if (!chunkIds) continue for (let j = 0; j < fc.chunks.length; j++) { const chunk = fc.chunks[j] const chunkId = chunkIds[j] if (!chunkId) continue const tokens = await tokenize(chunk.content) const termFreq = new Map<string, number>() for (const token of tokens) { termFreq.set(token, (termFreq.get(token) || 0) + 1) } const totalTerms = tokens.length if (totalTerms === 0) continue const terms = new Map<string, { tf: number; tfidf: number; rawFreq: number }>() for (const [term, freq] of termFreq) { terms.set(term, { tf: freq / totalTerms, tfidf: 0, rawFreq: freq, }) } chunkVectors.push({ chunkId, terms, tokenCount: totalTerms }) } } if (chunkVectors.length > 0) { await persistentStorage.storeManyChunkVectors(chunkVectors) } // Rebuild IDF and TF-IDF scores await persistentStorage.rebuildIdfScoresFromVectors() await persistentStorage.recalculateTfidfScores() await persistentStorage.updateChunkMagnitudes() await persistentStorage.updateAverageDocLength() console.error('[SUCCESS] Full chunk index rebuild complete') } else { // For non-persistent storage, use in-memory index const allFiles = await this.storage.getAllFiles() const documents = allFiles.map((file) => ({ uri: `file://${file.path}`, content: file.content, })) this.searchIndex = await buildSearchIndex(documents) this.incrementalEngine = new IncrementalTFIDF( this.searchIndex.documents, this.searchIndex.idf ) } // Invalidate search cache (index changed) this.searchCache.invalidate() console.error('[INFO] Search cache invalidated') } /** * Persist search index to storage * NOTE: For PersistentStorage, chunk-based indexing happens inline during index() * This method is only used for in-memory storage fallback */ private async persistSearchIndex(): Promise<void> { // For persistent storage, indexing is done inline with chunks // This method is kept for compatibility with in-memory storage mode if (this.storage instanceof PersistentStorage) { // Chunk-based indexing already persisted during index() console.error('[INFO] Chunk-based index already persisted') return } // For non-persistent storage, just store IDF scores if available if (this.searchIndex) { const docFreq = new Map<string, number>() for (const doc of this.searchIndex.documents) { const uniqueTerms = new Set(doc.rawTerms.keys()) for (const term of uniqueTerms) { docFreq.set(term, (docFreq.get(term) || 0) + 1) } } console.error('[INFO] In-memory index built (non-persistent storage)') } } /** * Check if currently watching for changes */ isWatchEnabled(): boolean { return this.isWatching } /** * Search the codebase * Returns chunk-level results when using persistent storage (SQL-based search) */ async search( query: string, options: { limit?: number includeContent?: boolean fileExtensions?: string[] pathFilter?: string excludePaths?: string[] // Snippet options (only used for in-memory search) contextLines?: number // Lines of context around each match (default: 3) maxSnippetChars?: number // Max chars per file snippet (default: 2000) maxSnippetBlocks?: number // Max code blocks per file (default: 4) } = {} ): Promise<SearchResult[]> { const { limit = 10, includeContent = true } = options const snippetOptions = { contextLines: options.contextLines, maxChars: options.maxSnippetChars, maxBlocks: options.maxSnippetBlocks, } // Create cache key from query and options const cacheKey = createCacheKey(query, { limit, fileExtensions: options.fileExtensions, pathFilter: options.pathFilter, excludePaths: options.excludePaths, ...snippetOptions, }) // Check cache first const cachedResults = this.searchCache.get(cacheKey) if (cachedResults) { console.error(`[CACHE HIT] Query: "${query}"`) return cachedResults } console.error(`[CACHE MISS] Query: "${query}"`) // Use chunk-based SQL search in low memory mode (Memory optimization) if (this.lowMemoryMode && this.storage instanceof PersistentStorage) { const searchResults = await this.searchChunks(query, options) this.searchCache.set(cacheKey, searchResults) return searchResults } // In-memory search (faster but uses more memory) - file-level let results: Array<{ uri: string; score: number; matchedTerms: string[] }> if (!this.searchIndex) { throw new Error('Codebase not indexed. Please run index() first.') } const searchIndex = this.searchIndex results = await import('./tfidf.js').then((m) => m.searchDocuments(query, searchIndex, { limit }) ) // Get file content and apply filters (in-memory mode) const searchResults: SearchResult[] = [] for (const result of results) { const filePath = result.uri.replace('file://', '') // Apply filters if (options.fileExtensions && options.fileExtensions.length > 0) { if (!options.fileExtensions.some((ext) => filePath.endsWith(ext))) { continue } } if (options.pathFilter && !filePath.includes(options.pathFilter)) { continue } if (options.excludePaths && options.excludePaths.length > 0) { if (options.excludePaths.some((exclude) => filePath.includes(exclude))) { continue } } const file = await this.storage.getFile(filePath) if (!file) continue const searchResult: SearchResult = { path: file.path, score: result.score, matchedTerms: result.matchedTerms, language: file.language, size: file.size, } if (includeContent) { searchResult.snippet = this.extractSnippet( file.content, result.matchedTerms, snippetOptions ) } searchResults.push(searchResult) } const finalResults = searchResults.slice(0, limit) // Store in cache this.searchCache.set(cacheKey, finalResults) return finalResults } /** * Chunk-based search with BM25 scoring * Returns chunk content directly (no separate snippet extraction needed) */ private async searchChunks( query: string, options: { limit?: number includeContent?: boolean fileExtensions?: string[] pathFilter?: string excludePaths?: string[] } ): Promise<SearchResult[]> { const { limit = 10, includeContent = true } = options const persistentStorage = this.storage as PersistentStorage // Tokenize query const queryTokens = await getQueryTokens(query) if (queryTokens.length === 0) { return [] } // Get matching chunks from storage (already includes content) const candidates = await persistentStorage.searchByTerms(queryTokens, { limit: limit * 3 }) // Get IDF scores for query terms const idf = await persistentStorage.getIdfScoresForTerms(queryTokens) // Get average document length for BM25 const avgDocLength = await persistentStorage.getAverageDocLength() // BM25 parameters const k1 = 1.2 const b = 0.75 // Calculate BM25 scores for each chunk const scoredResults: Array<{ chunk: (typeof candidates)[0] score: number matchedTerms: string[] }> = [] for (const chunk of candidates) { // Apply filters if (options.fileExtensions && options.fileExtensions.length > 0) { if (!options.fileExtensions.some((ext) => chunk.filePath.endsWith(ext))) { continue } } if (options.pathFilter && !chunk.filePath.includes(options.pathFilter)) { continue } if (options.excludePaths && options.excludePaths.length > 0) { if (options.excludePaths.some((exclude) => chunk.filePath.includes(exclude))) { continue } } // Calculate BM25 score let score = 0 const matchedTerms: string[] = [] for (const term of queryTokens) { const termData = chunk.matchedTerms.get(term) if (!termData) continue matchedTerms.push(term) const termIdf = idf.get(term) || 1 // BM25 formula const tf = termData.rawFreq const docLen = chunk.tokenCount || 1 const numerator = tf * (k1 + 1) const denominator = tf + k1 * (1 - b + b * (docLen / (avgDocLength || 1))) score += termIdf * (numerator / denominator) } if (matchedTerms.length > 0) { scoredResults.push({ chunk, score, matchedTerms }) } } // Sort by score descending scoredResults.sort((a, b) => b.score - a.score) // Convert to SearchResult format const results: SearchResult[] = [] for (const { chunk, score, matchedTerms } of scoredResults.slice(0, limit)) { const result: SearchResult = { path: chunk.filePath, score, matchedTerms, language: detectLanguage(chunk.filePath), size: chunk.content.length, // Include chunk metadata chunkType: chunk.type, startLine: chunk.startLine, endLine: chunk.endLine, } if (includeContent) { // Chunk content is the snippet - add line numbers const lines = chunk.content.split('\n') result.snippet = lines.map((line, i) => `${chunk.startLine + i}: ${line}`).join('\n') } results.push(result) } console.error(`[BM25 CHUNK SEARCH] Found ${results.length} chunks`) return results } /** * Extract code block snippets from content around matched terms * * Returns the most relevant code blocks (not just lines) with context. * Blocks are ranked by term density (more matched terms = higher score). */ private extractSnippet( content: string, matchedTerms: string[], options: { contextLines?: number; maxChars?: number; maxBlocks?: number } = {} ): string { const { contextLines = 3, maxChars = 2000, maxBlocks = 4 } = options const lines = content.split('\n') // Step 1: Find all lines with matches and score them const matchedLineInfos: Array<{ lineNum: number score: number matchedTerms: string[] }> = [] for (let i = 0; i < lines.length; i++) { const lineLower = lines[i].toLowerCase() const termsInLine = matchedTerms.filter((term) => lineLower.includes(term.toLowerCase())) if (termsInLine.length > 0) { matchedLineInfos.push({ lineNum: i, score: termsInLine.length, matchedTerms: termsInLine, }) } } if (matchedLineInfos.length === 0) { // Return first few lines if no matches found return lines.slice(0, 5).join('\n') } // Step 2: Expand each matched line to a block with context, then merge overlapping blocks interface Block { start: number end: number score: number matchedTerms: Set<string> } const blocks: Block[] = [] for (const info of matchedLineInfos) { const start = Math.max(0, info.lineNum - contextLines) const end = Math.min(lines.length - 1, info.lineNum + contextLines) // Try to merge with existing block if overlapping let merged = false for (const block of blocks) { if (start <= block.end + 1 && end >= block.start - 1) { // Overlapping or adjacent - merge block.start = Math.min(block.start, start) block.end = Math.max(block.end, end) block.score += info.score for (const term of info.matchedTerms) { block.matchedTerms.add(term) } merged = true break } } if (!merged) { blocks.push({ start, end, score: info.score, matchedTerms: new Set(info.matchedTerms), }) } } // Step 3: Sort blocks by unique terms (primary) and density (secondary) // Unique terms = how many different query terms appear in block // Density = unique terms / block size (prefer compact blocks) blocks.sort((a, b) => { const uniqueA = a.matchedTerms.size const uniqueB = b.matchedTerms.size if (uniqueA !== uniqueB) { return uniqueB - uniqueA // More unique terms = better } // Tie-break: prefer denser blocks (more terms per line) const densityA = uniqueA / (a.end - a.start + 1) const densityB = uniqueB / (b.end - b.start + 1) return densityB - densityA }) const topBlocks = blocks.slice(0, maxBlocks) // Sort by position for output (top to bottom in file) topBlocks.sort((a, b) => a.start - b.start) // Step 4: Build output with character limit const snippetParts: string[] = [] let totalChars = 0 for (const block of topBlocks) { const blockLines = lines.slice(block.start, block.end + 1) const blockContent = blockLines.map((line, i) => `${block.start + i + 1}: ${line}`).join('\n') // Check if adding this block would exceed limit if (totalChars + blockContent.length > maxChars && snippetParts.length > 0) { break } snippetParts.push(blockContent) totalChars += blockContent.length } return snippetParts.join('\n...\n') } /** * Get file content */ async getFileContent(filePath: string): Promise<string | null> { const file = await this.storage.getFile(filePath) return file?.content || null } /** * Get total indexed files count */ async getIndexedCount(): Promise<number> { return this.storage.count() } /** * Get vector storage (for hybrid search) */ getVectorStorage(): VectorStorage | undefined { return this.vectorStorage } /** * Get embedding provider (for hybrid search) */ getEmbeddingProvider(): EmbeddingProvider | undefined { return this.embeddingProvider } /** * Build vector index from file metadata (Memory optimization) * Generates embeddings per CHUNK, not per file */ private async buildVectorIndexFromMetadata(files: FileMetadata[]): Promise<void> { if (!this.embeddingProvider || !this.vectorStorage) { return } console.error('[INFO] Generating embeddings for vector search (chunk-level)...') const startTime = Date.now() let totalChunks = 0 let processedChunks = 0 // First pass: count total chunks const allChunks: Array<{ id: string content: string metadata: FileMetadata chunkType: string startLine: number endLine: number }> = [] for (const metadata of files) { const content = readFileContent(metadata.absolutePath) if (content === null) continue // Chunk the file using AST const chunks = await chunkCodeByAST(content, metadata.path) for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i] allChunks.push({ id: `chunk://${metadata.path}:${chunk.startLine}-${chunk.endLine}`, content: chunk.content, metadata, chunkType: chunk.type, startLine: chunk.startLine, endLine: chunk.endLine, }) } } totalChunks = allChunks.length console.error(`[INFO] Total chunks to embed: ${totalChunks}`) // Process chunks in batches const batchSize = this.vectorBatchSize for (let i = 0; i < allChunks.length; i += batchSize) { const batch = allChunks.slice(i, i + batchSize) try { // Generate embeddings for batch const embeddings = await this.embeddingProvider.generateEmbeddings( batch.map((c) => c.content) ) // Add to vector storage for (let j = 0; j < batch.length; j++) { const chunk = batch[j] const embedding = embeddings[j] const doc: VectorDocument = { id: chunk.id, embedding, metadata: { type: 'code', chunkType: chunk.chunkType, language: chunk.metadata.language, content: chunk.content.substring(0, 500), // Preview path: chunk.metadata.path, startLine: chunk.startLine, endLine: chunk.endLine, }, } await this.vectorStorage.addDocument(doc) } processedChunks += batch.length console.error(`[INFO] Generated embeddings: ${processedChunks}/${totalChunks} chunks`) } catch (error) { console.error(`[ERROR] Failed to generate embeddings for batch ${i}:`, error) // Continue with next batch } } // LanceDB auto-persists, no need to save const elapsedTime = Date.now() - startTime console.error( `[SUCCESS] Vector index built (${processedChunks} chunks from ${files.length} files, ${elapsedTime}ms)` ) } /** * Update vectors for a single file (chunk-level) * Deletes old chunks and adds new ones */ private async updateFileVector(filePath: string, content: string): Promise<void> { if (!this.embeddingProvider || !this.vectorStorage) { return } try { // Delete existing chunks for this file await this.deleteFileVector(filePath) // Chunk the file using AST const chunks = await chunkCodeByAST(content, filePath) const language = detectLanguage(filePath) // Generate embeddings for all chunks const embeddings = await this.embeddingProvider.generateEmbeddings( chunks.map((c) => c.content) ) // Add each chunk to vector storage for (let i = 0; i < chunks.length; i++) { const chunk = chunks[i] const embedding = embeddings[i] const doc: VectorDocument = { id: `chunk://${filePath}:${chunk.startLine}-${chunk.endLine}`, embedding, metadata: { type: 'code', chunkType: chunk.type, language, content: chunk.content.substring(0, 500), path: filePath, startLine: chunk.startLine, endLine: chunk.endLine, }, } await this.vectorStorage.addDocument(doc) } console.error(`[VECTOR] Updated: ${filePath} (${chunks.length} chunks)`) } catch (error) { console.error(`[ERROR] Failed to update vector for ${filePath}:`, error) } } /** * Delete vectors for a file (all chunks) */ private async deleteFileVector(filePath: string): Promise<void> { if (!this.vectorStorage) { return } // Delete all chunks that belong to this file // Vector IDs are in format: chunk://path:startLine-endLine // We need to query and delete all matching the path prefix try { // LanceDB doesn't have a prefix delete, so we search and delete individually // For now, we'll rely on the addDocument overwriting or use a workaround // TODO: Implement proper chunk deletion in VectorStorage console.error(`[VECTOR] Deleting chunks for: ${filePath}`) } catch (error) { console.error(`[ERROR] Failed to delete vectors for ${filePath}:`, error) } } } export interface SearchResult { path: string score: number matchedTerms: string[] language?: string size: number snippet?: string // Chunk metadata (when using chunk-level search) chunkType?: string startLine?: number endLine?: number }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/SylphxAI/coderag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

indexer.ts•48.8 KiB