M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

Overview Schema Related Servers Score Discussions

Mimir
src
indexing

FileIndexer.ts•38.9 KiB

// ============================================================================ // FileIndexer - Index files into Neo4j // Phase 1: Basic file indexing with content // Phase 2: Vector embeddings for semantic search // Phase 3: Parse and extract functions/classes (future) // ============================================================================ import { Driver } from 'neo4j-driver'; import { promises as fs } from 'fs'; import path from 'path'; import { createHash } from 'crypto'; import { EmbeddingsService, ChunkEmbeddingResult, formatMetadataForEmbedding, FileMetadata } from './EmbeddingsService.js'; import { DocumentParser } from './DocumentParser.js'; import { getHostWorkspaceRoot } from '../utils/path-utils.js'; import { ImageProcessor } from './ImageProcessor.js'; import { VLService } from './VLService.js'; import { LLMConfigLoader } from '../config/LLMConfigLoader.js'; /** * Generate a deterministic hash-based ID for content * This ensures idempotent re-indexing without duplicate creation issues */ export function generateContentHash(content: string, prefix: string = ''): string { const hash = createHash('sha256').update(content).digest('hex').slice(0, 16); return prefix ? `${prefix}-${hash}` : hash; } /** * Generate a deterministic chunk ID based on file path and chunk content */ export function generateChunkId(filePath: string, chunkIndex: number, chunkText: string): string { const contentHash = generateContentHash(chunkText); return `chunk-${generateContentHash(filePath)}-${chunkIndex}-${contentHash}`; } /** * Generate a deterministic file ID based on path */ export function generateFileId(filePath: string): string { return `file-${generateContentHash(filePath)}`; } export interface IndexResult { file_node_id: string; path: string; size_bytes: number; chunks_created?: number; } export class FileIndexer { private embeddingsService: EmbeddingsService; private embeddingsInitialized: boolean = false; private embeddingsInitPromise: Promise<void> | null = null; // Mutex for initialization private documentParser: DocumentParser; private imageProcessor: ImageProcessor | null = null; private vlService: VLService | null = null; private configLoader: LLMConfigLoader; private isNornicDB: boolean = false; private providerDetected: boolean = false; constructor(private driver: Driver) { this.embeddingsService = new EmbeddingsService(); this.documentParser = new DocumentParser(); this.configLoader = LLMConfigLoader.getInstance(); } /** * Detect database provider (NornicDB vs Neo4j) * Same detection logic as GraphManager for consistency */ private async detectDatabaseProvider(): Promise<void> { // Check for manual override const manualProvider = process.env.MIMIR_DATABASE_PROVIDER?.toLowerCase(); if (manualProvider === 'nornicdb') { this.isNornicDB = true; return; } else if (manualProvider === 'neo4j') { this.isNornicDB = false; return; } // Auto-detect via server metadata const session = this.driver.session(); try { const result = await session.run('RETURN 1 as test'); const serverAgent = result.summary.server?.agent || ''; if (serverAgent.toLowerCase().includes('nornicdb')) { this.isNornicDB = true; } else { this.isNornicDB = false; } } catch (error) { // Default to Neo4j on error this.isNornicDB = false; } finally { await session.close(); } } /** * Initialize embeddings service (lazy loading) * Skips initialization if connected to NornicDB * Uses mutex pattern to prevent race conditions in concurrent calls */ private async initEmbeddings(): Promise<void> { // If already initialized, return immediately if (this.embeddingsInitialized) { return; } // If initialization is in progress, wait for it if (this.embeddingsInitPromise) { return this.embeddingsInitPromise; } // Start initialization and store the promise for concurrent callers this.embeddingsInitPromise = this.doInitEmbeddings(); try { await this.embeddingsInitPromise; } finally { // Clear the promise after completion (success or failure) this.embeddingsInitPromise = null; } } /** * Internal initialization logic (called once via mutex) */ private async doInitEmbeddings(): Promise<void> { // Detect provider on first call if (!this.providerDetected) { await this.detectDatabaseProvider(); this.providerDetected = true; if (this.isNornicDB) { console.log('🗄️ FileIndexer: NornicDB detected - skipping embeddings service initialization'); } } // Only initialize embeddings service for Neo4j if (!this.isNornicDB) { await this.embeddingsService.initialize(); } this.embeddingsInitialized = true; } /** * Retry wrapper for Neo4j transactions with exponential backoff * Handles deadlocks and other transient database errors * * @param fn - Async function to execute with retry logic * @param operation - Description of operation for logging * @param maxRetries - Maximum number of retry attempts (default: 3) * @returns Result of the function execution * * @example * await this.retryNeo4jTransaction(async () => { * return await session.run('MERGE (n:Node {id: $id})', {id: '123'}); * }, 'Create node', 3); */ private async retryNeo4jTransaction<T>( fn: () => Promise<T>, operation: string, maxRetries: number = 3 ): Promise<T> { let lastError: any; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await fn(); } catch (error: any) { lastError = error; // Check for retryable Neo4j errors const isDeadlock = error.message?.includes('DeadlockDetected') || error.message?.includes('can\'t acquire') || error.message?.includes('ForsetiClient') || error.code === 'Neo.TransientError.Transaction.DeadlockDetected'; const isLockTimeout = error.message?.includes('LockClient') || error.code === 'Neo.TransientError.Transaction.LockClientStopped'; const isTransient = error.code?.startsWith('Neo.TransientError'); const isRetryable = isDeadlock || isLockTimeout || isTransient; // Don't retry on final attempt or non-retryable errors if (!isRetryable || attempt === maxRetries) { throw error; } // Exponential backoff with jitter: 100ms, 200ms, 400ms, 800ms... const baseDelay = 100 * Math.pow(2, attempt); const jitter = Math.random() * 50; // Add 0-50ms random jitter const delayMs = Math.min(baseDelay + jitter, 2000); const errorType = isDeadlock ? 'deadlock' : isLockTimeout ? 'lock timeout' : 'transient'; console.warn( `⚠️ ${operation} failed with Neo4j ${errorType} error ` + `(attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${Math.round(delayMs)}ms...` ); await new Promise(resolve => setTimeout(resolve, delayMs)); } } throw lastError; } /** * Initialize image processing services (lazy loading) */ private async initImageServices(): Promise<void> { const config = await this.configLoader.getEmbeddingsConfig(); if (!config?.images?.enabled) { return; } // Initialize ImageProcessor if (!this.imageProcessor) { this.imageProcessor = new ImageProcessor({ maxPixels: config.images.maxPixels, targetSize: config.images.targetSize, resizeQuality: config.images.resizeQuality }); } // Initialize VLService if describe mode is enabled if (config.images.describeMode && config.vl && !this.vlService) { this.vlService = new VLService({ provider: config.vl.provider, api: config.vl.api, apiPath: config.vl.apiPath, apiKey: config.vl.apiKey, model: config.vl.model, contextSize: config.vl.contextSize, maxTokens: config.vl.maxTokens, temperature: config.vl.temperature }); console.log('🖼️ Image embedding services initialized (VL describe mode)'); } } /** * Translate container path to host path * e.g., /workspace/my-project/file.ts -> /Users/user/src/my-project/file.ts * If not in Docker, both paths are the same */ private translateToHostPath(containerPath: string): string { const workspaceRoot = process.env.WORKSPACE_ROOT; const hostWorkspaceRoot = process.env.HOST_WORKSPACE_ROOT; // If WORKSPACE_ROOT not set, we're running locally - no translation needed if (!workspaceRoot) { return containerPath; } // If HOST_WORKSPACE_ROOT not set but WORKSPACE_ROOT is, something is misconfigured if (!hostWorkspaceRoot) { console.warn('⚠️ WORKSPACE_ROOT is set but HOST_WORKSPACE_ROOT is not - path translation may fail'); return containerPath; } // Expand tilde in HOST_WORKSPACE_ROOT for consistent storage // Use getHostWorkspaceRoot() which properly expands tilde const expandedHostRoot = getHostWorkspaceRoot(); // Replace container workspace root with expanded host workspace root if (containerPath.startsWith(workspaceRoot)) { return containerPath.replace(workspaceRoot, expandedHostRoot); } return containerPath; } /** * Index a single file into Neo4j with optional vector embeddings * * Creates a File node in the graph database with metadata and content. * For large files with embeddings enabled, splits content into chunks * with individual embeddings for precise semantic search (industry standard). * * Indexing Strategy: * - **Small files** (<1000 chars): Single embedding on File node * - **Large files** (>1000 chars): Multiple FileChunk nodes with embeddings * - **No embeddings**: Full content stored on File node for full-text search * * Supported Formats: * - Text files (.ts, .js, .py, .md, .json, etc.) * - PDF documents (text extraction) * - DOCX documents (text extraction) * - Images (.png, .jpg, etc.) with VL description or multimodal embedding * * @param filePath - Absolute path to file * @param rootPath - Root directory path for calculating relative paths * @param generateEmbeddings - Whether to generate vector embeddings * @returns Index result with file node ID, path, size, and chunk count * @throws {Error} If file is binary, non-indexable, or processing fails * * @example * // Index a TypeScript file without embeddings * const result = await fileIndexer.indexFile( * '/Users/user/project/src/auth.ts', * '/Users/user/project', * false * ); * console.log('Indexed:', result.path); * console.log('Size:', result.size_bytes, 'bytes'); * // File content stored on File node for full-text search * * @example * // Index a large file with embeddings (chunked) * const result = await fileIndexer.indexFile( * '/Users/user/project/docs/guide.md', * '/Users/user/project', * true * ); * console.log('Created', result.chunks_created, 'chunks'); * // Each chunk has its own embedding for precise semantic search * * @example * // Index a PDF document with embeddings * const result = await fileIndexer.indexFile( * '/Users/user/project/docs/manual.pdf', * '/Users/user/project', * true * ); * console.log('Extracted and indexed PDF:', result.path); * console.log('Chunks created:', result.chunks_created); * * @example * // Index an image with VL description * const result = await fileIndexer.indexFile( * '/Users/user/project/images/diagram.png', * '/Users/user/project', * true * ); * console.log('Image indexed with description:', result.path); * // VL model generates text description, then embeds it * * @example * // Handle indexing errors * try { * await fileIndexer.indexFile(filePath, rootPath, true); * } catch (error) { * if (error.message === 'Binary or non-indexable file') { * console.log('Skipped binary file'); * } else { * console.error('Indexing failed:', error.message); * } * } */ async indexFile(filePath: string, rootPath: string, generateEmbeddings: boolean = false, watchConfigId?: string): Promise<IndexResult> { const session = this.driver.session(); let content: string = ''; let isImage = false; // CRITICAL: Detect provider BEFORE making content storage decisions // This ensures NornicDB detection happens before shouldStoreFullContent is evaluated if (!this.providerDetected) { await this.detectDatabaseProvider(); this.providerDetected = true; if (this.isNornicDB) { console.log('🗄️ FileIndexer: NornicDB detected - full content will be stored for native embedding'); } } try { const relativePath = path.relative(rootPath, filePath); const extension = path.extname(filePath).toLowerCase(); const binaryDoc = this.documentParser.isSupportedFormat(extension); // Check if this is an image file BEFORE the binary skip if (ImageProcessor.isImageFile(filePath) && generateEmbeddings) { await this.initImageServices(); const config = await this.configLoader.getEmbeddingsConfig(); if (config?.images?.enabled) { isImage = true; // For NornicDB: ALWAYS use VL description mode (NornicDB can only embed text) // For Neo4j: Use configured mode (describeMode or direct multimodal) const useVLDescription = this.isNornicDB || config.images.describeMode; if (useVLDescription && this.vlService && this.imageProcessor) { // Path 1: VL Description Method (DEFAULT, REQUIRED for NornicDB) // Uses VL model to generate text description, then embeds the description console.log(`🖼️ Processing image with VL description: ${relativePath}${this.isNornicDB ? ' (NornicDB requires text)' : ''}`); // 1. Prepare image (resize if needed) const processedImage = await this.imageProcessor.prepareImageForVL(filePath); if (processedImage.wasResized) { console.log(` Resized from ${processedImage.originalSize.width}×${processedImage.originalSize.height} to ${processedImage.processedSize.width}×${processedImage.processedSize.height}`); } // 2. Create Data URL const dataURL = this.imageProcessor.createDataURL(processedImage.base64, processedImage.format); // 3. Get description from VL model const result = await this.vlService.describeImage(dataURL); content = result.description; console.log(` Generated description (${content.length} chars) in ${result.processingTimeMs}ms`); } else if (!config.images.describeMode && !this.isNornicDB && this.imageProcessor) { // Path 2: Direct Multimodal Embedding (Neo4j only) // Sends image directly to multimodal embeddings endpoint // NOTE: This path is NOT available for NornicDB (requires text content) console.log(`🖼️ Processing image with direct multimodal embedding: ${relativePath}`); // 1. Prepare image (resize if needed) const processedImage = await this.imageProcessor.prepareImageForVL(filePath); if (processedImage.wasResized) { console.log(` Resized from ${processedImage.originalSize.width}×${processedImage.originalSize.height} to ${processedImage.processedSize.width}×${processedImage.processedSize.height}`); } // 2. Create Data URL for embedding const dataURL = this.imageProcessor.createDataURL(processedImage.base64, processedImage.format); // 3. Store the data URL as content - will be sent to embeddings service // The embeddings service will handle multimodal input content = dataURL; console.log(` Prepared image for direct embedding (${processedImage.sizeBytes} bytes)`); } else if (this.isNornicDB && !this.vlService) { // NornicDB requires VL service for image embedding (can't do multimodal) console.warn(`⚠️ Skipping image ${relativePath}: NornicDB requires VL service for image descriptions`); throw new Error('Image indexing requires VL service for NornicDB (text-only embedding)'); } else { // Missing required services const missingServices = []; if (!this.imageProcessor) missingServices.push('ImageProcessor'); if (config.images.describeMode && !this.vlService) missingServices.push('VLService'); throw new Error(`Image processing requires: ${missingServices.join(', ')}`); } } else { // Images disabled, skip throw new Error('Image indexing disabled'); } } else if (binaryDoc) { // Extract text from PDF or DOCX const buffer = await fs.readFile(filePath); content = await this.documentParser.extractText(buffer, extension); console.log(`📄 Extracted ${content.length} chars from ${extension} document: ${relativePath}`); } else if (!this.shouldSkipFile(filePath, extension)) { // Read as plain text file content = await fs.readFile(filePath, 'utf-8'); // Check if content is actually text (not binary masquerading as text) if (!this.isTextContent(content)) { throw new Error('Binary content detected'); } } else { throw new Error('Binary or non-indexable file'); } const stats = await fs.stat(filePath); const language = this.detectLanguage(filePath); // Check if file already has chunks let hasExistingChunks = false; if (generateEmbeddings) { const checkResult = await session.run( `MATCH (f:File {path: $path})-[:HAS_CHUNK]->(c:FileChunk) RETURN count(c) AS chunk_count, f.last_modified AS last_modified`, { path: filePath } ); if (checkResult.records.length > 0) { const chunkCount = checkResult.records[0].get('chunk_count').toNumber(); hasExistingChunks = chunkCount > 0; const existingModified = checkResult.records[0].get('last_modified'); // Re-generate if file was modified if (hasExistingChunks && existingModified) { const existingModifiedDate = new Date(existingModified); if (stats.mtime > existingModifiedDate) { console.log(`📝 File modified, regenerating chunks: ${relativePath}`); hasExistingChunks = false; // Delete old chunks (use filePath which is the absolute container path) await session.run( `MATCH (f:File {path: $path})-[:HAS_CHUNK]->(c:FileChunk) DETACH DELETE c`, { path: filePath } ); } } } } // Determine if file needs chunking (based on embeddings config chunk size) const needsChunking = generateEmbeddings && content.length > 1000; // Will be refined by EmbeddingsService // Storage strategy: // - If NornicDB → ALWAYS store full content (NornicDB handles chunking/embedding natively) // - If embeddings ENABLED and file is LARGE → Store in chunks (chunk nodes) + no content on File node // - If embeddings DISABLED → ALWAYS store full content on File node (enables full-text search) // - If embeddings ENABLED and file is SMALL → Store content on File node + embedding const shouldStoreFullContent = this.isNornicDB || !generateEmbeddings || !needsChunking; // For NornicDB: Enrich content with metadata BEFORE storing // This gives NornicDB's embedding worker richer context for better embeddings // (Same enrichment that happens for Neo4j in the chunking/embedding path below) let contentToStore = content; if (this.isNornicDB && shouldStoreFullContent && generateEmbeddings) { const fileMetadata: FileMetadata = { name: path.basename(filePath), relativePath: relativePath, language: language, extension: extension, directory: path.dirname(relativePath), sizeBytes: stats.size }; const metadataPrefix = formatMetadataForEmbedding(fileMetadata); contentToStore = metadataPrefix + content; console.log(`📝 Enriched content for NornicDB embedding: ${relativePath} (+${metadataPrefix.length} chars metadata)`); } // Create File node with BOTH container and host paths // f.path = absolute container path (e.g., /app/docs/README.md) // f.host_path = absolute host path (e.g., /Users/user/src/Mimir/docs/README.md) // When not in Docker, both paths are the same // Convert container path to host path using environment variables const hostPath = this.translateToHostPath(filePath); // Use host path for logging (fall back to container path if not available) const displayPath = hostPath || filePath; // Wrap File node creation with retry logic to handle deadlocks const fileResult = await this.retryNeo4jTransaction(async () => { // Create File node and optionally link to WatchConfig const query = watchConfigId ? ` MERGE (f:File:Node {path: $path}) ON CREATE SET f.id = 'file-' + toString(timestamp()) + '-' + substring(randomUUID(), 0, 8) SET f.host_path = $host_path, f.name = $name, f.extension = $extension, f.language = $language, f.size_bytes = $size_bytes, f.line_count = $line_count, f.last_modified = $last_modified, f.indexed_date = datetime(), f.type = 'file', f.has_chunks = $has_chunks, f.content = $content WITH f MATCH (wc:WatchConfig {id: $watchConfigId}) MERGE (wc)-[:WATCHES]->(f) MERGE (f)-[:WATCHED_BY]->(wc) RETURN f.path AS path, f.size_bytes AS size_bytes, id(f) AS node_id ` : ` MERGE (f:File:Node {path: $path}) ON CREATE SET f.id = 'file-' + toString(timestamp()) + '-' + substring(randomUUID(), 0, 8) SET f.host_path = $host_path, f.name = $name, f.extension = $extension, f.language = $language, f.size_bytes = $size_bytes, f.line_count = $line_count, f.last_modified = $last_modified, f.indexed_date = datetime(), f.type = 'file', f.has_chunks = $has_chunks, f.content = $content RETURN f.path AS path, f.size_bytes AS size_bytes, id(f) AS node_id `; return await session.run(query, { path: filePath, // Now stores absolute container path host_path: hostPath, name: path.basename(filePath), extension: extension, language: language, size_bytes: stats.size, line_count: content.split('\n').length, last_modified: stats.mtime.toISOString(), has_chunks: needsChunking, content: shouldStoreFullContent ? contentToStore : null, // Store enriched content for NornicDB, raw content for Neo4j watchConfigId: watchConfigId || null }); }, `Create/update File node for ${relativePath}`); const fileNodeId = fileResult.records[0].get('node_id'); let chunksCreated = 0; // Generate and store embeddings if enabled and not already present // Skip embedding generation for NornicDB (database handles it natively) if (generateEmbeddings && !hasExistingChunks && !this.isNornicDB) { await this.initEmbeddings(); if (this.embeddingsService.isEnabled()) { try { // Prepare metadata for enrichment (ALL files get metadata enrichment) const fileMetadata: FileMetadata = { name: path.basename(filePath), relativePath: relativePath, language: language, extension: extension, directory: path.dirname(relativePath), sizeBytes: stats.size }; // Format metadata as natural language prefix const metadataPrefix = formatMetadataForEmbedding(fileMetadata); if (needsChunking) { // Large file: Generate separate chunk embeddings with metadata // Prepend metadata to the FULL content before chunking const enrichedContent = metadataPrefix + content; const chunkEmbeddings = await this.embeddingsService.generateChunkEmbeddings(enrichedContent); // Create FileChunk nodes with embeddings // Using content-based hashing for deterministic IDs (idempotent re-indexing) // Using parent_file_id property instead of NEXT_CHUNK relationships for simpler queries const totalChunks = chunkEmbeddings.length; for (const chunk of chunkEmbeddings) { // Generate deterministic chunk ID based on content hash // This ensures the same chunk always gets the same ID const chunkId = generateChunkId(relativePath, chunk.chunkIndex, chunk.text); await session.run(` MATCH (f:File) WHERE id(f) = $fileNodeId MERGE (c:FileChunk:Node {id: $chunkId}) SET c.chunk_index = $chunkIndex, c.text = $text, c.start_offset = $startOffset, c.end_offset = $endOffset, c.embedding = $embedding, c.embedding_dimensions = $dimensions, c.embedding_model = $model, c.type = 'file_chunk', c.indexed_date = datetime(), c.filePath = f.path, c.fileName = f.name, c.parent_file_id = $parentFileId, c.total_chunks = $totalChunks, c.has_next = $hasNext, c.has_prev = $hasPrev MERGE (f)-[:HAS_CHUNK {index: $chunkIndex}]->(c) `, { fileNodeId, chunkId, chunkIndex: chunk.chunkIndex, text: chunk.text, startOffset: chunk.startOffset, endOffset: chunk.endOffset, embedding: chunk.embedding, dimensions: chunk.dimensions, model: chunk.model, parentFileId: fileNodeId, totalChunks, hasNext: chunk.chunkIndex < totalChunks - 1, hasPrev: chunk.chunkIndex > 0 }); chunksCreated++; } console.log(`✅ Created ${chunksCreated} chunk embeddings for ${displayPath}`); } else { // Small file: Store embedding directly on File node with metadata enrichment const enrichedContent = metadataPrefix + content; const embedding = await this.embeddingsService.generateEmbedding(enrichedContent); await session.run(` MATCH (f:File) WHERE id(f) = $fileNodeId SET f.embedding = $embedding, f.embedding_dimensions = $dimensions, f.embedding_model = $model, f.has_embedding = true `, { fileNodeId, embedding: embedding.embedding, dimensions: embedding.dimensions, model: embedding.model }); console.log(`✅ Created file embedding for ${displayPath}`); } } catch (error: any) { console.warn(`⚠️ Failed to generate embeddings for ${displayPath}: ${error.message}`); } } } else if (generateEmbeddings && hasExistingChunks) { console.log(`⏭️ Skipping embeddings (already exist): ${displayPath}`); } return { file_node_id: `file-${fileNodeId}`, path: relativePath, size_bytes: stats.size, chunks_created: chunksCreated }; } catch (error: any) { // Provide specific error messages for different skip reasons const relativePath = path.relative(rootPath, filePath); if (error.message === 'Binary or non-indexable file') { // Silently skip - these are expected throw error; } if (error.message === 'Binary content detected') { console.warn(`⚠️ Skipping file with binary content: ${relativePath}`); throw new Error('Binary file'); } // UTF-8 decode errors if (error.code === 'ERR_INVALID_ARG_TYPE' || error.message?.includes('invalid')) { console.warn(`⚠️ Skipping file (UTF-8 decode error): ${relativePath}`); throw new Error('Binary file'); } throw error; } finally { await session.close(); } } /** * Check if content is actually text (not binary) * * Industry-standard approach: * 1. Check for null bytes (definitive binary indicator) * 2. Check for high concentration of control characters (0x00-0x08, 0x0E-0x1F) * 3. Allow all valid Unicode including emojis, CJK, extended Latin, etc. * * This properly handles: * - UTF-8 encoded files with emojis (🔧, 📄, etc.) * - Files with non-ASCII characters (Chinese, Japanese, Arabic, etc.) * - Files with special symbols and mathematical notation */ private isTextContent(content: string): boolean { // Empty content is treated as text (no binary indicators) if (content.length === 0) { return true; } // Check for null bytes (definitive binary indicator) if (content.includes('\0')) { return false; } // Sample first 8KB for performance (industry standard sample size) const sampleSize = Math.min(content.length, 8192); let controlCharCount = 0; for (let i = 0; i < sampleSize; i++) { const code = content.charCodeAt(i); // Count problematic control characters (0x00-0x08, 0x0E-0x1F) // Exclude common whitespace: tab (0x09), newline (0x0A), carriage return (0x0D) // Also exclude form feed (0x0C) which appears in some text files if ((code >= 0x00 && code <= 0x08) || (code >= 0x0E && code <= 0x1F)) { controlCharCount++; } // Check for lone surrogate code units (invalid UTF-16) // High surrogate (0xD800-0xDBFF) must be followed by low surrogate (0xDC00-0xDFFF) if (code >= 0xD800 && code <= 0xDBFF) { const nextCode = i + 1 < sampleSize ? content.charCodeAt(i + 1) : 0; if (nextCode < 0xDC00 || nextCode > 0xDFFF) { // Lone high surrogate - likely binary or corrupted controlCharCount++; } else { // Valid surrogate pair - skip the low surrogate on next iteration i++; } } else if (code >= 0xDC00 && code <= 0xDFFF) { // Lone low surrogate without preceding high surrogate controlCharCount++; } } // If more than 10% control characters, likely binary // This is more permissive than the old 85% printable threshold const controlRatio = controlCharCount / sampleSize; return controlRatio < 0.10; } /** * Check if file should be skipped (binary, images, archives, sensitive files, etc.) * Note: PDF and DOCX are in this list but handled separately via DocumentParser */ private shouldSkipFile(filePath: string, extension: string): boolean { // Binary and non-text file extensions to skip const skipExtensions = new Set([ // Images '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.svg', '.webp', '.tiff', '.tif', // Videos '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.m4v', // Audio '.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma', // Archives '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz', // Executables and binaries '.exe', '.dll', '.so', '.dylib', '.bin', '.dat', '.app', // Compiled/bytecode '.pyc', '.pyo', '.class', '.o', '.obj', '.wasm', // Documents (binary formats) - PDF/DOCX supported via DocumentParser '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp', // Fonts '.ttf', '.otf', '.woff', '.woff2', '.eot', // Database files '.db', '.sqlite', '.sqlite3', '.mdb', // IDE/Editor files '.swp', '.swo', '.DS_Store', '.idea', // Lock files (often auto-generated) '.lock', // Other binary formats '.pkl', '.pickle', '.parquet', '.avro', '.protobuf', '.pb', // Sensitive file extensions (Industry Standard Security) '.pem', '.key', '.p12', '.pfx', '.cer', '.crt', '.der', // Certificates & Private Keys '.keystore', '.jks', '.bks', // Java keystores '.ppk', '.pub', // SSH keys '.credentials', '.secret', // Credential files '.log' // Logs (may contain sensitive data) ]); // Check extension if (skipExtensions.has(extension)) { return true; } // Skip files without extension that are likely binary or auto-generated const fileName = path.basename(filePath); const binaryFileNames = new Set([ 'package-lock.json', // Too large and auto-generated 'yarn.lock', // Too large and auto-generated 'pnpm-lock.yaml', // Too large and auto-generated '.DS_Store', 'Thumbs.db', 'desktop.ini' ]); if (binaryFileNames.has(fileName)) { return true; } // Skip sensitive files by name (Industry Standard Security) // Configurable via MIMIR_SENSITIVE_FILES environment variable (comma-separated) const defaultSensitiveFiles = [ '.env', '.env.local', '.env.development', '.env.production', '.env.test', '.env.staging', '.env.example', // Environment files '.npmrc', '.yarnrc', '.pypirc', // Package manager configs with tokens '.netrc', '_netrc', // FTP/HTTP credentials 'id_rsa', 'id_dsa', 'id_ecdsa', 'id_ed25519', // SSH private keys 'credentials', 'secrets.yml', 'secrets.yaml', 'secrets.json', 'master.key', 'production.key' // Rails secrets ]; const sensitiveFileNames = new Set( process.env.MIMIR_SENSITIVE_FILES ? process.env.MIMIR_SENSITIVE_FILES.split(',').map(f => f.trim()).filter(f => f.length > 0) : defaultSensitiveFiles ); if (sensitiveFileNames.has(fileName)) { return true; } // Skip files with sensitive patterns in name const lowerFileName = fileName.toLowerCase(); const sensitivePatterns = [ 'password', 'passwd', 'secret', 'credential', 'token', 'apikey', 'api_key', 'private_key' ]; for (const pattern of sensitivePatterns) { if (lowerFileName.includes(pattern)) { return true; } } return false; } /** * Detect language from file extension */ private detectLanguage(filePath: string): string { const ext = path.extname(filePath).toLowerCase(); const languageMap: Record<string, string> = { '.ts': 'typescript', '.tsx': 'typescript', '.js': 'javascript', '.jsx': 'javascript', '.py': 'python', '.java': 'java', '.go': 'go', '.rs': 'rust', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php', '.md': 'markdown', '.json': 'json', '.yaml': 'yaml', '.yml': 'yaml', '.xml': 'xml', '.html': 'html', '.css': 'css', '.scss': 'scss', '.sql': 'sql' }; return languageMap[ext] || 'generic'; } /** * Delete file node and all associated chunks from Neo4j * * Removes the File node and cascades to delete all FileChunk nodes * and their relationships. Use this when files are deleted from disk * or need to be removed from the index. * * @param relativePath - Relative path to file (from root directory) * * @example * // Delete a file from index when deleted from disk * await fileIndexer.deleteFile('src/auth.ts'); * console.log('File removed from index'); * * @example * // Clean up after file move/rename * await fileIndexer.deleteFile('old/path/file.ts'); * await fileIndexer.indexFile('/new/path/file.ts', rootPath, true); * console.log('File re-indexed at new location'); * * @example * // Batch delete multiple files * const deletedFiles = ['src/old1.ts', 'src/old2.ts', 'src/old3.ts']; * for (const file of deletedFiles) { * await fileIndexer.deleteFile(file); * } * console.log('Cleaned up', deletedFiles.length, 'files'); */ async deleteFile(relativePath: string): Promise<void> { const session = this.driver.session(); try { // DETACH DELETE automatically removes relationships and connected chunks await session.run(` MATCH (f:File {path: $path}) OPTIONAL MATCH (f)-[:HAS_CHUNK]->(c:FileChunk) DETACH DELETE f, c `, { path: relativePath }); } finally { await session.close(); } } /** * Update file content and embeddings after file modification * * Re-indexes the file to update content and regenerate embeddings. * Automatically detects if file was modified and regenerates chunks * if needed. This is the recommended way to handle file changes. * * @param filePath - Absolute path to modified file * @param rootPath - Root directory path * * @example * // Update file after modification * await fileIndexer.updateFile( * '/Users/user/project/src/auth.ts', * '/Users/user/project' * ); * console.log('File content and embeddings updated'); * * @example * // Handle file watcher events * watcher.on('change', async (filePath) => { * console.log('File changed:', filePath); * await fileIndexer.updateFile(filePath, rootPath); * console.log('Index updated'); * }); * * @example * // Batch update multiple changed files * const changedFiles = await getModifiedFiles(); * for (const file of changedFiles) { * await fileIndexer.updateFile(file, rootPath); * } * console.log('Updated', changedFiles.length, 'files'); */ async updateFile(filePath: string, rootPath: string): Promise<void> { // Just re-index the file await this.indexFile(filePath, rootPath); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

FileIndexer.ts•38.9 KiB