Skip to main content
Glama
orneryd

M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

by orneryd
FileIndexer.ts39.8 kB
// ============================================================================ // FileIndexer - Index files into Neo4j // Phase 1: Basic file indexing with content // Phase 2: Vector embeddings for semantic search // Phase 3: Parse and extract functions/classes (future) // ============================================================================ import { Driver } from 'neo4j-driver'; import { promises as fs } from 'fs'; import path from 'path'; import { createHash } from 'crypto'; import { EmbeddingsService, ChunkEmbeddingResult, formatMetadataForEmbedding, FileMetadata } from './EmbeddingsService.js'; import { DocumentParser } from './DocumentParser.js'; import { getHostWorkspaceRoot } from '../utils/path-utils.js'; import { ImageProcessor } from './ImageProcessor.js'; import { VLService } from './VLService.js'; import { LLMConfigLoader } from '../config/LLMConfigLoader.js'; /** * Generate a deterministic hash-based ID for content * This ensures idempotent re-indexing without duplicate creation issues */ export function generateContentHash(content: string, prefix: string = ''): string { const hash = createHash('sha256').update(content).digest('hex').slice(0, 16); return prefix ? `${prefix}-${hash}` : hash; } /** * Generate a deterministic chunk ID based on file path and chunk content */ export function generateChunkId(filePath: string, chunkIndex: number, chunkText: string): string { const contentHash = generateContentHash(chunkText); return `chunk-${generateContentHash(filePath)}-${chunkIndex}-${contentHash}`; } /** * Generate a deterministic file ID based on path */ export function generateFileId(filePath: string): string { return `file-${generateContentHash(filePath)}`; } export interface IndexResult { file_node_id: string; path: string; size_bytes: number; chunks_created?: number; } export class FileIndexer { private embeddingsService: EmbeddingsService; private embeddingsInitialized: boolean = false; private embeddingsInitPromise: Promise<void> | null = null; // Mutex for initialization private documentParser: DocumentParser; private imageProcessor: ImageProcessor | null = null; private vlService: VLService | null = null; private configLoader: LLMConfigLoader; private isNornicDB: boolean = false; private providerDetected: boolean = false; constructor(private driver: Driver) { this.embeddingsService = new EmbeddingsService(); this.documentParser = new DocumentParser(); this.configLoader = LLMConfigLoader.getInstance(); } /** * Detect database provider (NornicDB vs Neo4j) * Same detection logic as GraphManager for consistency */ private async detectDatabaseProvider(): Promise<void> { // Check for manual override const manualProvider = process.env.MIMIR_DATABASE_PROVIDER?.toLowerCase(); if (manualProvider === 'nornicdb') { this.isNornicDB = true; return; } else if (manualProvider === 'neo4j') { this.isNornicDB = false; return; } // Auto-detect via server metadata const session = this.driver.session(); try { const result = await session.run('RETURN 1 as test'); const serverAgent = result.summary.server?.agent || ''; if (serverAgent.toLowerCase().includes('nornicdb')) { this.isNornicDB = true; } else { this.isNornicDB = false; } } catch (error) { // Default to Neo4j on error this.isNornicDB = false; } finally { await session.close(); } } /** * Initialize embeddings service (lazy loading) * Skips initialization if connected to NornicDB * Uses mutex pattern to prevent race conditions in concurrent calls */ private async initEmbeddings(): Promise<void> { // If already initialized, return immediately if (this.embeddingsInitialized) { return; } // If initialization is in progress, wait for it if (this.embeddingsInitPromise) { return this.embeddingsInitPromise; } // Start initialization and store the promise for concurrent callers this.embeddingsInitPromise = this.doInitEmbeddings(); try { await this.embeddingsInitPromise; } finally { // Clear the promise after completion (success or failure) this.embeddingsInitPromise = null; } } /** * Internal initialization logic (called once via mutex) */ private async doInitEmbeddings(): Promise<void> { // Detect provider on first call if (!this.providerDetected) { await this.detectDatabaseProvider(); this.providerDetected = true; if (this.isNornicDB) { console.log('🗄️ FileIndexer: NornicDB detected - skipping embeddings service initialization'); } } // Only initialize embeddings service for Neo4j if (!this.isNornicDB) { await this.embeddingsService.initialize(); } this.embeddingsInitialized = true; } /** * Retry wrapper for Neo4j transactions with exponential backoff * Handles deadlocks and other transient database errors * * @param fn - Async function to execute with retry logic * @param operation - Description of operation for logging * @param maxRetries - Maximum number of retry attempts (default: 3) * @returns Result of the function execution * * @example * await this.retryNeo4jTransaction(async () => { * return await session.run('MERGE (n:Node {id: $id})', {id: '123'}); * }, 'Create node', 3); */ private async retryNeo4jTransaction<T>( fn: () => Promise<T>, operation: string, maxRetries: number = 3 ): Promise<T> { let lastError: any; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await fn(); } catch (error: any) { lastError = error; // Check for retryable Neo4j errors const isDeadlock = error.message?.includes('DeadlockDetected') || error.message?.includes('can\'t acquire') || error.message?.includes('ForsetiClient') || error.code === 'Neo.TransientError.Transaction.DeadlockDetected'; const isLockTimeout = error.message?.includes('LockClient') || error.code === 'Neo.TransientError.Transaction.LockClientStopped'; const isTransient = error.code?.startsWith('Neo.TransientError'); const isRetryable = isDeadlock || isLockTimeout || isTransient; // Don't retry on final attempt or non-retryable errors if (!isRetryable || attempt === maxRetries) { throw error; } // Exponential backoff with jitter: 100ms, 200ms, 400ms, 800ms... const baseDelay = 100 * Math.pow(2, attempt); const jitter = Math.random() * 50; // Add 0-50ms random jitter const delayMs = Math.min(baseDelay + jitter, 2000); const errorType = isDeadlock ? 'deadlock' : isLockTimeout ? 'lock timeout' : 'transient'; console.warn( `⚠️ ${operation} failed with Neo4j ${errorType} error ` + `(attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${Math.round(delayMs)}ms...` ); await new Promise(resolve => setTimeout(resolve, delayMs)); } } throw lastError; } /** * Initialize image processing services (lazy loading) */ private async initImageServices(): Promise<void> { const config = await this.configLoader.getEmbeddingsConfig(); if (!config?.images?.enabled) { return; } // Initialize ImageProcessor if (!this.imageProcessor) { this.imageProcessor = new ImageProcessor({ maxPixels: config.images.maxPixels, targetSize: config.images.targetSize, resizeQuality: config.images.resizeQuality }); } // Initialize VLService if describe mode is enabled if (config.images.describeMode && config.vl && !this.vlService) { this.vlService = new VLService({ provider: config.vl.provider, api: config.vl.api, apiPath: config.vl.apiPath, apiKey: config.vl.apiKey, model: config.vl.model, contextSize: config.vl.contextSize, maxTokens: config.vl.maxTokens, temperature: config.vl.temperature }); console.log('🖼️ Image embedding services initialized (VL describe mode)'); } } /** * Translate container path to host path * e.g., /workspace/my-project/file.ts -> /Users/user/src/my-project/file.ts * If not in Docker, both paths are the same */ private translateToHostPath(containerPath: string): string { const workspaceRoot = process.env.WORKSPACE_ROOT; const hostWorkspaceRoot = process.env.HOST_WORKSPACE_ROOT; // If WORKSPACE_ROOT not set, we're running locally - no translation needed if (!workspaceRoot) { return containerPath; } // If HOST_WORKSPACE_ROOT not set but WORKSPACE_ROOT is, something is misconfigured if (!hostWorkspaceRoot) { console.warn('⚠️ WORKSPACE_ROOT is set but HOST_WORKSPACE_ROOT is not - path translation may fail'); return containerPath; } // Expand tilde in HOST_WORKSPACE_ROOT for consistent storage // Use getHostWorkspaceRoot() which properly expands tilde const expandedHostRoot = getHostWorkspaceRoot(); // Replace container workspace root with expanded host workspace root if (containerPath.startsWith(workspaceRoot)) { return containerPath.replace(workspaceRoot, expandedHostRoot); } return containerPath; } /** * Index a single file into Neo4j with optional vector embeddings * * Creates a File node in the graph database with metadata and content. * For large files with embeddings enabled, splits content into chunks * with individual embeddings for precise semantic search (industry standard). * * Indexing Strategy: * - **Small files** (<1000 chars): Single embedding on File node * - **Large files** (>1000 chars): Multiple FileChunk nodes with embeddings * - **No embeddings**: Full content stored on File node for full-text search * * Supported Formats: * - Text files (.ts, .js, .py, .md, .json, etc.) * - PDF documents (text extraction) * - DOCX documents (text extraction) * - Images (.png, .jpg, etc.) with VL description or multimodal embedding * * @param filePath - Absolute path to file * @param rootPath - Root directory path for calculating relative paths * @param generateEmbeddings - Whether to generate vector embeddings * @returns Index result with file node ID, path, size, and chunk count * @throws {Error} If file is binary, non-indexable, or processing fails * * @example * // Index a TypeScript file without embeddings * const result = await fileIndexer.indexFile( * '/Users/user/project/src/auth.ts', * '/Users/user/project', * false * ); * console.log('Indexed:', result.path); * console.log('Size:', result.size_bytes, 'bytes'); * // File content stored on File node for full-text search * * @example * // Index a large file with embeddings (chunked) * const result = await fileIndexer.indexFile( * '/Users/user/project/docs/guide.md', * '/Users/user/project', * true * ); * console.log('Created', result.chunks_created, 'chunks'); * // Each chunk has its own embedding for precise semantic search * * @example * // Index a PDF document with embeddings * const result = await fileIndexer.indexFile( * '/Users/user/project/docs/manual.pdf', * '/Users/user/project', * true * ); * console.log('Extracted and indexed PDF:', result.path); * console.log('Chunks created:', result.chunks_created); * * @example * // Index an image with VL description * const result = await fileIndexer.indexFile( * '/Users/user/project/images/diagram.png', * '/Users/user/project', * true * ); * console.log('Image indexed with description:', result.path); * // VL model generates text description, then embeds it * * @example * // Handle indexing errors * try { * await fileIndexer.indexFile(filePath, rootPath, true); * } catch (error) { * if (error.message === 'Binary or non-indexable file') { * console.log('Skipped binary file'); * } else { * console.error('Indexing failed:', error.message); * } * } */ async indexFile(filePath: string, rootPath: string, generateEmbeddings: boolean = false, watchConfigId?: string): Promise<IndexResult> { const session = this.driver.session(); let content: string = ''; let isImage = false; // CRITICAL: Detect provider BEFORE making content storage decisions // This ensures NornicDB detection happens before shouldStoreFullContent is evaluated if (!this.providerDetected) { await this.detectDatabaseProvider(); this.providerDetected = true; if (this.isNornicDB) { console.log('🗄️ FileIndexer: NornicDB detected - full content will be stored for native embedding'); } } try { const relativePath = path.relative(rootPath, filePath); const extension = path.extname(filePath).toLowerCase(); const binaryDoc = this.documentParser.isSupportedFormat(extension); // Check if this is an image file BEFORE the binary skip if (ImageProcessor.isImageFile(filePath) && generateEmbeddings) { await this.initImageServices(); const config = await this.configLoader.getEmbeddingsConfig(); if (config?.images?.enabled) { isImage = true; // For NornicDB: ALWAYS use VL description mode (NornicDB can only embed text) // For Neo4j: Use configured mode (describeMode or direct multimodal) const useVLDescription = this.isNornicDB || config.images.describeMode; if (useVLDescription && this.vlService && this.imageProcessor) { // Path 1: VL Description Method (DEFAULT, REQUIRED for NornicDB) // Uses VL model to generate text description, then embeds the description console.log(`🖼️ Processing image with VL description: ${relativePath}${this.isNornicDB ? ' (NornicDB requires text)' : ''}`); // 1. Prepare image (resize if needed) const processedImage = await this.imageProcessor.prepareImageForVL(filePath); if (processedImage.wasResized) { console.log(` Resized from ${processedImage.originalSize.width}×${processedImage.originalSize.height} to ${processedImage.processedSize.width}×${processedImage.processedSize.height}`); } // 2. Create Data URL const dataURL = this.imageProcessor.createDataURL(processedImage.base64, processedImage.format); // 3. Get description from VL model const result = await this.vlService.describeImage(dataURL); content = result.description; console.log(` Generated description (${content.length} chars) in ${result.processingTimeMs}ms`); } else if (!config.images.describeMode && !this.isNornicDB && this.imageProcessor) { // Path 2: Direct Multimodal Embedding (Neo4j only) // Sends image directly to multimodal embeddings endpoint // NOTE: This path is NOT available for NornicDB (requires text content) console.log(`🖼️ Processing image with direct multimodal embedding: ${relativePath}`); // 1. Prepare image (resize if needed) const processedImage = await this.imageProcessor.prepareImageForVL(filePath); if (processedImage.wasResized) { console.log(` Resized from ${processedImage.originalSize.width}×${processedImage.originalSize.height} to ${processedImage.processedSize.width}×${processedImage.processedSize.height}`); } // 2. Create Data URL for embedding const dataURL = this.imageProcessor.createDataURL(processedImage.base64, processedImage.format); // 3. Store the data URL as content - will be sent to embeddings service // The embeddings service will handle multimodal input content = dataURL; console.log(` Prepared image for direct embedding (${processedImage.sizeBytes} bytes)`); } else if (this.isNornicDB && !this.vlService) { // NornicDB requires VL service for image embedding (can't do multimodal) console.warn(`⚠️ Skipping image ${relativePath}: NornicDB requires VL service for image descriptions`); throw new Error('Image indexing requires VL service for NornicDB (text-only embedding)'); } else { // Missing required services const missingServices = []; if (!this.imageProcessor) missingServices.push('ImageProcessor'); if (config.images.describeMode && !this.vlService) missingServices.push('VLService'); throw new Error(`Image processing requires: ${missingServices.join(', ')}`); } } else { // Images disabled, skip throw new Error('Image indexing disabled'); } } else if (binaryDoc) { // Extract text from PDF or DOCX const buffer = await fs.readFile(filePath); content = await this.documentParser.extractText(buffer, extension); console.log(`📄 Extracted ${content.length} chars from ${extension} document: ${relativePath}`); } else if (!this.shouldSkipFile(filePath, extension)) { // Read as plain text file content = await fs.readFile(filePath, 'utf-8'); // Check if content is actually text (not binary masquerading as text) if (!this.isTextContent(content)) { throw new Error('Binary content detected'); } } else { throw new Error('Binary or non-indexable file'); } const stats = await fs.stat(filePath); const language = this.detectLanguage(filePath); // Check if file already has chunks let hasExistingChunks = false; if (generateEmbeddings) { const checkResult = await session.run( `MATCH (f:File {path: $path})-[:HAS_CHUNK]->(c:FileChunk) RETURN count(c) AS chunk_count, f.last_modified AS last_modified`, { path: filePath } ); if (checkResult.records.length > 0) { const chunkCount = checkResult.records[0].get('chunk_count').toNumber(); hasExistingChunks = chunkCount > 0; const existingModified = checkResult.records[0].get('last_modified'); // Re-generate if file was modified if (hasExistingChunks && existingModified) { const existingModifiedDate = new Date(existingModified); if (stats.mtime > existingModifiedDate) { console.log(`📝 File modified, regenerating chunks: ${relativePath}`); hasExistingChunks = false; // Delete old chunks (use filePath which is the absolute container path) await session.run( `MATCH (f:File {path: $path})-[:HAS_CHUNK]->(c:FileChunk) DETACH DELETE c`, { path: filePath } ); } } } } // Determine if file needs chunking (based on embeddings config chunk size) const needsChunking = generateEmbeddings && content.length > 1000; // Will be refined by EmbeddingsService // Storage strategy: // - If NornicDB → ALWAYS store full content (NornicDB handles chunking/embedding natively) // - If embeddings ENABLED and file is LARGE → Store in chunks (chunk nodes) + no content on File node // - If embeddings DISABLED → ALWAYS store full content on File node (enables full-text search) // - If embeddings ENABLED and file is SMALL → Store content on File node + embedding const shouldStoreFullContent = this.isNornicDB || !generateEmbeddings || !needsChunking; // For NornicDB: Enrich content with metadata BEFORE storing // This gives NornicDB's embedding worker richer context for better embeddings // (Same enrichment that happens for Neo4j in the chunking/embedding path below) let contentToStore = content; if (this.isNornicDB && shouldStoreFullContent && generateEmbeddings) { const fileMetadata: FileMetadata = { name: path.basename(filePath), relativePath: relativePath, language: language, extension: extension, directory: path.dirname(relativePath), sizeBytes: stats.size }; const metadataPrefix = formatMetadataForEmbedding(fileMetadata); contentToStore = metadataPrefix + content; console.log(`📝 Enriched content for NornicDB embedding: ${relativePath} (+${metadataPrefix.length} chars metadata)`); } // Create File node with BOTH container and host paths // f.path = absolute container path (e.g., /app/docs/README.md) // f.host_path = absolute host path (e.g., /Users/user/src/Mimir/docs/README.md) // When not in Docker, both paths are the same // Convert container path to host path using environment variables const hostPath = this.translateToHostPath(filePath); // Use host path for logging (fall back to container path if not available) const displayPath = hostPath || filePath; // Wrap File node creation with retry logic to handle deadlocks const fileResult = await this.retryNeo4jTransaction(async () => { // Create File node and optionally link to WatchConfig const query = watchConfigId ? ` MERGE (f:File:Node {path: $path}) ON CREATE SET f.id = 'file-' + toString(timestamp()) + '-' + substring(randomUUID(), 0, 8) SET f.host_path = $host_path, f.name = $name, f.extension = $extension, f.language = $language, f.size_bytes = $size_bytes, f.line_count = $line_count, f.last_modified = $last_modified, f.indexed_date = datetime(), f.type = 'file', f.has_chunks = $has_chunks, f.content = $content WITH f MATCH (wc:WatchConfig {id: $watchConfigId}) MERGE (wc)-[:WATCHES]->(f) MERGE (f)-[:WATCHED_BY]->(wc) RETURN f.path AS path, f.size_bytes AS size_bytes, id(f) AS node_id ` : ` MERGE (f:File:Node {path: $path}) ON CREATE SET f.id = 'file-' + toString(timestamp()) + '-' + substring(randomUUID(), 0, 8) SET f.host_path = $host_path, f.name = $name, f.extension = $extension, f.language = $language, f.size_bytes = $size_bytes, f.line_count = $line_count, f.last_modified = $last_modified, f.indexed_date = datetime(), f.type = 'file', f.has_chunks = $has_chunks, f.content = $content RETURN f.path AS path, f.size_bytes AS size_bytes, id(f) AS node_id `; return await session.run(query, { path: filePath, // Now stores absolute container path host_path: hostPath, name: path.basename(filePath), extension: extension, language: language, size_bytes: stats.size, line_count: content.split('\n').length, last_modified: stats.mtime.toISOString(), has_chunks: needsChunking, content: shouldStoreFullContent ? contentToStore : null, // Store enriched content for NornicDB, raw content for Neo4j watchConfigId: watchConfigId || null }); }, `Create/update File node for ${relativePath}`); const fileNodeId = fileResult.records[0].get('node_id'); let chunksCreated = 0; // Generate and store embeddings if enabled and not already present // Skip embedding generation for NornicDB (database handles it natively) if (generateEmbeddings && !hasExistingChunks && !this.isNornicDB) { await this.initEmbeddings(); if (this.embeddingsService.isEnabled()) { try { // Prepare metadata for enrichment (ALL files get metadata enrichment) const fileMetadata: FileMetadata = { name: path.basename(filePath), relativePath: relativePath, language: language, extension: extension, directory: path.dirname(relativePath), sizeBytes: stats.size }; // Format metadata as natural language prefix const metadataPrefix = formatMetadataForEmbedding(fileMetadata); if (needsChunking) { // Large file: Generate separate chunk embeddings with metadata // Prepend metadata to the FULL content before chunking const enrichedContent = metadataPrefix + content; const chunkEmbeddings = await this.embeddingsService.generateChunkEmbeddings(enrichedContent); // Create FileChunk nodes with embeddings // Using content-based hashing for deterministic IDs (idempotent re-indexing) // Using parent_file_id property instead of NEXT_CHUNK relationships for simpler queries const totalChunks = chunkEmbeddings.length; for (const chunk of chunkEmbeddings) { // Generate deterministic chunk ID based on content hash // This ensures the same chunk always gets the same ID const chunkId = generateChunkId(relativePath, chunk.chunkIndex, chunk.text); await session.run(` MATCH (f:File) WHERE id(f) = $fileNodeId MERGE (c:FileChunk:Node {id: $chunkId}) SET c.chunk_index = $chunkIndex, c.text = $text, c.start_offset = $startOffset, c.end_offset = $endOffset, c.embedding = $embedding, c.embedding_dimensions = $dimensions, c.embedding_model = $model, c.type = 'file_chunk', c.indexed_date = datetime(), c.filePath = f.path, c.fileName = f.name, c.parent_file_id = $parentFileId, c.total_chunks = $totalChunks, c.has_next = $hasNext, c.has_prev = $hasPrev MERGE (f)-[:HAS_CHUNK {index: $chunkIndex}]->(c) `, { fileNodeId, chunkId, chunkIndex: chunk.chunkIndex, text: chunk.text, startOffset: chunk.startOffset, endOffset: chunk.endOffset, embedding: chunk.embedding, dimensions: chunk.dimensions, model: chunk.model, parentFileId: fileNodeId, totalChunks, hasNext: chunk.chunkIndex < totalChunks - 1, hasPrev: chunk.chunkIndex > 0 }); chunksCreated++; } console.log(`✅ Created ${chunksCreated} chunk embeddings for ${displayPath}`); } else { // Small file: Store embedding directly on File node with metadata enrichment const enrichedContent = metadataPrefix + content; const embedding = await this.embeddingsService.generateEmbedding(enrichedContent); await session.run(` MATCH (f:File) WHERE id(f) = $fileNodeId SET f.embedding = $embedding, f.embedding_dimensions = $dimensions, f.embedding_model = $model, f.has_embedding = true `, { fileNodeId, embedding: embedding.embedding, dimensions: embedding.dimensions, model: embedding.model }); console.log(`✅ Created file embedding for ${displayPath}`); } } catch (error: any) { console.warn(`⚠️ Failed to generate embeddings for ${displayPath}: ${error.message}`); } } } else if (generateEmbeddings && hasExistingChunks) { console.log(`⏭️ Skipping embeddings (already exist): ${displayPath}`); } return { file_node_id: `file-${fileNodeId}`, path: relativePath, size_bytes: stats.size, chunks_created: chunksCreated }; } catch (error: any) { // Provide specific error messages for different skip reasons const relativePath = path.relative(rootPath, filePath); if (error.message === 'Binary or non-indexable file') { // Silently skip - these are expected throw error; } if (error.message === 'Binary content detected') { console.warn(`⚠️ Skipping file with binary content: ${relativePath}`); throw new Error('Binary file'); } // UTF-8 decode errors if (error.code === 'ERR_INVALID_ARG_TYPE' || error.message?.includes('invalid')) { console.warn(`⚠️ Skipping file (UTF-8 decode error): ${relativePath}`); throw new Error('Binary file'); } throw error; } finally { await session.close(); } } /** * Check if content is actually text (not binary) * * Industry-standard approach: * 1. Check for null bytes (definitive binary indicator) * 2. Check for high concentration of control characters (0x00-0x08, 0x0E-0x1F) * 3. Allow all valid Unicode including emojis, CJK, extended Latin, etc. * * This properly handles: * - UTF-8 encoded files with emojis (🔧, 📄, etc.) * - Files with non-ASCII characters (Chinese, Japanese, Arabic, etc.) * - Files with special symbols and mathematical notation */ private isTextContent(content: string): boolean { // Empty content is treated as text (no binary indicators) if (content.length === 0) { return true; } // Check for null bytes (definitive binary indicator) if (content.includes('\0')) { return false; } // Sample first 8KB for performance (industry standard sample size) const sampleSize = Math.min(content.length, 8192); let controlCharCount = 0; for (let i = 0; i < sampleSize; i++) { const code = content.charCodeAt(i); // Count problematic control characters (0x00-0x08, 0x0E-0x1F) // Exclude common whitespace: tab (0x09), newline (0x0A), carriage return (0x0D) // Also exclude form feed (0x0C) which appears in some text files if ((code >= 0x00 && code <= 0x08) || (code >= 0x0E && code <= 0x1F)) { controlCharCount++; } // Check for lone surrogate code units (invalid UTF-16) // High surrogate (0xD800-0xDBFF) must be followed by low surrogate (0xDC00-0xDFFF) if (code >= 0xD800 && code <= 0xDBFF) { const nextCode = i + 1 < sampleSize ? content.charCodeAt(i + 1) : 0; if (nextCode < 0xDC00 || nextCode > 0xDFFF) { // Lone high surrogate - likely binary or corrupted controlCharCount++; } else { // Valid surrogate pair - skip the low surrogate on next iteration i++; } } else if (code >= 0xDC00 && code <= 0xDFFF) { // Lone low surrogate without preceding high surrogate controlCharCount++; } } // If more than 10% control characters, likely binary // This is more permissive than the old 85% printable threshold const controlRatio = controlCharCount / sampleSize; return controlRatio < 0.10; } /** * Check if file should be skipped (binary, images, archives, sensitive files, etc.) * Note: PDF and DOCX are in this list but handled separately via DocumentParser */ private shouldSkipFile(filePath: string, extension: string): boolean { // Binary and non-text file extensions to skip const skipExtensions = new Set([ // Images '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.svg', '.webp', '.tiff', '.tif', // Videos '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.m4v', // Audio '.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma', // Archives '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz', // Executables and binaries '.exe', '.dll', '.so', '.dylib', '.bin', '.dat', '.app', // Compiled/bytecode '.pyc', '.pyo', '.class', '.o', '.obj', '.wasm', // Documents (binary formats) - PDF/DOCX supported via DocumentParser '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp', // Fonts '.ttf', '.otf', '.woff', '.woff2', '.eot', // Database files '.db', '.sqlite', '.sqlite3', '.mdb', // IDE/Editor files '.swp', '.swo', '.DS_Store', '.idea', // Lock files (often auto-generated) '.lock', // Other binary formats '.pkl', '.pickle', '.parquet', '.avro', '.protobuf', '.pb', // Sensitive file extensions (Industry Standard Security) '.pem', '.key', '.p12', '.pfx', '.cer', '.crt', '.der', // Certificates & Private Keys '.keystore', '.jks', '.bks', // Java keystores '.ppk', '.pub', // SSH keys '.credentials', '.secret', // Credential files '.log' // Logs (may contain sensitive data) ]); // Check extension if (skipExtensions.has(extension)) { return true; } // Skip files without extension that are likely binary or auto-generated const fileName = path.basename(filePath); const binaryFileNames = new Set([ 'package-lock.json', // Too large and auto-generated 'yarn.lock', // Too large and auto-generated 'pnpm-lock.yaml', // Too large and auto-generated '.DS_Store', 'Thumbs.db', 'desktop.ini' ]); if (binaryFileNames.has(fileName)) { return true; } // Skip sensitive files by name (Industry Standard Security) // Configurable via MIMIR_SENSITIVE_FILES environment variable (comma-separated) const defaultSensitiveFiles = [ '.env', '.env.local', '.env.development', '.env.production', '.env.test', '.env.staging', '.env.example', // Environment files '.npmrc', '.yarnrc', '.pypirc', // Package manager configs with tokens '.netrc', '_netrc', // FTP/HTTP credentials 'id_rsa', 'id_dsa', 'id_ecdsa', 'id_ed25519', // SSH private keys 'credentials', 'secrets.yml', 'secrets.yaml', 'secrets.json', 'master.key', 'production.key' // Rails secrets ]; const sensitiveFileNames = new Set( process.env.MIMIR_SENSITIVE_FILES ? process.env.MIMIR_SENSITIVE_FILES.split(',').map(f => f.trim()).filter(f => f.length > 0) : defaultSensitiveFiles ); if (sensitiveFileNames.has(fileName)) { return true; } // Skip files with sensitive patterns in name const lowerFileName = fileName.toLowerCase(); const sensitivePatterns = [ 'password', 'passwd', 'secret', 'credential', 'token', 'apikey', 'api_key', 'private_key' ]; for (const pattern of sensitivePatterns) { if (lowerFileName.includes(pattern)) { return true; } } return false; } /** * Detect language from file extension */ private detectLanguage(filePath: string): string { const ext = path.extname(filePath).toLowerCase(); const languageMap: Record<string, string> = { '.ts': 'typescript', '.tsx': 'typescript', '.js': 'javascript', '.jsx': 'javascript', '.py': 'python', '.java': 'java', '.go': 'go', '.rs': 'rust', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php', '.md': 'markdown', '.json': 'json', '.yaml': 'yaml', '.yml': 'yaml', '.xml': 'xml', '.html': 'html', '.css': 'css', '.scss': 'scss', '.sql': 'sql' }; return languageMap[ext] || 'generic'; } /** * Delete file node and all associated chunks from Neo4j * * Removes the File node and cascades to delete all FileChunk nodes * and their relationships. Use this when files are deleted from disk * or need to be removed from the index. * * @param relativePath - Relative path to file (from root directory) * * @example * // Delete a file from index when deleted from disk * await fileIndexer.deleteFile('src/auth.ts'); * console.log('File removed from index'); * * @example * // Clean up after file move/rename * await fileIndexer.deleteFile('old/path/file.ts'); * await fileIndexer.indexFile('/new/path/file.ts', rootPath, true); * console.log('File re-indexed at new location'); * * @example * // Batch delete multiple files * const deletedFiles = ['src/old1.ts', 'src/old2.ts', 'src/old3.ts']; * for (const file of deletedFiles) { * await fileIndexer.deleteFile(file); * } * console.log('Cleaned up', deletedFiles.length, 'files'); */ async deleteFile(relativePath: string): Promise<void> { const session = this.driver.session(); try { // DETACH DELETE automatically removes relationships and connected chunks await session.run(` MATCH (f:File {path: $path}) OPTIONAL MATCH (f)-[:HAS_CHUNK]->(c:FileChunk) DETACH DELETE f, c `, { path: relativePath }); } finally { await session.close(); } } /** * Update file content and embeddings after file modification * * Re-indexes the file to update content and regenerate embeddings. * Automatically detects if file was modified and regenerates chunks * if needed. This is the recommended way to handle file changes. * * @param filePath - Absolute path to modified file * @param rootPath - Root directory path * * @example * // Update file after modification * await fileIndexer.updateFile( * '/Users/user/project/src/auth.ts', * '/Users/user/project' * ); * console.log('File content and embeddings updated'); * * @example * // Handle file watcher events * watcher.on('change', async (filePath) => { * console.log('File changed:', filePath); * await fileIndexer.updateFile(filePath, rootPath); * console.log('Index updated'); * }); * * @example * // Batch update multiple changed files * const changedFiles = await getModifiedFiles(); * for (const file of changedFiles) { * await fileIndexer.updateFile(file, rootPath); * } * console.log('Updated', changedFiles.length, 'files'); */ async updateFile(filePath: string, rootPath: string): Promise<void> { // Just re-index the file await this.indexFile(filePath, rootPath); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server