// ============================================================================
// FileIndexer - Index files into Neo4j
// Phase 1: Basic file indexing with content
// Phase 2: Vector embeddings for semantic search
// Phase 3: Parse and extract functions/classes (future)
// ============================================================================
import { Driver } from 'neo4j-driver';
import { promises as fs } from 'fs';
import path from 'path';
import { createHash } from 'crypto';
import { EmbeddingsService, ChunkEmbeddingResult, formatMetadataForEmbedding, FileMetadata } from './EmbeddingsService.js';
import { DocumentParser } from './DocumentParser.js';
import { getHostWorkspaceRoot } from '../utils/path-utils.js';
import { ImageProcessor } from './ImageProcessor.js';
import { VLService } from './VLService.js';
import { LLMConfigLoader } from '../config/LLMConfigLoader.js';
/**
* Generate a deterministic hash-based ID for content
* This ensures idempotent re-indexing without duplicate creation issues
*/
export function generateContentHash(content: string, prefix: string = ''): string {
const hash = createHash('sha256').update(content).digest('hex').slice(0, 16);
return prefix ? `${prefix}-${hash}` : hash;
}
/**
* Generate a deterministic chunk ID based on file path and chunk content
*/
export function generateChunkId(filePath: string, chunkIndex: number, chunkText: string): string {
const contentHash = generateContentHash(chunkText);
return `chunk-${generateContentHash(filePath)}-${chunkIndex}-${contentHash}`;
}
/**
* Generate a deterministic file ID based on path
*/
export function generateFileId(filePath: string): string {
return `file-${generateContentHash(filePath)}`;
}
export interface IndexResult {
file_node_id: string;
path: string;
size_bytes: number;
chunks_created?: number;
}
export class FileIndexer {
private embeddingsService: EmbeddingsService;
private embeddingsInitialized: boolean = false;
private embeddingsInitPromise: Promise<void> | null = null; // Mutex for initialization
private documentParser: DocumentParser;
private imageProcessor: ImageProcessor | null = null;
private vlService: VLService | null = null;
private configLoader: LLMConfigLoader;
private isNornicDB: boolean = false;
private providerDetected: boolean = false;
constructor(private driver: Driver) {
this.embeddingsService = new EmbeddingsService();
this.documentParser = new DocumentParser();
this.configLoader = LLMConfigLoader.getInstance();
}
/**
* Detect database provider (NornicDB vs Neo4j)
* Same detection logic as GraphManager for consistency
*/
private async detectDatabaseProvider(): Promise<void> {
// Check for manual override
const manualProvider = process.env.MIMIR_DATABASE_PROVIDER?.toLowerCase();
if (manualProvider === 'nornicdb') {
this.isNornicDB = true;
return;
} else if (manualProvider === 'neo4j') {
this.isNornicDB = false;
return;
}
// Auto-detect via server metadata
const session = this.driver.session();
try {
const result = await session.run('RETURN 1 as test');
const serverAgent = result.summary.server?.agent || '';
if (serverAgent.toLowerCase().includes('nornicdb')) {
this.isNornicDB = true;
} else {
this.isNornicDB = false;
}
} catch (error) {
// Default to Neo4j on error
this.isNornicDB = false;
} finally {
await session.close();
}
}
/**
* Initialize embeddings service (lazy loading)
* Skips initialization if connected to NornicDB
* Uses mutex pattern to prevent race conditions in concurrent calls
*/
private async initEmbeddings(): Promise<void> {
// If already initialized, return immediately
if (this.embeddingsInitialized) {
return;
}
// If initialization is in progress, wait for it
if (this.embeddingsInitPromise) {
return this.embeddingsInitPromise;
}
// Start initialization and store the promise for concurrent callers
this.embeddingsInitPromise = this.doInitEmbeddings();
try {
await this.embeddingsInitPromise;
} finally {
// Clear the promise after completion (success or failure)
this.embeddingsInitPromise = null;
}
}
/**
* Internal initialization logic (called once via mutex)
*/
private async doInitEmbeddings(): Promise<void> {
// Detect provider on first call
if (!this.providerDetected) {
await this.detectDatabaseProvider();
this.providerDetected = true;
if (this.isNornicDB) {
console.log('🗄️ FileIndexer: NornicDB detected - skipping embeddings service initialization');
}
}
// Only initialize embeddings service for Neo4j
if (!this.isNornicDB) {
await this.embeddingsService.initialize();
}
this.embeddingsInitialized = true;
}
/**
* Retry wrapper for Neo4j transactions with exponential backoff
* Handles deadlocks and other transient database errors
*
* @param fn - Async function to execute with retry logic
* @param operation - Description of operation for logging
* @param maxRetries - Maximum number of retry attempts (default: 3)
* @returns Result of the function execution
*
* @example
* await this.retryNeo4jTransaction(async () => {
* return await session.run('MERGE (n:Node {id: $id})', {id: '123'});
* }, 'Create node', 3);
*/
private async retryNeo4jTransaction<T>(
fn: () => Promise<T>,
operation: string,
maxRetries: number = 3
): Promise<T> {
let lastError: any;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error: any) {
lastError = error;
// Check for retryable Neo4j errors
const isDeadlock = error.message?.includes('DeadlockDetected') ||
error.message?.includes('can\'t acquire') ||
error.message?.includes('ForsetiClient') ||
error.code === 'Neo.TransientError.Transaction.DeadlockDetected';
const isLockTimeout = error.message?.includes('LockClient') ||
error.code === 'Neo.TransientError.Transaction.LockClientStopped';
const isTransient = error.code?.startsWith('Neo.TransientError');
const isRetryable = isDeadlock || isLockTimeout || isTransient;
// Don't retry on final attempt or non-retryable errors
if (!isRetryable || attempt === maxRetries) {
throw error;
}
// Exponential backoff with jitter: 100ms, 200ms, 400ms, 800ms...
const baseDelay = 100 * Math.pow(2, attempt);
const jitter = Math.random() * 50; // Add 0-50ms random jitter
const delayMs = Math.min(baseDelay + jitter, 2000);
const errorType = isDeadlock ? 'deadlock' : isLockTimeout ? 'lock timeout' : 'transient';
console.warn(
`⚠️ ${operation} failed with Neo4j ${errorType} error ` +
`(attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${Math.round(delayMs)}ms...`
);
await new Promise(resolve => setTimeout(resolve, delayMs));
}
}
throw lastError;
}
/**
* Initialize image processing services (lazy loading)
*/
private async initImageServices(): Promise<void> {
const config = await this.configLoader.getEmbeddingsConfig();
if (!config?.images?.enabled) {
return;
}
// Initialize ImageProcessor
if (!this.imageProcessor) {
this.imageProcessor = new ImageProcessor({
maxPixels: config.images.maxPixels,
targetSize: config.images.targetSize,
resizeQuality: config.images.resizeQuality
});
}
// Initialize VLService if describe mode is enabled
if (config.images.describeMode && config.vl && !this.vlService) {
this.vlService = new VLService({
provider: config.vl.provider,
api: config.vl.api,
apiPath: config.vl.apiPath,
apiKey: config.vl.apiKey,
model: config.vl.model,
contextSize: config.vl.contextSize,
maxTokens: config.vl.maxTokens,
temperature: config.vl.temperature
});
console.log('🖼️ Image embedding services initialized (VL describe mode)');
}
}
/**
* Translate container path to host path
* e.g., /workspace/my-project/file.ts -> /Users/user/src/my-project/file.ts
* If not in Docker, both paths are the same
*/
private translateToHostPath(containerPath: string): string {
const workspaceRoot = process.env.WORKSPACE_ROOT;
const hostWorkspaceRoot = process.env.HOST_WORKSPACE_ROOT;
// If WORKSPACE_ROOT not set, we're running locally - no translation needed
if (!workspaceRoot) {
return containerPath;
}
// If HOST_WORKSPACE_ROOT not set but WORKSPACE_ROOT is, something is misconfigured
if (!hostWorkspaceRoot) {
console.warn('⚠️ WORKSPACE_ROOT is set but HOST_WORKSPACE_ROOT is not - path translation may fail');
return containerPath;
}
// Expand tilde in HOST_WORKSPACE_ROOT for consistent storage
// Use getHostWorkspaceRoot() which properly expands tilde
const expandedHostRoot = getHostWorkspaceRoot();
// Replace container workspace root with expanded host workspace root
if (containerPath.startsWith(workspaceRoot)) {
return containerPath.replace(workspaceRoot, expandedHostRoot);
}
return containerPath;
}
/**
* Index a single file into Neo4j with optional vector embeddings
*
* Creates a File node in the graph database with metadata and content.
* For large files with embeddings enabled, splits content into chunks
* with individual embeddings for precise semantic search (industry standard).
*
* Indexing Strategy:
* - **Small files** (<1000 chars): Single embedding on File node
* - **Large files** (>1000 chars): Multiple FileChunk nodes with embeddings
* - **No embeddings**: Full content stored on File node for full-text search
*
* Supported Formats:
* - Text files (.ts, .js, .py, .md, .json, etc.)
* - PDF documents (text extraction)
* - DOCX documents (text extraction)
* - Images (.png, .jpg, etc.) with VL description or multimodal embedding
*
* @param filePath - Absolute path to file
* @param rootPath - Root directory path for calculating relative paths
* @param generateEmbeddings - Whether to generate vector embeddings
* @returns Index result with file node ID, path, size, and chunk count
* @throws {Error} If file is binary, non-indexable, or processing fails
*
* @example
* // Index a TypeScript file without embeddings
* const result = await fileIndexer.indexFile(
* '/Users/user/project/src/auth.ts',
* '/Users/user/project',
* false
* );
* console.log('Indexed:', result.path);
* console.log('Size:', result.size_bytes, 'bytes');
* // File content stored on File node for full-text search
*
* @example
* // Index a large file with embeddings (chunked)
* const result = await fileIndexer.indexFile(
* '/Users/user/project/docs/guide.md',
* '/Users/user/project',
* true
* );
* console.log('Created', result.chunks_created, 'chunks');
* // Each chunk has its own embedding for precise semantic search
*
* @example
* // Index a PDF document with embeddings
* const result = await fileIndexer.indexFile(
* '/Users/user/project/docs/manual.pdf',
* '/Users/user/project',
* true
* );
* console.log('Extracted and indexed PDF:', result.path);
* console.log('Chunks created:', result.chunks_created);
*
* @example
* // Index an image with VL description
* const result = await fileIndexer.indexFile(
* '/Users/user/project/images/diagram.png',
* '/Users/user/project',
* true
* );
* console.log('Image indexed with description:', result.path);
* // VL model generates text description, then embeds it
*
* @example
* // Handle indexing errors
* try {
* await fileIndexer.indexFile(filePath, rootPath, true);
* } catch (error) {
* if (error.message === 'Binary or non-indexable file') {
* console.log('Skipped binary file');
* } else {
* console.error('Indexing failed:', error.message);
* }
* }
*/
async indexFile(filePath: string, rootPath: string, generateEmbeddings: boolean = false, watchConfigId?: string): Promise<IndexResult> {
const session = this.driver.session();
let content: string = '';
let isImage = false;
// CRITICAL: Detect provider BEFORE making content storage decisions
// This ensures NornicDB detection happens before shouldStoreFullContent is evaluated
if (!this.providerDetected) {
await this.detectDatabaseProvider();
this.providerDetected = true;
if (this.isNornicDB) {
console.log('🗄️ FileIndexer: NornicDB detected - full content will be stored for native embedding');
}
}
try {
const relativePath = path.relative(rootPath, filePath);
const extension = path.extname(filePath).toLowerCase();
const binaryDoc = this.documentParser.isSupportedFormat(extension);
// Check if this is an image file BEFORE the binary skip
if (ImageProcessor.isImageFile(filePath) && generateEmbeddings) {
await this.initImageServices();
const config = await this.configLoader.getEmbeddingsConfig();
if (config?.images?.enabled) {
isImage = true;
// For NornicDB: ALWAYS use VL description mode (NornicDB can only embed text)
// For Neo4j: Use configured mode (describeMode or direct multimodal)
const useVLDescription = this.isNornicDB || config.images.describeMode;
if (useVLDescription && this.vlService && this.imageProcessor) {
// Path 1: VL Description Method (DEFAULT, REQUIRED for NornicDB)
// Uses VL model to generate text description, then embeds the description
console.log(`🖼️ Processing image with VL description: ${relativePath}${this.isNornicDB ? ' (NornicDB requires text)' : ''}`);
// 1. Prepare image (resize if needed)
const processedImage = await this.imageProcessor.prepareImageForVL(filePath);
if (processedImage.wasResized) {
console.log(` Resized from ${processedImage.originalSize.width}×${processedImage.originalSize.height} to ${processedImage.processedSize.width}×${processedImage.processedSize.height}`);
}
// 2. Create Data URL
const dataURL = this.imageProcessor.createDataURL(processedImage.base64, processedImage.format);
// 3. Get description from VL model
const result = await this.vlService.describeImage(dataURL);
content = result.description;
console.log(` Generated description (${content.length} chars) in ${result.processingTimeMs}ms`);
} else if (!config.images.describeMode && !this.isNornicDB && this.imageProcessor) {
// Path 2: Direct Multimodal Embedding (Neo4j only)
// Sends image directly to multimodal embeddings endpoint
// NOTE: This path is NOT available for NornicDB (requires text content)
console.log(`🖼️ Processing image with direct multimodal embedding: ${relativePath}`);
// 1. Prepare image (resize if needed)
const processedImage = await this.imageProcessor.prepareImageForVL(filePath);
if (processedImage.wasResized) {
console.log(` Resized from ${processedImage.originalSize.width}×${processedImage.originalSize.height} to ${processedImage.processedSize.width}×${processedImage.processedSize.height}`);
}
// 2. Create Data URL for embedding
const dataURL = this.imageProcessor.createDataURL(processedImage.base64, processedImage.format);
// 3. Store the data URL as content - will be sent to embeddings service
// The embeddings service will handle multimodal input
content = dataURL;
console.log(` Prepared image for direct embedding (${processedImage.sizeBytes} bytes)`);
} else if (this.isNornicDB && !this.vlService) {
// NornicDB requires VL service for image embedding (can't do multimodal)
console.warn(`⚠️ Skipping image ${relativePath}: NornicDB requires VL service for image descriptions`);
throw new Error('Image indexing requires VL service for NornicDB (text-only embedding)');
} else {
// Missing required services
const missingServices = [];
if (!this.imageProcessor) missingServices.push('ImageProcessor');
if (config.images.describeMode && !this.vlService) missingServices.push('VLService');
throw new Error(`Image processing requires: ${missingServices.join(', ')}`);
}
} else {
// Images disabled, skip
throw new Error('Image indexing disabled');
}
} else if (binaryDoc) {
// Extract text from PDF or DOCX
const buffer = await fs.readFile(filePath);
content = await this.documentParser.extractText(buffer, extension);
console.log(`📄 Extracted ${content.length} chars from ${extension} document: ${relativePath}`);
} else if (!this.shouldSkipFile(filePath, extension)) {
// Read as plain text file
content = await fs.readFile(filePath, 'utf-8');
// Check if content is actually text (not binary masquerading as text)
if (!this.isTextContent(content)) {
throw new Error('Binary content detected');
}
} else {
throw new Error('Binary or non-indexable file');
}
const stats = await fs.stat(filePath);
const language = this.detectLanguage(filePath);
// Check if file already has chunks
let hasExistingChunks = false;
if (generateEmbeddings) {
const checkResult = await session.run(
`MATCH (f:File {path: $path})-[:HAS_CHUNK]->(c:FileChunk)
RETURN count(c) AS chunk_count, f.last_modified AS last_modified`,
{ path: filePath }
);
if (checkResult.records.length > 0) {
const chunkCount = checkResult.records[0].get('chunk_count').toNumber();
hasExistingChunks = chunkCount > 0;
const existingModified = checkResult.records[0].get('last_modified');
// Re-generate if file was modified
if (hasExistingChunks && existingModified) {
const existingModifiedDate = new Date(existingModified);
if (stats.mtime > existingModifiedDate) {
console.log(`📝 File modified, regenerating chunks: ${relativePath}`);
hasExistingChunks = false;
// Delete old chunks (use filePath which is the absolute container path)
await session.run(
`MATCH (f:File {path: $path})-[:HAS_CHUNK]->(c:FileChunk)
DETACH DELETE c`,
{ path: filePath }
);
}
}
}
}
// Determine if file needs chunking (based on embeddings config chunk size)
const needsChunking = generateEmbeddings && content.length > 1000; // Will be refined by EmbeddingsService
// Storage strategy:
// - If NornicDB → ALWAYS store full content (NornicDB handles chunking/embedding natively)
// - If embeddings ENABLED and file is LARGE → Store in chunks (chunk nodes) + no content on File node
// - If embeddings DISABLED → ALWAYS store full content on File node (enables full-text search)
// - If embeddings ENABLED and file is SMALL → Store content on File node + embedding
const shouldStoreFullContent = this.isNornicDB || !generateEmbeddings || !needsChunking;
// For NornicDB: Enrich content with metadata BEFORE storing
// This gives NornicDB's embedding worker richer context for better embeddings
// (Same enrichment that happens for Neo4j in the chunking/embedding path below)
let contentToStore = content;
if (this.isNornicDB && shouldStoreFullContent && generateEmbeddings) {
const fileMetadata: FileMetadata = {
name: path.basename(filePath),
relativePath: relativePath,
language: language,
extension: extension,
directory: path.dirname(relativePath),
sizeBytes: stats.size
};
const metadataPrefix = formatMetadataForEmbedding(fileMetadata);
contentToStore = metadataPrefix + content;
console.log(`📝 Enriched content for NornicDB embedding: ${relativePath} (+${metadataPrefix.length} chars metadata)`);
}
// Create File node with BOTH container and host paths
// f.path = absolute container path (e.g., /app/docs/README.md)
// f.host_path = absolute host path (e.g., /Users/user/src/Mimir/docs/README.md)
// When not in Docker, both paths are the same
// Convert container path to host path using environment variables
const hostPath = this.translateToHostPath(filePath);
// Use host path for logging (fall back to container path if not available)
const displayPath = hostPath || filePath;
// Wrap File node creation with retry logic to handle deadlocks
const fileResult = await this.retryNeo4jTransaction(async () => {
// Create File node and optionally link to WatchConfig
const query = watchConfigId ? `
MERGE (f:File:Node {path: $path})
ON CREATE SET f.id = 'file-' + toString(timestamp()) + '-' + substring(randomUUID(), 0, 8)
SET
f.host_path = $host_path,
f.name = $name,
f.extension = $extension,
f.language = $language,
f.size_bytes = $size_bytes,
f.line_count = $line_count,
f.last_modified = $last_modified,
f.indexed_date = datetime(),
f.type = 'file',
f.has_chunks = $has_chunks,
f.content = $content
WITH f
MATCH (wc:WatchConfig {id: $watchConfigId})
MERGE (wc)-[:WATCHES]->(f)
MERGE (f)-[:WATCHED_BY]->(wc)
RETURN f.path AS path, f.size_bytes AS size_bytes, id(f) AS node_id
` : `
MERGE (f:File:Node {path: $path})
ON CREATE SET f.id = 'file-' + toString(timestamp()) + '-' + substring(randomUUID(), 0, 8)
SET
f.host_path = $host_path,
f.name = $name,
f.extension = $extension,
f.language = $language,
f.size_bytes = $size_bytes,
f.line_count = $line_count,
f.last_modified = $last_modified,
f.indexed_date = datetime(),
f.type = 'file',
f.has_chunks = $has_chunks,
f.content = $content
RETURN f.path AS path, f.size_bytes AS size_bytes, id(f) AS node_id
`;
return await session.run(query, {
path: filePath, // Now stores absolute container path
host_path: hostPath,
name: path.basename(filePath),
extension: extension,
language: language,
size_bytes: stats.size,
line_count: content.split('\n').length,
last_modified: stats.mtime.toISOString(),
has_chunks: needsChunking,
content: shouldStoreFullContent ? contentToStore : null, // Store enriched content for NornicDB, raw content for Neo4j
watchConfigId: watchConfigId || null
});
}, `Create/update File node for ${relativePath}`);
const fileNodeId = fileResult.records[0].get('node_id');
let chunksCreated = 0;
// Generate and store embeddings if enabled and not already present
// Skip embedding generation for NornicDB (database handles it natively)
if (generateEmbeddings && !hasExistingChunks && !this.isNornicDB) {
await this.initEmbeddings();
if (this.embeddingsService.isEnabled()) {
try {
// Prepare metadata for enrichment (ALL files get metadata enrichment)
const fileMetadata: FileMetadata = {
name: path.basename(filePath),
relativePath: relativePath,
language: language,
extension: extension,
directory: path.dirname(relativePath),
sizeBytes: stats.size
};
// Format metadata as natural language prefix
const metadataPrefix = formatMetadataForEmbedding(fileMetadata);
if (needsChunking) {
// Large file: Generate separate chunk embeddings with metadata
// Prepend metadata to the FULL content before chunking
const enrichedContent = metadataPrefix + content;
const chunkEmbeddings = await this.embeddingsService.generateChunkEmbeddings(enrichedContent);
// Create FileChunk nodes with embeddings
// Using content-based hashing for deterministic IDs (idempotent re-indexing)
// Using parent_file_id property instead of NEXT_CHUNK relationships for simpler queries
const totalChunks = chunkEmbeddings.length;
for (const chunk of chunkEmbeddings) {
// Generate deterministic chunk ID based on content hash
// This ensures the same chunk always gets the same ID
const chunkId = generateChunkId(relativePath, chunk.chunkIndex, chunk.text);
await session.run(`
MATCH (f:File) WHERE id(f) = $fileNodeId
MERGE (c:FileChunk:Node {id: $chunkId})
SET
c.chunk_index = $chunkIndex,
c.text = $text,
c.start_offset = $startOffset,
c.end_offset = $endOffset,
c.embedding = $embedding,
c.embedding_dimensions = $dimensions,
c.embedding_model = $model,
c.type = 'file_chunk',
c.indexed_date = datetime(),
c.filePath = f.path,
c.fileName = f.name,
c.parent_file_id = $parentFileId,
c.total_chunks = $totalChunks,
c.has_next = $hasNext,
c.has_prev = $hasPrev
MERGE (f)-[:HAS_CHUNK {index: $chunkIndex}]->(c)
`, {
fileNodeId,
chunkId,
chunkIndex: chunk.chunkIndex,
text: chunk.text,
startOffset: chunk.startOffset,
endOffset: chunk.endOffset,
embedding: chunk.embedding,
dimensions: chunk.dimensions,
model: chunk.model,
parentFileId: fileNodeId,
totalChunks,
hasNext: chunk.chunkIndex < totalChunks - 1,
hasPrev: chunk.chunkIndex > 0
});
chunksCreated++;
}
console.log(`✅ Created ${chunksCreated} chunk embeddings for ${displayPath}`);
} else {
// Small file: Store embedding directly on File node with metadata enrichment
const enrichedContent = metadataPrefix + content;
const embedding = await this.embeddingsService.generateEmbedding(enrichedContent);
await session.run(`
MATCH (f:File) WHERE id(f) = $fileNodeId
SET
f.embedding = $embedding,
f.embedding_dimensions = $dimensions,
f.embedding_model = $model,
f.has_embedding = true
`, {
fileNodeId,
embedding: embedding.embedding,
dimensions: embedding.dimensions,
model: embedding.model
});
console.log(`✅ Created file embedding for ${displayPath}`);
}
} catch (error: any) {
console.warn(`⚠️ Failed to generate embeddings for ${displayPath}: ${error.message}`);
}
}
} else if (generateEmbeddings && hasExistingChunks) {
console.log(`⏭️ Skipping embeddings (already exist): ${displayPath}`);
}
return {
file_node_id: `file-${fileNodeId}`,
path: relativePath,
size_bytes: stats.size,
chunks_created: chunksCreated
};
} catch (error: any) {
// Provide specific error messages for different skip reasons
const relativePath = path.relative(rootPath, filePath);
if (error.message === 'Binary or non-indexable file') {
// Silently skip - these are expected
throw error;
}
if (error.message === 'Binary content detected') {
console.warn(`⚠️ Skipping file with binary content: ${relativePath}`);
throw new Error('Binary file');
}
// UTF-8 decode errors
if (error.code === 'ERR_INVALID_ARG_TYPE' || error.message?.includes('invalid')) {
console.warn(`⚠️ Skipping file (UTF-8 decode error): ${relativePath}`);
throw new Error('Binary file');
}
throw error;
} finally {
await session.close();
}
}
/**
* Check if content is actually text (not binary)
*
* Industry-standard approach:
* 1. Check for null bytes (definitive binary indicator)
* 2. Check for high concentration of control characters (0x00-0x08, 0x0E-0x1F)
* 3. Allow all valid Unicode including emojis, CJK, extended Latin, etc.
*
* This properly handles:
* - UTF-8 encoded files with emojis (🔧, 📄, etc.)
* - Files with non-ASCII characters (Chinese, Japanese, Arabic, etc.)
* - Files with special symbols and mathematical notation
*/
private isTextContent(content: string): boolean {
// Empty content is treated as text (no binary indicators)
if (content.length === 0) {
return true;
}
// Check for null bytes (definitive binary indicator)
if (content.includes('\0')) {
return false;
}
// Sample first 8KB for performance (industry standard sample size)
const sampleSize = Math.min(content.length, 8192);
let controlCharCount = 0;
for (let i = 0; i < sampleSize; i++) {
const code = content.charCodeAt(i);
// Count problematic control characters (0x00-0x08, 0x0E-0x1F)
// Exclude common whitespace: tab (0x09), newline (0x0A), carriage return (0x0D)
// Also exclude form feed (0x0C) which appears in some text files
if ((code >= 0x00 && code <= 0x08) || (code >= 0x0E && code <= 0x1F)) {
controlCharCount++;
}
// Check for lone surrogate code units (invalid UTF-16)
// High surrogate (0xD800-0xDBFF) must be followed by low surrogate (0xDC00-0xDFFF)
if (code >= 0xD800 && code <= 0xDBFF) {
const nextCode = i + 1 < sampleSize ? content.charCodeAt(i + 1) : 0;
if (nextCode < 0xDC00 || nextCode > 0xDFFF) {
// Lone high surrogate - likely binary or corrupted
controlCharCount++;
} else {
// Valid surrogate pair - skip the low surrogate on next iteration
i++;
}
} else if (code >= 0xDC00 && code <= 0xDFFF) {
// Lone low surrogate without preceding high surrogate
controlCharCount++;
}
}
// If more than 10% control characters, likely binary
// This is more permissive than the old 85% printable threshold
const controlRatio = controlCharCount / sampleSize;
return controlRatio < 0.10;
}
/**
* Check if file should be skipped (binary, images, archives, sensitive files, etc.)
* Note: PDF and DOCX are in this list but handled separately via DocumentParser
*/
private shouldSkipFile(filePath: string, extension: string): boolean {
// Binary and non-text file extensions to skip
const skipExtensions = new Set([
// Images
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.svg', '.webp', '.tiff', '.tif',
// Videos
'.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.m4v',
// Audio
'.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma',
// Archives
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
// Executables and binaries
'.exe', '.dll', '.so', '.dylib', '.bin', '.dat', '.app',
// Compiled/bytecode
'.pyc', '.pyo', '.class', '.o', '.obj', '.wasm',
// Documents (binary formats) - PDF/DOCX supported via DocumentParser
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
// Fonts
'.ttf', '.otf', '.woff', '.woff2', '.eot',
// Database files
'.db', '.sqlite', '.sqlite3', '.mdb',
// IDE/Editor files
'.swp', '.swo', '.DS_Store', '.idea',
// Lock files (often auto-generated)
'.lock',
// Other binary formats
'.pkl', '.pickle', '.parquet', '.avro', '.protobuf', '.pb',
// Sensitive file extensions (Industry Standard Security)
'.pem', '.key', '.p12', '.pfx', '.cer', '.crt', '.der', // Certificates & Private Keys
'.keystore', '.jks', '.bks', // Java keystores
'.ppk', '.pub', // SSH keys
'.credentials', '.secret', // Credential files
'.log' // Logs (may contain sensitive data)
]);
// Check extension
if (skipExtensions.has(extension)) {
return true;
}
// Skip files without extension that are likely binary or auto-generated
const fileName = path.basename(filePath);
const binaryFileNames = new Set([
'package-lock.json', // Too large and auto-generated
'yarn.lock', // Too large and auto-generated
'pnpm-lock.yaml', // Too large and auto-generated
'.DS_Store',
'Thumbs.db',
'desktop.ini'
]);
if (binaryFileNames.has(fileName)) {
return true;
}
// Skip sensitive files by name (Industry Standard Security)
// Configurable via MIMIR_SENSITIVE_FILES environment variable (comma-separated)
const defaultSensitiveFiles = [
'.env', '.env.local', '.env.development', '.env.production', '.env.test', '.env.staging', '.env.example', // Environment files
'.npmrc', '.yarnrc', '.pypirc', // Package manager configs with tokens
'.netrc', '_netrc', // FTP/HTTP credentials
'id_rsa', 'id_dsa', 'id_ecdsa', 'id_ed25519', // SSH private keys
'credentials', 'secrets.yml', 'secrets.yaml', 'secrets.json',
'master.key', 'production.key' // Rails secrets
];
const sensitiveFileNames = new Set(
process.env.MIMIR_SENSITIVE_FILES
? process.env.MIMIR_SENSITIVE_FILES.split(',').map(f => f.trim()).filter(f => f.length > 0)
: defaultSensitiveFiles
);
if (sensitiveFileNames.has(fileName)) {
return true;
}
// Skip files with sensitive patterns in name
const lowerFileName = fileName.toLowerCase();
const sensitivePatterns = [
'password', 'passwd', 'secret', 'credential', 'token', 'apikey', 'api_key', 'private_key'
];
for (const pattern of sensitivePatterns) {
if (lowerFileName.includes(pattern)) {
return true;
}
}
return false;
}
/**
* Detect language from file extension
*/
private detectLanguage(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
const languageMap: Record<string, string> = {
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.py': 'python',
'.java': 'java',
'.go': 'go',
'.rs': 'rust',
'.cpp': 'cpp',
'.c': 'c',
'.cs': 'csharp',
'.rb': 'ruby',
'.php': 'php',
'.md': 'markdown',
'.json': 'json',
'.yaml': 'yaml',
'.yml': 'yaml',
'.xml': 'xml',
'.html': 'html',
'.css': 'css',
'.scss': 'scss',
'.sql': 'sql'
};
return languageMap[ext] || 'generic';
}
/**
* Delete file node and all associated chunks from Neo4j
*
* Removes the File node and cascades to delete all FileChunk nodes
* and their relationships. Use this when files are deleted from disk
* or need to be removed from the index.
*
* @param relativePath - Relative path to file (from root directory)
*
* @example
* // Delete a file from index when deleted from disk
* await fileIndexer.deleteFile('src/auth.ts');
* console.log('File removed from index');
*
* @example
* // Clean up after file move/rename
* await fileIndexer.deleteFile('old/path/file.ts');
* await fileIndexer.indexFile('/new/path/file.ts', rootPath, true);
* console.log('File re-indexed at new location');
*
* @example
* // Batch delete multiple files
* const deletedFiles = ['src/old1.ts', 'src/old2.ts', 'src/old3.ts'];
* for (const file of deletedFiles) {
* await fileIndexer.deleteFile(file);
* }
* console.log('Cleaned up', deletedFiles.length, 'files');
*/
async deleteFile(relativePath: string): Promise<void> {
const session = this.driver.session();
try {
// DETACH DELETE automatically removes relationships and connected chunks
await session.run(`
MATCH (f:File {path: $path})
OPTIONAL MATCH (f)-[:HAS_CHUNK]->(c:FileChunk)
DETACH DELETE f, c
`, { path: relativePath });
} finally {
await session.close();
}
}
/**
* Update file content and embeddings after file modification
*
* Re-indexes the file to update content and regenerate embeddings.
* Automatically detects if file was modified and regenerates chunks
* if needed. This is the recommended way to handle file changes.
*
* @param filePath - Absolute path to modified file
* @param rootPath - Root directory path
*
* @example
* // Update file after modification
* await fileIndexer.updateFile(
* '/Users/user/project/src/auth.ts',
* '/Users/user/project'
* );
* console.log('File content and embeddings updated');
*
* @example
* // Handle file watcher events
* watcher.on('change', async (filePath) => {
* console.log('File changed:', filePath);
* await fileIndexer.updateFile(filePath, rootPath);
* console.log('Index updated');
* });
*
* @example
* // Batch update multiple changed files
* const changedFiles = await getModifiedFiles();
* for (const file of changedFiles) {
* await fileIndexer.updateFile(file, rootPath);
* }
* console.log('Updated', changedFiles.length, 'files');
*/
async updateFile(filePath: string, rootPath: string): Promise<void> {
// Just re-index the file
await this.indexFile(filePath, rootPath);
}
}