Skip to main content
Glama
orneryd

M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

by orneryd
check-and-reset-embeddings.js24.2 kB
#!/usr/bin/env node /** * Check and Reset Embeddings * * This script: * 1. Checks Neo4j vector index configuration * 2. Checks existing embeddings and their dimensions * 3. Compares with currently configured dimensions from environment * 4. Offers to reset the embedding space if mismatches are found * * Usage (via npm): * npm run embeddings:check # Check only * npm run embeddings:reset # Reset and regenerate embeddings * npm run embeddings:force-reset # Force reset without asking * npm run embeddings:clear # Clear embeddings without regenerating * npm run embeddings:generate-missing # Generate embeddings only for items without them * * Usage (direct): * node scripts/check-and-reset-embeddings.js # Check only * node scripts/check-and-reset-embeddings.js --reset # Reset and regenerate embeddings * node scripts/check-and-reset-embeddings.js --force # Force reset without asking * node scripts/check-and-reset-embeddings.js --reset --clear-only # Clear embeddings without regenerating * node scripts/check-and-reset-embeddings.js --generate-missing # Generate embeddings only for items without them */ import neo4j from 'neo4j-driver'; import readline from 'readline'; import { EmbeddingsService } from '../build/indexing/EmbeddingsService.js'; // Configuration from environment or defaults const NEO4J_URI = process.env.NEO4J_URI || 'bolt://localhost:7687'; const NEO4J_USER = process.env.NEO4J_USER || 'neo4j'; const NEO4J_PASSWORD = process.env.NEO4J_PASSWORD || 'password'; const CONFIGURED_DIMENSIONS = parseInt(process.env.MIMIR_EMBEDDINGS_DIMENSIONS || '1024', 10); const CONFIGURED_MODEL = process.env.MIMIR_EMBEDDINGS_MODEL || 'mxbai-embed-large'; // Ensure embeddings are enabled for regeneration process.env.MIMIR_EMBEDDINGS_ENABLED = 'true'; process.env.MIMIR_FEATURE_VECTOR_EMBEDDINGS = 'true'; process.env.MIMIR_EMBEDDINGS_PROVIDER = process.env.MIMIR_EMBEDDINGS_PROVIDER || 'llama.cpp'; process.env.MIMIR_EMBEDDINGS_MODEL = CONFIGURED_MODEL; process.env.MIMIR_EMBEDDINGS_DIMENSIONS = CONFIGURED_DIMENSIONS.toString(); process.env.OLLAMA_BASE_URL = process.env.OLLAMA_BASE_URL || 'http://localhost:11434/v1'; // Parse command line arguments const args = process.argv.slice(2); const shouldReset = args.includes('--reset') || args.includes('--force'); const forceReset = args.includes('--force'); const clearOnly = args.includes('--clear-only'); const generateMissing = args.includes('--generate-missing'); /** * Prompt user for confirmation */ function askQuestion(query) { const rl = readline.createInterface({ input: process.stdin, output: process.stdout, }); return new Promise(resolve => rl.question(query, ans => { rl.close(); resolve(ans); })); } /** * Check vector index configuration */ async function checkVectorIndex(session) { console.log('\n🔍 Checking vector index configuration...'); try { const result = await session.run(` SHOW INDEXES YIELD name, type, labelsOrTypes, properties, options WHERE name = 'node_embedding_index' RETURN name, type, options `); if (result.records.length === 0) { console.log('⚠️ No vector index found (node_embedding_index does not exist)'); return { exists: false, dimensions: null }; } const record = result.records[0]; const options = record.get('options'); const indexConfig = options?.indexConfig || {}; const dimensions = indexConfig['vector.dimensions']; console.log(`✅ Vector index exists: node_embedding_index`); console.log(` Type: ${record.get('type')}`); console.log(` Dimensions: ${dimensions}`); console.log(` Similarity: ${indexConfig['vector.similarity_function'] || 'cosine'}`); return { exists: true, dimensions: parseInt(dimensions, 10) }; } catch (error) { console.error('❌ Error checking vector index:', error.message); return { exists: false, dimensions: null, error: error.message }; } } /** * Check existing embeddings in database */ async function checkExistingEmbeddings(session) { console.log('\n🔍 Checking existing embeddings...'); try { // Count nodes with embeddings (any label) const countResult = await session.run(` MATCH (n) WHERE n.embedding IS NOT NULL RETURN count(n) as count `); const totalCountValue = countResult.records[0].get('count'); const totalCount = typeof totalCountValue === 'object' && totalCountValue.toNumber ? totalCountValue.toNumber() : totalCountValue; if (totalCount === 0) { console.log('ℹ️ No embeddings found in database'); return { count: 0, models: [], dimensions: [] }; } // Get embedding statistics (any label) const statsResult = await session.run(` MATCH (n) WHERE n.embedding IS NOT NULL RETURN n.embedding_model as model, n.embedding_dimensions as dimensions, size(n.embedding) as actualDims, count(n) as count ORDER BY count DESC `); console.log(`✅ Found ${totalCount} nodes with embeddings:`); const models = []; const dimensions = []; statsResult.records.forEach(record => { const model = record.get('model') || 'unknown'; const dimsValue = record.get('dimensions'); const actualDimsValue = record.get('actualDims'); // Prefer stored dimensions, fall back to actual size of embedding array const dims = dimsValue ? (typeof dimsValue === 'object' && dimsValue.toNumber ? dimsValue.toNumber() : dimsValue) : (actualDimsValue ? (typeof actualDimsValue === 'object' && actualDimsValue.toNumber ? actualDimsValue.toNumber() : actualDimsValue) : null); const countValue = record.get('count'); const count = typeof countValue === 'object' && countValue.toNumber ? countValue.toNumber() : countValue; models.push({ model, dimensions: dims, count }); if (dims) dimensions.push(dims); console.log(` ${model} (${dims || '?'} dims): ${count} nodes`); }); return { count: totalCount, models, dimensions }; } catch (error) { console.error('❌ Error checking embeddings:', error.message); return { count: 0, models: [], dimensions: [], error: error.message }; } } /** * Check for chunk embeddings */ async function checkChunkEmbeddings(session) { console.log('\n🔍 Checking chunk embeddings...'); try { const result = await session.run(` MATCH (n)-[:HAS_CHUNK]->(c:FileChunk) WHERE c.embedding IS NOT NULL RETURN count(c) as count, c.embedding_dimensions as dimensions, size(c.embedding) as actualDims LIMIT 1 `); if (result.records.length === 0) { console.log('ℹ️ No chunk embeddings found'); return { count: 0, dimensions: null }; } const countValue = result.records[0].get('count'); const count = typeof countValue === 'object' && countValue.toNumber ? countValue.toNumber() : countValue; const dimsValue = result.records[0].get('dimensions'); const dimensions = dimsValue ? (typeof dimsValue === 'object' && dimsValue.toNumber ? dimsValue.toNumber() : dimsValue) : null; console.log(`✅ Found ${count} chunks with embeddings (${dimensions || '?'} dims)`); return { count, dimensions }; } catch (error) { console.error('❌ Error checking chunk embeddings:', error.message); return { count: 0, dimensions: null, error: error.message }; } } /** * Analyze mismatches and report */ function analyzeMismatches(indexInfo, embeddingsInfo, chunksInfo) { console.log('\n📊 Analysis:'); console.log(` Configured dimensions: ${CONFIGURED_DIMENSIONS}`); console.log(` Configured model: ${CONFIGURED_MODEL}`); const mismatches = []; // Check index dimensions if (indexInfo.exists && indexInfo.dimensions !== CONFIGURED_DIMENSIONS) { console.log(` ⚠️ Index dimensions (${indexInfo.dimensions}) != configured (${CONFIGURED_DIMENSIONS})`); mismatches.push('index'); } else if (indexInfo.exists) { console.log(` ✅ Index dimensions match configured`); } // Check embedding dimensions const uniqueDims = [...new Set(embeddingsInfo.dimensions)]; if (uniqueDims.length > 0) { const allMatch = uniqueDims.every(d => d === CONFIGURED_DIMENSIONS); if (!allMatch) { console.log(` ⚠️ Embedding dimensions ${JSON.stringify(uniqueDims)} != configured (${CONFIGURED_DIMENSIONS})`); mismatches.push('embeddings'); } else { console.log(` ✅ All embedding dimensions match configured`); } } // Check chunk dimensions if (chunksInfo.count > 0 && chunksInfo.dimensions !== CONFIGURED_DIMENSIONS) { console.log(` ⚠️ Chunk dimensions (${chunksInfo.dimensions}) != configured (${CONFIGURED_DIMENSIONS})`); mismatches.push('chunks'); } else if (chunksInfo.count > 0) { console.log(` ✅ Chunk dimensions match configured`); } return mismatches; } /** * Find nodes with mismatched embeddings */ async function findMismatchedNodes(session) { console.log('\n🔍 Finding nodes with mismatched embeddings...'); try { const result = await session.run(` MATCH (n) WHERE n.embedding IS NOT NULL AND (n.embedding_dimensions IS NULL OR n.embedding_dimensions <> $configuredDims OR n.embedding_model IS NULL OR n.embedding_model <> $configuredModel OR size(n.embedding) <> $configuredDims) RETURN n.id as id, n.type as type, n.title as title, n.content as content, n.text as text, n.embedding_dimensions as currentDims, n.embedding_model as currentModel, size(n.embedding) as actualDims ORDER BY n.type, n.id `, { configuredDims: CONFIGURED_DIMENSIONS, configuredModel: CONFIGURED_MODEL }); const nodes = result.records.map(r => { const dimsValue = r.get('currentDims'); const dims = dimsValue ? (typeof dimsValue === 'object' && dimsValue.toNumber ? dimsValue.toNumber() : dimsValue) : null; return { id: r.get('id'), type: r.get('type'), title: r.get('title'), content: r.get('content'), text: r.get('text'), currentDims: dims, currentModel: r.get('currentModel') }; }); if (nodes.length === 0) { console.log(' ✅ No mismatched nodes found'); } else { console.log(` ⚠️ Found ${nodes.length} nodes with mismatched embeddings`); } return nodes; } catch (error) { console.error('❌ Error finding mismatched nodes:', error.message); throw error; } } /** * Find nodes without embeddings */ async function findNodesWithoutEmbeddings(session) { console.log('\n🔍 Finding nodes and chunks without embeddings...'); // Find both regular nodes and FileChunks without embeddings const result = await session.run(` MATCH (n) WHERE (n.type IS NOT NULL OR n:FileChunk) AND (n.embedding IS NULL OR size(n.embedding) = 0) AND (n.content IS NOT NULL OR n.text IS NOT NULL) RETURN n.id as id, labels(n)[0] as type, coalesce(n.content, n.text) as content LIMIT 10000 `); const nodes = result.records.map(record => ({ id: record.get('id'), type: record.get('type'), content: record.get('content') })); console.log(` Found ${nodes.length} nodes without embeddings`); return nodes; } /** * Generate embeddings for nodes that don't have them */ async function generateMissingEmbeddings(session) { console.log('\n🔄 Generating missing embeddings...'); // Initialize embeddings service const embeddingsService = new EmbeddingsService(); await embeddingsService.initialize(); if (!embeddingsService.isEnabled()) { console.error('❌ Embeddings service is not enabled. Check your configuration.'); throw new Error('Embeddings service not enabled'); } // Find nodes without embeddings const nodes = await findNodesWithoutEmbeddings(session); if (nodes.length === 0) { console.log(' ✅ All nodes already have embeddings'); return; } console.log(` Model: ${CONFIGURED_MODEL}`); console.log(` Dimensions: ${CONFIGURED_DIMENSIONS}`); console.log(` Processing ${nodes.length} nodes...`); let processed = 0; let errors = 0; for (const node of nodes) { try { if (!node.content) continue; // Generate embedding const embeddingResult = await embeddingsService.generateEmbedding(node.content); // Extract just the embedding array if it's wrapped in an object const embedding = Array.isArray(embeddingResult) ? embeddingResult : (embeddingResult.embedding || embeddingResult); // Verify it's an array of numbers if (!Array.isArray(embedding) || embedding.length === 0) { throw new Error(`Invalid embedding format: expected array, got ${typeof embedding}`); } // Update node with embedding await session.run( `MATCH (n {id: $id}) SET n.embedding = $embedding`, { id: node.id, embedding } ); processed++; if (processed % 10 === 0) { console.log(` Progress: ${processed}/${nodes.length}`); } } catch (error) { errors++; console.error(` ⚠️ Failed for node ${node.id} (${node.type}): ${error.message}`); } } console.log(`\n✅ Generated embeddings for ${processed} nodes (${errors} errors)`); // Verify embeddings were stored console.log('\n🔍 Verifying embeddings in database...'); const verifyResult = await session.run(` MATCH (n) WHERE n.embedding IS NOT NULL RETURN count(n) as total, avg(size(n.embedding)) as avgDimensions, min(size(n.embedding)) as minDimensions, max(size(n.embedding)) as maxDimensions `); const verify = verifyResult.records[0]; console.log(` Total nodes with embeddings: ${verify.get('total')}`); console.log(` Embedding dimensions: min=${verify.get('minDimensions')}, max=${verify.get('maxDimensions')}, avg=${Math.round(verify.get('avgDimensions'))}`); if (errors > 0) { console.log(`\n⚠️ ${errors} nodes failed to generate embeddings`); } } /** * Regenerate embeddings for mismatched nodes */ async function regenerateEmbeddings(session, nodes) { console.log('\n🔄 Regenerating embeddings...'); // Initialize embeddings service const embeddingsService = new EmbeddingsService(); await embeddingsService.initialize(); if (!embeddingsService.isEnabled()) { console.error('❌ Embeddings service is not enabled. Check your configuration.'); throw new Error('Embeddings service not enabled'); } console.log(` Model: ${CONFIGURED_MODEL}`); console.log(` Dimensions: ${CONFIGURED_DIMENSIONS}`); console.log(` Processing ${nodes.length} nodes...\n`); let successCount = 0; let errorCount = 0; for (let i = 0; i < nodes.length; i++) { const node = nodes[i]; const progress = `[${i + 1}/${nodes.length}]`; console.log(`${progress} Processing: ${node.type} - ${node.id}`); if (node.title) { console.log(` Title: ${node.title.substring(0, 60)}${node.title.length > 60 ? '...' : ''}`); } if (node.currentModel) { console.log(` Current: ${node.currentModel} (${node.currentDims || '?'} dims)`); } // Get text content for embedding const textContent = node.content || node.text || node.title || ''; if (!textContent) { console.log(' ⚠️ No text content found, skipping'); errorCount++; continue; } try { // Generate new embedding const embeddingResult = await embeddingsService.generateEmbedding(textContent); // Extract just the embedding array if it's wrapped in an object const embedding = Array.isArray(embeddingResult) ? embeddingResult : (embeddingResult.embedding || embeddingResult); // Verify it's an array of numbers if (!Array.isArray(embedding) || embedding.length === 0) { throw new Error(`Invalid embedding format: expected array, got ${typeof embedding}`); } // Update node with new embedding await session.run(` MATCH (n:Node {id: $id}) SET n.embedding = $embedding, n.embedding_model = $model, n.embedding_dimensions = $dimensions, n.has_embedding = true REMOVE n.needs_embedding `, { id: node.id, embedding: embedding, model: embeddingResult.model || CONFIGURED_MODEL, dimensions: embedding.length }); console.log(` ✅ Updated (${embeddingResult.dimensions} dims)`); successCount++; // Small delay to avoid overwhelming the service if (i < nodes.length - 1) { await new Promise(resolve => setTimeout(resolve, 100)); } } catch (error) { console.error(` ❌ Failed: ${error.message}`); errorCount++; } } console.log(`\n✅ Regeneration complete!`); console.log(` Success: ${successCount} nodes`); if (errorCount > 0) { console.log(` Errors: ${errorCount} nodes`); } return { successCount, errorCount }; } /** * Reset embedding space (full reset) */ async function resetEmbeddingSpace(session, regenerate = true) { console.log('\n🔄 Resetting embedding space...'); try { // Step 1: Drop vector index console.log('\n1️⃣ Dropping vector index...'); await session.run('DROP INDEX node_embedding_index IF EXISTS'); console.log(' ✅ Vector index dropped'); // Step 2: Create new vector index with configured dimensions console.log(`\n2️⃣ Creating vector index with ${CONFIGURED_DIMENSIONS} dimensions...`); await session.run(` CREATE VECTOR INDEX node_embedding_index IF NOT EXISTS FOR (n:Node) ON (n.embedding) OPTIONS {indexConfig: { \`vector.dimensions\`: ${CONFIGURED_DIMENSIONS}, \`vector.similarity_function\`: 'cosine' }} `); console.log(' ✅ Vector index created'); if (!regenerate) { // Step 3a: Clear embeddings from nodes (if not regenerating) console.log('\n3️⃣ Clearing embeddings from nodes (any label)...'); const nodeResult = await session.run(` MATCH (n) WHERE n.embedding IS NOT NULL REMOVE n.embedding, n.embedding_model, n.embedding_dimensions, n.has_embedding RETURN count(n) as count `); const nodeCountValue = nodeResult.records[0].get('count'); const nodeCount = typeof nodeCountValue === 'object' && nodeCountValue.toNumber ? nodeCountValue.toNumber() : nodeCountValue; console.log(` ✅ Cleared embeddings from ${nodeCount} nodes`); // Step 4: Clear embeddings from chunks console.log('\n4️⃣ Clearing embeddings from FileChunks...'); const chunkResult = await session.run(` MATCH (c:FileChunk) WHERE c.embedding IS NOT NULL REMOVE c.embedding, c.embedding_model, c.embedding_dimensions RETURN count(c) as count `); const chunkCountValue = chunkResult.records[0].get('count'); const chunkCount = typeof chunkCountValue === 'object' && chunkCountValue.toNumber ? chunkCountValue.toNumber() : chunkCountValue; console.log(` ✅ Cleared embeddings from ${chunkCount} chunks`); console.log('\n✅ Embedding space reset complete!'); console.log('\n📝 Next steps:'); console.log(' Re-index your files to generate new embeddings:'); console.log(' npm run index-docs'); } else { // Step 3b: Find and regenerate mismatched embeddings const nodes = await findMismatchedNodes(session); if (nodes.length > 0) { await regenerateEmbeddings(session, nodes); } console.log('\n✅ Embedding space reset complete!'); } } catch (error) { console.error('\n❌ Error resetting embedding space:', error.message); throw error; } } /** * Main function */ async function main() { console.log('🔧 Mimir Embedding Space Checker\n'); console.log('Configuration:'); console.log(` Neo4j URI: ${NEO4J_URI}`); console.log(` Dimensions: ${CONFIGURED_DIMENSIONS}`); console.log(` Model: ${CONFIGURED_MODEL}`); const driver = neo4j.driver( NEO4J_URI, neo4j.auth.basic(NEO4J_USER, NEO4J_PASSWORD) ); const session = driver.session(); try { // Handle generate-missing mode separately if (generateMissing) { console.log('\n📝 Generate Missing Mode: Will only generate embeddings for items without them\n'); await generateMissingEmbeddings(session); console.log('\n✅ Done!'); return; } // Check current state const indexInfo = await checkVectorIndex(session); const embeddingsInfo = await checkExistingEmbeddings(session); const chunksInfo = await checkChunkEmbeddings(session); // Analyze mismatches const mismatches = analyzeMismatches(indexInfo, embeddingsInfo, chunksInfo); // If force or clear-only is set, proceed regardless of mismatches if (!forceReset && !shouldReset && mismatches.length === 0) { console.log('\n✅ All embeddings match configured dimensions. No action needed.'); return; } // Report mismatches (if any) if (mismatches.length > 0) { console.log('\n⚠️ Mismatches detected:'); for (const m of mismatches) { console.log(` - ${m}`); } } else if (forceReset || shouldReset) { console.log('\n✅ No mismatches detected, but proceeding with reset as requested...'); } // Decide whether to reset let doReset = false; if (forceReset) { console.log('\n🔨 Force reset enabled. Proceeding without confirmation...'); doReset = true; } else if (shouldReset) { if (clearOnly) { console.log('\n⚠️ WARNING: This will:'); console.log(' 1. Drop the existing vector index'); console.log(' 2. Clear all embeddings from nodes and chunks'); console.log(' 3. Create a new vector index with configured dimensions'); console.log(' 4. You will need to re-index files to generate new embeddings'); } else { console.log('\n⚠️ WARNING: This will:'); console.log(' 1. Drop the existing vector index'); console.log(' 2. Create a new vector index with configured dimensions'); console.log(' 3. Find all nodes with mismatched embeddings'); console.log(' 4. Regenerate embeddings for those nodes (this may take a while)'); } const answer = await askQuestion('\nProceed with reset? (yes/no): '); doReset = answer.toLowerCase() === 'yes' || answer.toLowerCase() === 'y'; } else { console.log('\n💡 Tip: Run with --reset to fix mismatches automatically'); console.log(' npm run embeddings:reset'); console.log(' or: node scripts/check-and-reset-embeddings.js --reset'); return; } if (doReset) { await resetEmbeddingSpace(session, !clearOnly); } else { console.log('\n❌ Reset cancelled by user.'); } } catch (error) { console.error('\n❌ Fatal error:', error.message); throw error; } finally { await session.close(); await driver.close(); } } // Run the script main() .then(() => { console.log('\n✅ Done!'); process.exit(0); }) .catch((error) => { console.error('\n❌ Failed:', error); process.exit(1); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server