index-vectors
Index project files for semantic search by creating vector embeddings, enabling AI-powered code and document retrieval within development tools.
Instructions
Index project files for semantic search using vector embeddings
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| path | No | Project path to index (defaults to current directory) | |
| provider | No | Embedding provider to use (defaults to configured provider) | |
| force | No | Force re-indexing of all files |
Implementation Reference
- src/handlers/vector.ts:36-84 (handler)Main handler function for the 'index-vectors' tool. Sets up configuration, embedding provider, checks for existing vectors, calls indexProject helper, and formats the response.export async function handleIndexVectors(args: IndexVectorsInput): Promise<string> { const configManager = new ConfigManager(); const config = await configManager.getConfig(); logger.log('Starting vector indexing...'); try { // Get embedding provider let provider: EmbeddingProvider; if (args.provider) { provider = new EmbeddingProvider({ provider: args.provider }, configManager); } else { provider = await getDefaultEmbeddingProvider(configManager); } // Get current vector count const currentCount = await getVectorCount(args.path); if (currentCount > 0 && !args.force) { return `Found ${currentCount} existing vectors. Use force=true to re-index.`; } // Index the project const messages: string[] = []; const result = await indexProject({ projectPath: args.path, provider, config: config.vectorConfig || VectorConfigSchema.parse({}), force: args.force, onProgress: (message) => { messages.push(message); logger.log(message); }, }); const totalCount = await getVectorCount(args.path); return `Indexing complete! Files indexed: ${result.filesIndexed} Chunks created: ${result.chunksCreated} Time: ${(result.timeMs / 1000).toFixed(1)}s Total vectors: ${totalCount} Progress: ${messages.join('\n')}`; } catch (error) { logger.error('Vector indexing failed:', error); throw new Error(`Indexing failed: ${error instanceof Error ? error.message : String(error)}`); } }
- src/handlers/vector.ts:10-15 (schema)Zod input schema defining parameters for the index-vectors tool: path, provider, and force flag.// Input schema for index-vectors tool export const IndexVectorsSchema = z.object({ path: z.string().default(process.cwd()), provider: z.enum(['openai', 'azure', 'gemini']).optional(), force: z.boolean().default(false), });
- src/server.ts:445-465 (registration)Tool registration in the MCP server, specifying title, description, schema, and handler wrapper that imports and calls handleIndexVectors.// Register vector indexing tools server.registerTool("index-vectors", { title: "Index Vectors", description: "Index project files for semantic search using vector embeddings", inputSchema: IndexVectorsSchema.shape, }, async (args) => { const { handleIndexVectors } = await import("./handlers/vector"); const result = await handleIndexVectors({ path: args.path || process.cwd(), provider: args.provider, force: args.force || false, }); return { content: [ { type: "text", text: result } ] }; });
- src/vector/indexer.ts:27-203 (helper)Core helper function implementing the vector indexing logic: scans files, splits into chunks, generates embeddings using the provider, and stores in vector database with deduplication and batching.export async function indexProject(options: IndexingOptions): Promise<IndexingResult> { const startTime = Date.now(); const { projectPath, provider, config, force = false, onProgress } = options; onProgress?.('Initializing vector database...'); const { db, client } = await getVectorDB(projectPath); // Update .gitignore if needed await updateGitignore(projectPath); // Get files to index onProgress?.('Scanning project files...'); const files = await getFilesToIndex(projectPath, config.filePatterns); if (files.length === 0) { logger.warn('No files found to index'); return { filesIndexed: 0, chunksCreated: 0, timeMs: Date.now() - startTime }; } onProgress?.(`Found ${files.length} files to process`); // Create text splitter const splitter = new RecursiveCharacterTextSplitter({ chunkSize: config.chunkSize, chunkOverlap: config.chunkOverlap, }); let filesIndexed = 0; let chunksCreated = 0; // Process files in batches for (let i = 0; i < files.length; i += config.batchSize) { const batch = files.slice(i, i + config.batchSize); const batchChunks: Array<{ id: string; relpath: string; chunk: string; hash: string; mtimeMs: number; embedding?: number[]; }> = []; // Process batch for (const filePath of batch) { try { const relPath = relative(projectPath, filePath); const stats = await stat(filePath); const content = await readFile(filePath, 'utf-8'); // Skip empty files if (!content.trim()) continue; // Split into chunks const chunks = await splitter.splitText(content); for (let idx = 0; idx < chunks.length; idx++) { const chunk = chunks[idx]; const id = `${relPath}#${idx}`; const hash = createHash('sha256').update(chunk).digest('hex'); // Check if chunk already exists with same hash if (!force) { const result = await client.execute({ sql: `SELECT hash, mtime_ms FROM vector_chunks WHERE id = ?`, args: [id] }); if (result.rows.length > 0) { const row = result.rows[0]; const existing = { hash: row[0] as string, mtime_ms: row[1] as number }; if (existing.hash === hash && existing.mtime_ms === stats.mtimeMs) { continue; // Skip unchanged chunk } } } batchChunks.push({ id, relpath: relPath, chunk, hash, mtimeMs: stats.mtimeMs, }); } filesIndexed++; } catch (error) { logger.error(`Error processing file ${filePath}:`, error); } } // Generate embeddings for batch if (batchChunks.length > 0) { onProgress?.(`Generating embeddings for batch ${Math.floor(i / config.batchSize) + 1}...`); try { const texts = batchChunks.map(c => c.chunk); const embeddings = await provider.getEmbeddings(texts); // Store chunks with embeddings (dual-table approach) for (let j = 0; j < batchChunks.length; j++) { const chunk = batchChunks[j]; const embedding = embeddings[j]; // Validate embedding dimensions if (embedding.length !== 1536) { logger.warn(`Embedding dimension mismatch for ${chunk.id}: expected 1536, got ${embedding.length}`); continue; } try { // 1. Insert/update metadata in main table const result = await client.execute({ sql: `INSERT OR REPLACE INTO vector_chunks (id, relpath, chunk, hash, mtime_ms) VALUES (?, ?, ?, ?, ?)`, args: [ chunk.id, chunk.relpath, chunk.chunk, chunk.hash, chunk.mtimeMs, ] }); // 2. Get the rowid for linking const rowidResult = await client.execute({ sql: `SELECT rowid FROM vector_chunks WHERE id = ?`, args: [chunk.id] }); if (rowidResult.rows.length > 0) { const rowid = rowidResult.rows[0][0] as number; // 3. Try to insert into VSS virtual table try { // VSS requires DELETE before INSERT for updates await client.execute({ sql: `DELETE FROM vss_vectors WHERE rowid = ?`, args: [rowid] }); await client.execute({ sql: `INSERT INTO vss_vectors (rowid, embedding) VALUES (?, ?)`, args: [rowid, new Float32Array(embedding).buffer] }); } catch (vssError) { // VSS not available, fallback to adding embedding to main table await client.execute({ sql: `UPDATE vector_chunks SET embedding = ? WHERE id = ?`, args: [float32ArrayToBuffer(embedding), chunk.id] }); } chunksCreated++; } } catch (error) { logger.error(`Error storing chunk ${chunk.id}:`, error); } } } catch (error) { logger.error('Error generating embeddings:', error); throw error; } } onProgress?.(`Processed ${Math.min(i + config.batchSize, files.length)} / ${files.length} files`); } const timeMs = Date.now() - startTime; onProgress?.(`Indexing complete: ${filesIndexed} files, ${chunksCreated} chunks in ${(timeMs / 1000).toFixed(1)}s`); return { filesIndexed, chunksCreated, timeMs }; }