Local Search MCP Server

VectorRepository.ts•17.4 KiB

import Database from 'better-sqlite3'; import { DocumentChunk, DocumentChunkOptimized, VectorIndexStatistics, StorageError, ContentMetadata } from '../types/index.js'; import { log } from './Logger.js'; import fs from 'node:fs'; import path from 'node:path'; // Database row interfaces for sqlite-vec interface VecChunkRow { chunk_id: string; file_path: string; chunk_index: number; content: string; chunk_offset: number; token_count: number; created_at: string; distance?: number; } interface DocumentStats { documents: number; total_chunks: number; total_tokens: number; } export class VectorRepository { private db: Database.Database; private dbPath: string; constructor(db: Database.Database, dbPath: string) { this.db = db; this.dbPath = dbPath; } /** * Store document chunks with their embeddings using sqlite-vec * @param chunks Array of document chunks * @returns Number of chunks stored */ async storeChunks(chunks: DocumentChunk[]): Promise<number> { if (chunks.length === 0) return 0; const timer = log.time('store-chunks'); log.info('Starting sqlite-vec chunk storage', { totalChunks: chunks.length, uniqueFiles: new Set(chunks.map(c => c.filePath)).size }); try { const transaction = this.db.transaction((chunkList: DocumentChunk[]) => { // Group chunks by file const fileGroups = new Map<string, DocumentChunk[]>(); for (const chunk of chunkList) { if (!fileGroups.has(chunk.filePath)) { fileGroups.set(chunk.filePath, []); } fileGroups.get(chunk.filePath)!.push(chunk); } // Update documents table const docStmt = this.db.prepare(` INSERT OR REPLACE INTO documents (file_path, file_name, last_modified, total_chunks, total_tokens, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now')) `); for (const [filePath, fileChunks] of fileGroups) { const firstChunk = fileChunks[0]; docStmt.run( filePath, path.basename(filePath), firstChunk.metadata.lastModified.toISOString(), fileChunks.length, fileChunks.reduce((sum, c) => sum + c.metadata.tokenCount, 0) ); } // Store chunks in vec0 table using sqlite-vec // Use CAST to force SQLite INTEGER conversion from JavaScript numbers const vecStmt = this.db.prepare(` INSERT OR REPLACE INTO vec_chunks (embedding, chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at) VALUES (?, ?, ?, CAST(? AS INTEGER), ?, CAST(? AS INTEGER), CAST(? AS INTEGER), ?) `); for (const chunk of chunkList) { // sqlite-vec expects Float32Array for embeddings const embedding = new Float32Array(chunk.embedding); vecStmt.run( embedding, chunk.id, chunk.filePath, chunk.chunkIndex, chunk.content, chunk.metadata.chunkOffset, chunk.metadata.tokenCount, new Date().toISOString() ); } }); transaction(chunks); timer(); log.info('sqlite-vec chunk storage completed', { totalStored: chunks.length }); return chunks.length; } catch (error: any) { log.error('sqlite-vec chunk storage failed', error); throw new StorageError( `Failed to store chunks: ${error.message}`, error ); } } /** * Search for similar chunks using sqlite-vec native vector similarity search (ASYNC) * @param queryEmbedding Query embedding vector * @param limit Maximum results to return * @param minScore Minimum similarity score (0-1) * @returns Sorted array of matching chunks with scores */ async searchSimilar(queryEmbedding: number[], limit: number = 10, minScore: number = 0.0): Promise<DocumentChunk[]> { const timer = log.time('vector-search'); log.debug('Starting sqlite-vec native vector search', { queryEmbeddingSize: queryEmbedding.length, limit, minScore }); try { return await new Promise((resolve, reject) => { setImmediate(() => { try { // Convert query embedding to Float32Array for sqlite-vec const queryVector = new Float32Array(queryEmbedding); // Use sqlite-vec's MATCH syntax with k parameter for KNN search const stmt = this.db.prepare(` SELECT chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at, distance FROM vec_chunks WHERE embedding MATCH ? AND k = ? ORDER BY distance `); const allRows = stmt.all( queryVector, limit ) as VecChunkRow[]; // Filter by minimum score after retrieval const rows = allRows.filter(row => (row.distance || 0) >= minScore); timer(); log.info('sqlite-vec vector search completed', { totalResults: rows.length, topScore: rows[0]?.distance || 0 }); const results = rows.map(row => ({ id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, embedding: [], // Don't return embeddings to save memory score: row.distance || 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } })); resolve(results); } catch (error: any) { log.error('sqlite-vec search failed', error); reject(new StorageError( `Native vector search failed: ${error.message}`, error )); } }); }); } catch (error: any) { // Error already logged above in inner catch throw error; } } /** * Get chunk by ID * @param chunkId Chunk identifier * @returns DocumentChunk or null if not found */ async getChunk(chunkId: string): Promise<DocumentChunk | null> { try { const stmt = this.db.prepare(` SELECT chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at FROM vec_chunks WHERE chunk_id = ? `); const row = stmt.get(chunkId) as VecChunkRow | undefined; if (!row) { return null; } return { id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, embedding: [], score: 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } }; } catch (error: any) { throw new StorageError( `Failed to get chunk: ${error.message}`, error ); } } /** * Get all chunks for a specific file (ASYNC) * @param filePath File path * @returns Array of document chunks */ async getFileChunks(filePath: string): Promise<DocumentChunk[]> { try { // Use setImmediate to avoid blocking main thread (2025 best practice) return await new Promise((resolve, reject) => { setImmediate(() => { try { const stmt = this.db.prepare(` SELECT chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at FROM vec_chunks WHERE file_path = ? ORDER BY chunk_index `); const rows = stmt.all(filePath) as VecChunkRow[]; const results = rows.map(row => ({ id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, embedding: [], score: 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } })); resolve(results); } catch (error: any) { reject(new StorageError( `Failed to get file chunks: ${error.message}`, error )); } }); }); } catch (error: any) { throw new StorageError( `Failed to get file chunks: ${error.message}`, error ); } } /** * Delete chunks for a specific file * @param filePath File path * @returns Number of chunks deleted */ async deleteFile(filePath: string): Promise<number> { try { // Delete from vec_chunks table const vecStmt = this.db.prepare('DELETE FROM vec_chunks WHERE file_path = ?'); const vecResult = vecStmt.run(filePath); // Delete from documents table const docStmt = this.db.prepare('DELETE FROM documents WHERE file_path = ?'); docStmt.run(filePath); return vecResult.changes; } catch (error: any) { throw new StorageError( `Failed to delete file: ${error.message}`, error ); } } /** * Get index statistics (ASYNC) * @returns Statistics about the index */ async getStatistics(): Promise<VectorIndexStatistics> { try { // Use setImmediate to avoid blocking main thread (2025 best practice) return await new Promise((resolve, reject) => { setImmediate(async () => { try { const docStats = this.db.prepare(` SELECT COUNT(*) as documents, SUM(total_chunks) as total_chunks, SUM(total_tokens) as total_tokens FROM documents `).get() as DocumentStats | undefined; let dbSize = 0; try { const stats = await fs.promises.stat(this.dbPath); dbSize = stats.size; } catch { } const result = { totalChunks: docStats?.total_chunks || 0, totalFiles: docStats?.documents || 0, totalTokens: docStats?.total_tokens || 0, embeddingModel: 'universal-sentence-encoder', lastUpdated: new Date(), dbSize }; resolve(result); } catch (error: any) { reject(new StorageError( `Failed to get statistics: ${error.message}`, error )); } }); }); } catch (error: any) { throw new StorageError( `Failed to get statistics: ${error.message}`, error ); } } /** * Store content metadata for enhanced document classification * @param chunkId Chunk identifier * @param metadata Enhanced content metadata */ async storeContentMetadata(chunkId: string, metadata: ContentMetadata): Promise<void> { try { const stmt = this.db.prepare(` INSERT OR REPLACE INTO content_metadata (chunk_id, content_type, language, domain_tags, quality_score, source_authority, file_extension, has_comments, has_documentation, processed_content, raw_content, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')) `); stmt.run( chunkId, metadata.contentType, metadata.language, JSON.stringify(metadata.domainTags), metadata.qualityScore, metadata.sourceAuthority, metadata.fileExtension, metadata.hasComments ? 1 : 0, metadata.hasDocumentation ? 1 : 0, metadata.processedContent || null, metadata.rawContent || null ); log.debug('Content metadata stored', { chunkId: chunkId.substring(0, 16) + '...', contentType: metadata.contentType, language: metadata.language, domainCount: metadata.domainTags.length }); } catch (error: any) { log.error('Failed to store content metadata', error, { chunkId }); throw new StorageError( `Failed to store content metadata: ${error.message}`, error ); } } /** * Get content metadata for a chunk * @param chunkId Chunk identifier * @returns Content metadata or null if not found */ async getContentMetadata(chunkId: string): Promise<ContentMetadata | null> { try { const stmt = this.db.prepare(` SELECT content_type, language, domain_tags, quality_score, source_authority, file_extension, has_comments, has_documentation, processed_content, raw_content FROM content_metadata WHERE chunk_id = ? `); const row = stmt.get(chunkId) as any; if (!row) { return null; } return { contentType: row.content_type, language: row.language, domainTags: JSON.parse(row.domain_tags || '[]'), qualityScore: row.quality_score, sourceAuthority: row.source_authority, fileExtension: row.file_extension, hasComments: row.has_comments === 1, hasDocumentation: row.has_documentation === 1, processedContent: row.processed_content, rawContent: row.raw_content }; } catch (error: any) { log.error('Failed to get content metadata', error, { chunkId }); return null; } } /** * Enhanced search with content metadata support * @param queryEmbedding Query embedding vector * @param limit Maximum results to return * @param minScore Minimum similarity score * @returns Document chunks with enhanced metadata */ async searchSimilarWithMetadata(queryEmbedding: number[], limit: number = 10, minScore: number = 0.0): Promise<DocumentChunkOptimized[]> { const timer = log.time('enhanced-vector-search'); log.debug('Starting enhanced vector search with metadata', { queryEmbeddingSize: queryEmbedding.length, limit, minScore }); try { return await new Promise((resolve, reject) => { setImmediate(() => { try { const queryVector = new Float32Array(queryEmbedding); // Join with content metadata for enhanced results const stmt = this.db.prepare(` SELECT vc.chunk_id, vc.file_path, vc.chunk_index, vc.content, vc.chunk_offset, vc.token_count, vc.created_at, vc.distance, cm.content_type, cm.language, cm.domain_tags, cm.quality_score, cm.source_authority, cm.file_extension, cm.has_comments, cm.has_documentation, cm.processed_content, cm.raw_content FROM vec_chunks vc LEFT JOIN content_metadata cm ON vc.chunk_id = cm.chunk_id WHERE vc.embedding MATCH ? AND k = ? ORDER BY vc.distance `); const allRows = stmt.all(queryVector, limit) as any[]; const rows = allRows.filter(row => (row.distance || 0) >= minScore); timer(); log.info('Enhanced vector search completed', { totalResults: rows.length, topScore: rows[0]?.distance || 0, withMetadata: rows.filter(r => r.content_type).length }); const results = rows.map(row => { const result: DocumentChunkOptimized = { id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, score: row.distance || 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } }; // Add enhanced content metadata if available if (row.content_type) { result.contentMetadata = { contentType: row.content_type, language: row.language, domainTags: JSON.parse(row.domain_tags || '[]'), qualityScore: row.quality_score, sourceAuthority: row.source_authority, fileExtension: row.file_extension, hasComments: row.has_comments === 1, hasDocumentation: row.has_documentation === 1, processedContent: row.processed_content, rawContent: row.raw_content }; } return result; }); resolve(results); } catch (error: any) { log.error('Enhanced vector search failed', error); reject(new StorageError( `Enhanced vector search failed: ${error.message}`, error )); } }); }); } catch (error: any) { log.error('Enhanced vector search failed', error); throw new StorageError( `Enhanced vector search failed: ${error.message}`, error ); } } /** * Clear all vector data from index */ async clear(): Promise<void> { try { this.db.exec('DELETE FROM vec_chunks'); this.db.exec('DELETE FROM documents'); this.db.exec('DELETE FROM content_metadata'); } catch (error: any) { throw new StorageError( `Failed to clear vector index: ${error.message}`, error ); } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

VectorRepository.ts•17.4 KiB