Skip to main content
Glama
VectorRepository.ts17.8 kB
import Database from 'better-sqlite3'; import { DocumentChunk, DocumentChunkOptimized, VectorIndexStatistics, StorageError, ContentMetadata } from '../types/index.js'; import { log } from './Logger.js'; import fs from 'node:fs'; import path from 'node:path'; // Database row interfaces for sqlite-vec interface VecChunkRow { chunk_id: string; file_path: string; chunk_index: number; content: string; chunk_offset: number; token_count: number; created_at: string; distance?: number; } interface DocumentStats { documents: number; total_chunks: number; total_tokens: number; } export class VectorRepository { private db: Database.Database; private dbPath: string; constructor(db: Database.Database, dbPath: string) { this.db = db; this.dbPath = dbPath; } /** * Store document chunks with their embeddings using sqlite-vec * @param chunks Array of document chunks * @returns Number of chunks stored */ async storeChunks(chunks: DocumentChunk[]): Promise<number> { if (chunks.length === 0) return 0; const timer = log.time('store-chunks'); log.info('Starting sqlite-vec chunk storage', { totalChunks: chunks.length, uniqueFiles: new Set(chunks.map(c => c.filePath)).size }); try { const transaction = this.db.transaction((chunkList: DocumentChunk[]) => { // Group chunks by file const fileGroups = new Map<string, DocumentChunk[]>(); for (const chunk of chunkList) { if (!fileGroups.has(chunk.filePath)) { fileGroups.set(chunk.filePath, []); } fileGroups.get(chunk.filePath)!.push(chunk); } // Update documents table const docStmt = this.db.prepare(` INSERT OR REPLACE INTO documents (file_path, file_name, last_modified, total_chunks, total_tokens, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now')) `); for (const [filePath, fileChunks] of fileGroups) { const firstChunk = fileChunks[0]; docStmt.run( filePath, path.basename(filePath), firstChunk.metadata.lastModified.toISOString(), fileChunks.length, fileChunks.reduce((sum, c) => sum + c.metadata.tokenCount, 0) ); } // Store chunks in vec0 table using sqlite-vec // Use CAST to force SQLite INTEGER conversion from JavaScript numbers const vecStmt = this.db.prepare(` INSERT OR REPLACE INTO vec_chunks (embedding, chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at) VALUES (?, ?, ?, CAST(? AS INTEGER), ?, CAST(? AS INTEGER), CAST(? AS INTEGER), ?) `); for (const chunk of chunkList) { // sqlite-vec expects Float32Array for embeddings const embedding = new Float32Array(chunk.embedding); vecStmt.run( embedding, chunk.id, chunk.filePath, chunk.chunkIndex, chunk.content, chunk.metadata.chunkOffset, chunk.metadata.tokenCount, new Date().toISOString() ); } }); transaction(chunks); timer(); log.info('sqlite-vec chunk storage completed', { totalStored: chunks.length }); return chunks.length; } catch (error: any) { log.error('sqlite-vec chunk storage failed', error); throw new StorageError( `Failed to store chunks: ${error.message}`, error ); } } /** * Search for similar chunks using sqlite-vec native vector similarity search (ASYNC) * @param queryEmbedding Query embedding vector * @param limit Maximum results to return * @param minScore Minimum similarity score (0-1) * @returns Sorted array of matching chunks with scores */ async searchSimilar(queryEmbedding: number[], limit: number = 10, minScore: number = 0.0): Promise<DocumentChunk[]> { const timer = log.time('vector-search'); log.debug('Starting sqlite-vec native vector search', { queryEmbeddingSize: queryEmbedding.length, limit, minScore }); try { return await new Promise((resolve, reject) => { setImmediate(() => { try { // Convert query embedding to Float32Array for sqlite-vec const queryVector = new Float32Array(queryEmbedding); // Use sqlite-vec's MATCH syntax with k parameter for KNN search const stmt = this.db.prepare(` SELECT chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at, distance FROM vec_chunks WHERE embedding MATCH ? AND k = ? ORDER BY distance `); const allRows = stmt.all( queryVector, limit ) as VecChunkRow[]; // Filter by minimum score after retrieval const rows = allRows.filter(row => (row.distance || 0) >= minScore); timer(); log.info('sqlite-vec vector search completed', { totalResults: rows.length, topScore: rows[0]?.distance || 0 }); const results = rows.map(row => ({ id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, embedding: [], // Don't return embeddings to save memory score: row.distance || 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } })); resolve(results); } catch (error: any) { log.error('sqlite-vec search failed', error); reject(new StorageError( `Native vector search failed: ${error.message}`, error )); } }); }); } catch (error: any) { // Error already logged above in inner catch throw error; } } /** * Get chunk by ID * @param chunkId Chunk identifier * @returns DocumentChunk or null if not found */ async getChunk(chunkId: string): Promise<DocumentChunk | null> { try { const stmt = this.db.prepare(` SELECT chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at FROM vec_chunks WHERE chunk_id = ? `); const row = stmt.get(chunkId) as VecChunkRow | undefined; if (!row) { return null; } return { id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, embedding: [], score: 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } }; } catch (error: any) { throw new StorageError( `Failed to get chunk: ${error.message}`, error ); } } /** * Get all chunks for a specific file (ASYNC) * @param filePath File path * @returns Array of document chunks */ async getFileChunks(filePath: string): Promise<DocumentChunk[]> { try { // Use setImmediate to avoid blocking main thread (2025 best practice) return await new Promise((resolve, reject) => { setImmediate(() => { try { const stmt = this.db.prepare(` SELECT chunk_id, file_path, chunk_index, content, chunk_offset, token_count, created_at FROM vec_chunks WHERE file_path = ? ORDER BY chunk_index `); const rows = stmt.all(filePath) as VecChunkRow[]; const results = rows.map(row => ({ id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, embedding: [], score: 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } })); resolve(results); } catch (error: any) { reject(new StorageError( `Failed to get file chunks: ${error.message}`, error )); } }); }); } catch (error: any) { throw new StorageError( `Failed to get file chunks: ${error.message}`, error ); } } /** * Delete chunks for a specific file * @param filePath File path * @returns Number of chunks deleted */ async deleteFile(filePath: string): Promise<number> { try { // Delete from vec_chunks table const vecStmt = this.db.prepare('DELETE FROM vec_chunks WHERE file_path = ?'); const vecResult = vecStmt.run(filePath); // Delete from documents table const docStmt = this.db.prepare('DELETE FROM documents WHERE file_path = ?'); docStmt.run(filePath); return vecResult.changes; } catch (error: any) { throw new StorageError( `Failed to delete file: ${error.message}`, error ); } } /** * Get index statistics (ASYNC) * @returns Statistics about the index */ async getStatistics(): Promise<VectorIndexStatistics> { try { // Use setImmediate to avoid blocking main thread (2025 best practice) return await new Promise((resolve, reject) => { setImmediate(async () => { try { const docStats = this.db.prepare(` SELECT COUNT(*) as documents, SUM(total_chunks) as total_chunks, SUM(total_tokens) as total_tokens FROM documents `).get() as DocumentStats | undefined; let dbSize = 0; try { const stats = await fs.promises.stat(this.dbPath); dbSize = stats.size; } catch { } const result = { totalChunks: docStats?.total_chunks || 0, totalFiles: docStats?.documents || 0, totalTokens: docStats?.total_tokens || 0, embeddingModel: 'universal-sentence-encoder', lastUpdated: new Date(), dbSize }; resolve(result); } catch (error: any) { reject(new StorageError( `Failed to get statistics: ${error.message}`, error )); } }); }); } catch (error: any) { throw new StorageError( `Failed to get statistics: ${error.message}`, error ); } } /** * Store content metadata for enhanced document classification * @param chunkId Chunk identifier * @param metadata Enhanced content metadata */ async storeContentMetadata(chunkId: string, metadata: ContentMetadata): Promise<void> { try { const stmt = this.db.prepare(` INSERT OR REPLACE INTO content_metadata (chunk_id, content_type, language, domain_tags, quality_score, source_authority, file_extension, has_comments, has_documentation, processed_content, raw_content, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now')) `); stmt.run( chunkId, metadata.contentType, metadata.language, JSON.stringify(metadata.domainTags), metadata.qualityScore, metadata.sourceAuthority, metadata.fileExtension, metadata.hasComments ? 1 : 0, metadata.hasDocumentation ? 1 : 0, metadata.processedContent || null, metadata.rawContent || null ); log.debug('Content metadata stored', { chunkId: chunkId.substring(0, 16) + '...', contentType: metadata.contentType, language: metadata.language, domainCount: metadata.domainTags.length }); } catch (error: any) { log.error('Failed to store content metadata', error, { chunkId }); throw new StorageError( `Failed to store content metadata: ${error.message}`, error ); } } /** * Get content metadata for a chunk * @param chunkId Chunk identifier * @returns Content metadata or null if not found */ async getContentMetadata(chunkId: string): Promise<ContentMetadata | null> { try { const stmt = this.db.prepare(` SELECT content_type, language, domain_tags, quality_score, source_authority, file_extension, has_comments, has_documentation, processed_content, raw_content FROM content_metadata WHERE chunk_id = ? `); const row = stmt.get(chunkId) as any; if (!row) { return null; } return { contentType: row.content_type, language: row.language, domainTags: JSON.parse(row.domain_tags || '[]'), qualityScore: row.quality_score, sourceAuthority: row.source_authority, fileExtension: row.file_extension, hasComments: row.has_comments === 1, hasDocumentation: row.has_documentation === 1, processedContent: row.processed_content, rawContent: row.raw_content }; } catch (error: any) { log.error('Failed to get content metadata', error, { chunkId }); return null; } } /** * Enhanced search with content metadata support * @param queryEmbedding Query embedding vector * @param limit Maximum results to return * @param minScore Minimum similarity score * @returns Document chunks with enhanced metadata */ async searchSimilarWithMetadata(queryEmbedding: number[], limit: number = 10, minScore: number = 0.0): Promise<DocumentChunkOptimized[]> { const timer = log.time('enhanced-vector-search'); log.debug('Starting enhanced vector search with metadata', { queryEmbeddingSize: queryEmbedding.length, limit, minScore }); try { return await new Promise((resolve, reject) => { setImmediate(() => { try { const queryVector = new Float32Array(queryEmbedding); // Join with content metadata for enhanced results const stmt = this.db.prepare(` SELECT vc.chunk_id, vc.file_path, vc.chunk_index, vc.content, vc.chunk_offset, vc.token_count, vc.created_at, vc.distance, cm.content_type, cm.language, cm.domain_tags, cm.quality_score, cm.source_authority, cm.file_extension, cm.has_comments, cm.has_documentation, cm.processed_content, cm.raw_content FROM vec_chunks vc LEFT JOIN content_metadata cm ON vc.chunk_id = cm.chunk_id WHERE vc.embedding MATCH ? AND k = ? ORDER BY vc.distance `); const allRows = stmt.all(queryVector, limit) as any[]; const rows = allRows.filter(row => (row.distance || 0) >= minScore); timer(); log.info('Enhanced vector search completed', { totalResults: rows.length, topScore: rows[0]?.distance || 0, withMetadata: rows.filter(r => r.content_type).length }); const results = rows.map(row => { const result: DocumentChunkOptimized = { id: row.chunk_id, filePath: row.file_path, chunkIndex: row.chunk_index, content: row.content, score: row.distance || 0, metadata: { fileSize: 0, lastModified: new Date(row.created_at), chunkOffset: row.chunk_offset, tokenCount: row.token_count } }; // Add enhanced content metadata if available if (row.content_type) { result.contentMetadata = { contentType: row.content_type, language: row.language, domainTags: JSON.parse(row.domain_tags || '[]'), qualityScore: row.quality_score, sourceAuthority: row.source_authority, fileExtension: row.file_extension, hasComments: row.has_comments === 1, hasDocumentation: row.has_documentation === 1, processedContent: row.processed_content, rawContent: row.raw_content }; } return result; }); resolve(results); } catch (error: any) { log.error('Enhanced vector search failed', error); reject(new StorageError( `Enhanced vector search failed: ${error.message}`, error )); } }); }); } catch (error: any) { log.error('Enhanced vector search failed', error); throw new StorageError( `Enhanced vector search failed: ${error.message}`, error ); } } /** * Clear all vector data from index */ async clear(): Promise<void> { try { this.db.exec('DELETE FROM vec_chunks'); this.db.exec('DELETE FROM documents'); this.db.exec('DELETE FROM content_metadata'); } catch (error: any) { throw new StorageError( `Failed to clear vector index: ${error.message}`, error ); } } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server