Discord

// VectorStore implementation with LanceDB integration import { type Connection, Index, type Table, connect } from '@lancedb/lancedb' // ============================================ // Constants // ============================================ /** * Standard deviation multiplier for detecting group boundaries. * A gap is considered a "boundary" if it exceeds mean + k*std. * Value of 1.5 means gaps > 1.5 standard deviations above mean are boundaries. */ const GROUPING_BOUNDARY_STD_MULTIPLIER = 1.5 /** Multiplier for candidate count in hybrid search (to allow reranking) */ const HYBRID_SEARCH_CANDIDATE_MULTIPLIER = 2 /** FTS index name (bump version when changing tokenizer settings) */ const FTS_INDEX_NAME = 'fts_index_v2' /** Threshold for cleaning up old index versions (1 minute) */ const FTS_CLEANUP_THRESHOLD_MS = 60 * 1000 // ============================================ // Type Definitions // ============================================ /** * Grouping mode for quality filtering * - 'similar': Only return the most similar group (stops at first distance jump) * - 'related': Include related groups (stops at second distance jump) */ export type GroupingMode = 'similar' | 'related' /** * VectorStore configuration */ export interface VectorStoreConfig { /** LanceDB database path */ dbPath: string /** Table name */ tableName: string /** Maximum distance threshold for filtering results (optional) */ maxDistance?: number /** Grouping mode for quality filtering (optional) */ grouping?: GroupingMode /** Hybrid search weight for BM25 (0.0 = vector only, 1.0 = BM25 only, default 0.6) */ hybridWeight?: number } /** * Document metadata */ export interface DocumentMetadata { /** File name */ fileName: string /** File size in bytes */ fileSize: number /** File type (extension) */ fileType: string } /** * Vector chunk */ export interface VectorChunk { /** Chunk ID (UUID) */ id: string /** File path (absolute) */ filePath: string /** Chunk index (zero-based) */ chunkIndex: number /** Chunk text */ text: string /** Embedding vector (dimension depends on model) */ vector: number[] /** Metadata */ metadata: DocumentMetadata /** Ingestion timestamp (ISO 8601 format) */ timestamp: string } /** * Search result */ export interface SearchResult { /** File path */ filePath: string /** Chunk index */ chunkIndex: number /** Chunk text */ text: string /** Distance score using dot product (0 = identical, 1 = orthogonal, 2 = opposite) */ score: number /** Metadata */ metadata: DocumentMetadata } /** * Raw result from LanceDB query (internal type) */ interface LanceDBRawResult { filePath: string chunkIndex: number text: string metadata: DocumentMetadata _distance?: number _score?: number } // ============================================ // Type Guards // ============================================ /** * Type guard for DocumentMetadata */ function isDocumentMetadata(value: unknown): value is DocumentMetadata { if (typeof value !== 'object' || value === null) return false const obj = value as Record<string, unknown> return ( typeof obj['fileName'] === 'string' && typeof obj['fileSize'] === 'number' && typeof obj['fileType'] === 'string' ) } /** * Type guard for LanceDB raw search result */ function isLanceDBRawResult(value: unknown): value is LanceDBRawResult { if (typeof value !== 'object' || value === null) return false const obj = value as Record<string, unknown> return ( typeof obj['filePath'] === 'string' && typeof obj['chunkIndex'] === 'number' && typeof obj['text'] === 'string' && isDocumentMetadata(obj['metadata']) ) } /** * Convert LanceDB raw result to SearchResult with type validation * @throws DatabaseError if the result is invalid */ function toSearchResult(raw: unknown): SearchResult { if (!isLanceDBRawResult(raw)) { throw new DatabaseError('Invalid search result format from LanceDB') } return { filePath: raw.filePath, chunkIndex: raw.chunkIndex, text: raw.text, score: raw._distance ?? raw._score ?? 0, metadata: raw.metadata, } } // ============================================ // Error Classes // ============================================ /** * Database error */ export class DatabaseError extends Error { constructor( message: string, public override readonly cause?: Error ) { super(message) this.name = 'DatabaseError' } } // ============================================ // VectorStore Class // ============================================ /** * Vector storage class using LanceDB * * Responsibilities: * - LanceDB operations (insert, delete, search) * - Transaction handling (atomicity of delete→insert) * - Metadata management */ export class VectorStore { private db: Connection | null = null private table: Table | null = null private readonly config: VectorStoreConfig private ftsEnabled = false constructor(config: VectorStoreConfig) { this.config = config } /** * Initialize LanceDB and create table */ async initialize(): Promise<void> { try { // Connect to LanceDB this.db = await connect(this.config.dbPath) // Check table existence and create if needed const tableNames = await this.db.tableNames() if (tableNames.includes(this.config.tableName)) { // Open existing table this.table = await this.db.openTable(this.config.tableName) console.error(`VectorStore: Opened existing table "${this.config.tableName}"`) // Ensure FTS index exists (migration for existing databases) await this.ensureFtsIndex() } else { // Create new table (schema auto-defined on first data insertion) console.error( `VectorStore: Table "${this.config.tableName}" will be created on first data insertion` ) } console.error(`VectorStore initialized: ${this.config.dbPath}`) } catch (error) { throw new DatabaseError('Failed to initialize VectorStore', error as Error) } } /** * Delete all chunks for specified file path * * @param filePath - File path (absolute) */ async deleteChunks(filePath: string): Promise<void> { if (!this.table) { // If table doesn't exist, no deletion targets, return normally console.error('VectorStore: Skipping deletion as table does not exist') return } try { // Use LanceDB delete API to remove records matching filePath // Escape single quotes to prevent SQL injection const escapedFilePath = filePath.replace(/'/g, "''") // LanceDB's delete method doesn't throw errors if targets don't exist, // so call delete directly // Note: Field names are case-sensitive, use backticks for camelCase fields await this.table.delete(`\`filePath\` = '${escapedFilePath}'`) console.error(`VectorStore: Deleted chunks for file "${filePath}"`) // Rebuild FTS index after deleting data await this.rebuildFtsIndex() } catch (error) { // If error occurs, output warning log console.warn(`VectorStore: Error occurred while deleting file "${filePath}":`, error) // Don't treat as error if deletion targets don't exist or table is empty // Otherwise throw exception const errorMessage = (error as Error).message.toLowerCase() if ( !errorMessage.includes('not found') && !errorMessage.includes('does not exist') && !errorMessage.includes('no matching') ) { throw new DatabaseError(`Failed to delete chunks for file: ${filePath}`, error as Error) } } } /** * Batch insert vector chunks * * @param chunks - Array of vector chunks */ async insertChunks(chunks: VectorChunk[]): Promise<void> { if (chunks.length === 0) { return } try { if (!this.table) { // Create table on first insertion if (!this.db) { throw new DatabaseError('VectorStore is not initialized. Call initialize() first.') } // LanceDB's createTable API accepts data as Record<string, unknown>[] const records = chunks.map((chunk) => chunk as unknown as Record<string, unknown>) this.table = await this.db.createTable(this.config.tableName, records) console.error(`VectorStore: Created table "${this.config.tableName}"`) // Create FTS index for hybrid search await this.ensureFtsIndex() } else { // Add data to existing table const records = chunks.map((chunk) => chunk as unknown as Record<string, unknown>) await this.table.add(records) // Rebuild FTS index after adding new data await this.rebuildFtsIndex() } console.error(`VectorStore: Inserted ${chunks.length} chunks`) } catch (error) { throw new DatabaseError('Failed to insert chunks', error as Error) } } /** * Ensure FTS index exists for hybrid search * Creates ngram-based index if it doesn't exist, drops old versions * @throws DatabaseError if index creation fails (Fail-Fast principle) */ private async ensureFtsIndex(): Promise<void> { if (!this.table) { return } // Check existing indices const indices = await this.table.listIndices() const existingFtsIndices = indices.filter((idx) => idx.indexType === 'FTS') const hasExpectedIndex = existingFtsIndices.some((idx) => idx.name === FTS_INDEX_NAME) if (hasExpectedIndex) { this.ftsEnabled = true return } // Create new FTS index with ngram tokenizer for multilingual support // - min=2: Capture Japanese bi-grams (e.g., "東京", "設計") // - max=3: Balance between precision and index size // - prefixOnly=false: Generate ngrams from all positions for proper CJK support await this.table.createIndex('text', { config: Index.fts({ baseTokenizer: 'ngram', ngramMinLength: 2, ngramMaxLength: 3, prefixOnly: false, stem: false, }), name: FTS_INDEX_NAME, }) this.ftsEnabled = true console.error(`VectorStore: FTS index "${FTS_INDEX_NAME}" created successfully`) // Drop old FTS indices for (const idx of existingFtsIndices) { if (idx.name !== FTS_INDEX_NAME) { await this.table.dropIndex(idx.name) console.error(`VectorStore: Dropped old FTS index "${idx.name}"`) } } } /** * Rebuild FTS index after data changes (insert/delete) * LanceDB OSS requires explicit optimize() call to update FTS index * Also cleans up old index versions to prevent storage bloat */ private async rebuildFtsIndex(): Promise<void> { if (!this.table || !this.ftsEnabled) { return } // Optimize table and clean up old versions const cleanupThreshold = new Date(Date.now() - FTS_CLEANUP_THRESHOLD_MS) await this.table.optimize({ cleanupOlderThan: cleanupThreshold }) } /** * Apply grouping algorithm to filter results by detecting group boundaries. * * Uses statistical threshold (mean + k*std) to identify significant gaps (group boundaries). * - 'similar': Returns only the first group (cuts at first boundary) * - 'related': Returns up to 2 groups (cuts at second boundary) * * @param results - Search results sorted by distance (ascending) * @param mode - Grouping mode ('similar' = 1 group, 'related' = 2 groups) * @returns Filtered results */ private applyGrouping(results: SearchResult[], mode: GroupingMode): SearchResult[] { if (results.length <= 1) return results // Calculate gaps between consecutive results with their indices const gaps: { index: number; gap: number }[] = [] for (let i = 0; i < results.length - 1; i++) { const current = results[i] const next = results[i + 1] if (current !== undefined && next !== undefined) { gaps.push({ index: i + 1, gap: next.score - current.score }) } } if (gaps.length === 0) return results // Calculate statistical threshold to identify significant gaps (group boundaries) const gapValues = gaps.map((g) => g.gap) const mean = gapValues.reduce((a, b) => a + b, 0) / gapValues.length const variance = gapValues.reduce((a, b) => a + (b - mean) ** 2, 0) / gapValues.length const std = Math.sqrt(variance) const threshold = mean + GROUPING_BOUNDARY_STD_MULTIPLIER * std // Find all significant gaps (group boundaries) const boundaries = gaps.filter((g) => g.gap > threshold).map((g) => g.index) // If no boundaries found, return all results if (boundaries.length === 0) return results // Determine how many groups to include based on mode // 'similar': 1 group (cut at first boundary) // 'related': 2 groups (cut at second boundary, or return all if only 1 boundary) const groupsToInclude = mode === 'similar' ? 1 : 2 const boundaryIndex = groupsToInclude - 1 // If we don't have enough boundaries, return all results for 'related' mode if (boundaryIndex >= boundaries.length) { return mode === 'related' ? results : results.slice(0, boundaries[0]) } // Cut at the appropriate boundary return results.slice(0, boundaries[boundaryIndex]) } /** * Execute vector search with quality filtering * Architecture: Semantic search → Filter (maxDistance, grouping) → Keyword boost * * This "prefetch then rerank" approach ensures: * - maxDistance and grouping work on meaningful vector distances * - Keyword matching acts as a boost, not a replacement for semantic similarity * * @param queryVector - Query vector (dimension depends on model) * @param queryText - Optional query text for keyword boost (BM25) * @param limit - Number of results to retrieve (default 10) * @returns Array of search results (sorted by distance ascending, filtered by quality settings) */ async search(queryVector: number[], queryText?: string, limit = 10): Promise<SearchResult[]> { if (!this.table) { console.error('VectorStore: Returning empty results as table does not exist') return [] } if (limit < 1 || limit > 20) { throw new DatabaseError(`Invalid limit: expected 1-20, got ${limit}`) } try { // Step 1: Semantic (vector) search - always the primary search const candidateLimit = limit * HYBRID_SEARCH_CANDIDATE_MULTIPLIER let query = this.table.vectorSearch(queryVector).distanceType('dot').limit(candidateLimit) // Apply distance threshold at query level if (this.config.maxDistance !== undefined) { query = query.distanceRange(undefined, this.config.maxDistance) } const vectorResults = await query.toArray() // Convert to SearchResult format with type validation let results: SearchResult[] = vectorResults.map((result) => toSearchResult(result)) // Step 2: Apply grouping filter on vector distances (before keyword boost) // Grouping is meaningful only on semantic distances, not after keyword boost if (this.config.grouping && results.length > 1) { results = this.applyGrouping(results, this.config.grouping) } // Step 3: Apply keyword boost if enabled const hybridWeight = this.config.hybridWeight ?? 0.6 if (this.ftsEnabled && queryText && queryText.trim().length > 0 && hybridWeight > 0) { try { // Get unique filePaths from vector results to filter FTS search const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))] // Build WHERE clause with IN for targeted FTS search // Use backticks for column name (required for camelCase in LanceDB) const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`) const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})` const ftsResults = await this.table .search(queryText, 'fts', 'text') .where(whereClause) .select(['filePath', 'chunkIndex', 'text', 'metadata', '_score']) .limit(results.length * 2) // Enough to cover all vector results .toArray() results = this.applyKeywordBoost(results, ftsResults, hybridWeight) } catch (ftsError) { console.error('VectorStore: FTS search failed, using vector-only results:', ftsError) this.ftsEnabled = false } } // Return top results after all filtering and boosting return results.slice(0, limit) } catch (error) { throw new DatabaseError('Failed to search vectors', error as Error) } } /** * Apply keyword boost to rerank vector search results * Uses multiplicative formula: final_distance = distance / (1 + keyword_normalized * weight) * * This proportional boost ensures: * - Keyword matches improve ranking without dominating semantic similarity * - Documents without keyword matches keep their original vector distance * - Higher weight = stronger influence of keyword matching * * @param vectorResults - Results from vector search (already filtered by maxDistance/grouping) * @param ftsResults - Raw FTS results with BM25 scores * @param weight - Boost weight (0-1, from hybridWeight config) */ private applyKeywordBoost( vectorResults: SearchResult[], ftsResults: Record<string, unknown>[], weight: number ): SearchResult[] { // Build FTS score map with normalized scores (0-1) let maxBm25Score = 0 for (const result of ftsResults) { if (!result) continue const score = (result['_score'] as number) ?? 0 if (score > maxBm25Score) maxBm25Score = score } const ftsScoreMap = new Map<string, number>() for (const result of ftsResults) { if (!result) continue const key = `${result['filePath']}:${result['chunkIndex']}` const rawScore = (result['_score'] as number) ?? 0 const normalized = maxBm25Score > 0 ? rawScore / maxBm25Score : 0 ftsScoreMap.set(key, normalized) } // Apply multiplicative boost to vector results const boostedResults = vectorResults.map((result) => { const key = `${result.filePath}:${result.chunkIndex}` const keywordScore = ftsScoreMap.get(key) ?? 0 // Multiplicative boost: distance / (1 + keyword * weight) // - If keyword matches (score=1) and weight=1: distance halved // - If no keyword match (score=0): distance unchanged const boostedDistance = result.score / (1 + keywordScore * weight) return { ...result, score: boostedDistance, } }) // Re-sort by boosted distance (ascending = better) return boostedResults.sort((a, b) => a.score - b.score) } /** * Get list of ingested files * * @returns Array of file information */ async listFiles(): Promise<{ filePath: string; chunkCount: number; timestamp: string }[]> { if (!this.table) { return [] // Return empty array if table doesn't exist } try { // Retrieve all records const allRecords = await this.table.query().toArray() // Group by file path const fileMap = new Map<string, { chunkCount: number; timestamp: string }>() for (const record of allRecords) { const filePath = record.filePath as string const timestamp = record.timestamp as string if (fileMap.has(filePath)) { const fileInfo = fileMap.get(filePath) if (fileInfo) { fileInfo.chunkCount += 1 // Keep most recent timestamp if (timestamp > fileInfo.timestamp) { fileInfo.timestamp = timestamp } } } else { fileMap.set(filePath, { chunkCount: 1, timestamp }) } } // Convert Map to array of objects return Array.from(fileMap.entries()).map(([filePath, info]) => ({ filePath, chunkCount: info.chunkCount, timestamp: info.timestamp, })) } catch (error) { throw new DatabaseError('Failed to list files', error as Error) } } /** * Get system status * * @returns System status information */ async getStatus(): Promise<{ documentCount: number chunkCount: number memoryUsage: number uptime: number ftsIndexEnabled: boolean searchMode: 'hybrid' | 'vector-only' }> { if (!this.table) { return { documentCount: 0, chunkCount: 0, memoryUsage: 0, uptime: process.uptime(), ftsIndexEnabled: false, searchMode: 'vector-only', } } try { // Retrieve all records const allRecords = await this.table.query().toArray() const chunkCount = allRecords.length // Count unique file paths const uniqueFilePaths = new Set(allRecords.map((record) => record.filePath as string)) const documentCount = uniqueFilePaths.size // Get memory usage (in MB) const memoryUsage = process.memoryUsage().heapUsed / 1024 / 1024 // Get uptime (in seconds) const uptime = process.uptime() return { documentCount, chunkCount, memoryUsage, uptime, ftsIndexEnabled: this.ftsEnabled, searchMode: this.ftsEnabled && (this.config.hybridWeight ?? 0.6) > 0 ? 'hybrid' : 'vector-only', } } catch (error) { throw new DatabaseError('Failed to get status', error as Error) } } }

Local RAG

Implementation Reference

Latest Blog Posts

MCP directory API