import sqlite3 from 'sqlite3';
import { open, Database } from 'sqlite';
import * as lancedb from '@lancedb/lancedb';
import { PhraseQuery, MatchQuery, BooleanQuery, Occur } from '@lancedb/lancedb';
import { Field, FixedSizeList, Float32, Schema, Utf8, Int32 } from 'apache-arrow';
import QuickLRU from 'quick-lru';
import { mkdir } from 'fs/promises';
import { dirname } from 'path';
import {
Collection,
CollectionWithDocuments,
DocumentMetadata,
ProcessedDocument,
SearchResult,
SearchOptions,
StorageProvider,
} from '../types.js';
import { EmbeddingsProvider } from '../embeddings/types.js';
import { logger } from '../util/logger.js';
import { escapeFilterValue } from '../util/security.js';
type LanceDBConnection = Awaited<ReturnType<typeof lancedb.connect>>;
type LanceDBTable = Awaited<ReturnType<LanceDBConnection['openTable']>>;
type LanceDbRow = {
url: string;
title: string;
content: string;
path: string;
startLine: number;
endLine: number;
vector: number[];
type: 'overview' | 'api' | 'example' | 'usage';
lastUpdated: string;
version: string;
framework: string;
language: string;
// Serialized JSON for code blocks and props
codeBlocks: string;
props: string;
};
/**
* Query preprocessing result for improved search
*/
interface ProcessedQuery {
/** Exact phrases to match (from quoted strings) */
phrases: string[];
/** Cleaned query text for general search */
cleanedQuery: string;
/** Original query for fallback */
original: string;
}
/**
* Preprocesses a search query - keeps it generic for any documentation type.
* Only extracts explicitly quoted phrases, otherwise passes through to LanceDB's
* built-in tokenization which handles stop words and stemming.
*/
function preprocessQuery(query: string): ProcessedQuery {
const result: ProcessedQuery = {
phrases: [],
cleanedQuery: query,
original: query,
};
// Extract quoted phrases for exact matching
const quotedPattern = /"([^"]+)"/g;
let match;
while ((match = quotedPattern.exec(query)) !== null) {
result.phrases.push(match[1]);
}
// Remove quotes from cleaned query
result.cleanedQuery = query.replace(/"([^"]+)"/g, '$1').trim();
logger.debug('[QueryPreprocess] Processed query:', result);
return result;
}
export class DocumentStore implements StorageProvider {
private sqliteDb?: Database;
private lanceConn?: LanceDBConnection;
private lanceTable?: LanceDBTable;
private readonly searchCache: QuickLRU<string, SearchResult[]>;
private ftsIndexCreated = false;
constructor(
private readonly dbPath: string,
private readonly vectorDbPath: string,
private readonly embeddings: EmbeddingsProvider,
maxCacheSize: number = 1000
) {
logger.debug(`[DocumentStore] Initializing with paths:`, {
dbPath,
vectorDbPath,
maxCacheSize,
});
this.searchCache = new QuickLRU({ maxSize: maxCacheSize });
}
async initialize(): Promise<void> {
logger.debug(`[DocumentStore] Starting initialization with paths:`, {
dbPath: this.dbPath,
vectorDbPath: this.vectorDbPath,
});
try {
// Create directories with error handling
try {
logger.debug(`[DocumentStore] Creating SQLite directory: ${dirname(this.dbPath)}`);
await mkdir(dirname(this.dbPath), { recursive: true });
logger.debug(`[DocumentStore] Creating LanceDB directory: ${this.vectorDbPath}`);
await mkdir(this.vectorDbPath, { recursive: true });
} catch (error) {
logger.error('[DocumentStore] Error creating directories:', error);
throw new Error(`Failed to create storage directories: ${error instanceof Error ? error.message : String(error)}`);
}
// Initialize SQLite with error handling
try {
logger.debug(`[DocumentStore] Opening SQLite database at ${this.dbPath}`);
this.sqliteDb = await open({
filename: this.dbPath,
driver: sqlite3.Database,
});
logger.debug(`[DocumentStore] Configuring SQLite database`);
await this.sqliteDb.exec('PRAGMA busy_timeout = 5000;');
await this.sqliteDb.exec('PRAGMA journal_mode = WAL;');
await this.sqliteDb.exec('PRAGMA foreign_keys = ON;');
} catch (error) {
logger.error('[DocumentStore] Error initializing SQLite:', error);
throw new Error(`Failed to initialize SQLite: ${error instanceof Error ? error.message : String(error)}`);
}
// Create base tables if they don't exist
await this.sqliteDb.exec(`
CREATE TABLE IF NOT EXISTS documents (
url TEXT PRIMARY KEY,
title TEXT NOT NULL,
favicon TEXT,
last_indexed DATETIME NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_last_indexed ON documents(last_indexed);
`);
// Run database migrations
await this.runMigrations();
// Initialize LanceDB with error handling
try {
logger.debug(`[DocumentStore] Connecting to LanceDB at ${this.vectorDbPath}`);
this.lanceConn = await lancedb.connect(this.vectorDbPath);
logger.debug(`[DocumentStore] Getting table list`);
const tableNames = await this.lanceConn.tableNames();
logger.debug(`[DocumentStore] Existing tables:`, tableNames);
// Only create the table if it doesn't exist
if (!tableNames.includes('chunks')) {
logger.debug(`[DocumentStore] Creating chunks table with dimensions: ${this.embeddings.dimensions}`);
// Define schema using Apache Arrow
const vectorType = new FixedSizeList(this.embeddings.dimensions, new Field('item', new Float32(), true));
const schema = new Schema([
new Field('url', new Utf8(), false),
new Field('title', new Utf8(), false),
new Field('content', new Utf8(), false),
new Field('path', new Utf8(), false),
new Field('startLine', new Int32(), false),
new Field('endLine', new Int32(), false),
new Field('vector', vectorType, false),
new Field('type', new Utf8(), false),
new Field('lastUpdated', new Utf8(), false),
new Field('version', new Utf8(), true),
new Field('framework', new Utf8(), true),
new Field('language', new Utf8(), true),
// Flatten arrays to simple strings for better FTS support
new Field('codeBlocks', new Utf8(), true),
new Field('props', new Utf8(), true),
]);
// Create empty table with schema
this.lanceTable = await this.lanceConn.createEmptyTable('chunks', schema, { mode: 'create' });
logger.debug(`[DocumentStore] New chunks table created successfully`);
// Create FTS index for better text search
await this.createFTSIndex();
} else {
logger.debug(`[DocumentStore] Using existing chunks table`);
this.lanceTable = await this.lanceConn.openTable('chunks');
// Try to create FTS index if it doesn't exist
await this.createFTSIndex();
}
// Verify table is accessible
const rowCount = await this.lanceTable.countRows();
logger.debug(`[DocumentStore] Chunks table initialized, contains ${rowCount} rows`);
} catch (error) {
logger.error('[DocumentStore] Error initializing LanceDB:', error);
throw new Error(`Failed to initialize LanceDB: ${error instanceof Error ? error.message : String(error)}`);
}
logger.debug(`[DocumentStore] All storage components initialized successfully`);
} catch (error) {
logger.error('[DocumentStore] Error initializing storage:', error);
throw error;
}
}
/**
* Database migrations list.
* Each migration has a unique version number and SQL statements to execute.
* Migrations are applied in order and tracked in the schema_migrations table.
*
* To add a new migration:
* 1. Add a new entry with the next version number
* 2. Include a description for logging
* 3. Add the SQL statement(s) to execute
*/
private static readonly MIGRATIONS: Array<{
version: number;
description: string;
sql: string;
}> = [
{
version: 1,
description: 'Add authentication tracking columns',
sql: `
ALTER TABLE documents ADD COLUMN requires_auth INTEGER DEFAULT 0;
ALTER TABLE documents ADD COLUMN auth_domain TEXT;
`,
},
{
version: 2,
description: 'Add document tags table',
sql: `
CREATE TABLE IF NOT EXISTS document_tags (
url TEXT NOT NULL,
tag TEXT NOT NULL,
PRIMARY KEY (url, tag),
FOREIGN KEY (url) REFERENCES documents(url) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_document_tags_tag ON document_tags(tag);
`,
},
{
version: 3,
description: 'Add version column for versioned package documentation',
sql: `
ALTER TABLE documents ADD COLUMN version TEXT;
CREATE INDEX IF NOT EXISTS idx_documents_version ON documents(version);
`,
},
{
version: 4,
description: 'Add collections feature for grouping documentation sites',
sql: `
CREATE TABLE IF NOT EXISTS collections (
name TEXT PRIMARY KEY,
description TEXT,
created_at DATETIME NOT NULL,
updated_at DATETIME NOT NULL
);
CREATE TABLE IF NOT EXISTS collection_documents (
collection_name TEXT NOT NULL,
url TEXT NOT NULL,
added_at DATETIME NOT NULL,
PRIMARY KEY (collection_name, url),
FOREIGN KEY (collection_name) REFERENCES collections(name) ON DELETE CASCADE,
FOREIGN KEY (url) REFERENCES documents(url) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_collection_documents_name ON collection_documents(collection_name);
CREATE INDEX IF NOT EXISTS idx_collection_documents_url ON collection_documents(url);
`,
},
];
/**
* Run pending database migrations.
* Creates a schema_migrations table to track applied migrations.
* Only runs migrations that haven't been applied yet.
*/
private async runMigrations(): Promise<void> {
if (!this.sqliteDb) {
throw new Error('SQLite not initialized');
}
// Create migrations tracking table
await this.sqliteDb.exec(`
CREATE TABLE IF NOT EXISTS schema_migrations (
version INTEGER PRIMARY KEY,
applied_at DATETIME NOT NULL,
description TEXT
);
`);
// Get list of already applied migrations
const applied = await this.sqliteDb.all<Array<{ version: number }>>('SELECT version FROM schema_migrations ORDER BY version');
const appliedVersions = new Set(applied.map((row) => row.version));
// Run pending migrations in order
for (const migration of DocumentStore.MIGRATIONS) {
if (appliedVersions.has(migration.version)) {
logger.debug(`[DocumentStore] Migration ${migration.version} already applied: ${migration.description}`);
continue;
}
logger.info(`[DocumentStore] Running migration ${migration.version}: ${migration.description}`);
try {
// Split SQL statements and execute each one
// (SQLite doesn't support multiple statements in exec for ALTER TABLE)
const statements = migration.sql
.split(';')
.map((s) => s.trim())
.filter((s) => s.length > 0);
for (const statement of statements) {
try {
await this.sqliteDb.exec(statement);
} catch (error) {
// Ignore "duplicate column" errors for ALTER TABLE ADD COLUMN
// This handles cases where the column already exists
const errorMsg = error instanceof Error ? error.message : String(error);
if (errorMsg.includes('duplicate column name')) {
logger.debug(`[DocumentStore] Column already exists, skipping: ${statement}`);
continue;
}
throw error;
}
}
// Record successful migration
await this.sqliteDb.run('INSERT INTO schema_migrations (version, applied_at, description) VALUES (?, ?, ?)', [
migration.version,
new Date().toISOString(),
migration.description,
]);
logger.info(`[DocumentStore] ✓ Migration ${migration.version} completed`);
} catch (error) {
logger.error(`[DocumentStore] Migration ${migration.version} failed:`, error);
throw new Error(`Database migration ${migration.version} failed: ${error instanceof Error ? error.message : String(error)}`);
}
}
}
async addDocument(doc: ProcessedDocument): Promise<void> {
logger.debug(`[DocumentStore] Starting addDocument for:`, {
url: doc.metadata.url,
title: doc.metadata.title,
chunks: doc.chunks.length,
});
// Add diagnostic logging for vector dimensions
if (doc.chunks.length > 0) {
logger.debug(`[DocumentStore] Sample vector dimensions: ${doc.chunks[0].vector.length}`);
logger.debug(`[DocumentStore] Sample vector first 5 values: ${doc.chunks[0].vector.slice(0, 5)}`);
}
// Validate storage initialization
if (!this.sqliteDb) {
logger.debug('[DocumentStore] SQLite not initialized during addDocument');
throw new Error('SQLite storage not initialized');
}
if (!this.lanceTable) {
logger.debug('[DocumentStore] LanceDB not initialized during addDocument');
throw new Error('LanceDB storage not initialized');
}
try {
// Check if document already exists
const existing = await this.getDocument(doc.metadata.url);
if (existing) {
logger.debug(`[DocumentStore] Existing document found, will update:`, existing);
}
logger.debug(`[DocumentStore] Starting SQLite transaction`);
await this.sqliteDb.run('BEGIN TRANSACTION');
// Add metadata to SQLite
await this.sqliteDb.run(
'INSERT OR REPLACE INTO documents (url, title, favicon, last_indexed, requires_auth, auth_domain, version) VALUES (?, ?, ?, ?, ?, ?, ?)',
[
doc.metadata.url,
doc.metadata.title,
doc.metadata.favicon,
doc.metadata.lastIndexed.toISOString(),
doc.metadata.requiresAuth ? 1 : 0,
doc.metadata.authDomain || null,
doc.metadata.version || null,
]
);
logger.debug(
`[DocumentStore] Added metadata to SQLite (requiresAuth: ${doc.metadata.requiresAuth}, authDomain: ${doc.metadata.authDomain}, version: ${doc.metadata.version})`
);
// Delete existing chunks for this document (using escaped value to prevent injection)
await this.lanceTable.delete(`url = '${escapeFilterValue(doc.metadata.url)}'`);
logger.debug(`[DocumentStore] Deleted existing chunks`);
// Add new chunks to LanceDB
const rows = doc.chunks.map((chunk) => ({
url: doc.metadata.url,
title: doc.metadata.title,
content: chunk.content,
path: chunk.path,
startLine: chunk.startLine,
endLine: chunk.endLine,
vector: chunk.vector,
type: chunk.metadata.type,
lastUpdated: new Date().toISOString(),
version: '',
framework: '',
language: '',
// Serialize code blocks and props as JSON strings
codeBlocks: JSON.stringify(chunk.metadata.codeBlocks || []),
props: JSON.stringify(chunk.metadata.props || []),
})) as LanceDbRow[];
logger.debug(`[DocumentStore] Adding ${rows.length} chunks to LanceDB`);
await this.lanceTable.add(rows);
// Verify data was added
const rowCount = await this.lanceTable.countRows();
logger.debug(`[DocumentStore] Table now contains ${rowCount} rows`);
// Commit transaction
await this.sqliteDb.run('COMMIT');
logger.debug(`[DocumentStore] Committed transaction`);
// Clear search cache for this URL
this.clearCacheForUrl(doc.metadata.url);
} catch (error) {
// Rollback on error
if (this.sqliteDb) {
await this.sqliteDb.run('ROLLBACK');
}
logger.error('[DocumentStore] Error adding document:', error);
throw error;
}
}
async searchDocuments(queryVector: number[], options: SearchOptions = {}): Promise<SearchResult[]> {
if (!this.lanceTable) {
throw new Error('Storage not initialized');
}
const { limit = 10, includeVectors = false, filterByType, filterByTags, textQuery } = options;
logger.debug(`[DocumentStore] Searching documents with vector:`, {
dimensions: queryVector.length,
limit,
includeVectors,
filterByType,
filterByTags,
hasTextQuery: !!textQuery,
});
// Add validation for query vector
if (queryVector.length === 0 && !textQuery) {
logger.debug('[DocumentStore] Empty query vector and no text query provided');
return [];
}
// Log search parameters
logger.debug(`[DocumentStore] Search parameters:`, {
vectorDimensions: queryVector.length,
expectedDimensions: this.embeddings.dimensions,
limit,
filterType: filterByType,
});
// Ensure vector dimensions match if provided
if (queryVector.length > 0 && queryVector.length !== this.embeddings.dimensions) {
logger.debug(`[DocumentStore] Vector dimension mismatch: got ${queryVector.length}, expected ${this.embeddings.dimensions}`);
// Consider padding or truncating the vector to match expected dimensions
if (queryVector.length < this.embeddings.dimensions) {
// Pad the vector with zeros
queryVector = [...queryVector, ...new Array(this.embeddings.dimensions - queryVector.length).fill(0)];
logger.debug(`[DocumentStore] Padded vector to ${queryVector.length} dimensions`);
} else {
// Truncate the vector
queryVector = queryVector.slice(0, this.embeddings.dimensions);
logger.debug(`[DocumentStore] Truncated vector to ${queryVector.length} dimensions`);
}
}
try {
// Log query vector for debugging
logger.debug(`[DocumentStore] Query vector first 5 values: ${queryVector.slice(0, 5)}`);
// Ensure we have a valid query vector
if (queryVector.length === 0) {
logger.debug('[DocumentStore] Empty query vector provided for search');
// Use a default vector of the correct dimension instead of an empty array
queryVector = new Array(this.embeddings.dimensions).fill(0);
logger.debug(`[DocumentStore] Using default zero vector with ${queryVector.length} dimensions`);
}
// If filtering by tags, get the list of URLs that have all those tags
let tagFilteredUrls: string[] | undefined;
if (filterByTags && filterByTags.length > 0) {
tagFilteredUrls = await this.getUrlsByTags(filterByTags);
if (tagFilteredUrls.length === 0) {
// No documents match the tag filter, return empty results
logger.debug(`[DocumentStore] No documents match tag filter in vector search:`, filterByTags);
return [];
}
logger.debug(`[DocumentStore] Tag filter matched ${tagFilteredUrls.length} documents for vector search`);
}
// Create search query
let query = this.lanceTable.search(queryVector).limit(limit);
// Build WHERE conditions
const conditions: string[] = [];
if (filterByType) {
conditions.push(`type = '${escapeFilterValue(filterByType)}'`);
}
if (tagFilteredUrls && tagFilteredUrls.length > 0) {
const urlConditions = tagFilteredUrls.map((u) => `url = '${escapeFilterValue(u)}'`).join(' OR ');
conditions.push(`(${urlConditions})`);
}
if (conditions.length > 0) {
query = query.where(conditions.join(' AND '));
}
const results = await query.toArray();
logger.debug(`[DocumentStore] Found ${results.length} results`);
// Log the first result for debugging if available
if (results.length > 0) {
logger.debug(`[DocumentStore] First result:`, {
id: results[0].id,
score: results[0].score,
hasVector: 'vector' in results[0],
vectorType: typeof results[0].vector,
vectorLength: Array.isArray(results[0].vector) ? results[0].vector.length : 'not an array',
});
}
const searchResults = results.map((result: LanceDbRow & { id?: string; score?: number; _distance?: number }) => {
// Log the raw result for debugging
logger.debug(`[DocumentStore] Raw search result:`, {
id: result.id,
url: result.url,
hasVector: !!result.vector,
vectorType: result.vector ? typeof result.vector : 'undefined',
vectorLength: result.vector ? (Array.isArray(result.vector) ? result.vector.length : 'not an array') : 0,
});
// Parse JSON fields
let codeBlocks;
let props;
try {
codeBlocks = result.codeBlocks ? JSON.parse(result.codeBlocks) : undefined;
} catch {
codeBlocks = undefined;
}
try {
props = result.props ? JSON.parse(result.props) : undefined;
} catch {
props = undefined;
}
return {
id: String(result.id || result.url),
content: String(result.content),
url: String(result.url),
title: String(result.title),
score: result._distance != null ? 1 - result._distance : (result.score ?? 0),
...(includeVectors && { vector: result.vector as number[] }),
metadata: {
type: (result.type || 'overview') as 'overview' | 'api' | 'example' | 'usage',
path: String(result.path),
lastUpdated: new Date(result.lastUpdated ? String(result.lastUpdated) : Date.now()),
version: result.version as string | undefined,
framework: result.framework as string | undefined,
language: result.language as string | undefined,
codeBlocks,
props,
},
};
});
return searchResults;
} catch (error) {
logger.error('[DocumentStore] Error searching documents:', error);
throw error;
}
}
/**
* Create full-text search index on the content field.
* Uses replace: true to prevent accumulation of old index copies.
*/
private async createFTSIndex(): Promise<void> {
if (!this.lanceTable || this.ftsIndexCreated) {
return;
}
try {
logger.debug('[DocumentStore] Creating FTS index on content field...');
await this.lanceTable.createIndex('content', {
config: lancedb.Index.fts(),
replace: true, // Replace existing index to prevent accumulation
});
this.ftsIndexCreated = true;
logger.debug('[DocumentStore] FTS index created successfully');
} catch (error) {
const err = error as Error;
if (err.message?.toLowerCase().includes('already exists')) {
logger.debug('[DocumentStore] FTS index already exists');
this.ftsIndexCreated = true;
} else {
logger.warn('[DocumentStore] Failed to create FTS index:', err.message);
// Don't throw - FTS is optional, we can fall back to vector search
}
}
}
async searchByText(query: string, options: SearchOptions = {}): Promise<SearchResult[]> {
logger.debug(`[DocumentStore] Searching documents by text:`, { query, options });
const cacheKey = `text:${query}:${JSON.stringify(options)}`;
const cached = this.searchCache.get(cacheKey);
if (cached) {
logger.debug(`[DocumentStore] Returning cached results`);
return cached;
}
const { limit = 10, filterByType, filterUrl, filterByTags } = options;
// If filtering by tags, get the list of URLs that have all those tags
let tagFilteredUrls: string[] | undefined;
if (filterByTags && filterByTags.length > 0) {
tagFilteredUrls = await this.getUrlsByTags(filterByTags);
if (tagFilteredUrls.length === 0) {
// No documents match the tag filter, cache and return empty results
logger.debug(`[DocumentStore] No documents match tag filter:`, filterByTags);
const emptyResults: SearchResult[] = [];
this.searchCache.set(cacheKey, emptyResults);
return emptyResults;
}
logger.debug(`[DocumentStore] Tag filter matched ${tagFilteredUrls.length} documents`);
}
// Build WHERE clause for filtering (using escaped values to prevent injection)
const buildWhereClause = (): string | undefined => {
const conditions: string[] = [];
if (filterByType) {
conditions.push(`type = '${escapeFilterValue(filterByType)}'`);
}
if (filterUrl) {
// Filter by base URL - use LIKE to match URLs that start with the base URL
// Escape the filterUrl and also escape LIKE wildcards within the value
const escapedUrl = escapeFilterValue(filterUrl).replace(/%/g, '\\%').replace(/_/g, '\\_');
conditions.push(`url LIKE '${escapedUrl}%'`);
}
if (tagFilteredUrls && tagFilteredUrls.length > 0) {
// Filter to only include URLs that match the tag filter
const urlConditions = tagFilteredUrls.map((u) => `url = '${escapeFilterValue(u)}'`).join(' OR ');
conditions.push(`(${urlConditions})`);
}
return conditions.length > 0 ? conditions.join(' AND ') : undefined;
};
const whereClause = buildWhereClause();
try {
if (!this.lanceTable) {
throw new Error('Storage not initialized');
}
// Preprocess query - only extracts quoted phrases, keeps everything else generic
const processedQuery = preprocessQuery(query);
// Generate embedding for vector search
const queryVector = await this.embeddings.embed(query);
logger.debug('[DocumentStore] Attempting hybrid search (FTS + vector with RRF)');
// Strategy 1: If user provided quoted phrases, use phrase matching
if (this.ftsIndexCreated && processedQuery.phrases.length > 0) {
try {
logger.debug('[DocumentStore] Using phrase-based search for quoted terms:', processedQuery.phrases);
// Build boolean query: phrase matches (must) + general terms (should)
const queries: [Occur, PhraseQuery | MatchQuery][] = [];
// Add phrase queries for quoted phrases (exact match)
for (const phrase of processedQuery.phrases) {
queries.push([Occur.Must, new PhraseQuery(phrase, 'content', { slop: 0 })]);
}
// Add fuzzy match for the overall cleaned query
if (processedQuery.cleanedQuery) {
queries.push([Occur.Should, new MatchQuery(processedQuery.cleanedQuery, 'content', { fuzziness: 1 })]);
}
const boolQuery = new BooleanQuery(queries);
let ftsQuery = this.lanceTable
.query()
.fullTextSearch(boolQuery)
.limit(limit * 2);
if (whereClause) {
ftsQuery = ftsQuery.where(whereClause);
}
const ftsResults = await ftsQuery.toArray();
logger.debug(`[DocumentStore] Phrase-based FTS returned ${ftsResults.length} results`);
if (ftsResults.length > 0) {
// Combine with vector search for semantic relevance
let vectorQuery = this.lanceTable.search(queryVector).limit(limit * 2);
if (whereClause) {
vectorQuery = vectorQuery.where(whereClause);
}
const vectorResults = await vectorQuery.toArray();
const mergedResults = this.mergeAndRankResults(ftsResults, vectorResults, limit);
const searchResults = this.formatSearchResults(mergedResults);
this.searchCache.set(cacheKey, searchResults);
return searchResults;
}
} catch (phraseError) {
const err = phraseError as Error;
logger.debug('[DocumentStore] Phrase-based search failed:', err.message);
}
}
// Strategy 2: Standard hybrid search - FTS with fuzziness + vector search
if (this.ftsIndexCreated) {
try {
// LanceDB's FTS already handles stop words and stemming
// Add fuzziness for typo tolerance
const matchQuery = new MatchQuery(processedQuery.cleanedQuery, 'content', { fuzziness: 1 });
let ftsQuery = this.lanceTable
.query()
.fullTextSearch(matchQuery)
.limit(limit * 2);
if (whereClause) {
ftsQuery = ftsQuery.where(whereClause);
}
const ftsResults = await ftsQuery.toArray();
logger.debug(`[DocumentStore] FTS returned ${ftsResults.length} results`);
// Always combine with vector search for best results
let vectorQuery = this.lanceTable.search(queryVector).limit(limit * 2);
if (whereClause) {
vectorQuery = vectorQuery.where(whereClause);
}
const vectorResults = await vectorQuery.toArray();
logger.debug(`[DocumentStore] Vector search returned ${vectorResults.length} results`);
// Merge using RRF even if one is empty - ensures we get results
const mergedResults = this.mergeAndRankResults(ftsResults, vectorResults, limit);
if (mergedResults.length > 0) {
const searchResults = this.formatSearchResults(mergedResults);
this.searchCache.set(cacheKey, searchResults);
return searchResults;
}
} catch (ftsError) {
const err = ftsError as Error;
logger.debug('[DocumentStore] FTS search failed, falling back to vector search:', err.message);
}
}
// Strategy 3: Fallback to pure vector search (semantic similarity)
logger.debug('[DocumentStore] Falling back to pure vector search');
const results = await this.searchDocuments(queryVector, options);
this.searchCache.set(cacheKey, results);
return results;
} catch (error) {
logger.error('[DocumentStore] Error searching documents by text:', error);
throw error;
}
}
/**
* Merge FTS and vector results using Reciprocal Rank Fusion (RRF)
*/
private mergeAndRankResults(
ftsResults: LanceDbRow[],
vectorResults: LanceDbRow[],
limit: number
): (LanceDbRow & { _rrfScore: number })[] {
const k = 60; // RRF constant
const scores = new Map<string, { result: LanceDbRow; score: number }>();
// Score FTS results
ftsResults.forEach((result, rank) => {
const key = `${result.url}:${result.path}:${result.startLine}`;
const rrfScore = 1 / (k + rank + 1);
scores.set(key, { result, score: rrfScore });
});
// Add/combine vector results
vectorResults.forEach((result, rank) => {
const key = `${result.url}:${result.path}:${result.startLine}`;
const rrfScore = 1 / (k + rank + 1);
if (scores.has(key)) {
// Combine scores if result appears in both
const existing = scores.get(key)!;
existing.score += rrfScore;
} else {
scores.set(key, { result, score: rrfScore });
}
});
// Sort by combined RRF score and return top results
return Array.from(scores.values())
.sort((a, b) => b.score - a.score)
.slice(0, limit)
.map((item) => ({ ...item.result, _rrfScore: item.score }));
}
/**
* Format raw LanceDB results into SearchResult objects
*/
private formatSearchResults(results: (LanceDbRow & { _rrfScore?: number; _distance?: number; _score?: number })[]): SearchResult[] {
return results.map((result) => {
let codeBlocks, props;
try {
codeBlocks = result.codeBlocks ? JSON.parse(result.codeBlocks) : undefined;
} catch {
codeBlocks = undefined;
}
try {
props = result.props ? JSON.parse(result.props) : undefined;
} catch {
props = undefined;
}
return {
id: String(result.url),
content: String(result.content),
url: String(result.url),
title: String(result.title),
score: result._rrfScore ?? (result._distance != null ? 1 - result._distance : (result._score ?? 0)),
metadata: {
type: (result.type || 'overview') as 'overview' | 'api' | 'example' | 'usage',
path: String(result.path),
lastUpdated: new Date(result.lastUpdated ? String(result.lastUpdated) : Date.now()),
version: result.version as string | undefined,
framework: result.framework as string | undefined,
language: result.language as string | undefined,
codeBlocks,
props,
},
};
});
}
async listDocuments(): Promise<DocumentMetadata[]> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
logger.debug(`[DocumentStore] Listing documents`);
try {
const rows = await this.sqliteDb.all<
Array<{
url: string;
title: string;
favicon: string | null;
last_indexed: string;
requires_auth: number | null;
auth_domain: string | null;
version: string | null;
}>
>('SELECT url, title, favicon, last_indexed, requires_auth, auth_domain, version FROM documents ORDER BY last_indexed DESC');
logger.debug(`[DocumentStore] Found ${rows.length} documents`);
// Fetch tags for all documents
const tagsMap = await this.getAllDocumentTags();
return rows.map((row) => ({
url: row.url,
title: row.title,
favicon: row.favicon ?? undefined,
lastIndexed: new Date(row.last_indexed),
requiresAuth: row.requires_auth === 1,
authDomain: row.auth_domain ?? undefined,
tags: tagsMap.get(row.url) || [],
version: row.version ?? undefined,
}));
} catch (error) {
logger.error('[DocumentStore] Error listing documents:', error);
throw error;
}
}
/**
* Get all tags for all documents as a Map
*/
private async getAllDocumentTags(): Promise<Map<string, string[]>> {
if (!this.sqliteDb) {
return new Map();
}
const rows = await this.sqliteDb.all<Array<{ url: string; tag: string }>>('SELECT url, tag FROM document_tags ORDER BY url, tag');
const tagsMap = new Map<string, string[]>();
for (const row of rows) {
const existing = tagsMap.get(row.url) || [];
existing.push(row.tag);
tagsMap.set(row.url, existing);
}
return tagsMap;
}
async deleteDocument(url: string): Promise<void> {
if (!this.sqliteDb || !this.lanceTable) {
throw new Error('Storage not initialized');
}
logger.debug(`[DocumentStore] Deleting document: ${url}`);
try {
await this.sqliteDb.run('BEGIN TRANSACTION');
// Delete tags first (in case foreign key cascade isn't enabled)
await this.sqliteDb.run('DELETE FROM document_tags WHERE url = ?', [url]);
await this.sqliteDb.run('DELETE FROM documents WHERE url = ?', [url]);
await this.lanceTable.delete(`url = '${escapeFilterValue(url)}'`);
await this.sqliteDb.run('COMMIT');
// Clear cache for this URL
this.clearCacheForUrl(url);
logger.debug(`[DocumentStore] Document deleted successfully`);
} catch (error) {
if (this.sqliteDb) {
await this.sqliteDb.run('ROLLBACK');
}
logger.error('[DocumentStore] Error deleting document:', error);
throw error;
}
}
async getDocument(url: string): Promise<DocumentMetadata | null> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
logger.debug(`[DocumentStore] Getting document: ${url}`);
try {
// Check if SQLite is properly initialized
if (!this.sqliteDb) {
logger.debug('[DocumentStore] SQLite not initialized during getDocument');
throw new Error('Storage not initialized');
}
// Log the query being executed
logger.debug(`[DocumentStore] Executing SQLite query for URL: ${url}`);
const row = await this.sqliteDb.get<{
url: string;
title: string;
favicon: string | null;
last_indexed: string;
requires_auth: number | null;
auth_domain: string | null;
version: string | null;
}>('SELECT url, title, favicon, last_indexed, requires_auth, auth_domain, version FROM documents WHERE url = ?', [url]);
if (!row) {
logger.debug(`[DocumentStore] Document not found in SQLite: ${url}`);
return null;
}
// Check if LanceDB has any chunks for this document
if (this.lanceTable) {
const chunks = await this.lanceTable.countRows(`url = '${escapeFilterValue(url)}'`);
logger.debug(`[DocumentStore] Found ${chunks} chunks in LanceDB for ${url}`);
}
// Fetch tags for this document
const tags = await this.getDocumentTags(url);
logger.debug(`[DocumentStore] Document found in SQLite:`, row);
return {
url: row.url,
title: row.title,
favicon: row.favicon ?? undefined,
lastIndexed: new Date(row.last_indexed),
requiresAuth: row.requires_auth === 1,
authDomain: row.auth_domain ?? undefined,
tags,
version: row.version ?? undefined,
};
} catch (error) {
logger.error('[DocumentStore] Error getting document:', error);
throw error;
}
}
/**
* Get tags for a specific document
*/
private async getDocumentTags(url: string): Promise<string[]> {
if (!this.sqliteDb) {
return [];
}
const rows = await this.sqliteDb.all<Array<{ tag: string }>>('SELECT tag FROM document_tags WHERE url = ? ORDER BY tag', [url]);
return rows.map((row) => row.tag);
}
/**
* Set tags for a documentation site. Replaces any existing tags.
* @param url - The URL of the documentation site
* @param tags - Array of tags to assign (empty array removes all tags)
*/
async setTags(url: string, tags: string[]): Promise<void> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
logger.debug(`[DocumentStore] Setting tags for ${url}:`, tags);
try {
await this.sqliteDb.run('BEGIN TRANSACTION');
// Verify the document exists inside the transaction to prevent race conditions
const row = await this.sqliteDb.get<{ url: string }>('SELECT url FROM documents WHERE url = ?', [url]);
if (!row) {
await this.sqliteDb.run('ROLLBACK');
throw new Error('Documentation not found');
}
// Delete existing tags
await this.sqliteDb.run('DELETE FROM document_tags WHERE url = ?', [url]);
// Insert new tags (deduplicated and normalized)
const uniqueTags = [...new Set(tags.map((t) => t.trim().toLowerCase()).filter((t) => t.length > 0))];
for (const tag of uniqueTags) {
await this.sqliteDb.run('INSERT INTO document_tags (url, tag) VALUES (?, ?)', [url, tag]);
}
await this.sqliteDb.run('COMMIT');
// Clear cached search results that may be affected by tag changes
this.clearCacheForUrl(url);
logger.debug(`[DocumentStore] Tags set successfully for ${url}`);
} catch (error) {
if (this.sqliteDb) {
try {
await this.sqliteDb.run('ROLLBACK');
} catch {
// Ignore rollback errors (transaction may already be rolled back)
}
}
logger.error('[DocumentStore] Error setting tags:', error);
throw error;
}
}
/**
* List all unique tags with their usage counts
* @returns Array of tags with counts, sorted by count descending
*/
async listAllTags(): Promise<Array<{ tag: string; count: number }>> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
logger.debug(`[DocumentStore] Listing all tags`);
try {
const rows = await this.sqliteDb.all<Array<{ tag: string; count: number }>>(
'SELECT tag, COUNT(*) as count FROM document_tags GROUP BY tag ORDER BY count DESC, tag ASC'
);
logger.debug(`[DocumentStore] Found ${rows.length} unique tags`);
return rows;
} catch (error) {
logger.error('[DocumentStore] Error listing tags:', error);
throw error;
}
}
/**
* Get URLs of documents that have ALL of the specified tags
* @param tags - Array of tags that documents must have
* @returns Array of matching document URLs
*/
async getUrlsByTags(tags: string[]): Promise<string[]> {
if (!this.sqliteDb || tags.length === 0) {
return [];
}
// Normalize tags
const normalizedTags = tags.map((t) => t.trim().toLowerCase()).filter((t) => t.length > 0);
if (normalizedTags.length === 0) {
return [];
}
logger.debug(`[DocumentStore] Getting URLs by tags:`, normalizedTags);
try {
// Find documents that have ALL the specified tags
// This uses a GROUP BY with HAVING COUNT = number of tags
const placeholders = normalizedTags.map(() => '?').join(', ');
const query = `
SELECT url FROM document_tags
WHERE tag IN (${placeholders})
GROUP BY url
HAVING COUNT(DISTINCT tag) = ?
`;
const rows = await this.sqliteDb.all<Array<{ url: string }>>(query, [...normalizedTags, normalizedTags.length]);
logger.debug(`[DocumentStore] Found ${rows.length} URLs matching all tags`);
return rows.map((row) => row.url);
} catch (error) {
logger.error('[DocumentStore] Error getting URLs by tags:', error);
throw error;
}
}
// ============ Collection Methods ============
/**
* Create a new collection.
* @param name - Unique name for the collection
* @param description - Optional description
* @throws Error if collection already exists
*/
async createCollection(name: string, description?: string): Promise<void> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
const now = new Date().toISOString();
logger.debug(`[DocumentStore] Creating collection: ${normalizedName}`);
try {
await this.sqliteDb.run('INSERT INTO collections (name, description, created_at, updated_at) VALUES (?, ?, ?, ?)', [
normalizedName,
description || null,
now,
now,
]);
logger.info(`[DocumentStore] Collection created: ${normalizedName}`);
} catch (error) {
const err = error as Error;
if (err.message?.includes('UNIQUE constraint failed') || err.message?.includes('PRIMARY KEY constraint failed')) {
throw new Error(`Collection "${normalizedName}" already exists`);
}
throw error;
}
}
/**
* Delete a collection.
* Documents in the collection are NOT deleted, only the collection association.
* @param name - Name of the collection to delete
* @throws Error if collection doesn't exist
*/
async deleteCollection(name: string): Promise<void> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
logger.debug(`[DocumentStore] Deleting collection: ${normalizedName}`);
const result = await this.sqliteDb.run('DELETE FROM collections WHERE name = ?', [normalizedName]);
if (result.changes === 0) {
throw new Error(`Collection "${normalizedName}" not found`);
}
// collection_documents entries are cascade-deleted by foreign key
logger.info(`[DocumentStore] Collection deleted: ${normalizedName}`);
}
/**
* Update a collection's metadata.
* @param name - Current name of the collection
* @param updates - Fields to update
* @throws Error if collection doesn't exist
*/
async updateCollection(name: string, updates: { newName?: string; description?: string }): Promise<void> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
const now = new Date().toISOString();
// Check if collection exists
const existing = await this.sqliteDb.get<{ name: string; description: string | null; created_at: string }>(
'SELECT name, description, created_at FROM collections WHERE name = ?',
[normalizedName]
);
if (!existing) {
throw new Error(`Collection "${normalizedName}" not found`);
}
logger.debug(`[DocumentStore] Updating collection: ${normalizedName}`, updates);
try {
// If renaming, we need to handle FK constraints carefully
// SQLite doesn't allow PRAGMA changes within a transaction, so we use a different approach:
// 1. Create new collection with new name
// 2. Move documents to new collection
// 3. Delete old collection
if (updates.newName !== undefined) {
const newNormalizedName = updates.newName.trim();
// Check if new name already exists
const existingNew = await this.sqliteDb.get<{ name: string }>('SELECT name FROM collections WHERE name = ?', [newNormalizedName]);
if (existingNew) {
throw new Error(`Collection "${newNormalizedName}" already exists`);
}
await this.sqliteDb.run('BEGIN TRANSACTION');
// Create collection with new name
const newDescription = updates.description ?? existing.description;
await this.sqliteDb.run('INSERT INTO collections (name, description, created_at, updated_at) VALUES (?, ?, ?, ?)', [
newNormalizedName,
newDescription,
existing.created_at,
now,
]);
// Get all document URLs in the old collection
const docs = await this.sqliteDb.all<Array<{ url: string; added_at: string }>>(
'SELECT url, added_at FROM collection_documents WHERE collection_name = ?',
[normalizedName]
);
// Add documents to new collection
for (const doc of docs) {
await this.sqliteDb.run('INSERT INTO collection_documents (collection_name, url, added_at) VALUES (?, ?, ?)', [
newNormalizedName,
doc.url,
doc.added_at,
]);
}
// Delete old collection (cascade will remove old collection_documents entries)
await this.sqliteDb.run('DELETE FROM collections WHERE name = ?', [normalizedName]);
await this.sqliteDb.run('COMMIT');
} else if (updates.description !== undefined) {
// Just updating description, simple update
await this.sqliteDb.run('UPDATE collections SET description = ?, updated_at = ? WHERE name = ?', [
updates.description,
now,
normalizedName,
]);
}
logger.info(`[DocumentStore] Collection updated: ${normalizedName}`);
} catch (error) {
try {
await this.sqliteDb.run('ROLLBACK');
} catch {
// Ignore rollback errors
}
const err = error as Error;
if (err.message?.includes('already exists')) {
throw err;
}
throw error;
}
}
/**
* List all collections with document counts.
* @returns Array of collections sorted by name
*/
async listCollections(): Promise<Collection[]> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
logger.debug(`[DocumentStore] Listing collections`);
const rows = await this.sqliteDb.all<
Array<{
name: string;
description: string | null;
created_at: string;
updated_at: string;
document_count: number;
}>
>(`
SELECT c.name, c.description, c.created_at, c.updated_at,
COUNT(cd.url) as document_count
FROM collections c
LEFT JOIN collection_documents cd ON c.name = cd.collection_name
GROUP BY c.name
ORDER BY c.name ASC
`);
logger.debug(`[DocumentStore] Found ${rows.length} collections`);
return rows.map((row) => ({
name: row.name,
description: row.description ?? undefined,
createdAt: new Date(row.created_at),
updatedAt: new Date(row.updated_at),
documentCount: row.document_count,
}));
}
/**
* Get a collection with its full list of documents.
* @param name - Name of the collection
* @returns Collection with documents, or null if not found
*/
async getCollection(name: string): Promise<CollectionWithDocuments | null> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
logger.debug(`[DocumentStore] Getting collection: ${normalizedName}`);
// Get collection metadata
const row = await this.sqliteDb.get<{
name: string;
description: string | null;
created_at: string;
updated_at: string;
}>('SELECT name, description, created_at, updated_at FROM collections WHERE name = ?', [normalizedName]);
if (!row) {
logger.debug(`[DocumentStore] Collection not found: ${normalizedName}`);
return null;
}
// Get documents in the collection
const docRows = await this.sqliteDb.all<
Array<{
url: string;
title: string;
favicon: string | null;
last_indexed: string;
requires_auth: number | null;
auth_domain: string | null;
version: string | null;
}>
>(
`
SELECT d.url, d.title, d.favicon, d.last_indexed, d.requires_auth, d.auth_domain, d.version
FROM documents d
INNER JOIN collection_documents cd ON d.url = cd.url
WHERE cd.collection_name = ?
ORDER BY d.title ASC
`,
[normalizedName]
);
// Fetch tags for all documents
const tagsMap = await this.getAllDocumentTags();
const documents: DocumentMetadata[] = docRows.map((doc) => ({
url: doc.url,
title: doc.title,
favicon: doc.favicon ?? undefined,
lastIndexed: new Date(doc.last_indexed),
requiresAuth: doc.requires_auth === 1,
authDomain: doc.auth_domain ?? undefined,
tags: tagsMap.get(doc.url) || [],
version: doc.version ?? undefined,
}));
logger.debug(`[DocumentStore] Collection "${normalizedName}" has ${documents.length} documents`);
return {
name: row.name,
description: row.description ?? undefined,
createdAt: new Date(row.created_at),
updatedAt: new Date(row.updated_at),
documentCount: documents.length,
documents,
};
}
/**
* Add documents to a collection.
* @param name - Name of the collection
* @param urls - URLs of documents to add
* @throws Error if collection doesn't exist
*/
async addToCollection(name: string, urls: string[]): Promise<{ added: string[]; notFound: string[]; alreadyInCollection: string[] }> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
const now = new Date().toISOString();
logger.debug(`[DocumentStore] Adding ${urls.length} documents to collection: ${normalizedName}`);
// Check if collection exists
const collection = await this.sqliteDb.get<{ name: string }>('SELECT name FROM collections WHERE name = ?', [normalizedName]);
if (!collection) {
throw new Error(`Collection "${normalizedName}" not found`);
}
const added: string[] = [];
const notFound: string[] = [];
const alreadyInCollection: string[] = [];
await this.sqliteDb.run('BEGIN TRANSACTION');
try {
for (const url of urls) {
// Check if document exists
const doc = await this.sqliteDb.get<{ url: string }>('SELECT url FROM documents WHERE url = ?', [url]);
if (!doc) {
notFound.push(url);
continue;
}
// Try to add to collection
try {
await this.sqliteDb.run('INSERT INTO collection_documents (collection_name, url, added_at) VALUES (?, ?, ?)', [
normalizedName,
url,
now,
]);
added.push(url);
} catch (error) {
const err = error as Error;
if (err.message?.includes('UNIQUE constraint failed') || err.message?.includes('PRIMARY KEY constraint failed')) {
alreadyInCollection.push(url);
} else {
throw error;
}
}
}
// Update collection's updated_at timestamp
if (added.length > 0) {
await this.sqliteDb.run('UPDATE collections SET updated_at = ? WHERE name = ?', [now, normalizedName]);
}
await this.sqliteDb.run('COMMIT');
logger.info(`[DocumentStore] Added ${added.length} documents to collection "${normalizedName}"`);
if (notFound.length > 0) {
logger.debug(`[DocumentStore] Documents not found: ${notFound.join(', ')}`);
}
if (alreadyInCollection.length > 0) {
logger.debug(`[DocumentStore] Documents already in collection: ${alreadyInCollection.join(', ')}`);
}
return { added, notFound, alreadyInCollection };
} catch (error) {
await this.sqliteDb.run('ROLLBACK');
throw error;
}
}
/**
* Remove documents from a collection.
* @param name - Name of the collection
* @param urls - URLs of documents to remove
* @throws Error if collection doesn't exist
*/
async removeFromCollection(name: string, urls: string[]): Promise<{ removed: string[]; notInCollection: string[] }> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
const now = new Date().toISOString();
logger.debug(`[DocumentStore] Removing ${urls.length} documents from collection: ${normalizedName}`);
// Check if collection exists
const collection = await this.sqliteDb.get<{ name: string }>('SELECT name FROM collections WHERE name = ?', [normalizedName]);
if (!collection) {
throw new Error(`Collection "${normalizedName}" not found`);
}
const removed: string[] = [];
const notInCollection: string[] = [];
await this.sqliteDb.run('BEGIN TRANSACTION');
try {
for (const url of urls) {
const result = await this.sqliteDb.run('DELETE FROM collection_documents WHERE collection_name = ? AND url = ?', [
normalizedName,
url,
]);
if (result.changes && result.changes > 0) {
removed.push(url);
} else {
notInCollection.push(url);
}
}
// Update collection's updated_at timestamp
if (removed.length > 0) {
await this.sqliteDb.run('UPDATE collections SET updated_at = ? WHERE name = ?', [now, normalizedName]);
}
await this.sqliteDb.run('COMMIT');
logger.info(`[DocumentStore] Removed ${removed.length} documents from collection "${normalizedName}"`);
if (notInCollection.length > 0) {
logger.debug(`[DocumentStore] Documents not in collection: ${notInCollection.join(', ')}`);
}
return { removed, notInCollection };
} catch (error) {
await this.sqliteDb.run('ROLLBACK');
throw error;
}
}
/**
* Get URLs of all documents in a collection.
* Used for search filtering.
* @param name - Name of the collection
* @returns Array of document URLs
*/
async getCollectionUrls(name: string): Promise<string[]> {
if (!this.sqliteDb) {
throw new Error('Storage not initialized');
}
const normalizedName = name.trim();
logger.debug(`[DocumentStore] Getting URLs for collection: ${normalizedName}`);
const rows = await this.sqliteDb.all<Array<{ url: string }>>('SELECT url FROM collection_documents WHERE collection_name = ?', [
normalizedName,
]);
return rows.map((row) => row.url);
}
private clearCacheForUrl(url: string): void {
// Clear all cache entries that might contain results for this URL
for (const key of this.searchCache.keys()) {
const results = this.searchCache.get(key);
if (results?.some((result) => result.url === url)) {
this.searchCache.delete(key);
}
}
}
/**
* Validates that vectors are properly stored and retrievable from LanceDB
* @returns Promise<boolean> True if vectors are valid, false otherwise
*/
async validateVectors(): Promise<boolean> {
if (!this.lanceTable) {
logger.debug('[DocumentStore] Cannot validate vectors: Storage not initialized');
throw new Error('Storage not initialized');
}
try {
// Get total row count
const rowCount = await this.lanceTable.countRows();
logger.debug(`[DocumentStore] Vector validation: Table contains ${rowCount} rows`);
if (rowCount === 0) {
logger.debug('[DocumentStore] Vector validation: No rows found in vector table');
return false;
}
// Get a sample row using a query
const sample = await this.lanceTable.query().limit(1).toArray();
if (sample.length === 0) {
logger.debug('[DocumentStore] Vector validation: No rows returned from query');
return false;
}
// Log detailed information about the sample
logger.debug('[DocumentStore] Vector validation sample:', {
hasVector: 'vector' in sample[0],
vectorType: typeof sample[0].vector,
isArray: Array.isArray(sample[0].vector),
length: Array.isArray(sample[0].vector) ? sample[0].vector.length : 'N/A',
sample: Array.isArray(sample[0].vector) ? sample[0].vector.slice(0, 5) : sample[0].vector,
});
// Try a simple vector search with a random vector
const testVector = new Array(this.embeddings.dimensions).fill(0).map(() => Math.random());
logger.debug(`[DocumentStore] Testing vector search with random vector of length ${testVector.length}`);
const searchResults = await this.lanceTable.search(testVector).limit(1).toArray();
logger.debug(`[DocumentStore] Vector search test returned ${searchResults.length} results`);
if (searchResults.length > 0) {
logger.debug('[DocumentStore] Vector search test result:', {
score: searchResults[0].score,
hasVector: 'vector' in searchResults[0],
vectorLength: Array.isArray(searchResults[0].vector) ? searchResults[0].vector.length : 'N/A',
});
}
// Consider vectors valid if we have rows and can perform a search
// Even if scores are null, the search is still working
return rowCount > 0 && sample.length > 0 && searchResults.length > 0;
} catch (error) {
logger.error('[DocumentStore] Error validating vectors:', error);
return false;
}
}
/**
* Optimizes the LanceDB table by compacting data files and cleaning up old versions.
* This helps reduce disk space usage by:
* - Compacting small data fragments into larger files
* - Removing deleted rows from storage
* - Cleaning up old table versions
*
* Should be called periodically or after batch operations (add/delete).
*
* @returns Promise with optimization statistics
*/
async optimize(): Promise<{ compacted: boolean; cleanedUp: boolean; error?: string }> {
if (!this.lanceTable) {
logger.debug('[DocumentStore] Cannot optimize: Storage not initialized');
return { compacted: false, cleanedUp: false, error: 'Storage not initialized' };
}
const result = { compacted: false, cleanedUp: false, error: undefined as string | undefined };
try {
logger.info('[DocumentStore] Starting optimization...');
// Run optimization with cleanup
// This performs compaction, prunes old versions, and optimizes indices
// cleanupOlderThan: new Date() means clean up all old versions immediately
try {
logger.debug('[DocumentStore] Running optimize with cleanup...');
const stats = await this.lanceTable.optimize({
cleanupOlderThan: new Date(),
deleteUnverified: true,
});
result.compacted = true;
result.cleanedUp = true;
logger.debug('[DocumentStore] Optimization stats:', stats);
} catch (optimizeError) {
const err = optimizeError as Error;
logger.warn('[DocumentStore] Optimization failed:', err.message);
result.error = `Optimization failed: ${err.message}`;
}
// Clear the search cache after optimization as data may have changed
this.searchCache.clear();
logger.info('[DocumentStore] Optimization complete:', result);
return result;
} catch (error) {
logger.error('[DocumentStore] Error during optimization:', error);
return {
compacted: false,
cleanedUp: false,
error: error instanceof Error ? error.message : String(error),
};
}
}
}