import { Pool, PoolConfig } from 'pg';
import { MetadataChunk, Reference } from '../chunking/types';
import { EmbeddingModel } from './embedding';
import logger from '../utils/logger';
export interface SearchResult extends MetadataChunk {
similarity?: number;
rank?: number;
}
export class VectorStore {
private pool: Pool;
private embeddingModel: EmbeddingModel;
constructor(config: PoolConfig, embeddingModel: EmbeddingModel) {
this.pool = new Pool(config);
this.embeddingModel = embeddingModel;
}
async storeChunk(chunk: MetadataChunk): Promise<void> {
try {
const embedding = await this.embeddingModel.embed(chunk.content);
await this.pool.query(
`INSERT INTO metadata_chunks
(id, org_id, type, name, content, symbols, refs, path, raw, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
ON CONFLICT (id) DO UPDATE SET
content = $5, symbols = $6, refs = $7, embedding = $10, updated_at = CURRENT_TIMESTAMP`,
[
chunk.id,
chunk.orgId,
chunk.type,
chunk.name,
chunk.content,
chunk.symbols,
JSON.stringify(chunk.references),
chunk.path,
JSON.stringify(chunk.raw),
`[${embedding.join(',')}]`
]
);
logger.debug(`Stored chunk ${chunk.id} with ${embedding.length}-dim embedding`);
} catch (error) {
logger.error(`Failed to store chunk ${chunk.id}`, { error });
throw error;
}
}
async storeBatch(chunks: MetadataChunk[]): Promise<void> {
const embeddings = await this.embeddingModel.embedBatch(
chunks.map(c => c.content)
);
const client = await this.pool.connect();
try {
await client.query('BEGIN');
for (let i = 0; i < chunks.length; i++) {
const chunk = chunks[i];
const embedding = embeddings[i];
await client.query(
`INSERT INTO metadata_chunks
(id, org_id, type, name, content, symbols, refs, path, raw, embedding)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
ON CONFLICT (id) DO UPDATE SET
content = $5, symbols = $6, refs = $7, embedding = $10, updated_at = CURRENT_TIMESTAMP`,
[
chunk.id,
chunk.orgId,
chunk.type,
chunk.name,
chunk.content,
chunk.symbols,
JSON.stringify(chunk.references),
chunk.path,
JSON.stringify(chunk.raw),
`[${embedding.join(',')}]`
]
);
}
await client.query('COMMIT');
logger.info(`Stored ${chunks.length} chunks in batch`);
} catch (error) {
await client.query('ROLLBACK');
logger.error('Batch store failed', { error });
throw error;
} finally {
client.release();
}
}
async vectorSearch(query: string, limit: number = 10): Promise<SearchResult[]> {
try {
const embedding = await this.embeddingModel.embed(query);
const result = await this.pool.query(
`SELECT id, org_id, type, name, content, symbols, refs, path, raw,
1 - (embedding <=> $1) as similarity
FROM metadata_chunks
ORDER BY embedding <=> $1
LIMIT $2`,
[`[${embedding.join(',')}]`, limit]
);
return result.rows.map(row => this.mapRowToChunk(row));
} catch (error) {
logger.error('Vector search failed', { error, query });
throw error;
}
}
async keywordSearch(query: string, limit: number = 10): Promise<SearchResult[]> {
try {
const result = await this.pool.query(
`SELECT id, org_id, type, name, content, symbols, refs, path, raw,
ts_rank(to_tsvector('english', content), plainto_tsquery('english', $1)) as rank
FROM metadata_chunks
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', $1)
ORDER BY rank DESC
LIMIT $2`,
[query, limit]
);
return result.rows.map(row => this.mapRowToChunk(row));
} catch (error) {
logger.error('Keyword search failed', { error, query });
throw error;
}
}
async symbolSearch(symbol: string, limit: number = 10): Promise<SearchResult[]> {
try {
const result = await this.pool.query(
`SELECT id, org_id, type, name, content, symbols, refs, path, raw
FROM metadata_chunks
WHERE $1 = ANY(symbols)
ORDER BY name
LIMIT $2`,
[symbol, limit]
);
return result.rows.map(row => this.mapRowToChunk(row));
} catch (error) {
logger.error('Symbol search failed', { error, symbol });
throw error;
}
}
async hybridSearch(query: string, limit: number = 10): Promise<SearchResult[]> {
try {
// Get results from both vector and keyword search
const [vectorResults, keywordResults] = await Promise.all([
this.vectorSearch(query, limit * 2),
this.keywordSearch(query, limit * 2)
]);
// Combine and rerank results
const combinedResults = new Map<string, SearchResult>();
// Add vector results with similarity scores
vectorResults.forEach((result, index) => {
combinedResults.set(result.id, {
...result,
similarity: result.similarity || 0,
rank: (result.similarity || 0) * 0.7 // Weight vector search 70%
});
});
// Merge keyword results
keywordResults.forEach((result, index) => {
const existing = combinedResults.get(result.id);
if (existing) {
// Combine scores
existing.rank = (existing.rank || 0) + (result.rank || 0) * 0.3;
} else {
combinedResults.set(result.id, {
...result,
rank: (result.rank || 0) * 0.3 // Weight keyword search 30%
});
}
});
// Sort by combined rank and return top results
return Array.from(combinedResults.values())
.sort((a, b) => (b.rank || 0) - (a.rank || 0))
.slice(0, limit);
} catch (error) {
logger.error('Hybrid search failed', { error, query });
throw error;
}
}
private mapRowToChunk(row: any): SearchResult {
let references: Reference[] = [];
let raw: any = {};
try {
references = row.refs ? JSON.parse(row.refs) : [];
} catch (e) {
logger.warn(`Failed to parse references for chunk ${row.id}`, { refs: row.refs });
}
try {
raw = row.raw ? JSON.parse(row.raw) : {};
} catch (e) {
logger.warn(`Failed to parse raw data for chunk ${row.id}`, { raw: row.raw });
}
return {
id: row.id,
orgId: row.org_id,
type: row.type,
name: row.name,
content: row.content,
symbols: row.symbols || [],
references,
path: row.path || '',
raw,
metadata: {
size: row.content?.length || 0,
lineCount: row.content?.split('\n').length || 0
},
similarity: row.similarity,
rank: row.rank
};
}
async deleteChunk(chunkId: string): Promise<void> {
try {
await this.pool.query('DELETE FROM metadata_chunks WHERE id = $1', [chunkId]);
logger.debug(`Deleted chunk ${chunkId}`);
} catch (error) {
logger.error(`Failed to delete chunk ${chunkId}`, { error });
throw error;
}
}
async close(): Promise<void> {
await this.pool.end();
}
}