"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.addChunks = addChunks;
exports.clearStore = clearStore;
exports.getStoreSize = getStoreSize;
exports.computeTfIdfVectors = computeTfIdfVectors;
exports.embedTextsOpenAI = embedTextsOpenAI;
exports.searchByEmbedding = searchByEmbedding;
exports.searchByTfIdf = searchByTfIdf;
exports.hybridSearch = hybridSearch;
const axios_1 = __importDefault(require("axios"));
const db_1 = require("./db");
// Initialize DB on module load
// Initialize DB explicitly in server.ts
// initDb();
function addChunks(chunks, documentId) {
// Add document_id to chunks and generate unique ID
const chunksWithDocId = chunks.map((c, i) => ({
...c,
id: `${documentId}_${i}`, // Ensure unique ID per chunk across documents
document_id: documentId,
page_number: c.pageNumber // Map pageNumber to page_number for DB
}));
db_1.dbOps.addChunksBatch(chunksWithDocId);
}
function clearStore() {
db_1.dbOps.reset();
}
function getStoreSize() {
const chunks = db_1.dbOps.getAllChunks();
return chunks.length;
}
function tokenize(text) {
return text.toLowerCase().replace(/[^\w\s]/g, "").split(/\s+/).filter(x => x.length > 0);
}
function computeTfIdfVectors(chunks) {
// 1. Build vocabulary from the provided chunks
const docsTokens = chunks.map((c) => {
const tokens = tokenize(c.text);
return tokens;
});
// We need a global vocabulary if we want to be consistent, but for "local" TF-IDF (per ingestion batch or per doc),
// we can just use the vocabulary of these chunks.
const vocab = new Set();
docsTokens.forEach(tokens => tokens.forEach(t => vocab.add(t)));
const vocabArr = Array.from(vocab).sort();
// 2. Compute IDF
const idf = {};
vocabArr.forEach((term) => {
let docCount = 0;
docsTokens.forEach((tokens) => {
if (tokens.includes(term))
docCount++;
});
// Use log(1 + N / (1 + df)) to avoid negative values and division by zero
idf[term] = Math.log(1 + chunks.length / (1 + docCount));
});
// 3. Compute TF-IDF vectors (Sparse)
chunks.forEach((c, i) => {
const tokens = docsTokens[i];
const vec = {};
tokens.forEach((t) => {
const tf = tokens.filter((x) => x === t).length / tokens.length;
vec[t] = tf * (idf[t] || 0);
});
c.tfidf = vec;
});
}
// cosine similarity
function cosine(a, b) {
let dot = 0;
let na = 0;
let nb = 0;
for (let i = 0; i < a.length; i++) {
dot += (a[i] || 0) * (b[i] || 0);
na += (a[i] || 0) * (a[i] || 0);
nb += (b[i] || 0) * (b[i] || 0);
}
if (na === 0 || nb === 0)
return 0;
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
async function embedTextsOpenAI(texts, apiKey) {
try {
const response = await axios_1.default.post("https://api.openai.com/v1/embeddings", {
input: texts,
model: "text-embedding-ada-002"
}, {
headers: {
"Authorization": `Bearer ${apiKey}`,
"Content-Type": "application/json"
}
});
return response.data.data.map((d) => d.embedding);
}
catch (error) {
console.error("Error fetching embeddings:", error.message);
throw error;
}
}
function searchByEmbedding(queryEmbedding, k = 5, filter) {
const allChunks = db_1.dbOps.getAllChunks();
const scored = allChunks
.filter((c) => {
if (filter?.documentId && c.document_id !== filter.documentId)
return false;
return c.embedding && c.embedding.length === queryEmbedding.length;
})
.map((c) => ({ c, score: cosine(c.embedding, queryEmbedding) }))
.sort((a, b) => b.score - a.score)
.slice(0, k);
return scored;
}
function searchByTfIdf(query, k = 5, filter) {
const allChunks = db_1.dbOps.getAllChunks();
const qTokens = tokenize(query);
const qSet = new Set(qTokens);
const scored = allChunks
.filter((c) => {
if (filter?.documentId && c.document_id !== filter.documentId)
return false;
return true;
})
.map((c) => {
const ctokens = tokenize(c.text);
let overlap = 0;
ctokens.forEach((t) => {
if (qSet.has(t))
overlap++;
});
return { c, score: overlap / (Math.sqrt(ctokens.length) || 1) };
})
.sort((a, b) => b.score - a.score)
.slice(0, k);
return scored;
}
async function hybridSearch(query, k = 5, filter, apiKey) {
// 1. Get TF-IDF results (get more than k to have good candidates for fusion)
const tfidfResults = searchByTfIdf(query, k * 2, filter);
// 2. Get Embedding results if apiKey is provided
let embeddingResults = [];
if (apiKey) {
try {
const embed = (await embedTextsOpenAI([query], apiKey))[0];
embeddingResults = searchByEmbedding(embed, k * 2, filter);
}
catch (e) {
console.error("Hybrid search: Embedding failed, falling back to TF-IDF only", e);
}
}
// 3. Reciprocal Rank Fusion (RRF)
const rrfK = 60;
const scores = new Map();
const chunkMap = new Map();
// Helper to process results
const processResults = (results) => {
results.forEach((item, rank) => {
const id = item.c.id;
chunkMap.set(id, item.c);
const rrfScore = 1 / (rrfK + rank + 1);
scores.set(id, (scores.get(id) || 0) + rrfScore);
});
};
processResults(tfidfResults);
processResults(embeddingResults);
// 4. Sort by RRF score
const sorted = Array.from(scores.entries())
.map(([id, score]) => ({ c: chunkMap.get(id), score }))
.sort((a, b) => b.score - a.score)
.slice(0, k);
return sorted;
}