"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.store = void 0;
exports.addChunks = addChunks;
exports.clearStore = clearStore;
exports.getStoreSize = getStoreSize;
exports.computeTfIdfVectors = computeTfIdfVectors;
exports.embedTextsOpenAI = embedTextsOpenAI;
exports.searchByEmbedding = searchByEmbedding;
exports.searchByTfIdf = searchByTfIdf;
const axios_1 = __importDefault(require("axios"));
exports.store = [];
function addChunks(chunks) {
exports.store.push(...chunks);
}
function clearStore() {
exports.store.length = 0;
}
function getStoreSize() {
return exports.store.length;
}
function tokenize(text) {
return text.toLowerCase().replace(/[^\w\s]/g, "").split(/\s+/).filter(x => x.length > 0);
}
function computeTfIdfVectors(store) {
const docs = store.map(c => tokenize(c.text));
const vocabulary = Array.from(new Set(docs.flat())).sort();
// Compute IDF
const idf = vocabulary.map(w => {
const docCount = docs.filter(d => d.includes(w)).length;
return Math.log(docs.length / (1 + docCount));
});
// Compute TF-IDF vectors
const vectors = docs.map((tokens) => {
const tf = {};
tokens.forEach((t) => (tf[t] = (tf[t] || 0) + 1));
const vec = vocabulary.map((w, i) => {
const tv = tf[w] || 0;
return tv * idf[i];
});
// normalize
const norm = Math.sqrt(vec.reduce((s, v) => s + v * v, 0)) || 1;
return vec.map((v) => v / norm);
});
// attach to chunks
store.forEach((c, i) => (c.tfidf = vectors[i]));
return { vocabulary, vectors };
}
// cosine similarity
function cosine(a, b) {
let dot = 0;
let na = 0;
let nb = 0;
for (let i = 0; i < a.length; i++) {
dot += (a[i] || 0) * (b[i] || 0);
na += (a[i] || 0) * (a[i] || 0);
nb += (b[i] || 0) * (b[i] || 0);
}
if (na === 0 || nb === 0)
return 0;
return dot / (Math.sqrt(na) * Math.sqrt(nb));
}
async function embedTextsOpenAI(texts, apiKey) {
try {
const response = await axios_1.default.post("https://api.openai.com/v1/embeddings", {
input: texts,
model: "text-embedding-ada-002"
}, {
headers: {
"Authorization": `Bearer ${apiKey}`,
"Content-Type": "application/json"
}
});
return response.data.data.map((d) => d.embedding);
}
catch (error) {
console.error("Error fetching embeddings:", error.message);
throw error;
}
}
function searchByEmbedding(queryEmbedding, k = 5) {
// store must have embedding
const scored = exports.store
.filter((c) => c.embedding && c.embedding.length === queryEmbedding.length)
.map((c) => ({ c, score: cosine(c.embedding, queryEmbedding) }))
.sort((a, b) => b.score - a.score)
.slice(0, k);
return scored;
}
function searchByTfIdf(query, k = 5) {
// This is a simplified search that re-computes query vector against the store's implicit vocabulary
// For a real system, we should persist vocabulary.
// Here we will just do a simple token overlap score if TF-IDF vectors are not fully aligned or just rely on the overlap logic from the original code which was also simplified.
// Actually, let's implement a proper TF-IDF search if we have vectors, but we need the vocabulary.
// Since we don't store vocabulary globally in this simple implementation, we'll fallback to a simple token overlap/frequency score.
const qTokens = tokenize(query);
const qSet = new Set(qTokens);
const scored = exports.store
.map((c) => {
const ctokens = tokenize(c.text);
let overlap = 0;
ctokens.forEach((t) => {
if (qSet.has(t))
overlap++;
});
// Jaccard-ish or just overlap count normalized by length
return { c, score: overlap / (Math.sqrt(ctokens.length) || 1) };
})
.sort((a, b) => b.score - a.score)
.slice(0, k);
return scored;
}