SemanticIndex.js•10.1 kB
export class SemanticIndex {
constructor() {
this.documents = new Map(); // factId -> document content
this.termFrequency = new Map(); // factId -> term -> frequency
this.inverseDocumentFrequency = new Map(); // term -> idf score
this.documentVectors = new Map(); // factId -> normalized vector
this.vocabulary = new Set(); // all unique terms
this.stopWords = new Set([
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'
]);
}
// Tokenize and clean text
tokenize(text) {
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ') // Replace punctuation with spaces
.split(/\s+/)
.filter(term => term.length > 2 && !this.stopWords.has(term))
.map(term => this.stemTerm(term)); // Simple stemming
}
// Simple stemming (remove common suffixes)
stemTerm(term) {
// Remove common English suffixes
const suffixes = ['ing', 'ly', 'ed', 'ies', 'ied', 'ies', 'ied', 's'];
for (const suffix of suffixes) {
if (term.endsWith(suffix) && term.length > suffix.length + 2) {
return term.slice(0, -suffix.length);
}
}
return term;
}
// Add a document to the index
addDocument(factId, content, metadata = {}) {
const tokens = this.tokenize(content);
// Store document
this.documents.set(factId, { content, metadata, tokens });
// Calculate term frequencies for this document
const tf = new Map();
tokens.forEach(token => {
tf.set(token, (tf.get(token) || 0) + 1);
this.vocabulary.add(token);
});
this.termFrequency.set(factId, tf);
// Rebuild IDF scores when documents change
this.buildInverseDocumentFrequency();
this.buildDocumentVectors();
}
// Remove a document from the index
removeDocument(factId) {
const doc = this.documents.get(factId);
if (!doc) return;
// Remove from all data structures
this.documents.delete(factId);
this.termFrequency.delete(factId);
this.documentVectors.delete(factId);
// Rebuild vocabulary and IDF
this.rebuildVocabulary();
this.buildInverseDocumentFrequency();
this.buildDocumentVectors();
}
// Rebuild vocabulary from all documents
rebuildVocabulary() {
this.vocabulary.clear();
for (const [factId, tf] of this.termFrequency) {
for (const term of tf.keys()) {
this.vocabulary.add(term);
}
}
}
// Calculate inverse document frequency for all terms
buildInverseDocumentFrequency() {
const totalDocs = this.documents.size;
if (totalDocs === 0) return;
this.inverseDocumentFrequency.clear();
for (const term of this.vocabulary) {
// Count documents containing this term
let docsWithTerm = 0;
for (const tf of this.termFrequency.values()) {
if (tf.has(term)) {
docsWithTerm++;
}
}
// Calculate IDF: log(total_docs / docs_with_term)
const idf = Math.log(totalDocs / (docsWithTerm + 1)); // +1 to avoid division by zero
this.inverseDocumentFrequency.set(term, idf);
}
}
// Build TF-IDF vectors for all documents
buildDocumentVectors() {
this.documentVectors.clear();
for (const [factId, tf] of this.termFrequency) {
const vector = new Map();
let magnitude = 0;
// Calculate TF-IDF for each term in the document
for (const [term, frequency] of tf) {
const idf = this.inverseDocumentFrequency.get(term) || 0;
const tfidf = frequency * idf;
vector.set(term, tfidf);
magnitude += tfidf * tfidf;
}
// Normalize the vector
magnitude = Math.sqrt(magnitude);
if (magnitude > 0) {
for (const [term, tfidf] of vector) {
vector.set(term, tfidf / magnitude);
}
}
this.documentVectors.set(factId, vector);
}
}
// Calculate cosine similarity between query and document
calculateSimilarity(queryVector, docVector) {
let dotProduct = 0;
let queryMagnitude = 0;
let docMagnitude = 0;
// Calculate dot product and magnitudes
for (const [term, queryTfidf] of queryVector) {
const docTfidf = docVector.get(term) || 0;
dotProduct += queryTfidf * docTfidf;
queryMagnitude += queryTfidf * queryTfidf;
}
for (const docTfidf of docVector.values()) {
docMagnitude += docTfidf * docTfidf;
}
queryMagnitude = Math.sqrt(queryMagnitude);
docMagnitude = Math.sqrt(docMagnitude);
if (queryMagnitude === 0 || docMagnitude === 0) {
return 0;
}
return dotProduct / (queryMagnitude * docMagnitude);
}
// Convert query text to TF-IDF vector
queryToVector(queryText) {
const tokens = this.tokenize(queryText);
const tf = new Map();
// Calculate term frequencies for query
tokens.forEach(token => {
tf.set(token, (tf.get(token) || 0) + 1);
});
// Convert to TF-IDF vector
const vector = new Map();
let magnitude = 0;
for (const [term, frequency] of tf) {
const idf = this.inverseDocumentFrequency.get(term) || 0;
const tfidf = frequency * idf;
vector.set(term, tfidf);
magnitude += tfidf * tfidf;
}
// Normalize
magnitude = Math.sqrt(magnitude);
if (magnitude > 0) {
for (const [term, tfidf] of vector) {
vector.set(term, tfidf / magnitude);
}
}
return vector;
}
// Search for similar documents
search(queryText, options = {}) {
const {
limit = 10,
threshold = 0.1,
includeScores = true,
includeMetadata = false
} = options;
if (this.documents.size === 0) {
return [];
}
const queryVector = this.queryToVector(queryText);
const results = [];
// Calculate similarity with all documents
for (const [factId, docVector] of this.documentVectors) {
const similarity = this.calculateSimilarity(queryVector, docVector);
if (similarity >= threshold) {
const result = { factId, similarity };
if (includeMetadata) {
const doc = this.documents.get(factId);
result.metadata = doc.metadata;
result.content = doc.content;
}
results.push(result);
}
}
// Sort by similarity (descending) and limit results
results.sort((a, b) => b.similarity - a.similarity);
return results.slice(0, limit);
}
// Get terms that are most important for a document
getDocumentKeywords(factId, topN = 10) {
const vector = this.documentVectors.get(factId);
if (!vector) return [];
const keywords = Array.from(vector.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, topN)
.map(([term, score]) => ({ term, score }));
return keywords;
}
// Find similar documents to a given document
findSimilarDocuments(factId, options = {}) {
const docVector = this.documentVectors.get(factId);
if (!docVector) return [];
const { limit = 5, threshold = 0.3 } = options;
const results = [];
for (const [otherFactId, otherVector] of this.documentVectors) {
if (otherFactId === factId) continue;
const similarity = this.calculateSimilarity(docVector, otherVector);
if (similarity >= threshold) {
results.push({ factId: otherFactId, similarity });
}
}
results.sort((a, b) => b.similarity - a.similarity);
return results.slice(0, limit);
}
// Get statistics about the index
getStats() {
return {
totalDocuments: this.documents.size,
vocabularySize: this.vocabulary.size,
averageDocumentLength: this.getAverageDocumentLength(),
topTerms: this.getTopTerms(10),
};
}
getAverageDocumentLength() {
if (this.documents.size === 0) return 0;
let totalLength = 0;
for (const doc of this.documents.values()) {
totalLength += doc.tokens.length;
}
return totalLength / this.documents.size;
}
getTopTerms(topN = 10) {
const termCounts = new Map();
for (const tf of this.termFrequency.values()) {
for (const [term, count] of tf) {
termCounts.set(term, (termCounts.get(term) || 0) + count);
}
}
return Array.from(termCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, topN)
.map(([term, count]) => ({ term, count }));
}
// Update the index when a document changes
updateDocument(factId, newContent, newMetadata = {}) {
this.removeDocument(factId);
this.addDocument(factId, newContent, newMetadata);
}
// Clear the entire index
clear() {
this.documents.clear();
this.termFrequency.clear();
this.inverseDocumentFrequency.clear();
this.documentVectors.clear();
this.vocabulary.clear();
}
// Export index data for persistence
export() {
return {
documents: Array.from(this.documents.entries()),
termFrequency: Array.from(this.termFrequency.entries()).map(([factId, tf]) => [
factId,
Array.from(tf.entries())
]),
inverseDocumentFrequency: Array.from(this.inverseDocumentFrequency.entries()),
vocabulary: Array.from(this.vocabulary),
};
}
// Import index data from persistence
import(data) {
this.clear();
// Restore documents
for (const [factId, doc] of data.documents) {
this.documents.set(factId, doc);
}
// Restore term frequencies
for (const [factId, tfArray] of data.termFrequency) {
this.termFrequency.set(factId, new Map(tfArray));
}
// Restore IDF
this.inverseDocumentFrequency = new Map(data.inverseDocumentFrequency);
// Restore vocabulary
this.vocabulary = new Set(data.vocabulary);
// Rebuild document vectors
this.buildDocumentVectors();
}
}