import { VectorStore, SecurityDocument, SearchResult } from "../types.js";
import * as fs from "fs/promises";
import * as path from "path";
import natural from "natural";
const TfIdf = natural.TfIdf;
const tokenizer = new natural.WordTokenizer();
export class SimpleVectorStore implements VectorStore {
private documents: SecurityDocument[] = [];
private tfidf: any;
private indexPath: string;
constructor() {
const homeDir = process.env.HOME || process.env.USERPROFILE || "/tmp";
this.indexPath = path.join(homeDir, ".security-mcp", "documents.json");
}
async initialize(): Promise<void> {
try {
// Ensure directory exists
await fs.mkdir(path.dirname(this.indexPath), { recursive: true });
// Load existing documents if available
try {
const data = await fs.readFile(this.indexPath, "utf-8");
const parsed = JSON.parse(data);
this.documents = parsed.map((doc: any) => ({
...doc,
lastUpdated: new Date(doc.lastUpdated),
}));
console.error(
`Loaded ${this.documents.length} documents from cache`
);
} catch (error) {
console.error(
"No existing index found or error reading it, starting fresh"
);
this.documents = [];
}
// Build TF-IDF index
this.buildIndex();
} catch (error) {
console.error("Error initializing vector store:", error);
throw error;
}
}
private buildIndex() {
this.tfidf = new TfIdf();
for (const doc of this.documents) {
// Combine title and content for better matching
const fullText = `${doc.title} ${doc.category} ${doc.content}`;
this.tfidf.addDocument(fullText);
}
}
async addDocuments(documents: SecurityDocument[]): Promise<void> {
// Add new documents
for (const doc of documents) {
// Check if document already exists (by ID)
const existingIndex = this.documents.findIndex((d) => d.id === doc.id);
if (existingIndex >= 0) {
// Update existing document
this.documents[existingIndex] = doc;
} else {
// Add new document
this.documents.push(doc);
}
}
// Rebuild index
this.buildIndex();
// Save to disk
await this.save();
}
private async save(): Promise<void> {
await fs.writeFile(
this.indexPath,
JSON.stringify(this.documents, null, 2),
"utf-8"
);
console.error(`Saved ${this.documents.length} documents to cache`);
}
async search(query: string, limit: number = 5): Promise<SearchResult[]> {
if (this.documents.length === 0) {
return [];
}
const results: SearchResult[] = [];
// Use TF-IDF to find relevant documents
this.tfidf.tfidfs(query, (i: number, measure: number) => {
if (measure > 0) {
const doc = this.documents[i];
const relevantChunk = this.extractRelevantChunk(doc.content, query);
results.push({
document: doc,
score: measure,
relevantChunk,
});
}
});
// Sort by score descending
results.sort((a, b) => b.score - a.score);
// Normalize scores (highest score becomes 1.0)
if (results.length > 0) {
const maxScore = results[0].score;
for (const result of results) {
result.score = result.score / maxScore;
}
}
return results.slice(0, limit);
}
private extractRelevantChunk(
content: string,
query: string,
chunkSize: number = 500
): string {
const queryTokens = tokenizer
.tokenize(query.toLowerCase())
.filter((t) => t.length > 2);
const sentences = content.split(/[.!?]\s+/);
// Score each sentence based on query term presence
const scoredSentences = sentences.map((sentence, idx) => {
const lowerSentence = sentence.toLowerCase();
let score = 0;
for (const token of queryTokens) {
if (lowerSentence.includes(token)) {
score += 1;
}
}
return { sentence, score, idx };
});
// Sort by score
scoredSentences.sort((a, b) => b.score - a.score);
// Take top sentences and return in original order
const topSentences = scoredSentences
.slice(0, 3)
.sort((a, b) => a.idx - b.idx);
let chunk = topSentences.map((s) => s.sentence).join(". ");
// Truncate if too long
if (chunk.length > chunkSize) {
chunk = chunk.substring(0, chunkSize) + "...";
}
return chunk || content.substring(0, chunkSize) + "...";
}
}