import { createHash } from 'node:crypto';
/**
* SHA-256 hash of text content for exact deduplication.
*/
export function contentHash(text: string): string {
return createHash('sha256').update(text).digest('hex');
}
/**
* Jaccard similarity between two sets of tokens (words).
* Returns value in [0, 1]. 1 = identical sets.
*/
export function jaccardSimilarity(a: string, b: string): number {
const setA = new Set(tokenize(a));
const setB = new Set(tokenize(b));
if (setA.size === 0 && setB.size === 0) return 1;
let intersection = 0;
for (const token of setA) {
if (setB.has(token)) intersection++;
}
const union = setA.size + setB.size - intersection;
return union === 0 ? 0 : intersection / union;
}
function tokenize(text: string): string[] {
return text.toLowerCase().split(/\W+/).filter(Boolean);
}
/**
* Deduplicate an array of text chunks by removing near-duplicates.
* Uses Jaccard similarity with a threshold.
*/
export function dedup(chunks: string[], threshold = 0.85): string[] {
const result: string[] = [];
for (const chunk of chunks) {
const isDup = result.some((existing) => jaccardSimilarity(existing, chunk) >= threshold);
if (!isDup) {
result.push(chunk);
}
}
return result;
}