Skip to main content
Glama
DuplicateFinder.tsโ€ข3.4 kB
/** * DuplicateFinder - Find near-duplicate content sections */ import { WritingStorage } from "../storage/WritingStorage.js"; import { paginateResults } from "../utils/pagination.js"; export interface DuplicateMatch { file1: string; file2: string; content: string; similarity: number; location1: { line: number }; location2: { line: number }; } export class DuplicateFinder { constructor(private storage: WritingStorage) {} async findDuplicates(options: { scope?: string; similarityThreshold?: number; minLength?: number; limit?: number; }): Promise<DuplicateMatch[]> { const { similarityThreshold = 0.8, minLength = 50, limit } = options; const files = await this.storage.getAllFiles(); const matches: DuplicateMatch[] = []; for (let i = 0; i < files.length; i++) { for (let j = i + 1; j < files.length; j++) { const duplicates = this.compareFiles( files[i], files[j], similarityThreshold, minLength ); matches.push(...duplicates); } } // Sort by similarity (highest first) before pagination const sorted = matches.sort((a, b) => b.similarity - a.similarity); return paginateResults(sorted, limit); } private compareFiles( file1: { file_path: string; content: string }, file2: { file_path: string; content: string }, threshold: number, minLength: number ): DuplicateMatch[] { const matches: DuplicateMatch[] = []; const paragraphs1 = this.splitParagraphs(file1.content); const paragraphs2 = this.splitParagraphs(file2.content); for (let i = 0; i < paragraphs1.length; i++) { const para1 = paragraphs1[i]; if (para1.text.length < minLength) {continue;} for (let j = 0; j < paragraphs2.length; j++) { const para2 = paragraphs2[j]; if (para2.text.length < minLength) {continue;} const similarity = this.calculateSimilarity(para1.text, para2.text); if (similarity >= threshold) { matches.push({ file1: file1.file_path, file2: file2.file_path, content: para1.text.substring(0, 100) + "...", similarity, location1: { line: para1.line }, location2: { line: para2.line }, }); } } } return matches; } private splitParagraphs(content: string): { text: string; line: number }[] { const lines = content.split("\n"); const paragraphs: { text: string; line: number }[] = []; let current = ""; let startLine = 1; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (line.length === 0) { if (current) { paragraphs.push({ text: current.trim(), line: startLine }); current = ""; } startLine = i + 2; } else { current += " " + line; } } if (current) { paragraphs.push({ text: current.trim(), line: startLine }); } return paragraphs; } private calculateSimilarity(text1: string, text2: string): number { const words1 = new Set(text1.toLowerCase().split(/\s+/)); const words2 = new Set(text2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter((w) => words2.has(w))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/xiaolai/claude-writers-aid-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server