find_duplicates
Detect similar files, functions, or code spans with configurable similarity thresholds to identify duplication in your codebase.
Instructions
Detect similar files/functions/code spans with similarity thresholds.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| findType | Yes | What to find: files (similar files), functions (similar functions), code (duplicate code spans) | |
| fileName | No | For files: name or path of file to find similar files for | |
| symbol | No | For functions: function name to find similar functions for | |
| filePath | No | For functions: file containing the reference function (optional) | |
| minLines | No | For code: minimum lines for duplicate (default: 8) | |
| minSimilarity | No | Minimum similarity threshold 0-1 (default: 0.6) | |
| maxResults | No | Maximum results to return (default: 25) | |
| includeContent | No | Include content analysis (default: false) | |
| extensions | No | File extensions to scan (default: ts,tsx,js,jsx,py) |
Implementation Reference
- src/tools/code-analysis.ts:787-984 (handler)The implementation of 'duplicateCodeFinder', which uses token winnowing to identify and group duplicate code spans across the workspace.
duplicateCodeFinder(options?: { rootDir?: string; minLines?: number; kTokens?: number; window?: number; maxReports?: number; extensions?: string[]; maxFiles?: number; timeoutMs?: number; }): DuplicateCodeGroup[] { const minLines = options?.minLines ?? 8; const kTokens = Math.max(2, options?.kTokens ?? 25); const window = Math.max(1, options?.window ?? 4); const maxReports = options?.maxReports ?? 100; const extensions = options?.extensions ?? ['ts', 'tsx', 'js', 'jsx', 'py']; const rootDir = options?.rootDir ?? this.config.getDefaultWorkspaceRoot(); const root = this.config.resolveWorkspacePath(rootDir); if (!this.config.isPathAllowed(root)) { throw new Error(`Access denied: Path '${rootDir}' is not in the allowlist`); } const maxFiles = Math.max(1, options?.maxFiles ?? 5000); const deadlineMs = typeof options?.timeoutMs === 'number' ? Date.now() + Math.max(0, options.timeoutMs) : null; const timedOut = () => deadlineMs !== null && Date.now() > deadlineMs; const extSet = new Set(extensions.map((e) => (e.startsWith('.') ? e : `.${e}`))); // Collect files const allFiles = walkDirectory(root, maxFiles); const files = allFiles.filter((f) => extSet.has(extname(f.path).toLowerCase())); // Tokenize files const fileTokens: string[][] = []; const fileTokLines: number[][] = []; const fileKgrams: string[][] = []; const fileFps: Array<Array<[bigint, number]>> = []; const filePaths: string[] = []; for (const file of files) { if (timedOut()) break; const content = readTextFile(file.path, 2 * 1024 * 1024); if (!content) { fileTokens.push([]); fileTokLines.push([]); fileKgrams.push([]); fileFps.push([]); filePaths.push(file.path); continue; } const ext = extname(file.path).toLowerCase(); const language = ext === '.py' ? 'python' : 'typescript'; const { tokens, lines } = tokenize(content, language); const kgs = kgrams(tokens, kTokens); const fps = winnowHashes(kgs, window); fileTokens.push(tokens); fileTokLines.push(lines); fileKgrams.push(kgs); fileFps.push(fps); filePaths.push(file.path); } // If we couldn't finish tokenizing, don't attempt partial matching (too low signal). if (timedOut()) { return []; } // Build fingerprint index const index = new Map<string, Array<[number, number]>>(); for (let fi = 0; fi < fileFps.length; fi++) { if (timedOut()) break; const seen = new Set<string>(); for (const [hash, pos] of fileFps[fi]) { const key = `${hash}:${pos}`; if (seen.has(key)) continue; seen.add(key); const hashKey = hash.toString(); if (!index.has(hashKey)) { index.set(hashKey, []); } index.get(hashKey)!.push([fi, pos]); } } // Find matches const groups = new Map<string, DuplicateCodeGroup>(); const seenSpans = new Set<string>(); for (const [, occurrences] of index) { if (timedOut()) break; if (occurrences.length < 2) continue; for (let i = 0; i < occurrences.length; i++) { if (timedOut()) break; for (let j = i + 1; j < occurrences.length; j++) { if (timedOut()) break; const [fi, posI] = occurrences[i]; const [fj, posJ] = occurrences[j]; const toksI = fileTokens[fi]; const toksJ = fileTokens[fj]; if (!toksI.length || !toksJ.length) continue; // Extend match let startI = posI; let startJ = posJ; let endI = Math.min(posI + kTokens - 1, toksI.length - 1); let endJ = Math.min(posJ + kTokens - 1, toksJ.length - 1); // Extend backward while (startI > 0 && startJ > 0 && toksI[startI - 1] === toksJ[startJ - 1]) { startI--; startJ--; } // Extend forward while ( endI + 1 < toksI.length && endJ + 1 < toksJ.length && toksI[endI + 1] === toksJ[endJ + 1] ) { endI++; endJ++; } // Get line ranges const linesI = fileTokLines[fi]; const linesJ = fileTokLines[fj]; const liStart = linesI[startI] ?? 1; const liEnd = linesI[endI] ?? liStart; const ljStart = linesJ[startJ] ?? 1; const ljEnd = linesJ[endJ] ?? ljStart; // Check min lines if (liEnd - liStart + 1 < minLines) continue; if (ljEnd - ljStart + 1 < minLines) continue; // Dedupe spans const spanKey = [ Math.min(fi, fj), Math.min(liStart, ljStart), Math.max(fi, fj), Math.max(liEnd, ljEnd), ].join(':'); if (seenSpans.has(spanKey)) continue; seenSpans.add(spanKey); // Group by content hash const normSeq = toksI .slice(startI, endI + 1) .join(' ') .slice(0, 4000); const gkey = blakeHash(normSeq).toString(); const relI = relative(root, filePaths[fi]); const relJ = relative(root, filePaths[fj]); const occList = [ { path: relI, start_line: liStart, end_line: liEnd }, { path: relJ, start_line: ljStart, end_line: ljEnd }, ]; const score = endI - startI + 1 + (endJ - startJ + 1); if (!groups.has(gkey)) { groups.set(gkey, { score, occurrences: occList, reasons: [`matched k-grams with winnowing hash`], }); } else { const group = groups.get(gkey)!; const existing = new Set( group.occurrences.map((o) => `${o.path}:${o.start_line}:${o.end_line}`) ); for (const occ of occList) { const key = `${occ.path}:${occ.start_line}:${occ.end_line}`; if (!existing.has(key)) { group.occurrences.push(occ); existing.add(key); } } group.score = Math.max(group.score, score); } } } } const results = Array.from(groups.values()).sort((a, b) => b.score - a.score); return results.slice(0, maxReports); }