Skip to main content
Glama
rodhayl
by rodhayl

find_duplicates

Detect similar files, functions, or code spans with configurable similarity thresholds to identify duplication in your codebase.

Instructions

Detect similar files/functions/code spans with similarity thresholds.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
findTypeYesWhat to find: files (similar files), functions (similar functions), code (duplicate code spans)
fileNameNoFor files: name or path of file to find similar files for
symbolNoFor functions: function name to find similar functions for
filePathNoFor functions: file containing the reference function (optional)
minLinesNoFor code: minimum lines for duplicate (default: 8)
minSimilarityNoMinimum similarity threshold 0-1 (default: 0.6)
maxResultsNoMaximum results to return (default: 25)
includeContentNoInclude content analysis (default: false)
extensionsNoFile extensions to scan (default: ts,tsx,js,jsx,py)

Implementation Reference

  • The implementation of 'duplicateCodeFinder', which uses token winnowing to identify and group duplicate code spans across the workspace.
    duplicateCodeFinder(options?: {
      rootDir?: string;
      minLines?: number;
      kTokens?: number;
      window?: number;
      maxReports?: number;
      extensions?: string[];
      maxFiles?: number;
      timeoutMs?: number;
    }): DuplicateCodeGroup[] {
      const minLines = options?.minLines ?? 8;
      const kTokens = Math.max(2, options?.kTokens ?? 25);
      const window = Math.max(1, options?.window ?? 4);
      const maxReports = options?.maxReports ?? 100;
      const extensions = options?.extensions ?? ['ts', 'tsx', 'js', 'jsx', 'py'];
      const rootDir = options?.rootDir ?? this.config.getDefaultWorkspaceRoot();
      const root = this.config.resolveWorkspacePath(rootDir);
    
      if (!this.config.isPathAllowed(root)) {
        throw new Error(`Access denied: Path '${rootDir}' is not in the allowlist`);
      }
    
      const maxFiles = Math.max(1, options?.maxFiles ?? 5000);
      const deadlineMs =
        typeof options?.timeoutMs === 'number' ? Date.now() + Math.max(0, options.timeoutMs) : null;
      const timedOut = () => deadlineMs !== null && Date.now() > deadlineMs;
    
      const extSet = new Set(extensions.map((e) => (e.startsWith('.') ? e : `.${e}`)));
    
      // Collect files
      const allFiles = walkDirectory(root, maxFiles);
      const files = allFiles.filter((f) => extSet.has(extname(f.path).toLowerCase()));
    
      // Tokenize files
      const fileTokens: string[][] = [];
      const fileTokLines: number[][] = [];
      const fileKgrams: string[][] = [];
      const fileFps: Array<Array<[bigint, number]>> = [];
      const filePaths: string[] = [];
    
      for (const file of files) {
        if (timedOut()) break;
        const content = readTextFile(file.path, 2 * 1024 * 1024);
        if (!content) {
          fileTokens.push([]);
          fileTokLines.push([]);
          fileKgrams.push([]);
          fileFps.push([]);
          filePaths.push(file.path);
          continue;
        }
    
        const ext = extname(file.path).toLowerCase();
        const language = ext === '.py' ? 'python' : 'typescript';
        const { tokens, lines } = tokenize(content, language);
    
        const kgs = kgrams(tokens, kTokens);
        const fps = winnowHashes(kgs, window);
    
        fileTokens.push(tokens);
        fileTokLines.push(lines);
        fileKgrams.push(kgs);
        fileFps.push(fps);
        filePaths.push(file.path);
      }
    
      // If we couldn't finish tokenizing, don't attempt partial matching (too low signal).
      if (timedOut()) {
        return [];
      }
    
      // Build fingerprint index
      const index = new Map<string, Array<[number, number]>>();
      for (let fi = 0; fi < fileFps.length; fi++) {
        if (timedOut()) break;
        const seen = new Set<string>();
        for (const [hash, pos] of fileFps[fi]) {
          const key = `${hash}:${pos}`;
          if (seen.has(key)) continue;
          seen.add(key);
    
          const hashKey = hash.toString();
          if (!index.has(hashKey)) {
            index.set(hashKey, []);
          }
          index.get(hashKey)!.push([fi, pos]);
        }
      }
    
      // Find matches
      const groups = new Map<string, DuplicateCodeGroup>();
      const seenSpans = new Set<string>();
    
      for (const [, occurrences] of index) {
        if (timedOut()) break;
        if (occurrences.length < 2) continue;
    
        for (let i = 0; i < occurrences.length; i++) {
          if (timedOut()) break;
          for (let j = i + 1; j < occurrences.length; j++) {
            if (timedOut()) break;
            const [fi, posI] = occurrences[i];
            const [fj, posJ] = occurrences[j];
    
            const toksI = fileTokens[fi];
            const toksJ = fileTokens[fj];
            if (!toksI.length || !toksJ.length) continue;
    
            // Extend match
            let startI = posI;
            let startJ = posJ;
            let endI = Math.min(posI + kTokens - 1, toksI.length - 1);
            let endJ = Math.min(posJ + kTokens - 1, toksJ.length - 1);
    
            // Extend backward
            while (startI > 0 && startJ > 0 && toksI[startI - 1] === toksJ[startJ - 1]) {
              startI--;
              startJ--;
            }
    
            // Extend forward
            while (
              endI + 1 < toksI.length &&
              endJ + 1 < toksJ.length &&
              toksI[endI + 1] === toksJ[endJ + 1]
            ) {
              endI++;
              endJ++;
            }
    
            // Get line ranges
            const linesI = fileTokLines[fi];
            const linesJ = fileTokLines[fj];
    
            const liStart = linesI[startI] ?? 1;
            const liEnd = linesI[endI] ?? liStart;
            const ljStart = linesJ[startJ] ?? 1;
            const ljEnd = linesJ[endJ] ?? ljStart;
    
            // Check min lines
            if (liEnd - liStart + 1 < minLines) continue;
            if (ljEnd - ljStart + 1 < minLines) continue;
    
            // Dedupe spans
            const spanKey = [
              Math.min(fi, fj),
              Math.min(liStart, ljStart),
              Math.max(fi, fj),
              Math.max(liEnd, ljEnd),
            ].join(':');
    
            if (seenSpans.has(spanKey)) continue;
            seenSpans.add(spanKey);
    
            // Group by content hash
            const normSeq = toksI
              .slice(startI, endI + 1)
              .join(' ')
              .slice(0, 4000);
            const gkey = blakeHash(normSeq).toString();
    
            const relI = relative(root, filePaths[fi]);
            const relJ = relative(root, filePaths[fj]);
    
            const occList = [
              { path: relI, start_line: liStart, end_line: liEnd },
              { path: relJ, start_line: ljStart, end_line: ljEnd },
            ];
    
            const score = endI - startI + 1 + (endJ - startJ + 1);
    
            if (!groups.has(gkey)) {
              groups.set(gkey, {
                score,
                occurrences: occList,
                reasons: [`matched k-grams with winnowing hash`],
              });
            } else {
              const group = groups.get(gkey)!;
              const existing = new Set(
                group.occurrences.map((o) => `${o.path}:${o.start_line}:${o.end_line}`)
              );
              for (const occ of occList) {
                const key = `${occ.path}:${occ.start_line}:${occ.end_line}`;
                if (!existing.has(key)) {
                  group.occurrences.push(occ);
                  existing.add(key);
                }
              }
              group.score = Math.max(group.score, score);
            }
          }
        }
      }
    
      const results = Array.from(groups.values()).sort((a, b) => b.score - a.score);
      return results.slice(0, maxReports);
    }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rodhayl/mcpLocalHelper'

If you have feedback or need assistance with the MCP directory API, please join our Discord server