ChromaDB Local MCP Server

duplicate-detector.js•13.4 KiB

#!/usr/bin/env node /** * Duplicate File Detector for ChromaDB * * Finds duplicate files using multiple strategies: * - Exact hash matching (MD5/SHA256) * - Fuzzy matching for similar content * - Perceptual hashing for images (basic implementation) * * Perfect for cleaning up photo libraries, downloads, etc. */ import { createHash } from 'crypto'; import { readFile, stat } from 'fs/promises'; import { extname, basename } from 'path'; import { scanDirectory, getFileCategory } from './batch-processor.js'; /** * Calculate file hash * @param {string} filePath - Path to file * @param {string} algorithm - Hash algorithm (md5, sha1, sha256) * @returns {Promise<string>} Hex hash string */ export async function calculateFileHash(filePath, algorithm = 'md5') { try { const buffer = await readFile(filePath); return createHash(algorithm).update(buffer).digest('hex'); } catch (error) { throw new Error(`Failed to hash file: ${error.message}`); } } /** * Calculate partial hash (first + last chunks) for large files * Much faster for large files while still being effective * @param {string} filePath - Path to file * @param {number} chunkSize - Size of chunks to hash (default 64KB) * @returns {Promise<string>} Partial hash */ export async function calculatePartialHash(filePath, chunkSize = 65536) { try { const buffer = await readFile(filePath); const hash = createHash('md5'); // Hash file size hash.update(Buffer.from(buffer.length.toString())); // Hash first chunk hash.update(buffer.slice(0, Math.min(chunkSize, buffer.length))); // Hash last chunk if file is large enough if (buffer.length > chunkSize * 2) { hash.update(buffer.slice(-chunkSize)); } // Hash middle chunk for extra certainty if (buffer.length > chunkSize * 3) { const midStart = Math.floor(buffer.length / 2) - Math.floor(chunkSize / 2); hash.update(buffer.slice(midStart, midStart + chunkSize)); } return hash.digest('hex'); } catch (error) { throw new Error(`Failed to calculate partial hash: ${error.message}`); } } /** * Simple perceptual hash for images * Creates a fingerprint based on average brightness in grid cells * @param {string} filePath - Path to image * @returns {Promise<string>} Perceptual hash */ export async function calculateImagePerceptualHash(filePath) { try { const buffer = await readFile(filePath); const ext = extname(filePath).toLowerCase(); // For JPEG, try to extract a simple fingerprint from pixel data // This is a basic implementation - real perceptual hashing would use DCT if (ext === '.jpg' || ext === '.jpeg') { // Find start of scan data let scanStart = -1; for (let i = 0; i < buffer.length - 1; i++) { if (buffer[i] === 0xFF && buffer[i + 1] === 0xDA) { scanStart = i + 2; break; } } if (scanStart > 0) { // Sample scan data at regular intervals const hash = createHash('md5'); const sampleSize = Math.min(1024, buffer.length - scanStart); const step = Math.max(1, Math.floor((buffer.length - scanStart) / sampleSize)); for (let i = scanStart; i < buffer.length && i < scanStart + sampleSize * step; i += step) { hash.update(Buffer.from([buffer[i]])); } return 'pjpg_' + hash.digest('hex').slice(0, 16); } } // For PNG, sample IDAT chunks if (ext === '.png') { const hash = createHash('md5'); // Find IDAT chunks and sample them for (let i = 8; i < buffer.length - 12; ) { const length = buffer.readUInt32BE(i); const type = buffer.slice(i + 4, i + 8).toString('ascii'); if (type === 'IDAT') { // Sample this chunk const chunkData = buffer.slice(i + 8, i + 8 + Math.min(length, 256)); hash.update(chunkData); } i += 12 + length; // Move to next chunk } return 'ppng_' + hash.digest('hex').slice(0, 16); } // Fallback to partial hash for other formats return 'pgen_' + await calculatePartialHash(filePath); } catch (error) { // Fallback to regular hash on error return 'perr_' + await calculatePartialHash(filePath); } } /** * Scan a directory and find all duplicate files * @param {string} dirPath - Directory to scan * @param {object} options - Scan options * @returns {Promise<object>} Duplicate groups and statistics */ export async function findDuplicates(dirPath, options = {}) { const { recursive = true, categories = null, extensions = null, hashMethod = 'partial', // 'full', 'partial', 'perceptual' maxFiles = 5000, minSize = 1, // Minimum file size in bytes onProgress = null } = options; // Scan for files const files = await scanDirectory(dirPath, { recursive, categories, extensions, maxFiles }); // Group files by size first (quick pre-filter) const sizeGroups = new Map(); let processed = 0; for (const file of files) { try { const stats = await stat(file); if (stats.size < minSize) continue; const size = stats.size; if (!sizeGroups.has(size)) { sizeGroups.set(size, []); } sizeGroups.get(size).push(file); processed++; if (onProgress && processed % 100 === 0) { onProgress({ phase: 'sizing', processed, total: files.length }); } } catch { // Skip files we can't stat } } // Filter to only sizes with potential duplicates const potentialDuplicates = []; for (const [size, fileList] of sizeGroups) { if (fileList.length > 1) { potentialDuplicates.push(...fileList); } } // Calculate hashes for potential duplicates const hashGroups = new Map(); processed = 0; for (const file of potentialDuplicates) { try { let hash; switch (hashMethod) { case 'full': hash = await calculateFileHash(file); break; case 'perceptual': const category = getFileCategory(file); if (category.type === 'images') { hash = await calculateImagePerceptualHash(file); } else { hash = await calculatePartialHash(file); } break; case 'partial': default: hash = await calculatePartialHash(file); } if (!hashGroups.has(hash)) { hashGroups.set(hash, []); } hashGroups.get(hash).push(file); processed++; if (onProgress && processed % 50 === 0) { onProgress({ phase: 'hashing', processed, total: potentialDuplicates.length }); } } catch { // Skip files we can't hash } } // Build result groups const duplicateGroups = []; let totalDuplicates = 0; let totalWastedSpace = 0; for (const [hash, fileList] of hashGroups) { if (fileList.length > 1) { // Get file info for each duplicate const groupFiles = await Promise.all(fileList.map(async (file) => { try { const stats = await stat(file); return { path: file, filename: basename(file), size: stats.size, modified: stats.mtime.toISOString() }; } catch { return { path: file, filename: basename(file) }; } })); // Sort by modification date (oldest first) groupFiles.sort((a, b) => { if (!a.modified) return 1; if (!b.modified) return -1; return new Date(a.modified) - new Date(b.modified); }); const wastedSpace = groupFiles.slice(1).reduce((sum, f) => sum + (f.size || 0), 0); duplicateGroups.push({ hash, count: fileList.length, files: groupFiles, original: groupFiles[0], // Oldest file assumed to be original duplicates: groupFiles.slice(1), wastedSpace, wastedSpaceHuman: formatBytes(wastedSpace) }); totalDuplicates += fileList.length - 1; totalWastedSpace += wastedSpace; } } // Sort groups by wasted space (largest first) duplicateGroups.sort((a, b) => b.wastedSpace - a.wastedSpace); return { scanned: files.length, potentialDuplicates: potentialDuplicates.length, duplicateGroups: duplicateGroups.length, totalDuplicates, totalWastedSpace, totalWastedSpaceHuman: formatBytes(totalWastedSpace), hashMethod, groups: duplicateGroups }; } /** * Find duplicates within a ChromaDB collection * @param {ChromaClient} client - ChromaDB client * @param {string} collectionName - Collection to check * @returns {Promise<object>} Duplicate information */ export async function findCollectionDuplicates(client, collectionName) { try { const collection = await client.getCollection({ name: collectionName }); const data = await collection.get(); if (!data.ids || data.ids.length === 0) { return { duplicates: [], count: 0 }; } // Group by file path if available const pathGroups = new Map(); for (let i = 0; i < data.ids.length; i++) { const meta = data.metadatas[i] || {}; const path = meta.full_path || meta.filename || data.ids[i]; if (!pathGroups.has(path)) { pathGroups.set(path, []); } pathGroups.get(path).push({ id: data.ids[i], metadata: meta, content: data.documents[i]?.slice(0, 100) }); } const duplicates = []; for (const [path, entries] of pathGroups) { if (entries.length > 1) { duplicates.push({ path, count: entries.length, entries }); } } return { collection: collectionName, totalDocuments: data.ids.length, duplicateGroups: duplicates.length, duplicates }; } catch (error) { throw new Error(`Failed to check collection duplicates: ${error.message}`); } } /** * Compare two files for similarity * @param {string} file1 - First file path * @param {string} file2 - Second file path * @returns {Promise<object>} Comparison result */ export async function compareFiles(file1, file2) { try { const [stats1, stats2] = await Promise.all([ stat(file1), stat(file2) ]); const result = { file1: { path: file1, size: stats1.size, modified: stats1.mtime }, file2: { path: file2, size: stats2.size, modified: stats2.mtime }, sameSize: stats1.size === stats2.size, exactMatch: false, partialMatch: false, perceptualMatch: false }; // If different sizes, not exact duplicates if (!result.sameSize) { return result; } // Check partial hash const [partial1, partial2] = await Promise.all([ calculatePartialHash(file1), calculatePartialHash(file2) ]); result.partialMatch = partial1 === partial2; // If partial matches, verify with full hash if (result.partialMatch) { const [full1, full2] = await Promise.all([ calculateFileHash(file1), calculateFileHash(file2) ]); result.exactMatch = full1 === full2; result.hash1 = full1; result.hash2 = full2; } // For images, also check perceptual hash const cat1 = getFileCategory(file1); const cat2 = getFileCategory(file2); if (cat1.type === 'images' && cat2.type === 'images') { const [phash1, phash2] = await Promise.all([ calculateImagePerceptualHash(file1), calculateImagePerceptualHash(file2) ]); result.perceptualMatch = phash1 === phash2; result.perceptualHash1 = phash1; result.perceptualHash2 = phash2; } return result; } catch (error) { throw new Error(`Failed to compare files: ${error.message}`); } } /** * Format bytes to human readable */ function formatBytes(bytes) { if (bytes === 0) return '0 B'; const k = 1024; const sizes = ['B', 'KB', 'MB', 'GB', 'TB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; } /** * Generate deduplication report * @param {object} duplicateResult - Result from findDuplicates * @returns {string} Human-readable report */ export function generateReport(duplicateResult) { const lines = [ '='.repeat(60), 'DUPLICATE FILE REPORT', '='.repeat(60), '', `Files scanned: ${duplicateResult.scanned}`, `Potential duplicates (same size): ${duplicateResult.potentialDuplicates}`, `Duplicate groups found: ${duplicateResult.duplicateGroups}`, `Total duplicate files: ${duplicateResult.totalDuplicates}`, `Wasted space: ${duplicateResult.totalWastedSpaceHuman}`, `Hash method: ${duplicateResult.hashMethod}`, '', ]; if (duplicateResult.groups.length > 0) { lines.push('-'.repeat(60)); lines.push('DUPLICATE GROUPS (sorted by wasted space)'); lines.push('-'.repeat(60)); lines.push(''); for (const group of duplicateResult.groups.slice(0, 20)) { lines.push(`[${group.count} files, ${group.wastedSpaceHuman} wasted]`); lines.push(` Original: ${group.original.path}`); for (const dup of group.duplicates) { lines.push(` Duplicate: ${dup.path}`); } lines.push(''); } if (duplicateResult.groups.length > 20) { lines.push(`... and ${duplicateResult.groups.length - 20} more groups`); } } else { lines.push('No duplicates found!'); } return lines.join('\n'); } export default { calculateFileHash, calculatePartialHash, calculateImagePerceptualHash, findDuplicates, findCollectionDuplicates, compareFiles, generateReport };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vespo92/chromadblocal-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

duplicate-detector.js•13.4 KiB