#!/usr/bin/env node
/**
* Duplicate File Detector for ChromaDB
*
* Finds duplicate files using multiple strategies:
* - Exact hash matching (MD5/SHA256)
* - Fuzzy matching for similar content
* - Perceptual hashing for images (basic implementation)
*
* Perfect for cleaning up photo libraries, downloads, etc.
*/
import { createHash } from 'crypto';
import { readFile, stat } from 'fs/promises';
import { extname, basename } from 'path';
import { scanDirectory, getFileCategory } from './batch-processor.js';
/**
* Calculate file hash
* @param {string} filePath - Path to file
* @param {string} algorithm - Hash algorithm (md5, sha1, sha256)
* @returns {Promise<string>} Hex hash string
*/
export async function calculateFileHash(filePath, algorithm = 'md5') {
try {
const buffer = await readFile(filePath);
return createHash(algorithm).update(buffer).digest('hex');
} catch (error) {
throw new Error(`Failed to hash file: ${error.message}`);
}
}
/**
* Calculate partial hash (first + last chunks) for large files
* Much faster for large files while still being effective
* @param {string} filePath - Path to file
* @param {number} chunkSize - Size of chunks to hash (default 64KB)
* @returns {Promise<string>} Partial hash
*/
export async function calculatePartialHash(filePath, chunkSize = 65536) {
try {
const buffer = await readFile(filePath);
const hash = createHash('md5');
// Hash file size
hash.update(Buffer.from(buffer.length.toString()));
// Hash first chunk
hash.update(buffer.slice(0, Math.min(chunkSize, buffer.length)));
// Hash last chunk if file is large enough
if (buffer.length > chunkSize * 2) {
hash.update(buffer.slice(-chunkSize));
}
// Hash middle chunk for extra certainty
if (buffer.length > chunkSize * 3) {
const midStart = Math.floor(buffer.length / 2) - Math.floor(chunkSize / 2);
hash.update(buffer.slice(midStart, midStart + chunkSize));
}
return hash.digest('hex');
} catch (error) {
throw new Error(`Failed to calculate partial hash: ${error.message}`);
}
}
/**
* Simple perceptual hash for images
* Creates a fingerprint based on average brightness in grid cells
* @param {string} filePath - Path to image
* @returns {Promise<string>} Perceptual hash
*/
export async function calculateImagePerceptualHash(filePath) {
try {
const buffer = await readFile(filePath);
const ext = extname(filePath).toLowerCase();
// For JPEG, try to extract a simple fingerprint from pixel data
// This is a basic implementation - real perceptual hashing would use DCT
if (ext === '.jpg' || ext === '.jpeg') {
// Find start of scan data
let scanStart = -1;
for (let i = 0; i < buffer.length - 1; i++) {
if (buffer[i] === 0xFF && buffer[i + 1] === 0xDA) {
scanStart = i + 2;
break;
}
}
if (scanStart > 0) {
// Sample scan data at regular intervals
const hash = createHash('md5');
const sampleSize = Math.min(1024, buffer.length - scanStart);
const step = Math.max(1, Math.floor((buffer.length - scanStart) / sampleSize));
for (let i = scanStart; i < buffer.length && i < scanStart + sampleSize * step; i += step) {
hash.update(Buffer.from([buffer[i]]));
}
return 'pjpg_' + hash.digest('hex').slice(0, 16);
}
}
// For PNG, sample IDAT chunks
if (ext === '.png') {
const hash = createHash('md5');
// Find IDAT chunks and sample them
for (let i = 8; i < buffer.length - 12; ) {
const length = buffer.readUInt32BE(i);
const type = buffer.slice(i + 4, i + 8).toString('ascii');
if (type === 'IDAT') {
// Sample this chunk
const chunkData = buffer.slice(i + 8, i + 8 + Math.min(length, 256));
hash.update(chunkData);
}
i += 12 + length; // Move to next chunk
}
return 'ppng_' + hash.digest('hex').slice(0, 16);
}
// Fallback to partial hash for other formats
return 'pgen_' + await calculatePartialHash(filePath);
} catch (error) {
// Fallback to regular hash on error
return 'perr_' + await calculatePartialHash(filePath);
}
}
/**
* Scan a directory and find all duplicate files
* @param {string} dirPath - Directory to scan
* @param {object} options - Scan options
* @returns {Promise<object>} Duplicate groups and statistics
*/
export async function findDuplicates(dirPath, options = {}) {
const {
recursive = true,
categories = null,
extensions = null,
hashMethod = 'partial', // 'full', 'partial', 'perceptual'
maxFiles = 5000,
minSize = 1, // Minimum file size in bytes
onProgress = null
} = options;
// Scan for files
const files = await scanDirectory(dirPath, {
recursive,
categories,
extensions,
maxFiles
});
// Group files by size first (quick pre-filter)
const sizeGroups = new Map();
let processed = 0;
for (const file of files) {
try {
const stats = await stat(file);
if (stats.size < minSize) continue;
const size = stats.size;
if (!sizeGroups.has(size)) {
sizeGroups.set(size, []);
}
sizeGroups.get(size).push(file);
processed++;
if (onProgress && processed % 100 === 0) {
onProgress({ phase: 'sizing', processed, total: files.length });
}
} catch {
// Skip files we can't stat
}
}
// Filter to only sizes with potential duplicates
const potentialDuplicates = [];
for (const [size, fileList] of sizeGroups) {
if (fileList.length > 1) {
potentialDuplicates.push(...fileList);
}
}
// Calculate hashes for potential duplicates
const hashGroups = new Map();
processed = 0;
for (const file of potentialDuplicates) {
try {
let hash;
switch (hashMethod) {
case 'full':
hash = await calculateFileHash(file);
break;
case 'perceptual':
const category = getFileCategory(file);
if (category.type === 'images') {
hash = await calculateImagePerceptualHash(file);
} else {
hash = await calculatePartialHash(file);
}
break;
case 'partial':
default:
hash = await calculatePartialHash(file);
}
if (!hashGroups.has(hash)) {
hashGroups.set(hash, []);
}
hashGroups.get(hash).push(file);
processed++;
if (onProgress && processed % 50 === 0) {
onProgress({ phase: 'hashing', processed, total: potentialDuplicates.length });
}
} catch {
// Skip files we can't hash
}
}
// Build result groups
const duplicateGroups = [];
let totalDuplicates = 0;
let totalWastedSpace = 0;
for (const [hash, fileList] of hashGroups) {
if (fileList.length > 1) {
// Get file info for each duplicate
const groupFiles = await Promise.all(fileList.map(async (file) => {
try {
const stats = await stat(file);
return {
path: file,
filename: basename(file),
size: stats.size,
modified: stats.mtime.toISOString()
};
} catch {
return { path: file, filename: basename(file) };
}
}));
// Sort by modification date (oldest first)
groupFiles.sort((a, b) => {
if (!a.modified) return 1;
if (!b.modified) return -1;
return new Date(a.modified) - new Date(b.modified);
});
const wastedSpace = groupFiles.slice(1).reduce((sum, f) => sum + (f.size || 0), 0);
duplicateGroups.push({
hash,
count: fileList.length,
files: groupFiles,
original: groupFiles[0], // Oldest file assumed to be original
duplicates: groupFiles.slice(1),
wastedSpace,
wastedSpaceHuman: formatBytes(wastedSpace)
});
totalDuplicates += fileList.length - 1;
totalWastedSpace += wastedSpace;
}
}
// Sort groups by wasted space (largest first)
duplicateGroups.sort((a, b) => b.wastedSpace - a.wastedSpace);
return {
scanned: files.length,
potentialDuplicates: potentialDuplicates.length,
duplicateGroups: duplicateGroups.length,
totalDuplicates,
totalWastedSpace,
totalWastedSpaceHuman: formatBytes(totalWastedSpace),
hashMethod,
groups: duplicateGroups
};
}
/**
* Find duplicates within a ChromaDB collection
* @param {ChromaClient} client - ChromaDB client
* @param {string} collectionName - Collection to check
* @returns {Promise<object>} Duplicate information
*/
export async function findCollectionDuplicates(client, collectionName) {
try {
const collection = await client.getCollection({ name: collectionName });
const data = await collection.get();
if (!data.ids || data.ids.length === 0) {
return { duplicates: [], count: 0 };
}
// Group by file path if available
const pathGroups = new Map();
for (let i = 0; i < data.ids.length; i++) {
const meta = data.metadatas[i] || {};
const path = meta.full_path || meta.filename || data.ids[i];
if (!pathGroups.has(path)) {
pathGroups.set(path, []);
}
pathGroups.get(path).push({
id: data.ids[i],
metadata: meta,
content: data.documents[i]?.slice(0, 100)
});
}
const duplicates = [];
for (const [path, entries] of pathGroups) {
if (entries.length > 1) {
duplicates.push({
path,
count: entries.length,
entries
});
}
}
return {
collection: collectionName,
totalDocuments: data.ids.length,
duplicateGroups: duplicates.length,
duplicates
};
} catch (error) {
throw new Error(`Failed to check collection duplicates: ${error.message}`);
}
}
/**
* Compare two files for similarity
* @param {string} file1 - First file path
* @param {string} file2 - Second file path
* @returns {Promise<object>} Comparison result
*/
export async function compareFiles(file1, file2) {
try {
const [stats1, stats2] = await Promise.all([
stat(file1),
stat(file2)
]);
const result = {
file1: { path: file1, size: stats1.size, modified: stats1.mtime },
file2: { path: file2, size: stats2.size, modified: stats2.mtime },
sameSize: stats1.size === stats2.size,
exactMatch: false,
partialMatch: false,
perceptualMatch: false
};
// If different sizes, not exact duplicates
if (!result.sameSize) {
return result;
}
// Check partial hash
const [partial1, partial2] = await Promise.all([
calculatePartialHash(file1),
calculatePartialHash(file2)
]);
result.partialMatch = partial1 === partial2;
// If partial matches, verify with full hash
if (result.partialMatch) {
const [full1, full2] = await Promise.all([
calculateFileHash(file1),
calculateFileHash(file2)
]);
result.exactMatch = full1 === full2;
result.hash1 = full1;
result.hash2 = full2;
}
// For images, also check perceptual hash
const cat1 = getFileCategory(file1);
const cat2 = getFileCategory(file2);
if (cat1.type === 'images' && cat2.type === 'images') {
const [phash1, phash2] = await Promise.all([
calculateImagePerceptualHash(file1),
calculateImagePerceptualHash(file2)
]);
result.perceptualMatch = phash1 === phash2;
result.perceptualHash1 = phash1;
result.perceptualHash2 = phash2;
}
return result;
} catch (error) {
throw new Error(`Failed to compare files: ${error.message}`);
}
}
/**
* Format bytes to human readable
*/
function formatBytes(bytes) {
if (bytes === 0) return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
/**
* Generate deduplication report
* @param {object} duplicateResult - Result from findDuplicates
* @returns {string} Human-readable report
*/
export function generateReport(duplicateResult) {
const lines = [
'='.repeat(60),
'DUPLICATE FILE REPORT',
'='.repeat(60),
'',
`Files scanned: ${duplicateResult.scanned}`,
`Potential duplicates (same size): ${duplicateResult.potentialDuplicates}`,
`Duplicate groups found: ${duplicateResult.duplicateGroups}`,
`Total duplicate files: ${duplicateResult.totalDuplicates}`,
`Wasted space: ${duplicateResult.totalWastedSpaceHuman}`,
`Hash method: ${duplicateResult.hashMethod}`,
'',
];
if (duplicateResult.groups.length > 0) {
lines.push('-'.repeat(60));
lines.push('DUPLICATE GROUPS (sorted by wasted space)');
lines.push('-'.repeat(60));
lines.push('');
for (const group of duplicateResult.groups.slice(0, 20)) {
lines.push(`[${group.count} files, ${group.wastedSpaceHuman} wasted]`);
lines.push(` Original: ${group.original.path}`);
for (const dup of group.duplicates) {
lines.push(` Duplicate: ${dup.path}`);
}
lines.push('');
}
if (duplicateResult.groups.length > 20) {
lines.push(`... and ${duplicateResult.groups.length - 20} more groups`);
}
} else {
lines.push('No duplicates found!');
}
return lines.join('\n');
}
export default {
calculateFileHash,
calculatePartialHash,
calculateImagePerceptualHash,
findDuplicates,
findCollectionDuplicates,
compareFiles,
generateReport
};