ChromaDB Local MCP Server

batch-processor.js•15.2 KiB

#!/usr/bin/env node /** * Fast Batch File Processor for ChromaDB * * Handles rapid ingestion of: * - Photos (JPEG, PNG, HEIC, RAW) with EXIF extraction * - CAD files (DXF, DWG, STEP, STL, OBJ) * - Documents (PDF, TXT, MD, JSON, YAML) * - Code files (JS, TS, PY, etc.) * * Features: * - Parallel processing with configurable concurrency * - Automatic metadata extraction + EXIF for photos * - Progress tracking * - Temporary collections for quick load/unload */ import { readdir, stat, readFile } from 'fs/promises'; import { join, extname, basename, dirname, relative } from 'path'; import { createHash } from 'crypto'; // Lazy load EXIF extractor to avoid circular deps let exifExtractor = null; async function getExifExtractor() { if (!exifExtractor) { exifExtractor = await import('./exif-extractor.js'); } return exifExtractor; } // File type configurations export const FILE_TYPES = { images: { extensions: ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.heic', '.heif', '.tiff', '.tif', '.raw', '.cr2', '.nef', '.arw'], category: 'image', extractText: false }, cad: { extensions: ['.dxf', '.dwg', '.step', '.stp', '.stl', '.obj', '.iges', '.igs', '.fbx', '.3ds', '.blend', '.skp', '.fcstd', '.scad'], category: 'cad', extractText: false }, documents: { extensions: ['.pdf', '.txt', '.md', '.markdown', '.rst', '.doc', '.docx', '.rtf', '.odt'], category: 'document', extractText: true }, data: { extensions: ['.json', '.yaml', '.yml', '.xml', '.csv', '.tsv', '.toml', '.ini', '.conf', '.config'], category: 'data', extractText: true }, code: { extensions: ['.js', '.ts', '.jsx', '.tsx', '.py', '.rb', '.go', '.rs', '.java', '.kt', '.swift', '.c', '.cpp', '.h', '.hpp', '.cs', '.php', '.vue', '.svelte', '.html', '.css', '.scss', '.sass', '.less', '.sql', '.sh', '.bash', '.zsh', '.ps1', '.bat'], category: 'code', extractText: true } }; // Get file category based on extension export function getFileCategory(filePath) { const ext = extname(filePath).toLowerCase(); for (const [type, config] of Object.entries(FILE_TYPES)) { if (config.extensions.includes(ext)) { return { type, ...config }; } } return { type: 'unknown', category: 'unknown', extractText: false }; } // Generate unique document ID from file path export function generateDocId(filePath, prefix = 'file') { const hash = createHash('md5').update(filePath).digest('hex').slice(0, 8); const name = basename(filePath).replace(/[^a-zA-Z0-9]/g, '_').slice(0, 32); return `${prefix}_${name}_${hash}`; } // Extract metadata from file stats async function extractFileMetadata(filePath) { try { const stats = await stat(filePath); const ext = extname(filePath).toLowerCase(); const category = getFileCategory(filePath); return { filename: basename(filePath), extension: ext, directory: dirname(filePath), size_bytes: stats.size, size_human: formatBytes(stats.size), created_at: stats.birthtime?.toISOString() || stats.ctime.toISOString(), modified_at: stats.mtime.toISOString(), file_type: category.type, category: category.category, is_binary: !category.extractText }; } catch (error) { return { filename: basename(filePath), extension: extname(filePath).toLowerCase(), error: error.message }; } } // Format bytes to human readable function formatBytes(bytes) { if (bytes === 0) return '0 B'; const k = 1024; const sizes = ['B', 'KB', 'MB', 'GB', 'TB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; } // Extract EXIF-like metadata from images (with full EXIF for JPEG/TIFF) async function extractImageMetadata(filePath, includeExif = true) { const base = await extractFileMetadata(filePath); const ext = extname(filePath).toLowerCase(); // Try to read first bytes for basic image info try { const buffer = await readFile(filePath); const info = { ...base, file_signature: buffer.slice(0, 4).toString('hex') }; // PNG dimensions if (ext === '.png' && buffer.length > 24) { info.width = buffer.readUInt32BE(16); info.height = buffer.readUInt32BE(20); } // JPEG dimensions (simplified) if ((ext === '.jpg' || ext === '.jpeg') && buffer.length > 2) { // Look for SOF0 marker for (let i = 0; i < buffer.length - 10; i++) { if (buffer[i] === 0xFF && (buffer[i + 1] === 0xC0 || buffer[i + 1] === 0xC2)) { info.height = buffer.readUInt16BE(i + 5); info.width = buffer.readUInt16BE(i + 7); break; } } } // Extract full EXIF data for supported formats if (includeExif && ['.jpg', '.jpeg', '.tiff', '.tif'].includes(ext)) { try { const exifModule = await getExifExtractor(); const exif = await exifModule.extractExif(filePath); if (exif.hasExif) { // Add camera info if (exif.camera?.make) info.camera_make = exif.camera.make; if (exif.camera?.model) info.camera_model = exif.camera.model; // Add lens info if (exif.lens?.model) info.lens_model = exif.lens.model; if (exif.lens?.focalLength) info.focal_length = exif.lens.focalLength; if (exif.lens?.focalLength35mm) info.focal_length_35mm = exif.lens.focalLength35mm; // Add exposure info if (exif.exposure?.aperture) info.aperture = exif.exposure.aperture; if (exif.exposure?.time) info.shutter_speed = exif.exposure.time; if (exif.exposure?.iso) info.iso = exif.exposure.iso; if (exif.exposure?.flash) info.flash = exif.exposure.flash; // Add date/time if (exif.datetime?.original) info.date_taken = exif.datetime.original; // Add GPS if (exif.gps) { info.gps_latitude = exif.gps.latitude; info.gps_longitude = exif.gps.longitude; info.gps_location = `${exif.gps.latitude}, ${exif.gps.longitude}`; if (exif.gps.altitude) info.gps_altitude = exif.gps.altitude; } // Use EXIF dimensions if available if (exif.image?.width) info.width = exif.image.width; if (exif.image?.height) info.height = exif.image.height; if (exif.image?.orientation) info.orientation = exif.image.orientation; info.has_exif = true; } } catch (exifError) { // EXIF extraction failed, continue with basic info info.exif_error = exifError.message; } } return info; } catch (error) { return { ...base, read_error: error.message }; } } // Extract CAD file metadata async function extractCADMetadata(filePath) { const base = await extractFileMetadata(filePath); const ext = extname(filePath).toLowerCase(); const info = { ...base, cad_format: ext.slice(1).toUpperCase() }; // Try to extract basic info from text-based CAD formats try { if (['.dxf', '.obj', '.stl'].includes(ext)) { const content = await readFile(filePath, 'utf-8'); const lines = content.split('\n').slice(0, 100); if (ext === '.stl') { info.is_ascii = content.toLowerCase().startsWith('solid'); const solidMatch = content.match(/solid\s+(\S+)/i); if (solidMatch) info.solid_name = solidMatch[1]; } if (ext === '.obj') { const vertices = content.match(/^v\s/gm); const faces = content.match(/^f\s/gm); info.vertex_count = vertices?.length || 0; info.face_count = faces?.length || 0; } if (ext === '.dxf') { info.has_entities = content.includes('ENTITIES'); info.has_blocks = content.includes('BLOCKS'); } } } catch (error) { // Binary file or read error - that's okay } return info; } // Read text content from file async function readTextContent(filePath, maxSize = 1024 * 100) { // 100KB max try { const stats = await stat(filePath); if (stats.size > maxSize) { const buffer = await readFile(filePath); return buffer.slice(0, maxSize).toString('utf-8') + '\n\n[... truncated ...]'; } return await readFile(filePath, 'utf-8'); } catch (error) { return `[Error reading file: ${error.message}]`; } } // Process a single file for ChromaDB ingestion export async function processFile(filePath, options = {}) { const { includeContent = true, maxContentSize = 100 * 1024, // 100KB basePath = null } = options; const category = getFileCategory(filePath); let metadata; let content; // Extract type-specific metadata switch (category.type) { case 'images': metadata = await extractImageMetadata(filePath); // For images, content is a description content = `Image file: ${metadata.filename}\nDimensions: ${metadata.width || 'unknown'}x${metadata.height || 'unknown'}\nSize: ${metadata.size_human}\nFormat: ${metadata.extension}`; break; case 'cad': metadata = await extractCADMetadata(filePath); content = `CAD file: ${metadata.filename}\nFormat: ${metadata.cad_format}\nSize: ${metadata.size_human}`; if (metadata.vertex_count) content += `\nVertices: ${metadata.vertex_count}, Faces: ${metadata.face_count}`; break; default: metadata = await extractFileMetadata(filePath); if (category.extractText && includeContent) { content = await readTextContent(filePath, maxContentSize); } else { content = `File: ${metadata.filename}\nType: ${metadata.file_type}\nSize: ${metadata.size_human}`; } } // Add relative path if basePath provided if (basePath) { metadata.relative_path = relative(basePath, filePath); } metadata.full_path = filePath; metadata.processed_at = new Date().toISOString(); return { id: generateDocId(filePath), content, metadata }; } // Scan directory for files matching criteria export async function scanDirectory(dirPath, options = {}) { const { recursive = true, extensions = null, // null = all, or array like ['.jpg', '.png'] categories = null, // null = all, or array like ['images', 'cad'] maxFiles = 10000, excludePatterns = [/node_modules/, /\.git/, /\.DS_Store/, /__pycache__/] } = options; const files = []; async function scan(currentPath) { if (files.length >= maxFiles) return; try { const entries = await readdir(currentPath, { withFileTypes: true }); for (const entry of entries) { if (files.length >= maxFiles) break; const fullPath = join(currentPath, entry.name); // Check exclude patterns if (excludePatterns.some(pattern => pattern.test(fullPath))) { continue; } if (entry.isDirectory() && recursive) { await scan(fullPath); } else if (entry.isFile()) { const ext = extname(entry.name).toLowerCase(); const category = getFileCategory(fullPath); // Filter by extension if (extensions && !extensions.includes(ext)) { continue; } // Filter by category if (categories && !categories.includes(category.type)) { continue; } files.push(fullPath); } } } catch (error) { console.error(`Error scanning ${currentPath}: ${error.message}`); } } await scan(dirPath); return files; } // Batch process files with progress tracking export async function batchProcessFiles(files, options = {}) { const { concurrency = 10, onProgress = null, includeContent = true, maxContentSize = 100 * 1024, basePath = null } = options; const results = []; const errors = []; let processed = 0; // Process in batches for concurrency control for (let i = 0; i < files.length; i += concurrency) { const batch = files.slice(i, i + concurrency); const batchResults = await Promise.all( batch.map(async (file) => { try { const result = await processFile(file, { includeContent, maxContentSize, basePath }); return { success: true, result }; } catch (error) { return { success: false, file, error: error.message }; } }) ); for (const res of batchResults) { if (res.success) { results.push(res.result); } else { errors.push({ file: res.file, error: res.error }); } processed++; if (onProgress) { onProgress({ processed, total: files.length, percent: Math.round((processed / files.length) * 100), current: res.success ? res.result?.metadata?.filename : res.file }); } } } return { results, errors, stats: { total: files.length, processed: results.length, failed: errors.length } }; } // Export collection to JSON export async function exportCollection(client, collectionName) { try { const collection = await client.getCollection({ name: collectionName }); const data = await collection.get(); return { name: collectionName, exported_at: new Date().toISOString(), count: data.ids.length, documents: data.ids.map((id, idx) => ({ id, content: data.documents[idx], metadata: data.metadatas[idx] })) }; } catch (error) { throw new Error(`Failed to export collection: ${error.message}`); } } // Import collection from JSON export async function importCollection(client, data, options = {}) { const { overwrite = false, collectionName = null } = options; const name = collectionName || data.name; try { if (overwrite) { try { await client.deleteCollection({ name }); } catch (e) { // Collection might not exist } } const collection = await client.getOrCreateCollection({ name }); // Batch insert const batchSize = 100; let imported = 0; for (let i = 0; i < data.documents.length; i += batchSize) { const batch = data.documents.slice(i, i + batchSize); await collection.add({ ids: batch.map(d => d.id), documents: batch.map(d => d.content), metadatas: batch.map(d => d.metadata || {}) }); imported += batch.length; } return { success: true, collection: name, imported }; } catch (error) { throw new Error(`Failed to import collection: ${error.message}`); } } // Quick stats about a directory export async function getDirectoryStats(dirPath, options = {}) { const files = await scanDirectory(dirPath, { ...options, maxFiles: 100000 }); const stats = { total_files: files.length, by_category: {}, by_extension: {}, total_size: 0 }; for (const file of files) { const category = getFileCategory(file); const ext = extname(file).toLowerCase(); stats.by_category[category.type] = (stats.by_category[category.type] || 0) + 1; stats.by_extension[ext] = (stats.by_extension[ext] || 0) + 1; try { const fileStat = await stat(file); stats.total_size += fileStat.size; } catch (e) { // Ignore stat errors } } stats.total_size_human = formatBytes(stats.total_size); return stats; } export default { FILE_TYPES, getFileCategory, generateDocId, processFile, scanDirectory, batchProcessFiles, exportCollection, importCollection, getDirectoryStats };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vespo92/chromadblocal-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

batch-processor.js•15.2 KiB