Local Search MCP Server

FileProcessor.ts•10.4 KiB

import { promises as fs } from 'fs'; import path from 'path'; import { FileProcessingError, ContentMetadata } from '../types/index.js'; import { log } from './Logger.js'; import { ContentClassifier } from './ContentClassifier.js'; import { ContentEnhancer } from './ContentEnhancer.js'; import { DomainExtractor } from './DomainExtractor.js'; export class FileProcessor { private supportedExtensions: Set<string>; private maxFileSizeMB: number; private contentClassifier: ContentClassifier; private contentEnhancer: ContentEnhancer; private domainExtractor: DomainExtractor | null = null; constructor(maxFileSizeMB: number = 1024, database?: any) { log.debug('Initializing FileProcessor', { maxFileSizeMB }); this.maxFileSizeMB = maxFileSizeMB; this.supportedExtensions = new Set([ '.txt', '.md', '.json', '.yaml', '.yml', '.rst', '.js', '.ts', '.py', '.java', '.c', '.cpp', '.h', '.css', '.scss', '.html', '.xml', '.csv' ]); // Initialize content processing components this.contentClassifier = new ContentClassifier(); this.contentEnhancer = new ContentEnhancer(); // Initialize domain extractor if database is provided if (database) { this.domainExtractor = new DomainExtractor(database); } log.debug('FileProcessor initialized successfully', { supportedExtensions: Array.from(this.supportedExtensions), maxFileSizeMB: this.maxFileSizeMB, hasContentClassifier: !!this.contentClassifier, hasContentEnhancer: !!this.contentEnhancer, hasDomainExtractor: !!this.domainExtractor }); } isFileSupported(filePath: string): boolean { const ext = path.extname(filePath).toLowerCase(); return this.supportedExtensions.has(ext); } /** * Extract text content from a file * @param filePath Absolute path to the file * @returns Extracted text content */ async extractText(filePath: string): Promise<string> { const timer = log.time(`file-extract-${path.basename(filePath)}`); try { log.debug('Starting file text extraction', { filePath, basename: path.basename(filePath) }); // Check if file exists and get stats const stats = await fs.stat(filePath); if (stats.isDirectory()) { log.warn('Attempted to extract text from directory', { filePath }); throw new FileProcessingError(`Path is a directory: ${filePath}`); } const maxSizeBytes = this.maxFileSizeMB * 1024 * 1024; if (stats.size > maxSizeBytes) { log.warn('File size exceeds limit', { filePath, size: stats.size, sizeMB: (stats.size / 1024 / 1024).toFixed(1), maxSizeMB: this.maxFileSizeMB }); throw new FileProcessingError(`File too large: ${filePath} (${(stats.size / 1024 / 1024).toFixed(1)}MB exceeds ${this.maxFileSizeMB}MB limit)`); } log.debug('File validation passed', { filePath, sizeBytes: stats.size, modified: stats.mtime.toISOString() }); const content = await fs.readFile(filePath, 'utf-8'); const ext = path.extname(filePath).toLowerCase(); log.debug('Raw content read successfully', { filePath, contentLength: content.length, extension: ext }); // Process content based on file type const processed = this.postProcessContent(content, ext); timer(); log.info('File text extraction completed', { filePath, originalLength: content.length, processedLength: processed.length, extension: ext, changed: processed.length !== content.length }); return processed; } catch (error: any) { log.error('File text extraction failed', error, { filePath, basename: path.basename(filePath) }); if (error instanceof FileProcessingError) { throw error; } throw new FileProcessingError( `Failed to read file ${filePath}: ${error.message}`, error ); } } /** * Post-process content based on file type * @param content Raw text content * @param extension File extension * @returns Processed text content */ private postProcessContent(content: string, extension: string): string { // Remove comments for certain file types switch (extension) { case '.js': case '.ts': case '.java': case '.c': case '.cpp': return this.removeCodeComments(content); case '.css': case '.scss': return this.removeCssComments(content); case '.html': return this.removeHtmlComments(content); case '.json': try { // Pretty-print JSON for better parsing const parsed = JSON.parse(content); return JSON.stringify(parsed, null, 2); } catch { return content; } case '.xml': return this.removeXmlComments(content); default: return content; } } /** * Remove code-style comments */ private removeCodeComments(content: string): string { return content .replace(/\/\*[\s\S]*?\*\//g, '') // Multi-line comments .replace(/\/\/.*$/gm, '') // Single-line comments .replace(/^#!/gm, '') // Shebang lines .replace(/^\s*#.*$/gm, ''); // Python style comments } /** * Remove CSS comments */ private removeCssComments(content: string): string { return content.replace(/\/\*[\s\S]*?\*\//g, ''); } /** * Remove HTML comments */ private removeHtmlComments(content: string): string { return content.replace(//g, ''); } /** * Remove XML comments */ private removeXmlComments(content: string): string { return content.replace(//g, ''); } /** * Get file metadata * @param filePath Absolute file path * @returns File metadata */ async getFileMetadata(filePath: string): Promise<{ size: number; lastModified: Date; isText: boolean; }> { try { const timer = log.time(`file-metadata-${path.basename(filePath)}`); log.debug('Getting file metadata', { filePath }); const stats = await fs.stat(filePath); const metadata = { size: stats.size, lastModified: stats.mtime, isText: this.isLikelyText(stats.size) }; timer(); log.debug('File metadata retrieved', { filePath, size: stats.size, sizeKB: (stats.size / 1024).toFixed(1), lastModified: stats.mtime.toISOString(), isText: metadata.isText }); return metadata; } catch (error: any) { log.error('Failed to get file metadata', error, { filePath }); throw error; } } /** * Check if file is likely text-based * @param fileSize Size in bytes * @returns true if likely text */ private isLikelyText(fileSize: number): boolean { // Simple heuristic: assume files under 100KB are text // In production, you'd check the MIME type or first few bytes return fileSize < 100 * 1024; } /** * Process file with enhanced content classification and domain detection * @param filePath Absolute path to the file * @returns Enhanced processing result with metadata */ async processFileWithMetadata(filePath: string): Promise<{ content: string; enhancedContent: string; metadata: ContentMetadata; domains: string[]; }> { const timer = log.time(`enhanced-process-${path.basename(filePath)}`); try { log.debug('Starting enhanced file processing', { filePath }); // Extract raw text content const rawContent = await this.extractText(filePath); // Classify content to get type and quality metrics const classificationResult = await this.contentClassifier.classifyContent( rawContent, filePath, path.extname(filePath) ); // Enhance content based on its classification const enhancementResult = await this.contentEnhancer.enhanceContent( rawContent, classificationResult.classification, filePath ); // Extract domain information if domain extractor is available let domains: string[] = []; if (this.domainExtractor) { domains = await this.domainExtractor.extractDomainTags( enhancementResult.processedContent, filePath, classificationResult.classification.language ); } // Get file stats for metadata const stats = await fs.stat(filePath); // Construct comprehensive metadata const metadata: ContentMetadata = { contentType: classificationResult.classification.contentType, language: classificationResult.classification.language, domainTags: domains, qualityScore: classificationResult.quality.score, sourceAuthority: classificationResult.authority.score, processedContent: enhancementResult.processedContent, rawContent: rawContent, fileExtension: path.extname(filePath), hasComments: enhancementResult.hasComments, hasDocumentation: enhancementResult.hasDocumentation }; timer(); log.info('Enhanced file processing completed', { filePath, contentType: classificationResult.classification.contentType, language: classificationResult.classification.language, qualityScore: classificationResult.quality.score, authorityScore: classificationResult.authority.score, domains: domains.length, originalLength: rawContent.length, enhancedLength: enhancementResult.processedContent.length, tokenCount: FileProcessor.countTokens(enhancementResult.processedContent) }); return { content: rawContent, enhancedContent: enhancementResult.processedContent, metadata, domains }; } catch (error: any) { log.error('Enhanced file processing failed', error, { filePath }); throw error; } } /** * Count tokens in text (simple implementation) * @param text Text content * @returns Approximate token count */ static countTokens(text: string): number { // Simple approximation: 1 token ≈ 4 characters for English return Math.ceil(text.length / 4); } /** * Validate and normalize file path * @param filePath Input file path * @returns Normalized absolute path */ static normalizePath(filePath: string): string { return path.resolve(filePath); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

FileProcessor.ts•10.4 KiB