file-processor.ts•13 kB
import { z } from 'zod';
import { getConfig } from './config.js';
// File processing result interface
export interface FileProcessingResult {
success: boolean;
filePath: string;
content?: string;
metadata?: FileMetadata;
error?: {
code: string;
message: string;
details?: any;
};
}
// File metadata interface
export interface FileMetadata {
name: string;
extension: string;
size: number;
type: 'text' | 'binary' | 'image' | 'archive' | 'unknown';
mimeType?: string;
encoding?: string;
lineCount?: number;
language?: string;
lastModified?: string;
checksum?: string;
}
// File validation schema
const FilePathSchema = z.string().refine(
(path) => {
// Security validation: prevent path traversal
const normalizedPath = path.replace(/\\/g, '/');
return !normalizedPath.includes('../') &&
!normalizedPath.includes('..\\') &&
!normalizedPath.startsWith('/') &&
!normalizedPath.includes('//');
},
{ message: 'Invalid file path: potential security risk detected' }
);
// Batch processing options
export interface BatchProcessingOptions {
maxConcurrent: number;
continueOnError: boolean;
validatePaths: boolean;
includeMetadata: boolean;
maxFileSize: number;
allowedExtensions?: string[];
excludePatterns?: string[];
}
// File type detection based on extension
const FILE_TYPE_MAP: Record<string, string> = {
// Text files
'.txt': 'text/plain',
'.md': 'text/markdown',
'.json': 'application/json',
'.yaml': 'application/yaml',
'.yml': 'application/yaml',
'.xml': 'application/xml',
'.csv': 'text/csv',
'.log': 'text/plain',
// Code files
'.js': 'text/javascript',
'.jsx': 'text/javascript',
'.ts': 'text/typescript',
'.tsx': 'text/typescript',
'.py': 'text/python',
'.java': 'text/java',
'.cpp': 'text/cpp',
'.c': 'text/c',
'.h': 'text/c',
'.go': 'text/go',
'.rs': 'text/rust',
'.php': 'text/php',
'.rb': 'text/ruby',
'.swift': 'text/swift',
'.kt': 'text/kotlin',
'.scala': 'text/scala',
'.clj': 'text/clojure',
'.html': 'text/html',
'.css': 'text/css',
'.scss': 'text/scss',
'.sass': 'text/sass',
'.less': 'text/less',
'.sql': 'text/sql',
'.sh': 'text/shell',
'.bash': 'text/shell',
'.zsh': 'text/shell',
'.fish': 'text/shell',
'.ps1': 'text/powershell',
'.r': 'text/r',
'.m': 'text/matlab',
'.pl': 'text/perl',
'.lua': 'text/lua',
'.dart': 'text/dart',
'.elm': 'text/elm',
'.ex': 'text/elixir',
'.exs': 'text/elixir',
'.erl': 'text/erlang',
'.hrl': 'text/erlang',
'.fs': 'text/fsharp',
'.fsx': 'text/fsharp',
'.ml': 'text/ocaml',
'.mli': 'text/ocaml',
'.hs': 'text/haskell',
'.lhs': 'text/haskell',
'.jl': 'text/julia',
'.nim': 'text/nim',
'.nims': 'text/nim',
'.cr': 'text/crystal',
'.d': 'text/d',
'.zig': 'text/zig',
'.v': 'text/v',
'.vsh': 'text/v',
// Configuration files
'.toml': 'application/toml',
'.ini': 'text/plain',
'.cfg': 'text/plain',
'.conf': 'text/plain',
'.env': 'text/plain',
'.properties': 'text/plain',
// Documentation
'.rst': 'text/restructuredtext',
'.adoc': 'text/asciidoc',
'.tex': 'text/latex',
// Images
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.bmp': 'image/bmp',
'.svg': 'image/svg+xml',
'.webp': 'image/webp',
'.ico': 'image/x-icon',
// Archives
'.zip': 'application/zip',
'.tar': 'application/tar',
'.gz': 'application/gzip',
'.7z': 'application/x-7z-compressed',
'.rar': 'application/vnd.rar',
'.bz2': 'application/bzip2',
'.xz': 'application/xz',
// Binary
'.exe': 'application/octet-stream',
'.dll': 'application/octet-stream',
'.so': 'application/octet-stream',
'.dylib': 'application/octet-stream',
'.bin': 'application/octet-stream',
};
// Language detection based on file extension
const LANGUAGE_MAP: Record<string, string> = {
'.js': 'javascript',
'.jsx': 'javascript',
'.ts': 'typescript',
'.tsx': 'typescript',
'.py': 'python',
'.java': 'java',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.go': 'go',
'.rs': 'rust',
'.php': 'php',
'.rb': 'ruby',
'.swift': 'swift',
'.kt': 'kotlin',
'.scala': 'scala',
'.clj': 'clojure',
'.html': 'html',
'.css': 'css',
'.scss': 'scss',
'.sass': 'sass',
'.less': 'less',
'.sql': 'sql',
'.sh': 'shell',
'.bash': 'shell',
'.zsh': 'shell',
'.fish': 'shell',
'.ps1': 'powershell',
'.r': 'r',
'.m': 'matlab',
'.pl': 'perl',
'.lua': 'lua',
'.dart': 'dart',
'.elm': 'elm',
'.ex': 'elixir',
'.exs': 'elixir',
'.erl': 'erlang',
'.hrl': 'erlang',
'.fs': 'fsharp',
'.fsx': 'fsharp',
'.ml': 'ocaml',
'.mli': 'ocaml',
'.hs': 'haskell',
'.lhs': 'haskell',
'.jl': 'julia',
'.nim': 'nim',
'.nims': 'nim',
'.cr': 'crystal',
'.d': 'd',
'.zig': 'zig',
'.v': 'v',
'.vsh': 'v',
};
// File type categorization
function categorizeFileType(mimeType: string): 'text' | 'binary' | 'image' | 'archive' | 'unknown' {
if (mimeType.startsWith('text/') || mimeType.includes('json') || mimeType.includes('xml') || mimeType.includes('yaml')) {
return 'text';
}
if (mimeType.startsWith('image/')) {
return 'image';
}
if (mimeType.includes('zip') || mimeType.includes('tar') || mimeType.includes('compress')) {
return 'archive';
}
if (mimeType.includes('octet-stream') || mimeType.includes('binary')) {
return 'binary';
}
return 'unknown';
}
// Validate file path for security
export function validateFilePath(filePath: string): { valid: boolean; error?: string } {
try {
FilePathSchema.parse(filePath);
return { valid: true };
} catch (error) {
return {
valid: false,
error: error instanceof z.ZodError ? error.errors[0].message : 'Invalid file path'
};
}
}
// Extract file metadata
export function extractFileMetadata(filePath: string, content?: string): FileMetadata {
const name = filePath.split('/').pop() || filePath;
const extensionMatch = name.match(/\.[^.]+$/);
const extension = extensionMatch ? extensionMatch[0].toLowerCase() : '';
const mimeType = FILE_TYPE_MAP[extension] || 'application/octet-stream';
const type = categorizeFileType(mimeType);
const language = LANGUAGE_MAP[extension];
const metadata: FileMetadata = {
name,
extension,
size: content ? Buffer.byteLength(content, 'utf8') : 0,
type,
mimeType,
language,
};
// Add line count for text files
if (type === 'text' && content) {
metadata.lineCount = content.split('\n').length;
metadata.encoding = 'utf8';
}
return metadata;
}
// Process a single file with error handling
export async function processSingleFile(
filePath: string,
content: string,
options: Partial<BatchProcessingOptions> = {}
): Promise<FileProcessingResult> {
const config = getConfig();
try {
// Validate file path if requested
if (options.validatePaths !== false) {
const validation = validateFilePath(filePath);
if (!validation.valid) {
return {
success: false,
filePath,
error: {
code: 'INVALID_FILE_PATH',
message: validation.error || 'Invalid file path',
},
};
}
}
// Check file size
const fileSize = Buffer.byteLength(content, 'utf8');
const maxSize = options.maxFileSize || config.limits.maxFileSize;
if (fileSize > maxSize) {
return {
success: false,
filePath,
error: {
code: 'FILE_TOO_LARGE',
message: `File size (${fileSize} bytes) exceeds maximum allowed size (${maxSize} bytes)`,
details: { fileSize, maxSize },
},
};
}
// Check allowed extensions
if (options.allowedExtensions && options.allowedExtensions.length > 0) {
const metadata = extractFileMetadata(filePath);
if (!options.allowedExtensions.includes(metadata.extension)) {
return {
success: false,
filePath,
error: {
code: 'EXTENSION_NOT_ALLOWED',
message: `File extension '${metadata.extension}' is not allowed`,
details: { extension: metadata.extension, allowedExtensions: options.allowedExtensions },
},
};
}
}
// Check exclude patterns
if (options.excludePatterns && options.excludePatterns.length > 0) {
const shouldExclude = options.excludePatterns.some(pattern => {
const regex = new RegExp(pattern);
return regex.test(filePath);
});
if (shouldExclude) {
return {
success: false,
filePath,
error: {
code: 'FILE_EXCLUDED',
message: `File matches exclude pattern`,
details: { filePath, excludePatterns: options.excludePatterns },
},
};
}
}
// Extract metadata if requested
const metadata = options.includeMetadata ? extractFileMetadata(filePath, content) : undefined;
return {
success: true,
filePath,
content,
metadata,
};
} catch (error) {
return {
success: false,
filePath,
error: {
code: 'PROCESSING_ERROR',
message: error instanceof Error ? error.message : 'Unknown processing error',
details: error,
},
};
}
}
// Batch process multiple files
export async function batchProcessFiles(
files: Array<{ path: string; content: string }>,
options: Partial<BatchProcessingOptions> = {}
): Promise<{
results: FileProcessingResult[];
summary: {
total: number;
successful: number;
failed: number;
errors: Array<{ filePath: string; error: string }>;
};
}> {
const config = getConfig();
const maxConcurrent = options.maxConcurrent || config.limits.maxConcurrentRequests;
const continueOnError = options.continueOnError !== false;
const results: FileProcessingResult[] = [];
const errors: Array<{ filePath: string; error: string }> = [];
// Process files in batches to respect concurrency limits
for (let i = 0; i < files.length; i += maxConcurrent) {
const batch = files.slice(i, i + maxConcurrent);
const batchPromises = batch.map(async (file) => {
try {
const result = await processSingleFile(file.path, file.content, options);
if (!result.success && result.error) {
errors.push({
filePath: file.path,
error: result.error.message,
});
if (!continueOnError) {
throw new Error(`Processing failed for ${file.path}: ${result.error.message}`);
}
}
return result;
} catch (error) {
const errorResult: FileProcessingResult = {
success: false,
filePath: file.path,
error: {
code: 'BATCH_PROCESSING_ERROR',
message: error instanceof Error ? error.message : 'Unknown batch processing error',
details: error,
},
};
errors.push({
filePath: file.path,
error: errorResult.error!.message,
});
if (!continueOnError) {
throw error;
}
return errorResult;
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
const successful = results.filter(r => r.success).length;
const failed = results.filter(r => !r.success).length;
return {
results,
summary: {
total: files.length,
successful,
failed,
errors,
},
};
}
// Helper function to filter files by extension
export function filterFilesByExtension(
files: Array<{ path: string; content: string }>,
allowedExtensions: string[]
): Array<{ path: string; content: string }> {
return files.filter(file => {
const metadata = extractFileMetadata(file.path);
return allowedExtensions.includes(metadata.extension);
});
}
// Helper function to get file statistics
export function getFileStatistics(results: FileProcessingResult[]): {
totalSize: number;
totalLines: number;
languageDistribution: Record<string, number>;
typeDistribution: Record<string, number>;
} {
let totalSize = 0;
let totalLines = 0;
const languageDistribution: Record<string, number> = {};
const typeDistribution: Record<string, number> = {};
results.forEach(result => {
if (result.success && result.metadata) {
totalSize += result.metadata.size;
totalLines += result.metadata.lineCount || 0;
if (result.metadata.language) {
languageDistribution[result.metadata.language] = (languageDistribution[result.metadata.language] || 0) + 1;
}
typeDistribution[result.metadata.type] = (typeDistribution[result.metadata.type] || 0) + 1;
}
});
return {
totalSize,
totalLines,
languageDistribution,
typeDistribution,
};
}