Open Search MCP

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

open-search-mcp
src
content-processing

document-processor.ts•15.2 kB

/** * 文档处理引擎 - 支持多种文档格式的智能处理 * 包括PDF、Word、Excel、PowerPoint等格式的内容提取和分析 */ import fs from 'fs/promises'; import path from 'path'; import { createReadStream } from 'fs'; import { Logger } from '../utils/logger.js'; export interface DocumentMetadata { filename: string; fileSize: number; mimeType: string; format: DocumentFormat; pageCount?: number; wordCount?: number; author?: string; title?: string; subject?: string; creator?: string; createdDate?: string; modifiedDate?: string; language?: string; extractedAt: string; } export interface ProcessedDocument { metadata: DocumentMetadata; content: { text: string; markdown?: string; html?: string; structured?: any; }; pages?: DocumentPage[]; images?: ExtractedImage[]; tables?: ExtractedTable[]; links?: ExtractedLink[]; quality: { score: number; factors: QualityFactor[]; }; processingTime: number; } export interface DocumentPage { pageNumber: number; text: string; images?: ExtractedImage[]; tables?: ExtractedTable[]; boundingBox?: BoundingBox; } export interface ExtractedImage { id: string; description?: string; base64?: string; url?: string; boundingBox?: BoundingBox; pageNumber?: number; } export interface ExtractedTable { id: string; headers: string[]; rows: string[][]; caption?: string; pageNumber?: number; boundingBox?: BoundingBox; } export interface ExtractedLink { text: string; url: string; pageNumber?: number; } export interface BoundingBox { x: number; y: number; width: number; height: number; } export interface QualityFactor { name: string; score: number; description: string; } export type DocumentFormat = | 'pdf' | 'docx' | 'doc' | 'xlsx' | 'xls' | 'pptx' | 'ppt' | 'txt' | 'rtf' | 'odt' | 'ods' | 'odp' | 'unknown'; export interface ProcessingOptions { extractImages?: boolean; extractTables?: boolean; extractLinks?: boolean; ocrEnabled?: boolean; preserveFormatting?: boolean; pageRange?: { start: number; end: number; }; outputFormats?: ('text' | 'markdown' | 'html' | 'structured')[]; quality?: 'fast' | 'balanced' | 'high'; } export class DocumentProcessor { private logger: Logger; private supportedFormats: Set<string>; constructor() { this.logger = new Logger('DocumentProcessor'); this.supportedFormats = new Set([ 'pdf', 'docx', 'doc', 'xlsx', 'xls', 'pptx', 'ppt', 'txt', 'rtf', 'odt', 'ods', 'odp' ]); } /** * Process document file */ async processDocument( filePath: string, options: ProcessingOptions = {} ): Promise<ProcessedDocument> { const startTime = Date.now(); try { this.logger.info(`Processing document: ${filePath}`); // 1. 验证文件存在性和格式 await this.validateFile(filePath); // 2. 检测文档格式 const format = await this.detectFormat(filePath); // 3. 提取基础元数据 const metadata = await this.extractMetadata(filePath, format); // 4. 根据格式选择处理器 const processor = this.getProcessor(format); // 5. 提取内容 const extractedContent = await processor.extract(filePath, options); // 6. 后处理和质量评估 const processedContent = await this.postProcess(extractedContent, options); const quality = this.assessQuality(processedContent, metadata); const processingTime = Date.now() - startTime; const result: ProcessedDocument = { metadata, content: processedContent.content, pages: processedContent.pages, images: processedContent.images, tables: processedContent.tables, links: processedContent.links, quality, processingTime }; this.logger.info(`Document processed successfully in ${processingTime}ms`); return result; } catch (error) { this.logger.error(`Document processing failed for ${filePath}:`, error); throw new Error(`Failed to process document: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * 批量处理文档 */ async processDocuments( filePaths: string[], options: ProcessingOptions & { concurrency?: number } = {} ): Promise<ProcessedDocument[]> { const concurrency = options.concurrency || 3; const results: ProcessedDocument[] = []; this.logger.info(`Starting batch document processing: ${filePaths.length} files`); // 分批处理 for (let i = 0; i < filePaths.length; i += concurrency) { const batch = filePaths.slice(i, i + concurrency); const batchPromises = batch.map(async (filePath) => { try { return await this.processDocument(filePath, options); } catch (error) { this.logger.warn(`Failed to process ${filePath}:`, error); return null; } }); const batchResults = await Promise.allSettled(batchPromises); batchResults.forEach((result) => { if (result.status === 'fulfilled' && result.value) { results.push(result.value); } }); } this.logger.info(`Batch processing completed: ${results.length}/${filePaths.length} successful`); return results; } /** * 从Buffer处理文档 */ async processDocumentFromBuffer( buffer: Buffer, filename: string, options: ProcessingOptions = {} ): Promise<ProcessedDocument> { // 创建临时文件 const tempPath = path.join(process.cwd(), 'temp', `${Date.now()}_${filename}`); try { // 确保临时目录存在 await fs.mkdir(path.dirname(tempPath), { recursive: true }); // 写入临时文件 await fs.writeFile(tempPath, buffer); // 处理文档 const result = await this.processDocument(tempPath, options); return result; } finally { // 清理临时文件 try { await fs.unlink(tempPath); } catch (error) { this.logger.warn(`Failed to cleanup temp file ${tempPath}:`, error); } } } /** * 验证文件 */ private async validateFile(filePath: string): Promise<void> { try { const stats = await fs.stat(filePath); if (!stats.isFile()) { throw new Error('Path is not a file'); } if (stats.size === 0) { throw new Error('File is empty'); } if (stats.size > 100 * 1024 * 1024) { // 100MB limit throw new Error('File too large (max 100MB)'); } } catch (error) { throw new Error(`File validation failed: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * 检测文档格式 */ private async detectFormat(filePath: string): Promise<DocumentFormat> { const ext = path.extname(filePath).toLowerCase().slice(1); if (this.supportedFormats.has(ext)) { return ext as DocumentFormat; } // 尝试通过文件头检测 const buffer = Buffer.alloc(16); const file = await fs.open(filePath, 'r'); try { await file.read(buffer, 0, 16, 0); // PDF magic number if (buffer.toString('ascii', 0, 4) === '%PDF') { return 'pdf'; } // Office documents (ZIP-based) if (buffer[0] === 0x50 && buffer[1] === 0x4B) { return 'docx'; // Could be docx, xlsx, pptx } // Legacy Office documents if (buffer[0] === 0xD0 && buffer[1] === 0xCF) { return 'doc'; // Could be doc, xls, ppt } } finally { await file.close(); } return 'unknown'; } /** * 提取基础元数据 */ private async extractMetadata(filePath: string, format: DocumentFormat): Promise<DocumentMetadata> { const stats = await fs.stat(filePath); const filename = path.basename(filePath); const metadata: DocumentMetadata = { filename, fileSize: stats.size, mimeType: this.getMimeType(format), format, createdDate: stats.birthtime.toISOString(), modifiedDate: stats.mtime.toISOString(), extractedAt: new Date().toISOString() }; // 尝试提取更详细的元数据 try { const detailedMetadata = await this.extractDetailedMetadata(filePath, format); Object.assign(metadata, detailedMetadata); } catch (error) { this.logger.warn(`Failed to extract detailed metadata for ${filePath}:`, error); } return metadata; } /** * 获取处理器 */ private getProcessor(format: DocumentFormat): DocumentFormatProcessor { switch (format) { case 'pdf': return new PDFProcessor(); case 'docx': case 'doc': return new WordProcessor(); case 'xlsx': case 'xls': return new ExcelProcessor(); case 'pptx': case 'ppt': return new PowerPointProcessor(); case 'txt': return new TextProcessor(); default: return new GenericProcessor(); } } /** * 后处理 */ private async postProcess( extractedContent: any, options: ProcessingOptions ): Promise<any> { const result = { ...extractedContent }; // 生成不同格式 if (options.outputFormats?.includes('markdown')) { result.content.markdown = this.convertToMarkdown(extractedContent.content.text); } if (options.outputFormats?.includes('html')) { result.content.html = this.convertToHTML(extractedContent.content.text); } if (options.outputFormats?.includes('structured')) { result.content.structured = this.extractStructuredData(extractedContent); } return result; } /** * 质量评估 */ private assessQuality(content: any, metadata: DocumentMetadata): { score: number; factors: QualityFactor[] } { const factors: QualityFactor[] = []; let totalScore = 0; // 内容长度评分 const textLength = content.content.text.length; const lengthScore = Math.min(textLength / 1000, 1) * 0.3; factors.push({ name: 'content_length', score: lengthScore, description: `Document has ${textLength} characters` }); totalScore += lengthScore; // 结构化内容评分 const structureScore = (content.tables?.length || 0) * 0.1 + (content.images?.length || 0) * 0.05; factors.push({ name: 'structure', score: Math.min(structureScore, 0.3), description: `Document has ${content.tables?.length || 0} tables and ${content.images?.length || 0} images` }); totalScore += Math.min(structureScore, 0.3); // 元数据完整性评分 const metadataScore = (metadata.title ? 0.1 : 0) + (metadata.author ? 0.1 : 0) + (metadata.subject ? 0.1 : 0); factors.push({ name: 'metadata', score: metadataScore, description: 'Metadata completeness' }); totalScore += metadataScore; // 格式支持评分 const formatScore = metadata.format !== 'unknown' ? 0.2 : 0; factors.push({ name: 'format_support', score: formatScore, description: `Format ${metadata.format} is supported` }); totalScore += formatScore; return { score: Math.min(totalScore, 1), factors }; } /** * 工具方法 */ private getMimeType(format: DocumentFormat): string { const mimeTypes: Record<DocumentFormat, string> = { pdf: 'application/pdf', docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', doc: 'application/msword', xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', xls: 'application/vnd.ms-excel', pptx: 'application/vnd.openxmlformats-officedocument.presentationml.presentation', ppt: 'application/vnd.ms-powerpoint', txt: 'text/plain', rtf: 'application/rtf', odt: 'application/vnd.oasis.opendocument.text', ods: 'application/vnd.oasis.opendocument.spreadsheet', odp: 'application/vnd.oasis.opendocument.presentation', unknown: 'application/octet-stream' }; return mimeTypes[format] || 'application/octet-stream'; } private async extractDetailedMetadata(filePath: string, format: DocumentFormat): Promise<Partial<DocumentMetadata>> { // 这里应该使用具体的库来提取元数据 // 例如：pdf-parse, mammoth, xlsx等 return {}; } private convertToMarkdown(text: string): string { // 简单的文本到Markdown转换 return text.replace(/\n\n/g, '\n\n'); } private convertToHTML(text: string): string { // 简单的文本到HTML转换 return `<div>${text.replace(/\n/g, '<br>')}</div>`; } private extractStructuredData(content: any): any { return { text: content.content.text, tables: content.tables || [], images: content.images || [], links: content.links || [] }; } } /** * 文档格式处理器接口 */ interface DocumentFormatProcessor { extract(filePath: string, options: ProcessingOptions): Promise<any>; } /** * 具体处理器实现（简化版本） */ class PDFProcessor implements DocumentFormatProcessor { async extract(filePath: string, options: ProcessingOptions): Promise<any> { // 这里应该使用pdf-parse或类似库 return { content: { text: 'PDF content placeholder' }, pages: [], images: [], tables: [], links: [] }; } } class WordProcessor implements DocumentFormatProcessor { async extract(filePath: string, options: ProcessingOptions): Promise<any> { // 这里应该使用mammoth或类似库 return { content: { text: 'Word document content placeholder' }, pages: [], images: [], tables: [], links: [] }; } } class ExcelProcessor implements DocumentFormatProcessor { async extract(filePath: string, options: ProcessingOptions): Promise<any> { // 这里应该使用xlsx或类似库 return { content: { text: 'Excel content placeholder' }, pages: [], images: [], tables: [], links: [] }; } } class PowerPointProcessor implements DocumentFormatProcessor { async extract(filePath: string, options: ProcessingOptions): Promise<any> { // 这里应该使用相应的PowerPoint处理库 return { content: { text: 'PowerPoint content placeholder' }, pages: [], images: [], tables: [], links: [] }; } } class TextProcessor implements DocumentFormatProcessor { async extract(filePath: string, options: ProcessingOptions): Promise<any> { const content = await fs.readFile(filePath, 'utf-8'); return { content: { text: content }, pages: [], images: [], tables: [], links: [] }; } } class GenericProcessor implements DocumentFormatProcessor { async extract(filePath: string, options: ProcessingOptions): Promise<any> { return { content: { text: 'Unsupported format' }, pages: [], images: [], tables: [], links: [] }; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/flyanima/open-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server