Skip to main content
Glama
little2512
by little2512
server.js28.5 kB
#!/usr/bin/env node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListToolsRequestSchema, ReadResourceRequestSchema, ListResourcesRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import fs from 'fs-extra'; import path from 'path'; import crypto from 'crypto'; import mammoth from 'mammoth'; import Tesseract from 'tesseract.js'; import NodeCache from 'node-cache'; import sharp from 'sharp'; import { Worker, isMainThread, parentPort, workerData } from 'worker_threads'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const server = new Server( { name: "word-doc-reader", version: "2.0.0", }, { capabilities: { tools: {}, resources: {}, }, } ); // 缓存配置 const documentCache = new NodeCache({ stdTTL: 3600, // 1小时缓存 checkperiod: 600, // 每10分钟检查过期缓存 useClones: false }); // 全文索引 class DocumentIndexer { constructor() { this.index = new Map(); // word -> Set of documentIds this.documents = new Map(); // documentId -> document metadata this.lastUpdated = Date.now(); } // 添加文档到索引 addDocument(documentId, content, metadata) { const words = this.extractWords(content); // 更新倒排索引 words.forEach(word => { if (!this.index.has(word)) { this.index.set(word, new Set()); } this.index.get(word).add(documentId); }); // 存储文档元数据 this.documents.set(documentId, { ...metadata, wordCount: words.length, lastIndexed: Date.now() }); this.lastUpdated = Date.now(); } // 提取词汇 extractWords(text) { // 中英文分词 const chinesePattern = /[\u4e00-\u9fff]+/g; const englishPattern = /[a-zA-Z]+/g; const numberPattern = /\d+/g; const words = []; // 提取中文词汇(简单分词,后续可优化为更智能的分词) const chineseMatches = text.match(chinesePattern) || []; chineseMatches.forEach(match => { // 简单的中文二字、三字、四字词分词 for (let i = 0; i < match.length; i++) { for (let len = 2; len <= 4 && i + len <= match.length; len++) { words.push(match.substr(i, len)); } } }); // 提取英文单词 const englishMatches = text.match(englishPattern) || []; words.push(...englishMatches.map(word => word.toLowerCase())); // 提取数字 const numberMatches = text.match(numberPattern) || []; words.push(...numberMatches); return words.filter(word => word.length > 0); } // 搜索文档 search(query) { const queryWords = this.extractWords(query.toLowerCase()); const documentScores = new Map(); queryWords.forEach(word => { const docs = this.index.get(word); if (docs) { docs.forEach(docId => { const score = documentScores.get(docId) || 0; documentScores.set(docId, score + 1); }); } }); // 按相关性排序 const results = Array.from(documentScores.entries()) .sort((a, b) => b[1] - a[1]) .map(([docId, score]) => ({ documentId, score, document: this.documents.get(docId) })); return results; } // 清除索引 clear() { this.index.clear(); this.documents.clear(); this.lastUpdated = Date.now(); } // 获取统计信息 getStats() { return { totalWords: this.index.size, totalDocuments: this.documents.size, lastUpdated: this.lastUpdated }; } } const documentIndexer = new DocumentIndexer(); // 大文档处理器 class LargeDocumentProcessor { constructor() { this.maxFileSize = 10 * 1024 * 1024; // 10MB this.maxPages = 100; } async isLargeDocument(filePath) { const stats = await fs.stat(filePath); return stats.size > this.maxFileSize; } async processInChunks(filePath, options = {}) { const chunkSize = options.chunkSize || 1024 * 1024; // 1MB chunks const buffer = await fs.readFile(filePath); const chunks = []; for (let i = 0; i < buffer.length; i += chunkSize) { chunks.push(buffer.slice(i, i + chunkSize)); } // 并行处理chunks const promises = chunks.map((chunk, index) => this.processChunk(chunk, index, options) ); const results = await Promise.all(promises); return this.mergeResults(results); } async processChunk(chunk, index, options) { // 使用worker处理每个chunk return new Promise((resolve, reject) => { const worker = new Worker(__filename, { workerData: { type: 'process_chunk', chunk: chunk.toString('base64'), index, options } }); worker.on('message', resolve); worker.on('error', reject); }); } mergeResults(results) { const merged = { text: '', tables: [], images: [], metadata: { chunksProcessed: results.length, totalLength: 0 } }; results.forEach(result => { merged.text += result.text || ''; if (result.tables) merged.tables.push(...result.tables); if (result.images) merged.images.push(...result.images); merged.metadata.totalLength += result.text?.length || 0; }); return merged; } } const largeDocProcessor = new LargeDocumentProcessor(); // 文档分析器(包含表格和图片提取) class DocumentAnalyzer { constructor() { this.ocrWorker = null; } async initializeOCR() { if (!this.ocrWorker) { this.ocrWorker = await Tesseract.createWorker(); await this.ocrWorker.loadLanguage('chi_sim+eng'); await this.ocrWorker.initialize('chi_sim+eng'); } } async extractTables(docxBuffer) { try { // 使用mammoth提取HTML,然后解析表格 const result = await mammoth.convertToHtml({ buffer: docxBuffer }); const html = result.value; // 简单的HTML表格解析 const tables = []; const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi; let match; while ((match = tableRegex.exec(html)) !== null) { const tableHtml = match[0]; const tableData = this.parseTableHtml(tableHtml); if (tableData.rows.length > 0) { tables.push(tableData); } } return tables; } catch (error) { console.error('表格提取错误:', error); return []; } } parseTableHtml(tableHtml) { const rows = []; const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(tableHtml)) !== null) { const cells = []; const cellRegex = /<(?:t[dh])(?:[^>]*)>([\s\S]*?)<\/t[dh]>/gi; let cellMatch; while ((cellMatch = cellRegex.exec(rowMatch[1])) !== null) { // 清理HTML标签和多余空白 const cellText = cellMatch[1] .replace(/<[^>]*>/g, '') .replace(/\s+/g, ' ') .trim(); cells.push(cellText); } if (cells.length > 0) { rows.push(cells); } } return { rows, html: tableHtml }; } async extractImages(docxBuffer, outputDir) { try { // 创建临时目录 const tempDir = path.join(outputDir, 'temp_images'); await fs.ensureDir(tempDir); // 使用zip-local解压docx文件(docx实际上是zip文件) const JSZip = await import('jszip'); const zip = await JSZip.loadAsync(docxBuffer); const images = []; const mediaFolder = zip.folder('word/media'); if (mediaFolder) { const imageFiles = mediaFolder.files; for (const [filename, file] of Object.entries(imageFiles)) { if (!file.dir && this.isImageFile(filename)) { // 保存图片 const imagePath = path.join(tempDir, path.basename(filename)); const imageBuffer = await file.async('nodebuffer'); await fs.writeFile(imagePath, imageBuffer); // 进行OCR分析 const ocrText = await this.performOCR(imageBuffer); images.push({ filename, path: imagePath, size: imageBuffer.length, format: path.extname(filename).substring(1), ocrText: ocrText, extractedAt: new Date().toISOString() }); } } } return images; } catch (error) { console.error('图片提取错误:', error); return []; } } isImageFile(filename) { const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']; const ext = path.extname(filename).toLowerCase(); return imageExtensions.includes(ext); } async performOCR(imageBuffer) { try { await this.initializeOCR(); // 图片预处理 const processedImage = await sharp(imageBuffer) .resize({ width: 2000, withoutEnlargement: true }) .sharpen() .normalise() .png() .toBuffer(); const { data: { text } } = await this.ocrWorker.recognize(processedImage); return text.trim(); } catch (error) { console.error('OCR处理错误:', error); return ''; } } async cleanup() { if (this.ocrWorker) { await this.ocrWorker.terminate(); this.ocrWorker = null; } } } const documentAnalyzer = new DocumentAnalyzer(); // 缓存管理器 class CacheManager { constructor() { this.cacheDir = path.join(__dirname, '.cache'); this.metadataFile = path.join(this.cacheDir, 'metadata.json'); } async initialize() { await fs.ensureDir(this.cacheDir); if (!(await fs.pathExists(this.metadataFile))) { await fs.writeJson(this.metadataFile, {}); } } getCacheKey(filePath) { const stats = fs.statSync(filePath); const keyData = `${filePath}-${stats.mtime.getTime()}-${stats.size}`; return crypto.createHash('md5').update(keyData).digest('hex'); } async get(filePath) { await this.initialize(); const cacheKey = this.getCacheKey(filePath); const cacheFile = path.join(this.cacheDir, `${cacheKey}.json`); const metadata = await fs.readJson(this.metadataFile); if (metadata[cacheKey] && await fs.pathExists(cacheFile)) { return await fs.readJson(cacheFile); } return null; } async set(filePath, data) { await this.initialize(); const cacheKey = this.getCacheKey(filePath); const cacheFile = path.join(this.cacheDir, `${cacheKey}.json`); const metadata = await fs.readJson(this.metadataFile); await fs.writeJson(cacheFile, data); metadata[cacheKey] = { filePath, cachedAt: new Date().toISOString(), size: JSON.stringify(data).length }; await fs.writeJson(this.metadataFile, metadata); } async clear() { await this.initialize(); const files = await fs.readdir(this.cacheDir); for (const file of files) { if (file !== 'metadata.json') { await fs.remove(path.join(this.cacheDir, file)); } } await fs.writeJson(this.metadataFile, {}); } async getStats() { await this.initialize(); const metadata = await fs.readJson(this.metadataFile); const keys = Object.keys(metadata); return { totalCached: keys.length, totalSize: keys.reduce((sum, key) => sum + (metadata[key].size || 0), 0), files: metadata }; } } const cacheManager = new CacheManager(); // Worker线程处理 if (!isMainThread) { const { type, data } = workerData; if (type === 'process_chunk') { const { chunk, index, options } = data; mammoth.extractRawText({ buffer: Buffer.from(chunk, 'base64') }) .then(result => { parentPort.postMessage({ index, text: result.value, success: true }); }) .catch(error => { parentPort.postMessage({ index, error: error.message, success: false }); }); } } // 工具定义 server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: [ { name: "read_word_document", description: "增强版Word文档读取器,支持表格提取、图片OCR分析和缓存优化", inputSchema: { type: "object", properties: { filePath: { type: "string", description: "Word文档的文件路径" }, memoryKey: { type: "string", description: "用于存储的内存键名,便于后续检索", default: "default" }, documentType: { type: "string", description: "文档类型", enum: ["ui-component", "api-doc", "common-doc", "other"], default: "common-doc" }, extractTables: { type: "boolean", description: "是否提取表格", default: true }, extractImages: { type: "boolean", description: "是否提取图片并进行OCR分析", default: true }, useCache: { type: "boolean", description: "是否使用缓存", default: true }, outputDir: { type: "string", description: "图片和临时文件输出目录", default: "./output" } }, required: ["filePath"] } }, { name: "search_documents", description: "全文索引搜索,支持中英文混合搜索", inputSchema: { type: "object", properties: { query: { type: "string", description: "搜索关键词" }, documentType: { type: "string", description: "限制搜索的文档类型", enum: ["ui-component", "api-doc", "common-doc", "other"] }, limit: { type: "number", description: "返回结果数量限制", default: 10 } }, required: ["query"] } }, { name: "get_cache_stats", description: "获取缓存统计信息", inputSchema: { type: "object", properties: {} } }, { name: "clear_cache", description: "清空所有缓存", inputSchema: { type: "object", properties: { type: { type: "string", description: "清除类型:all, document, index", enum: ["all", "document", "index"], default: "all" } } } }, { name: "list_stored_documents", description: "列出所有已存储的文档", inputSchema: { type: "object", properties: { documentType: { type: "string", description: "筛选特定类型的文档", enum: ["ui-component", "api-doc", "common-doc", "other"] } } } }, { name: "get_stored_document", description: "获取已存储的文档内容", inputSchema: { type: "object", properties: { memoryKey: { type: "string", description: "要获取的文档内存键名" } }, required: ["memoryKey"] } }, { name: "clear_memory", description: "清除指定的内存内容", inputSchema: { type: "object", properties: { memoryKey: { type: "string", description: "要清除的内存键名,如果不提供则清除所有" } } } } ] }; }); // 工具执行 server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; try { switch (name) { case "read_word_document": { const { filePath, memoryKey = "default", documentType = "common-doc", extractTables = true, extractImages = true, useCache = true, outputDir = "./output" } = args; if (!fs.existsSync(filePath)) { throw new Error(`文件不存在: ${filePath}`); } const fileExt = path.extname(filePath).toLowerCase(); if (fileExt !== '.docx' && fileExt !== '.doc') { throw new Error(`不支持的文件格式: ${fileExt}。仅支持 .docx 和 .doc 文件`); } // 检查缓存 let analysisResult = null; if (useCache) { analysisResult = await cacheManager.get(filePath); if (analysisResult) { console.log(`使用缓存数据: ${filePath}`); } } if (!analysisResult) { console.log(`分析文档: ${filePath}`); // 读取文件 const docxBuffer = await fs.readFile(filePath); // 检查是否为大文档 if (await largeDocProcessor.isLargeDocument(filePath)) { console.log('检测到大文档,使用并行处理'); analysisResult = await largeDocProcessor.processInChunks(filePath, { extractTables, extractImages }); } else { // 标准处理 const textResult = await mammoth.extractRawText({ buffer: docxBuffer }); analysisResult = { text: textResult.value, tables: [], images: [], metadata: { filePath, documentType, processedAt: new Date().toISOString(), fileSize: docxBuffer.length } }; // 提取表格 if (extractTables) { analysisResult.tables = await documentAnalyzer.extractTables(docxBuffer); } // 提取图片 if (extractImages) { await fs.ensureDir(outputDir); analysisResult.images = await documentAnalyzer.extractImages(docxBuffer, outputDir); } } // 缓存结果 if (useCache) { await cacheManager.set(filePath, analysisResult); } } // 生成文档ID用于索引 const documentId = `${memoryKey}_${Date.now()}`; // 更新全文索引 const indexContent = analysisResult.text + ' ' + analysisResult.tables.map(t => t.rows.join(' ')).join(' ') + ' ' + analysisResult.images.map(i => i.ocrText).join(' '); documentIndexer.addDocument(documentId, indexContent, { memoryKey, filePath, documentType, tablesCount: analysisResult.tables.length, imagesCount: analysisResult.images.length }); // 存储到内存(兼容现有功能) const documentData = { ...analysisResult, documentId, memoryKey, filePath, documentType, timestamp: new Date().toISOString() }; documentCache.set(memoryKey, documentData); // 构建响应 let responseText = `成功读取并分析Word文档:\n`; responseText += `文件路径: ${filePath}\n`; responseText += `文档类型: ${documentType}\n`; responseText += `内存键: ${memoryKey}\n`; responseText += `内容长度: ${analysisResult.text.length} 字符\n`; responseText += `表格数量: ${analysisResult.tables.length}\n`; responseText += `图片数量: ${analysisResult.images.length}\n`; if (analysisResult.tables.length > 0) { responseText += `\n表格预览:\n`; analysisResult.tables.slice(0, 2).forEach((table, index) => { responseText += `表格${index + 1}: ${table.rows.length}行 x ${table.rows[0]?.length || 0}列\n`; if (table.rows.length > 0) { responseText += `示例行: ${table.rows[0]?.slice(0, 3).join(' | ')}\n`; } }); } if (analysisResult.images.length > 0) { responseText += `\n图片OCR结果预览:\n`; analysisResult.images.slice(0, 2).forEach((image, index) => { const ocrPreview = image.ocrText.substring(0, 100); responseText += `图片${index + 1} (${image.filename}): ${ocrPreview}${image.ocrText.length > 100 ? '...' : ''}\n`; }); } responseText += `\n文本内容预览:\n${analysisResult.text.substring(0, 300)}...`; return { content: [ { type: "text", text: responseText } ] }; } case "search_documents": { const { query, documentType, limit = 10 } = args; const searchResults = documentIndexer.search(query); // 按文档类型过滤 const filteredResults = documentType ? searchResults.filter(result => result.document.documentType === documentType) : searchResults; const limitedResults = filteredResults.slice(0, limit); if (limitedResults.length === 0) { return { content: [ { type: "text", text: `未找到包含关键词 "${query}" 的文档` } ] }; } const resultsText = limitedResults.map((result, index) => { const doc = result.document; return `${index + 1}. 相关度: ${result.score}\n 内存键: ${doc.memoryKey}\n 文件: ${doc.filePath}\n 类型: ${doc.documentType}\n 表格数: ${doc.tablesCount}\n 图片数: ${doc.imagesCount}\n 最后索引: ${new Date(doc.lastIndexed).toLocaleString()}`; }).join('\n\n'); return { content: [ { type: "text", text: `搜索结果 "${query}" (找到 ${limitedResults.length} 个匹配,共 ${filteredResults.length} 个):\n\n${resultsText}` } ] }; } case "get_cache_stats": { const documentStats = await cacheManager.getStats(); const indexStats = documentIndexer.getStats(); const statsText = `缓存统计信息:\n\n` + `文档缓存:\n` + `- 缓存文件数: ${documentStats.totalCached}\n` + `- 总缓存大小: ${(documentStats.totalSize / 1024 / 1024).toFixed(2)} MB\n\n` + `全文索引:\n` + `- 索引词汇数: ${indexStats.totalWords}\n` + `- 索引文档数: ${indexStats.totalDocuments}\n` + `- 最后更新: ${new Date(indexStats.lastUpdated).toLocaleString()}`; return { content: [ { type: "text", text: statsText } ] }; } case "clear_cache": { const { type = "all" } = args; switch (type) { case "document": await cacheManager.clear(); var clearedMessage = "已清空文档缓存"; break; case "index": documentIndexer.clear(); var clearedMessage = "已清空全文索引"; break; case "all": default: await cacheManager.clear(); documentIndexer.clear(); documentCache.flushAll(); var clearedMessage = "已清空所有缓存(文档缓存、全文索引、内存缓存)"; break; } return { content: [ { type: "text", text: clearedMessage } ] }; } case "list_stored_documents": { const { documentType } = args; const docs = []; const keys = documentCache.keys(); for (const key of keys) { const doc = documentCache.get(key); if (!documentType || doc.documentType === documentType) { docs.push(doc); } } const docList = docs.map(doc => `- 内存键: ${doc.memoryKey}\n 文件路径: ${doc.filePath}\n 文档类型: ${doc.documentType}\n 存储时间: ${doc.timestamp}\n 内容长度: ${doc.text?.length || 0} 字符\n 表格数: ${doc.tables?.length || 0}\n 图片数: ${doc.images?.length || 0}` ).join('\n\n'); return { content: [ { type: "text", text: `已存储的文档 (${docs.length} 个):\n\n${docList || "暂无存储的文档"}` } ] }; } case "get_stored_document": { const { memoryKey } = args; const doc = documentCache.get(memoryKey); if (!doc) { throw new Error(`未找到内存键为 "${memoryKey}" 的文档`); } let responseText = `文档内容 (内存键: ${memoryKey}):\n\n`; responseText += `文件路径: ${doc.filePath}\n`; responseText += `文档类型: ${doc.documentType}\n`; responseText += `处理时间: ${doc.timestamp}\n\n`; // 添加文本内容 if (doc.text) { responseText += `【文本内容】\n${doc.text}\n\n`; } // 添加表格内容 if (doc.tables && doc.tables.length > 0) { responseText += `【表格内容】(${doc.tables.length} 个)\n`; doc.tables.forEach((table, index) => { responseText += `\n表格${index + 1}:\n`; table.rows.forEach((row, rowIndex) => { responseText += `行${rowIndex + 1}: ${row.join(' | ')}\n`; }); }); } // 添加图片OCR内容 if (doc.images && doc.images.length > 0) { responseText += `【图片OCR内容】(${doc.images.length} 个)\n`; doc.images.forEach((image, index) => { responseText += `\n图片${index + 1} (${image.filename}):\n${image.ocrText}\n`; }); } return { content: [ { type: "text", text: responseText } ] }; } case "clear_memory": { const { memoryKey } = args; if (memoryKey) { const removed = documentCache.del(memoryKey); if (removed > 0) { return { content: [ { type: "text", text: `已清除内存键 "${memoryKey}" 的内容` } ] }; } else { throw new Error(`未找到内存键 "${memoryKey}" 的内容`); } } else { // 清除所有内存 const count = documentCache.keys().length; documentCache.flushAll(); return { content: [ { type: "text", text: `已清除所有内存内容 (共 ${count} 个文档)` } ] }; } } default: throw new Error(`未知工具: ${name}`); } } catch (error) { console.error('工具执行错误:', error); return { content: [ { type: "text", text: `错误: ${error.message}` } ], isError: true }; } }); // 启动服务器 async function main() { const transport = new StdioServerTransport(); await server.connect(transport); console.error("Word Document Reader MCP server running on stdio"); } // 优雅关闭处理 process.on('SIGINT', async () => { console.error('\n正在关闭服务器...'); await documentAnalyzer.cleanup(); process.exit(0); }); process.on('SIGTERM', async () => { console.error('\n正在关闭服务器...'); await documentAnalyzer.cleanup(); process.exit(0); }); // 导出类供测试使用 export { DocumentIndexer, CacheManager, LargeDocumentProcessor, DocumentAnalyzer }; main().catch((error) => { console.error("服务器启动失败:", error); process.exit(1); });

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/little2512/word-doc-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server