Word Document Reader MCP Server

Overview Inspect Schema Related Servers Score Discussions

word-doc-mcp

server.js•28.5 kB

#!/usr/bin/env node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListToolsRequestSchema, ReadResourceRequestSchema, ListResourcesRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import fs from 'fs-extra'; import path from 'path'; import crypto from 'crypto'; import mammoth from 'mammoth'; import Tesseract from 'tesseract.js'; import NodeCache from 'node-cache'; import sharp from 'sharp'; import { Worker, isMainThread, parentPort, workerData } from 'worker_threads'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const server = new Server( { name: "word-doc-reader", version: "2.0.0", }, { capabilities: { tools: {}, resources: {}, }, } ); // 缓存配置 const documentCache = new NodeCache({ stdTTL: 3600, // 1小时缓存 checkperiod: 600, // 每10分钟检查过期缓存 useClones: false }); // 全文索引 class DocumentIndexer { constructor() { this.index = new Map(); // word -> Set of documentIds this.documents = new Map(); // documentId -> document metadata this.lastUpdated = Date.now(); } // 添加文档到索引 addDocument(documentId, content, metadata) { const words = this.extractWords(content); // 更新倒排索引 words.forEach(word => { if (!this.index.has(word)) { this.index.set(word, new Set()); } this.index.get(word).add(documentId); }); // 存储文档元数据 this.documents.set(documentId, { ...metadata, wordCount: words.length, lastIndexed: Date.now() }); this.lastUpdated = Date.now(); } // 提取词汇 extractWords(text) { // 中英文分词 const chinesePattern = /[\u4e00-\u9fff]+/g; const englishPattern = /[a-zA-Z]+/g; const numberPattern = /\d+/g; const words = []; // 提取中文词汇（简单分词，后续可优化为更智能的分词） const chineseMatches = text.match(chinesePattern) || []; chineseMatches.forEach(match => { // 简单的中文二字、三字、四字词分词 for (let i = 0; i < match.length; i++) { for (let len = 2; len <= 4 && i + len <= match.length; len++) { words.push(match.substr(i, len)); } } }); // 提取英文单词 const englishMatches = text.match(englishPattern) || []; words.push(...englishMatches.map(word => word.toLowerCase())); // 提取数字 const numberMatches = text.match(numberPattern) || []; words.push(...numberMatches); return words.filter(word => word.length > 0); } // 搜索文档 search(query) { const queryWords = this.extractWords(query.toLowerCase()); const documentScores = new Map(); queryWords.forEach(word => { const docs = this.index.get(word); if (docs) { docs.forEach(docId => { const score = documentScores.get(docId) || 0; documentScores.set(docId, score + 1); }); } }); // 按相关性排序 const results = Array.from(documentScores.entries()) .sort((a, b) => b[1] - a[1]) .map(([docId, score]) => ({ documentId, score, document: this.documents.get(docId) })); return results; } // 清除索引 clear() { this.index.clear(); this.documents.clear(); this.lastUpdated = Date.now(); } // 获取统计信息 getStats() { return { totalWords: this.index.size, totalDocuments: this.documents.size, lastUpdated: this.lastUpdated }; } } const documentIndexer = new DocumentIndexer(); // 大文档处理器 class LargeDocumentProcessor { constructor() { this.maxFileSize = 10 * 1024 * 1024; // 10MB this.maxPages = 100; } async isLargeDocument(filePath) { const stats = await fs.stat(filePath); return stats.size > this.maxFileSize; } async processInChunks(filePath, options = {}) { const chunkSize = options.chunkSize || 1024 * 1024; // 1MB chunks const buffer = await fs.readFile(filePath); const chunks = []; for (let i = 0; i < buffer.length; i += chunkSize) { chunks.push(buffer.slice(i, i + chunkSize)); } // 并行处理chunks const promises = chunks.map((chunk, index) => this.processChunk(chunk, index, options) ); const results = await Promise.all(promises); return this.mergeResults(results); } async processChunk(chunk, index, options) { // 使用worker处理每个chunk return new Promise((resolve, reject) => { const worker = new Worker(__filename, { workerData: { type: 'process_chunk', chunk: chunk.toString('base64'), index, options } }); worker.on('message', resolve); worker.on('error', reject); }); } mergeResults(results) { const merged = { text: '', tables: [], images: [], metadata: { chunksProcessed: results.length, totalLength: 0 } }; results.forEach(result => { merged.text += result.text || ''; if (result.tables) merged.tables.push(...result.tables); if (result.images) merged.images.push(...result.images); merged.metadata.totalLength += result.text?.length || 0; }); return merged; } } const largeDocProcessor = new LargeDocumentProcessor(); // 文档分析器（包含表格和图片提取） class DocumentAnalyzer { constructor() { this.ocrWorker = null; } async initializeOCR() { if (!this.ocrWorker) { this.ocrWorker = await Tesseract.createWorker(); await this.ocrWorker.loadLanguage('chi_sim+eng'); await this.ocrWorker.initialize('chi_sim+eng'); } } async extractTables(docxBuffer) { try { // 使用mammoth提取HTML，然后解析表格 const result = await mammoth.convertToHtml({ buffer: docxBuffer }); const html = result.value; // 简单的HTML表格解析 const tables = []; const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi; let match; while ((match = tableRegex.exec(html)) !== null) { const tableHtml = match[0]; const tableData = this.parseTableHtml(tableHtml); if (tableData.rows.length > 0) { tables.push(tableData); } } return tables; } catch (error) { console.error('表格提取错误:', error); return []; } } parseTableHtml(tableHtml) { const rows = []; const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi; let rowMatch; while ((rowMatch = rowRegex.exec(tableHtml)) !== null) { const cells = []; const cellRegex = /<(?:t[dh])(?:[^>]*)>([\s\S]*?)<\/t[dh]>/gi; let cellMatch; while ((cellMatch = cellRegex.exec(rowMatch[1])) !== null) { // 清理HTML标签和多余空白 const cellText = cellMatch[1] .replace(/<[^>]*>/g, '') .replace(/\s+/g, ' ') .trim(); cells.push(cellText); } if (cells.length > 0) { rows.push(cells); } } return { rows, html: tableHtml }; } async extractImages(docxBuffer, outputDir) { try { // 创建临时目录 const tempDir = path.join(outputDir, 'temp_images'); await fs.ensureDir(tempDir); // 使用zip-local解压docx文件（docx实际上是zip文件） const JSZip = await import('jszip'); const zip = await JSZip.loadAsync(docxBuffer); const images = []; const mediaFolder = zip.folder('word/media'); if (mediaFolder) { const imageFiles = mediaFolder.files; for (const [filename, file] of Object.entries(imageFiles)) { if (!file.dir && this.isImageFile(filename)) { // 保存图片 const imagePath = path.join(tempDir, path.basename(filename)); const imageBuffer = await file.async('nodebuffer'); await fs.writeFile(imagePath, imageBuffer); // 进行OCR分析 const ocrText = await this.performOCR(imageBuffer); images.push({ filename, path: imagePath, size: imageBuffer.length, format: path.extname(filename).substring(1), ocrText: ocrText, extractedAt: new Date().toISOString() }); } } } return images; } catch (error) { console.error('图片提取错误:', error); return []; } } isImageFile(filename) { const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']; const ext = path.extname(filename).toLowerCase(); return imageExtensions.includes(ext); } async performOCR(imageBuffer) { try { await this.initializeOCR(); // 图片预处理 const processedImage = await sharp(imageBuffer) .resize({ width: 2000, withoutEnlargement: true }) .sharpen() .normalise() .png() .toBuffer(); const { data: { text } } = await this.ocrWorker.recognize(processedImage); return text.trim(); } catch (error) { console.error('OCR处理错误:', error); return ''; } } async cleanup() { if (this.ocrWorker) { await this.ocrWorker.terminate(); this.ocrWorker = null; } } } const documentAnalyzer = new DocumentAnalyzer(); // 缓存管理器 class CacheManager { constructor() { this.cacheDir = path.join(__dirname, '.cache'); this.metadataFile = path.join(this.cacheDir, 'metadata.json'); } async initialize() { await fs.ensureDir(this.cacheDir); if (!(await fs.pathExists(this.metadataFile))) { await fs.writeJson(this.metadataFile, {}); } } getCacheKey(filePath) { const stats = fs.statSync(filePath); const keyData = `${filePath}-${stats.mtime.getTime()}-${stats.size}`; return crypto.createHash('md5').update(keyData).digest('hex'); } async get(filePath) { await this.initialize(); const cacheKey = this.getCacheKey(filePath); const cacheFile = path.join(this.cacheDir, `${cacheKey}.json`); const metadata = await fs.readJson(this.metadataFile); if (metadata[cacheKey] && await fs.pathExists(cacheFile)) { return await fs.readJson(cacheFile); } return null; } async set(filePath, data) { await this.initialize(); const cacheKey = this.getCacheKey(filePath); const cacheFile = path.join(this.cacheDir, `${cacheKey}.json`); const metadata = await fs.readJson(this.metadataFile); await fs.writeJson(cacheFile, data); metadata[cacheKey] = { filePath, cachedAt: new Date().toISOString(), size: JSON.stringify(data).length }; await fs.writeJson(this.metadataFile, metadata); } async clear() { await this.initialize(); const files = await fs.readdir(this.cacheDir); for (const file of files) { if (file !== 'metadata.json') { await fs.remove(path.join(this.cacheDir, file)); } } await fs.writeJson(this.metadataFile, {}); } async getStats() { await this.initialize(); const metadata = await fs.readJson(this.metadataFile); const keys = Object.keys(metadata); return { totalCached: keys.length, totalSize: keys.reduce((sum, key) => sum + (metadata[key].size || 0), 0), files: metadata }; } } const cacheManager = new CacheManager(); // Worker线程处理 if (!isMainThread) { const { type, data } = workerData; if (type === 'process_chunk') { const { chunk, index, options } = data; mammoth.extractRawText({ buffer: Buffer.from(chunk, 'base64') }) .then(result => { parentPort.postMessage({ index, text: result.value, success: true }); }) .catch(error => { parentPort.postMessage({ index, error: error.message, success: false }); }); } } // 工具定义 server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: [ { name: "read_word_document", description: "增强版Word文档读取器，支持表格提取、图片OCR分析和缓存优化", inputSchema: { type: "object", properties: { filePath: { type: "string", description: "Word文档的文件路径" }, memoryKey: { type: "string", description: "用于存储的内存键名，便于后续检索", default: "default" }, documentType: { type: "string", description: "文档类型", enum: ["ui-component", "api-doc", "common-doc", "other"], default: "common-doc" }, extractTables: { type: "boolean", description: "是否提取表格", default: true }, extractImages: { type: "boolean", description: "是否提取图片并进行OCR分析", default: true }, useCache: { type: "boolean", description: "是否使用缓存", default: true }, outputDir: { type: "string", description: "图片和临时文件输出目录", default: "./output" } }, required: ["filePath"] } }, { name: "search_documents", description: "全文索引搜索，支持中英文混合搜索", inputSchema: { type: "object", properties: { query: { type: "string", description: "搜索关键词" }, documentType: { type: "string", description: "限制搜索的文档类型", enum: ["ui-component", "api-doc", "common-doc", "other"] }, limit: { type: "number", description: "返回结果数量限制", default: 10 } }, required: ["query"] } }, { name: "get_cache_stats", description: "获取缓存统计信息", inputSchema: { type: "object", properties: {} } }, { name: "clear_cache", description: "清空所有缓存", inputSchema: { type: "object", properties: { type: { type: "string", description: "清除类型：all, document, index", enum: ["all", "document", "index"], default: "all" } } } }, { name: "list_stored_documents", description: "列出所有已存储的文档", inputSchema: { type: "object", properties: { documentType: { type: "string", description: "筛选特定类型的文档", enum: ["ui-component", "api-doc", "common-doc", "other"] } } } }, { name: "get_stored_document", description: "获取已存储的文档内容", inputSchema: { type: "object", properties: { memoryKey: { type: "string", description: "要获取的文档内存键名" } }, required: ["memoryKey"] } }, { name: "clear_memory", description: "清除指定的内存内容", inputSchema: { type: "object", properties: { memoryKey: { type: "string", description: "要清除的内存键名，如果不提供则清除所有" } } } } ] }; }); // 工具执行 server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; try { switch (name) { case "read_word_document": { const { filePath, memoryKey = "default", documentType = "common-doc", extractTables = true, extractImages = true, useCache = true, outputDir = "./output" } = args; if (!fs.existsSync(filePath)) { throw new Error(`文件不存在: ${filePath}`); } const fileExt = path.extname(filePath).toLowerCase(); if (fileExt !== '.docx' && fileExt !== '.doc') { throw new Error(`不支持的文件格式: ${fileExt}。仅支持 .docx 和 .doc 文件`); } // 检查缓存 let analysisResult = null; if (useCache) { analysisResult = await cacheManager.get(filePath); if (analysisResult) { console.log(`使用缓存数据: ${filePath}`); } } if (!analysisResult) { console.log(`分析文档: ${filePath}`); // 读取文件 const docxBuffer = await fs.readFile(filePath); // 检查是否为大文档 if (await largeDocProcessor.isLargeDocument(filePath)) { console.log('检测到大文档，使用并行处理'); analysisResult = await largeDocProcessor.processInChunks(filePath, { extractTables, extractImages }); } else { // 标准处理 const textResult = await mammoth.extractRawText({ buffer: docxBuffer }); analysisResult = { text: textResult.value, tables: [], images: [], metadata: { filePath, documentType, processedAt: new Date().toISOString(), fileSize: docxBuffer.length } }; // 提取表格 if (extractTables) { analysisResult.tables = await documentAnalyzer.extractTables(docxBuffer); } // 提取图片 if (extractImages) { await fs.ensureDir(outputDir); analysisResult.images = await documentAnalyzer.extractImages(docxBuffer, outputDir); } } // 缓存结果 if (useCache) { await cacheManager.set(filePath, analysisResult); } } // 生成文档ID用于索引 const documentId = `${memoryKey}_${Date.now()}`; // 更新全文索引 const indexContent = analysisResult.text + ' ' + analysisResult.tables.map(t => t.rows.join(' ')).join(' ') + ' ' + analysisResult.images.map(i => i.ocrText).join(' '); documentIndexer.addDocument(documentId, indexContent, { memoryKey, filePath, documentType, tablesCount: analysisResult.tables.length, imagesCount: analysisResult.images.length }); // 存储到内存（兼容现有功能） const documentData = { ...analysisResult, documentId, memoryKey, filePath, documentType, timestamp: new Date().toISOString() }; documentCache.set(memoryKey, documentData); // 构建响应 let responseText = `成功读取并分析Word文档:\n`; responseText += `文件路径: ${filePath}\n`; responseText += `文档类型: ${documentType}\n`; responseText += `内存键: ${memoryKey}\n`; responseText += `内容长度: ${analysisResult.text.length} 字符\n`; responseText += `表格数量: ${analysisResult.tables.length}\n`; responseText += `图片数量: ${analysisResult.images.length}\n`; if (analysisResult.tables.length > 0) { responseText += `\n表格预览:\n`; analysisResult.tables.slice(0, 2).forEach((table, index) => { responseText += `表格${index + 1}: ${table.rows.length}行 x ${table.rows[0]?.length || 0}列\n`; if (table.rows.length > 0) { responseText += `示例行: ${table.rows[0]?.slice(0, 3).join(' | ')}\n`; } }); } if (analysisResult.images.length > 0) { responseText += `\n图片OCR结果预览:\n`; analysisResult.images.slice(0, 2).forEach((image, index) => { const ocrPreview = image.ocrText.substring(0, 100); responseText += `图片${index + 1} (${image.filename}): ${ocrPreview}${image.ocrText.length > 100 ? '...' : ''}\n`; }); } responseText += `\n文本内容预览:\n${analysisResult.text.substring(0, 300)}...`; return { content: [ { type: "text", text: responseText } ] }; } case "search_documents": { const { query, documentType, limit = 10 } = args; const searchResults = documentIndexer.search(query); // 按文档类型过滤 const filteredResults = documentType ? searchResults.filter(result => result.document.documentType === documentType) : searchResults; const limitedResults = filteredResults.slice(0, limit); if (limitedResults.length === 0) { return { content: [ { type: "text", text: `未找到包含关键词 "${query}" 的文档` } ] }; } const resultsText = limitedResults.map((result, index) => { const doc = result.document; return `${index + 1}. 相关度: ${result.score}\n 内存键: ${doc.memoryKey}\n 文件: ${doc.filePath}\n 类型: ${doc.documentType}\n 表格数: ${doc.tablesCount}\n 图片数: ${doc.imagesCount}\n 最后索引: ${new Date(doc.lastIndexed).toLocaleString()}`; }).join('\n\n'); return { content: [ { type: "text", text: `搜索结果 "${query}" (找到 ${limitedResults.length} 个匹配，共 ${filteredResults.length} 个):\n\n${resultsText}` } ] }; } case "get_cache_stats": { const documentStats = await cacheManager.getStats(); const indexStats = documentIndexer.getStats(); const statsText = `缓存统计信息:\n\n` + `文档缓存:\n` + `- 缓存文件数: ${documentStats.totalCached}\n` + `- 总缓存大小: ${(documentStats.totalSize / 1024 / 1024).toFixed(2)} MB\n\n` + `全文索引:\n` + `- 索引词汇数: ${indexStats.totalWords}\n` + `- 索引文档数: ${indexStats.totalDocuments}\n` + `- 最后更新: ${new Date(indexStats.lastUpdated).toLocaleString()}`; return { content: [ { type: "text", text: statsText } ] }; } case "clear_cache": { const { type = "all" } = args; switch (type) { case "document": await cacheManager.clear(); var clearedMessage = "已清空文档缓存"; break; case "index": documentIndexer.clear(); var clearedMessage = "已清空全文索引"; break; case "all": default: await cacheManager.clear(); documentIndexer.clear(); documentCache.flushAll(); var clearedMessage = "已清空所有缓存（文档缓存、全文索引、内存缓存）"; break; } return { content: [ { type: "text", text: clearedMessage } ] }; } case "list_stored_documents": { const { documentType } = args; const docs = []; const keys = documentCache.keys(); for (const key of keys) { const doc = documentCache.get(key); if (!documentType || doc.documentType === documentType) { docs.push(doc); } } const docList = docs.map(doc => `- 内存键: ${doc.memoryKey}\n 文件路径: ${doc.filePath}\n 文档类型: ${doc.documentType}\n 存储时间: ${doc.timestamp}\n 内容长度: ${doc.text?.length || 0} 字符\n 表格数: ${doc.tables?.length || 0}\n 图片数: ${doc.images?.length || 0}` ).join('\n\n'); return { content: [ { type: "text", text: `已存储的文档 (${docs.length} 个):\n\n${docList || "暂无存储的文档"}` } ] }; } case "get_stored_document": { const { memoryKey } = args; const doc = documentCache.get(memoryKey); if (!doc) { throw new Error(`未找到内存键为 "${memoryKey}" 的文档`); } let responseText = `文档内容 (内存键: ${memoryKey}):\n\n`; responseText += `文件路径: ${doc.filePath}\n`; responseText += `文档类型: ${doc.documentType}\n`; responseText += `处理时间: ${doc.timestamp}\n\n`; // 添加文本内容 if (doc.text) { responseText += `【文本内容】\n${doc.text}\n\n`; } // 添加表格内容 if (doc.tables && doc.tables.length > 0) { responseText += `【表格内容】(${doc.tables.length} 个)\n`; doc.tables.forEach((table, index) => { responseText += `\n表格${index + 1}:\n`; table.rows.forEach((row, rowIndex) => { responseText += `行${rowIndex + 1}: ${row.join(' | ')}\n`; }); }); } // 添加图片OCR内容 if (doc.images && doc.images.length > 0) { responseText += `【图片OCR内容】(${doc.images.length} 个)\n`; doc.images.forEach((image, index) => { responseText += `\n图片${index + 1} (${image.filename}):\n${image.ocrText}\n`; }); } return { content: [ { type: "text", text: responseText } ] }; } case "clear_memory": { const { memoryKey } = args; if (memoryKey) { const removed = documentCache.del(memoryKey); if (removed > 0) { return { content: [ { type: "text", text: `已清除内存键 "${memoryKey}" 的内容` } ] }; } else { throw new Error(`未找到内存键 "${memoryKey}" 的内容`); } } else { // 清除所有内存 const count = documentCache.keys().length; documentCache.flushAll(); return { content: [ { type: "text", text: `已清除所有内存内容 (共 ${count} 个文档)` } ] }; } } default: throw new Error(`未知工具: ${name}`); } } catch (error) { console.error('工具执行错误:', error); return { content: [ { type: "text", text: `错误: ${error.message}` } ], isError: true }; } }); // 启动服务器 async function main() { const transport = new StdioServerTransport(); await server.connect(transport); console.error("Word Document Reader MCP server running on stdio"); } // 优雅关闭处理 process.on('SIGINT', async () => { console.error('\n正在关闭服务器...'); await documentAnalyzer.cleanup(); process.exit(0); }); process.on('SIGTERM', async () => { console.error('\n正在关闭服务器...'); await documentAnalyzer.cleanup(); process.exit(0); }); // 导出类供测试使用 export { DocumentIndexer, CacheManager, LargeDocumentProcessor, DocumentAnalyzer }; main().catch((error) => { console.error("服务器启动失败:", error); process.exit(1); });

Implementation Reference

Latest Blog Posts

Model Context Protocol Proxies: Enabling Enterprise Control with Virtual MCPs
By Om-Shree-0709 on December 9, 2025.
AI Security
Virtual MCP
Kubernetes Operator
The State of MCP in 2025: Who's Building What and Why It Matters
By punkpeye on December 7, 2025.
mcp
startups
MCP hosting with persistent storage
By punkpeye on December 6, 2025.
changelog

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/little2512/word-doc-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server