read_word_document
Extract text, tables, and perform OCR analysis on images from Word documents for processing and retrieval.
Instructions
增强版Word文档读取器,支持表格提取、图片OCR分析和缓存优化
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| filePath | Yes | Word文档的文件路径 | |
| memoryKey | No | 用于存储的内存键名,便于后续检索 | default |
| documentType | No | 文档类型 | common-doc |
| extractTables | No | 是否提取表格 | |
| extractImages | No | 是否提取图片并进行OCR分析 | |
| useCache | No | 是否使用缓存 | |
| outputDir | No | 图片和临时文件输出目录 | ./output |
Implementation Reference
- server-basic.js:36-60 (registration)Registration of read_word_document tool in ListToolsRequestSchema handler, including name, description, and input schema.{ name: "read_word_document", description: "读取Word文档内容并存储到内存中", inputSchema: { type: "object", properties: { filePath: { type: "string", description: "Word文档的文件路径" }, memoryKey: { type: "string", description: "用于存储的内存键名,便于后续检索", default: "default" }, documentType: { type: "string", description: "文档类型,如 'ui-component', 'api-doc', 'common-doc'", enum: ["ui-component", "api-doc", "common-doc", "other"], default: "common-doc" } }, required: ["filePath"] } },
- server-basic.js:131-168 (handler)Basic implementation of read_word_document handler: checks file existence and extension, extracts raw text using mammoth, stores in type-specific memory maps, returns success message with preview.case "read_word_document": { const { filePath, memoryKey = "default", documentType = "common-doc" } = args; if (!fs.existsSync(filePath)) { throw new Error(`文件不存在: ${filePath}`); } const fileExt = path.extname(filePath).toLowerCase(); if (fileExt !== '.docx' && fileExt !== '.doc') { throw new Error(`不支持的文件格式: ${fileExt}。仅支持 .docx 和 .doc 文件`); } const result = await mammoth.extractRawText({ path: filePath }); const documentData = { content: result.value, filePath, documentType, timestamp: new Date().toISOString(), memoryKey }; // 根据文档类型存储到不同的内存区域 if (documentType === 'ui-component') { uiComponentMemory.set(memoryKey, documentData); } else { documentMemory.set(memoryKey, documentData); } return { content: [ { type: "text", text: `成功读取并存储Word文档:\n文件路径: ${filePath}\n文档类型: ${documentType}\n内存键: ${memoryKey}\n内容长度: ${result.value.length} 字符\n\n文档内容预览:\n${result.value.substring(0, 500)}...` } ] }; }
- server.js:474-517 (registration)Registration of enhanced read_word_document tool in ListToolsRequestSchema handler, with extended input schema supporting table/image extraction, caching, etc.name: "read_word_document", description: "增强版Word文档读取器,支持表格提取、图片OCR分析和缓存优化", inputSchema: { type: "object", properties: { filePath: { type: "string", description: "Word文档的文件路径" }, memoryKey: { type: "string", description: "用于存储的内存键名,便于后续检索", default: "default" }, documentType: { type: "string", description: "文档类型", enum: ["ui-component", "api-doc", "common-doc", "other"], default: "common-doc" }, extractTables: { type: "boolean", description: "是否提取表格", default: true }, extractImages: { type: "boolean", description: "是否提取图片并进行OCR分析", default: true }, useCache: { type: "boolean", description: "是否使用缓存", default: true }, outputDir: { type: "string", description: "图片和临时文件输出目录", default: "./output" } }, required: ["filePath"] } },
- server.js:616-757 (handler)Advanced implementation of read_word_document handler: supports caching, large file chunking, table extraction, image OCR with Tesseract, full-text indexing, stores enhanced analysis results in cache and memory, returns detailed preview.case "read_word_document": { const { filePath, memoryKey = "default", documentType = "common-doc", extractTables = true, extractImages = true, useCache = true, outputDir = "./output" } = args; if (!fs.existsSync(filePath)) { throw new Error(`文件不存在: ${filePath}`); } const fileExt = path.extname(filePath).toLowerCase(); if (fileExt !== '.docx' && fileExt !== '.doc') { throw new Error(`不支持的文件格式: ${fileExt}。仅支持 .docx 和 .doc 文件`); } // 检查缓存 let analysisResult = null; if (useCache) { analysisResult = await cacheManager.get(filePath); if (analysisResult) { console.log(`使用缓存数据: ${filePath}`); } } if (!analysisResult) { console.log(`分析文档: ${filePath}`); // 读取文件 const docxBuffer = await fs.readFile(filePath); // 检查是否为大文档 if (await largeDocProcessor.isLargeDocument(filePath)) { console.log('检测到大文档,使用并行处理'); analysisResult = await largeDocProcessor.processInChunks(filePath, { extractTables, extractImages }); } else { // 标准处理 const textResult = await mammoth.extractRawText({ buffer: docxBuffer }); analysisResult = { text: textResult.value, tables: [], images: [], metadata: { filePath, documentType, processedAt: new Date().toISOString(), fileSize: docxBuffer.length } }; // 提取表格 if (extractTables) { analysisResult.tables = await documentAnalyzer.extractTables(docxBuffer); } // 提取图片 if (extractImages) { await fs.ensureDir(outputDir); analysisResult.images = await documentAnalyzer.extractImages(docxBuffer, outputDir); } } // 缓存结果 if (useCache) { await cacheManager.set(filePath, analysisResult); } } // 生成文档ID用于索引 const documentId = `${memoryKey}_${Date.now()}`; // 更新全文索引 const indexContent = analysisResult.text + ' ' + analysisResult.tables.map(t => t.rows.join(' ')).join(' ') + ' ' + analysisResult.images.map(i => i.ocrText).join(' '); documentIndexer.addDocument(documentId, indexContent, { memoryKey, filePath, documentType, tablesCount: analysisResult.tables.length, imagesCount: analysisResult.images.length }); // 存储到内存(兼容现有功能) const documentData = { ...analysisResult, documentId, memoryKey, filePath, documentType, timestamp: new Date().toISOString() }; documentCache.set(memoryKey, documentData); // 构建响应 let responseText = `成功读取并分析Word文档:\n`; responseText += `文件路径: ${filePath}\n`; responseText += `文档类型: ${documentType}\n`; responseText += `内存键: ${memoryKey}\n`; responseText += `内容长度: ${analysisResult.text.length} 字符\n`; responseText += `表格数量: ${analysisResult.tables.length}\n`; responseText += `图片数量: ${analysisResult.images.length}\n`; if (analysisResult.tables.length > 0) { responseText += `\n表格预览:\n`; analysisResult.tables.slice(0, 2).forEach((table, index) => { responseText += `表格${index + 1}: ${table.rows.length}行 x ${table.rows[0]?.length || 0}列\n`; if (table.rows.length > 0) { responseText += `示例行: ${table.rows[0]?.slice(0, 3).join(' | ')}\n`; } }); } if (analysisResult.images.length > 0) { responseText += `\n图片OCR结果预览:\n`; analysisResult.images.slice(0, 2).forEach((image, index) => { const ocrPreview = image.ocrText.substring(0, 100); responseText += `图片${index + 1} (${image.filename}): ${ocrPreview}${image.ocrText.length > 100 ? '...' : ''}\n`; }); } responseText += `\n文本内容预览:\n${analysisResult.text.substring(0, 300)}...`; return { content: [ { type: "text", text: responseText } ] }; }