parse_word
Extract text, tables, and images from Word documents using the MCP Development Framework’s parsing tool for efficient content analysis and retrieval.
Instructions
解析Word文档内容,提取文本、表格和图片信息
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| file_path | Yes | Word文档的本地路径,例如'/path/to/document.docx' |
Implementation Reference
- mcp_tool/tools/word_tool.py:21-28 (registration)Tool registration: @ToolRegistry.register decorator on WordTool class with name="parse_word"@ToolRegistry.register class WordTool(BaseTool): """ 用于解析Word文档的工具,提取文本内容、表格和图片信息 支持.docx和.doc(Word 97-2003)格式 """ name = "parse_word"
- mcp_tool/tools/word_tool.py:30-39 (schema)Input schema definition: requires 'file_path' string parameterinput_schema = { "type": "object", "required": ["file_path"], "properties": { "file_path": { "type": "string", "description": "Word文档的本地路径,例如'/path/to/document.docx'", } }, }
- mcp_tool/tools/word_tool.py:41-61 (handler)Handler entry point: execute method validates input, processes file_path, and calls the core parserasync def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]: """ 解析Word文档 Args: arguments: 参数字典,必须包含'file_path'键 Returns: 解析结果列表 """ if "file_path" not in arguments: return [types.TextContent( type="text", text="错误: 缺少必要参数 'file_path'" )] # 处理文件路径,支持挂载目录的转换 file_path = self.process_file_path(arguments["file_path"]) return await self._parse_word_document(file_path)
- mcp_tool/tools/word_tool.py:234-454 (handler)Core handler implementation: Parses Word documents (.docx/.doc), extracts properties, paragraphs, tables as Markdown, images as base64, handles .doc conversion via LibreOfficeasync def _parse_word_document(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]: """ 解析Word文档内容,支持.docx和.doc格式 Args: file_path: Word文档路径 Returns: Word文档内容列表 """ results = [] temp_docx_path = None # 检查文件是否存在 if not os.path.exists(file_path): return [types.TextContent( type="text", text=f"错误: 文件不存在: {file_path}\n请检查路径是否正确,并确保文件可访问。" )] # 检查文件扩展名 if not file_path.lower().endswith(('.docx', '.doc')): return [types.TextContent( type="text", text=f"错误: 不支持的文件格式: {file_path}\n仅支持.docx和.doc格式的Word文档。" )] try: # 添加文件信息 file_size_mb = os.path.getsize(file_path) / (1024 * 1024) # 处理.doc格式(Word 97-2003文档) if file_path.lower().endswith('.doc'): results.append(types.TextContent( type="text", text=f"# Word文档解析 (Word 97-2003 格式)\n\n文件大小: {file_size_mb:.2f} MB" )) # 检查LibreOffice是否可用 if not self._is_libreoffice_installed(): return [types.TextContent( type="text", text="错误: 无法解析Word 97-2003 (.doc)格式。\n" "系统未安装LibreOffice,无法进行格式转换。\n" "请安装LibreOffice后重试,或将文档另存为.docx格式。" )] try: # 显示转换提示 results.append(types.TextContent( type="text", text="正在使用LibreOffice转换文档格式,请稍候..." )) # 转换.doc到.docx temp_docx_path = self._convert_doc_to_docx(file_path) # 更新文件路径为转换后的文件 file_path = temp_docx_path results.append(types.TextContent( type="text", text="文档格式转换完成,继续解析...\n" )) except Exception as e: return results + [types.TextContent( type="text", text=f"错误: {str(e)}\n" f"建议:\n" f"1. 确保已正确安装LibreOffice且可通过命令行访问\n" f"2. 尝试手动将文档转换为.docx格式后重试\n" f"3. 检查文档是否加密或损坏" )] else: results.append(types.TextContent( type="text", text=f"# Word文档解析\n\n文件大小: {file_size_mb:.2f} MB" )) # 打开Word文档 doc = docx.Document(file_path) # 提取文档属性 properties = {} if hasattr(doc.core_properties, 'title') and doc.core_properties.title: properties['标题'] = doc.core_properties.title if hasattr(doc.core_properties, 'author') and doc.core_properties.author: properties['作者'] = doc.core_properties.author if hasattr(doc.core_properties, 'created') and doc.core_properties.created: properties['创建时间'] = str(doc.core_properties.created) if hasattr(doc.core_properties, 'modified') and doc.core_properties.modified: properties['修改时间'] = str(doc.core_properties.modified) if hasattr(doc.core_properties, 'comments') and doc.core_properties.comments: properties['备注'] = doc.core_properties.comments # 添加文档属性信息 if properties: properties_text = "## 文档属性\n\n" for key, value in properties.items(): properties_text += f"- {key}: {value}\n" results.append(types.TextContent( type="text", text=properties_text )) # 提取文档内容 content_text = "## 文档内容\n\n" # 处理段落 paragraphs_count = len(doc.paragraphs) content_text += f"### 段落 (共{paragraphs_count}个)\n\n" for i, para in enumerate(doc.paragraphs): if para.text.strip(): # 只处理非空段落 content_text += f"{para.text}\n\n" # 处理表格 tables_count = len(doc.tables) if tables_count > 0: content_text += f"### 表格 (共{tables_count}个)\n\n" for i, table in enumerate(doc.tables): content_text += f"#### 表格 {i+1}\n\n" # 创建Markdown表格 rows = [] for row in table.rows: cells = [cell.text.replace('\n', ' ').strip() for cell in row.cells] rows.append(cells) if rows: # 表头 content_text += "| " + " | ".join(rows[0]) + " |\n" # 分隔线 content_text += "| " + " | ".join(["---"] * len(rows[0])) + " |\n" # 表格内容 for row in rows[1:]: content_text += "| " + " | ".join(row) + " |\n" content_text += "\n" # 添加文档内容 results.append(types.TextContent( type="text", text=content_text )) # 提取图片信息和内容 try: # 提取文档中的所有图片,并过滤掉嵌入的外部文档 images = self._extract_images_from_word(doc) if images: image_info = f"## 图片信息\n\n文档中包含 {len(images)} 张图片。\n\n" results.append(types.TextContent( type="text", text=image_info )) # 返回图片内容 for i, (image_id, image_bytes) in enumerate(images): try: # 获取图片MIME类型 mime_type = self._get_image_mime_type(image_bytes) # 将图片添加到结果中 image_base64 = self._encode_image_base64(image_bytes) results.append(types.TextContent( type="text", text=f"### 图片 {i+1}\n\n" )) results.append(types.ImageContent( type="image", data=image_base64, mimeType=mime_type )) except Exception as e: # 记录图片处理错误但不中断 results.append(types.TextContent( type="text", text=f"注意: 图片 {i+1} 处理失败: {str(e)}" )) else: results.append(types.TextContent( type="text", text="## 图片信息\n\n文档中未包含图片或嵌入对象均不是有效图片。" )) except Exception as img_error: results.append(types.TextContent( type="text", text=f"警告: 提取图片信息时出错: {str(img_error)}" )) # 添加处理完成的提示 results.append(types.TextContent( type="text", text="Word文档处理完成!" )) return results except Exception as e: error_details = traceback.format_exc() return [types.TextContent( type="text", text=f"错误: 解析Word文档失败: {str(e)}\n" f"可能的原因:\n" f"1. 文件格式不兼容或已损坏\n" f"2. 文件受密码保护\n" f"3. 文件包含不支持的内容\n\n" f"详细错误信息: {error_details}" )] finally: # 清理临时文件 if temp_docx_path and os.path.exists(temp_docx_path): try: # 删除临时文件 temp_dir = os.path.dirname(temp_docx_path) shutil.rmtree(temp_dir, ignore_errors=True) except Exception: # 忽略清理过程中的错误 pass
- mcp_tool/tools/word_tool.py:127-155 (helper)Helper function to extract and validate images from Word document, filtering invalid embedded objectsdef _extract_images_from_word(self, doc: Document) -> List[Tuple[str, bytes]]: """ 从Word文档中提取图片,过滤掉嵌入的外部文档 Args: doc: Word文档对象 Returns: 图片列表,每项包含图片ID和二进制数据 """ images = [] document_part = doc.part rels = document_part.rels for rel in rels.values(): try: # 只处理图片类型的关系 if "image" in rel.reltype: image_part = rel.target_part image_bytes = image_part.blob image_id = rel.rId # 验证是否为真实图片,过滤掉嵌入的外部文档 if self._is_valid_image(image_bytes): images.append((image_id, image_bytes)) except Exception: continue return images