MCP Development Framework

{ "sourceFile": "mcp_simple_tool/tools/word_tool.py", "activeCommit": 0, "commits": [ { "activePatchIndex": 0, "patches": [ { "date": 1741337157526, "content": "Index: \n===================================================================\n--- \n+++ \n" } ], "date": 1741337157526, "name": "Commit-0", "content": "\"\"\"\nWord文档解析工具,用于解析Word文档内容\n\"\"\"\n\nimport os\nimport traceback\nfrom typing import Dict, List, Any\nimport docx\nimport mcp.types as types\nfrom . import BaseTool, ToolRegistry\n\n@ToolRegistry.register\nclass WordTool(BaseTool):\n \"\"\"\n 用于解析Word文档的工具,提取文本内容、表格和图片信息\n \"\"\"\n \n name = \"word\"\n description = \"解析Word文档内容,提取文本、表格和图片信息\"\n input_schema = {\n \"type\": \"object\",\n \"required\": [\"file_path\"],\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"Word文档的本地路径,例如'/path/to/document.docx'\",\n }\n },\n }\n \n async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 解析Word文档\n \n Args:\n arguments: 参数字典,必须包含'file_path'键\n \n Returns:\n Word文档内容列表\n \"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"错误: 缺少必要参数 'file_path'\"\n )]\n \n return await self._parse_word_document(arguments[\"file_path\"])\n \n async def _parse_word_document(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 解析Word文档内容\n \n Args:\n file_path: Word文档路径\n \n Returns:\n Word文档内容列表\n \"\"\"\n results = []\n \n # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n # 检查文件扩展名\n if not file_path.lower().endswith(('.docx', '.doc')):\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 不支持的文件格式: {file_path}\\n仅支持.docx和.doc格式的Word文档。\"\n )]\n \n try:\n # 添加文件信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n text=f\"# Word文档解析\\n\\n文件大小: {file_size_mb:.2f} MB\"\n ))\n \n # 打开Word文档\n doc = docx.Document(file_path)\n \n # 提取文档属性\n properties = {}\n if hasattr(doc.core_properties, 'title') and doc.core_properties.title:\n properties['标题'] = doc.core_properties.title\n if hasattr(doc.core_properties, 'author') and doc.core_properties.author:\n properties['作者'] = doc.core_properties.author\n if hasattr(doc.core_properties, 'created') and doc.core_properties.created:\n properties['创建时间'] = str(doc.core_properties.created)\n if hasattr(doc.core_properties, 'modified') and doc.core_properties.modified:\n properties['修改时间'] = str(doc.core_properties.modified)\n if hasattr(doc.core_properties, 'comments') and doc.core_properties.comments:\n properties['备注'] = doc.core_properties.comments\n \n # 添加文档属性信息\n if properties:\n properties_text = \"## 文档属性\\n\\n\"\n for key, value in properties.items():\n properties_text += f\"- {key}: {value}\\n\"\n results.append(types.TextContent(\n type=\"text\",\n text=properties_text\n ))\n \n # 提取文档内容\n content_text = \"## 文档内容\\n\\n\"\n \n # 处理段落\n paragraphs_count = len(doc.paragraphs)\n content_text += f\"### 段落 (共{paragraphs_count}个)\\n\\n\"\n \n for i, para in enumerate(doc.paragraphs):\n if para.text.strip(): # 只处理非空段落\n content_text += f\"{para.text}\\n\\n\"\n \n # 处理表格\n tables_count = len(doc.tables)\n if tables_count > 0:\n content_text += f\"### 表格 (共{tables_count}个)\\n\\n\"\n \n for i, table in enumerate(doc.tables):\n content_text += f\"#### 表格 {i+1}\\n\\n\"\n \n # 创建Markdown表格\n rows = []\n for row in table.rows:\n cells = [cell.text.replace('\\n', ' ').strip() for cell in row.cells]\n rows.append(cells)\n \n if rows:\n # 表头\n content_text += \"| \" + \" | \".join(rows[0]) + \" |\\n\"\n # 分隔线\n content_text += \"| \" + \" | \".join([\"---\"] * len(rows[0])) + \" |\\n\"\n # 表格内容\n for row in rows[1:]:\n content_text += \"| \" + \" | \".join(row) + \" |\\n\"\n \n content_text += \"\\n\"\n \n # 添加文档内容\n results.append(types.TextContent(\n type=\"text\",\n text=content_text\n ))\n \n # 提取图片信息\n try:\n # 计算文档中的图片数量\n image_count = 0\n for rel in doc.part.rels.values():\n if \"image\" in rel.target_ref:\n image_count += 1\n \n if image_count > 0:\n image_info = f\"## 图片信息\\n\\n文档中包含 {image_count} 张图片。\\n\\n\"\n image_info += \"注意:当前仅提供图片数量信息,不提取图片内容。如需查看图片,请直接打开原始文档。\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n text=image_info\n ))\n except Exception as img_error:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 提取图片信息时出错: {str(img_error)}\"\n ))\n \n # 添加处理完成的提示\n results.append(types.TextContent(\n type=\"text\",\n text=\"Word文档处理完成!\"\n ))\n \n return results\n except Exception as e:\n error_details = traceback.format_exc()\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 解析Word文档失败: {str(e)}\\n\"\n f\"可能的原因:\\n\"\n f\"1. 文件格式不兼容或已损坏\\n\"\n f\"2. 文件受密码保护\\n\"\n f\"3. 文件包含不支持的内容\\n\\n\"\n f\"详细错误信息: {error_details}\"\n )] " } ] }