MCP Development Framework

{ "sourceFile": "mcp_simple_tool/tools/pdf_tool.py", "activeCommit": 0, "commits": [ { "activePatchIndex": 3, "patches": [ { "date": 1741332266981, "content": "Index: \n===================================================================\n--- \n+++ \n" }, { "date": 1741333308024, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,74 +1,39 @@\n-\"\"\"\n-PDF文档解析工具,提供完整的PDF解析功能,包括文本和图像提取\n-\"\"\"\n-\n import os\n import tempfile\n import shutil\n import PyPDF2\n+from pdf2image import convert_from_path\n+from PIL import Image\n import pymupdf4llm\n-import traceback\n-from typing import Dict, List, Any\n import mcp.types as types\n-from .base import BaseTool\n-from .utils.pdf_helpers import extract_images_from_pdf\n+from . import BaseTool, ToolRegistry\n \n-\n+@ToolRegistry.register\n class PdfTool(BaseTool):\n- \"\"\"\n- 用于解析PDF文档的工具,提供完整的文本和图像提取功能\n- \"\"\"\n+ \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n+ name = \"file\"\n+ description = \"解析PDF文件并提取文本和图片内容\"\n+ input_schema = {\n+ \"type\": \"object\",\n+ \"required\": [\"file_path\"],\n+ \"properties\": {\n+ \"file_path\": {\n+ \"type\": \"string\",\n+ \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n+ }\n+ },\n+ }\n \n- @property\n- def name(self) -> str:\n- return \"file\"\n- \n- @property\n- def description(self) -> str:\n- return \"解析PDF文档并提取文本和图片内容\"\n- \n- @property\n- def input_schema(self) -> Dict[str, Any]:\n- return {\n- \"type\": \"object\",\n- \"required\": [\"file_path\"],\n- \"properties\": {\n- \"file_path\": {\n- \"type\": \"string\",\n- \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n- }\n- },\n- }\n- \n- async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"\n- 解析PDF文档\n- \n- Args:\n- arguments: 参数字典,必须包含'file_path'键\n- \n- Returns:\n- PDF内容列表,包括文本和图像信息\n- \"\"\"\n+ async def execute(self, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"解析PDF文件并提取文本和图片\"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n- text=\"错误: 缺少必要参数 'file_path'\"\n+ text=\"Error: Missing required argument 'file_path'\"\n )]\n- \n- return await self._parse_pdf(arguments[\"file_path\"])\n- \n- async def _parse_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"\n- 解析PDF文档,提取文本和图像\n- \n- Args:\n- file_path: PDF文件路径\n \n- Returns:\n- PDF内容列表,包括文本和图像信息\n- \"\"\"\n+ file_path = arguments[\"file_path\"]\n results = []\n \n # 添加初始状态提示\n results.append(types.TextContent(\n@@ -136,56 +101,144 @@\n type=\"text\",\n text=md_content\n ))\n \n- # 使用PyMuPDF提取图片(更高级的图片提取方法)\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"正在使用高级方法提取图片,这可能需要一些时间...\"\n- ))\n+ # 处理提取的图像\n+ image_files = []\n+ for root, dirs, files in os.walk(image_path):\n+ for file in files:\n+ if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n+ image_files.append(os.path.join(root, file))\n \n- # 提取图片\n- image_info = await extract_images_from_pdf(file_path, image_path)\n- \n # 添加图像信息\n- if image_info:\n- image_markdown = \"\\n## 提取的图像信息\\n\\n\"\n- image_markdown += f\"共提取了 {len(image_info)} 张图像。\\n\\n\"\n+ if image_files:\n+ image_info = \"\\n## 提取的图像信息\\n\\n\"\n+ image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n- for i, img_data in enumerate(image_info):\n- image_markdown += f\"### 图像 {i+1}\\n\\n\"\n- image_markdown += f\"- 文件名: {img_data['filename']}\\n\"\n- image_markdown += f\"- 页码: {img_data['page']}\\n\"\n- image_markdown += f\"- 尺寸: {img_data['width']}x{img_data['height']} 像素\\n\"\n- image_markdown += f\"- 格式: {img_data['format']}\\n\"\n- image_markdown += f\"- 大小: {img_data['size_bytes'] / 1024:.2f} KB\\n\"\n- \n- if 'type' in img_data and img_data['type'] == 'rendered_page':\n- image_markdown += f\"- 类型: 渲染页面\\n\"\n- else:\n- image_markdown += f\"- 类型: 嵌入图像\\n\"\n- \n- image_markdown += \"\\n---\\n\\n\"\n+ for i, img_file in enumerate(image_files):\n+ try:\n+ with Image.open(img_file) as img:\n+ width, height = img.size\n+ format_name = img.format\n+ \n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n+ except Exception as e:\n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n- text=image_markdown\n+ text=image_info\n ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n except Exception as extract_error:\n- # 如果PymuPDF4llm提取失败,记录错误并尝试使用备用方法\n+ # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n ))\n \n- # 这里可以添加备用提取方法,如PyPDF2等\n- # 为简化起见,这个部分省略\n+ # 使用PyPDF2提取文本\n+ text_content = \"\"\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n+ num_pages = len(reader.pages)\n+ \n+ # 添加PDF元数据\n+ text_content += f\"# PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n+ if reader.metadata:\n+ for key, value in reader.metadata.items():\n+ if key.startswith('/'):\n+ key = key[1:] # 移除前导斜杠\n+ if value and str(value).strip():\n+ text_content += f\"- {key}: {value}\\n\"\n+ \n+ # 提取文本 - 限制页数以提高性能\n+ max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n+ if max_pages_to_process < num_pages:\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n+ \n+ text_content += \"\\n## 内容摘要\\n\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages_to_process):\n+ # 添加进度提示\n+ if page_num % 5 == 0 and page_num > 0:\n+ progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n+ results.append(types.TextContent(type=\"text\", text=progress_msg))\n+ \n+ page = reader.pages[page_num]\n+ page_text = page.extract_text()\n+ if page_text:\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n+ text_content += page_text + \"\\n\"\n+ \n+ # 添加文本内容到结果\n+ results.append(types.TextContent(type=\"text\", text=text_content))\n+ \n+ # 尝试使用pdf2image提取图片\n+ try:\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n+ \n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"正在提取图片,这可能需要一些时间...\"\n+ ))\n+ \n+ # 转换PDF页面为图片并保存\n+ images = convert_from_path(\n+ file_path, \n+ dpi=150, \n+ fmt=\"jpg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n+ \n+ # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n+ for i, img in enumerate(images):\n+ # 保存图片到临时目录\n+ img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n+ \n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ except Exception as img_error:\n+ # 如果图片提取失败,添加错误信息但继续\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n+ ))\n \n # 清理临时目录\n shutil.rmtree(temp_dir)\n \n@@ -200,8 +253,9 @@\n # 确保清理临时目录\n if 'temp_dir' in locals() and os.path.exists(temp_dir):\n shutil.rmtree(temp_dir)\n \n+ import traceback\n error_details = traceback.format_exc()\n return [\n types.TextContent(\n type=\"text\",\n" }, { "date": 1741494795896, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,13 +1,18 @@\n+\"\"\"\n+PDF快速预览工具,仅提取文本内容,适用于大型PDF文件\n+\"\"\"\n+\n import os\n import tempfile\n import shutil\n import PyPDF2\n-from pdf2image import convert_from_path\n-from PIL import Image\n import pymupdf4llm\n+import traceback\n+from typing import Dict, List, Any, Optional\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n+from .image_recognition_tool import ImageRecognizer\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n@@ -19,29 +24,89 @@\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n+ },\n+ \"recognize_images\": {\n+ \"type\": \"boolean\",\n+ \"description\": \"是否使用大模型识别图片内容\",\n+ },\n+ \"image_save_dir\": {\n+ \"type\": \"string\",\n+ \"description\": \"图片保存目录,默认为/host_images\",\n+ },\n+ \"api_key\": {\n+ \"type\": \"string\",\n+ \"description\": \"大模型API密钥,用于图像识别\",\n+ },\n+ \"api_base_url\": {\n+ \"type\": \"string\",\n+ \"description\": \"大模型API基础URL,用于图像识别\",\n }\n },\n }\n \n- async def execute(self, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"解析PDF文件并提取文本和图片\"\"\"\n+ async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"\n+ 解析PDF文档\n+ \n+ Args:\n+ arguments: 参数字典,必须包含'file_path'键\n+ \n+ Returns:\n+ PDF文本内容列表\n+ \"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n- text=\"Error: Missing required argument 'file_path'\"\n+ text=\"错误: 缺少必要参数 'file_path'\"\n )]\n+ \n+ # 获取参数\n+ file_path = arguments[\"file_path\"]\n+ recognize_images = arguments.get(\"recognize_images\", False)\n+ image_save_dir = arguments.get(\"image_save_dir\", \"/host_images\")\n+ api_key = arguments.get(\"api_key\", os.environ.get(\"LLM_API_KEY\", \"\"))\n+ api_base_url = arguments.get(\"api_base_url\", os.environ.get(\"LLM_API_BASE_URL\", \"api.openai.com\"))\n+ \n+ # 如果启用图像识别但没有提供API密钥,返回错误\n+ if recognize_images and not api_key:\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=\"错误: 启用图像识别功能需要提供API密钥,请通过参数提供或设置环境变量 LLM_API_KEY\"\n+ )]\n+ \n+ return await self._parse_pdf(\n+ file_path, \n+ recognize_images=recognize_images,\n+ image_save_dir=image_save_dir,\n+ api_key=api_key,\n+ api_base_url=api_base_url\n+ )\n+ \n+ async def _parse_pdf(\n+ self, \n+ file_path: str, \n+ recognize_images: bool = False,\n+ image_save_dir: str = \"/host_images\",\n+ api_key: str = \"\",\n+ api_base_url: str = \"api.openai.com\"\n+ ) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"\n+ 解析PDF文件并提取文本和图片\n+ \n+ Args:\n+ file_path: PDF文件路径\n+ recognize_images: 是否使用大模型识别图片内容\n+ image_save_dir: 图片保存目录\n+ api_key: 大模型API密钥\n+ api_base_url: 大模型API基础URL\n \n- file_path = arguments[\"file_path\"]\n+ Returns:\n+ PDF内容列表\n+ \"\"\"\n results = []\n \n- # 添加初始状态提示\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"开始处理PDF文件,请稍候...\"\n- ))\n- \n # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n@@ -53,8 +118,16 @@\n temp_dir = tempfile.mkdtemp()\n image_path = os.path.join(temp_dir, \"images\")\n os.makedirs(image_path, exist_ok=True)\n \n+ # 确保图片保存目录存在\n+ if recognize_images:\n+ os.makedirs(image_save_dir, exist_ok=True)\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"已启用图像识别功能,图片将保存到 {image_save_dir} 目录\"\n+ ))\n+ \n # 添加文件大小信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n@@ -113,29 +186,79 @@\n if image_files:\n image_info = \"\\n## 提取的图像信息\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n- for i, img_file in enumerate(image_files):\n- try:\n- with Image.open(img_file) as img:\n- width, height = img.size\n- format_name = img.format\n+ # 如果启用了图像识别\n+ if recognize_images and api_key:\n+ image_info += \"正在使用大模型识别图片内容,这可能需要一些时间...\\n\\n\"\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_info\n+ ))\n+ \n+ # 创建图像识别器\n+ recognizer = ImageRecognizer(api_base_url, api_key)\n+ \n+ # 处理每张图片\n+ for i, img_file in enumerate(image_files):\n+ try:\n+ # 复制图片到保存目录\n+ img_filename = f\"pdf_image_{os.path.basename(file_path).replace('.pdf', '')}_{i+1}.jpg\"\n+ saved_img_path = os.path.join(image_save_dir, img_filename)\n+ shutil.copy2(img_file, saved_img_path)\n \n+ # 识别图片内容\n+ recognition_result = await recognizer.recognize_image(\n+ saved_img_path, \n+ \"请详细描述这张图片的内容,包括图表、文字和视觉元素\"\n+ )\n+ \n+ # 添加识别结果\n+ if recognition_result:\n+ image_recognition_info = f\"### 图像 {i+1} 识别结果\\n\\n\"\n+ image_recognition_info += f\"- 文件名: {img_filename}\\n\"\n+ image_recognition_info += f\"- 保存路径: {saved_img_path}\\n\\n\"\n+ image_recognition_info += f\"**识别内容**:\\n\\n{recognition_result}\\n\\n\"\n+ image_recognition_info += \"---\\n\\n\"\n+ \n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_recognition_info\n+ ))\n+ else:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"### 图像 {i+1} 识别失败\\n\\n- 文件名: {img_filename}\\n- 保存路径: {saved_img_path}\\n\\n---\\n\\n\"\n+ ))\n+ except Exception as img_rec_error:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"### 图像 {i+1} 处理错误\\n\\n错误信息: {str(img_rec_error)}\\n\\n---\\n\\n\"\n+ ))\n+ else:\n+ # 不进行图像识别,只显示图片信息\n+ for i, img_file in enumerate(image_files):\n+ try:\n+ from PIL import Image\n+ with Image.open(img_file) as img:\n+ width, height = img.size\n+ format_name = img.format\n+ \n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n+ except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n image_info += \"---\\n\\n\"\n- except Exception as e:\n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_info\n- ))\n+ \n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_info\n+ ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n@@ -197,8 +320,9 @@\n text=f\"正在提取图片,这可能需要一些时间...\"\n ))\n \n # 转换PDF页面为图片并保存\n+ from pdf2image import convert_from_path\n images = convert_from_path(\n file_path, \n dpi=150, \n fmt=\"jpg\", \n@@ -208,30 +332,88 @@\n )\n \n # 处理每个页面图片\n image_markdown = \"\\n## 图片内容\\n\\n\"\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 如果启用了图像识别\n+ if recognize_images and api_key:\n+ image_markdown += \"正在使用大模型识别图片内容,这可能需要一些时间...\\n\\n\"\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n \n- # 获取图片尺寸\n- width, height = img.size\n+ # 创建图像识别器\n+ recognizer = ImageRecognizer(api_base_url, api_key)\n \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n+ # 处理每张图片\n+ for i, img in enumerate(images):\n+ try:\n+ # 保存图片到临时目录和保存目录\n+ temp_img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(temp_img_path, \"JPEG\", quality=80)\n+ \n+ # 复制到保存目录\n+ img_filename = f\"pdf_image_{os.path.basename(file_path).replace('.pdf', '')}_{i+1}.jpg\"\n+ saved_img_path = os.path.join(image_save_dir, img_filename)\n+ shutil.copy2(temp_img_path, saved_img_path)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 识别图片内容\n+ recognition_result = await recognizer.recognize_image(\n+ saved_img_path, \n+ \"请详细描述这张图片的内容,包括图表、文字和视觉元素\"\n+ )\n+ \n+ # 添加识别结果\n+ if recognition_result:\n+ image_recognition_info = f\"### 第 {i+1} 页图片识别结果\\n\\n\"\n+ image_recognition_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_recognition_info += f\"- 格式: JPEG\\n\"\n+ image_recognition_info += f\"- 保存路径: {saved_img_path}\\n\\n\"\n+ image_recognition_info += f\"**识别内容**:\\n\\n{recognition_result}\\n\\n\"\n+ image_recognition_info += \"---\\n\\n\"\n+ \n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_recognition_info\n+ ))\n+ else:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"### 第 {i+1} 页图片识别失败\\n\\n- 尺寸: {width}x{height} 像素\\n- 格式: JPEG\\n- 保存路径: {saved_img_path}\\n\\n---\\n\\n\"\n+ ))\n+ except Exception as img_rec_error:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"### 第 {i+1} 页图片处理错误\\n\\n错误信息: {str(img_rec_error)}\\n\\n---\\n\\n\"\n+ ))\n+ else:\n+ # 不进行图像识别,只显示图片信息\n+ for i, img in enumerate(images):\n+ # 保存图片到临时目录\n+ img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n" }, { "date": 1741495204531, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,452 +1,1 @@\n-\"\"\"\n-PDF快速预览工具,仅提取文本内容,适用于大型PDF文件\n-\"\"\"\n-\n-import os\n-import tempfile\n-import shutil\n-import PyPDF2\n-import pymupdf4llm\n-import traceback\n-from typing import Dict, List, Any, Optional\n-import mcp.types as types\n-from . import BaseTool, ToolRegistry\n-from .image_recognition_tool import ImageRecognizer\n-\n-@ToolRegistry.register\n-class PdfTool(BaseTool):\n- \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n- name = \"file\"\n- description = \"解析PDF文件并提取文本和图片内容\"\n- input_schema = {\n- \"type\": \"object\",\n- \"required\": [\"file_path\"],\n- \"properties\": {\n- \"file_path\": {\n- \"type\": \"string\",\n- \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n- },\n- \"recognize_images\": {\n- \"type\": \"boolean\",\n- \"description\": \"是否使用大模型识别图片内容\",\n- },\n- \"image_save_dir\": {\n- \"type\": \"string\",\n- \"description\": \"图片保存目录,默认为/host_images\",\n- },\n- \"api_key\": {\n- \"type\": \"string\",\n- \"description\": \"大模型API密钥,用于图像识别\",\n- },\n- \"api_base_url\": {\n- \"type\": \"string\",\n- \"description\": \"大模型API基础URL,用于图像识别\",\n- }\n- },\n- }\n- \n- async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"\n- 解析PDF文档\n- \n- Args:\n- arguments: 参数字典,必须包含'file_path'键\n- \n- Returns:\n- PDF文本内容列表\n- \"\"\"\n- if \"file_path\" not in arguments:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"错误: 缺少必要参数 'file_path'\"\n- )]\n- \n- # 获取参数\n- file_path = arguments[\"file_path\"]\n- recognize_images = arguments.get(\"recognize_images\", False)\n- image_save_dir = arguments.get(\"image_save_dir\", \"/host_images\")\n- api_key = arguments.get(\"api_key\", os.environ.get(\"LLM_API_KEY\", \"\"))\n- api_base_url = arguments.get(\"api_base_url\", os.environ.get(\"LLM_API_BASE_URL\", \"api.openai.com\"))\n- \n- # 如果启用图像识别但没有提供API密钥,返回错误\n- if recognize_images and not api_key:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"错误: 启用图像识别功能需要提供API密钥,请通过参数提供或设置环境变量 LLM_API_KEY\"\n- )]\n- \n- return await self._parse_pdf(\n- file_path, \n- recognize_images=recognize_images,\n- image_save_dir=image_save_dir,\n- api_key=api_key,\n- api_base_url=api_base_url\n- )\n- \n- async def _parse_pdf(\n- self, \n- file_path: str, \n- recognize_images: bool = False,\n- image_save_dir: str = \"/host_images\",\n- api_key: str = \"\",\n- api_base_url: str = \"api.openai.com\"\n- ) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"\n- 解析PDF文件并提取文本和图片\n- \n- Args:\n- file_path: PDF文件路径\n- recognize_images: 是否使用大模型识别图片内容\n- image_save_dir: 图片保存目录\n- api_key: 大模型API密钥\n- api_base_url: 大模型API基础URL\n- \n- Returns:\n- PDF内容列表\n- \"\"\"\n- results = []\n- \n- # 检查文件是否存在\n- if not os.path.exists(file_path):\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n- )]\n- \n- try:\n- # 创建临时目录用于存储图片\n- temp_dir = tempfile.mkdtemp()\n- image_path = os.path.join(temp_dir, \"images\")\n- os.makedirs(image_path, exist_ok=True)\n- \n- # 确保图片保存目录存在\n- if recognize_images:\n- os.makedirs(image_save_dir, exist_ok=True)\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"已启用图像识别功能,图片将保存到 {image_save_dir} 目录\"\n- ))\n- \n- # 添加文件大小信息\n- file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"文件大小: {file_size_mb:.2f} MB\"\n- ))\n- \n- # 对大文件提供警告\n- if file_size_mb > 10:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n- ))\n- \n- # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n- try:\n- # 获取PDF页数\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n- \n- # 限制处理的页数\n- max_pages = min(num_pages, 30)\n- pages_to_process = list(range(max_pages))\n- \n- # 使用PymuPDF4llm提取内容\n- md_content = pymupdf4llm.to_markdown(\n- doc=file_path,\n- pages=pages_to_process,\n- page_chunks=True,\n- write_images=True,\n- image_path=image_path,\n- image_format=\"jpg\",\n- dpi=150\n- )\n- \n- # 如果处理的页数少于总页数,添加提示\n- if max_pages < num_pages:\n- md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n- else:\n- md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n- \n- # 添加提取的内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=md_content\n- ))\n- \n- # 处理提取的图像\n- image_files = []\n- for root, dirs, files in os.walk(image_path):\n- for file in files:\n- if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n- image_files.append(os.path.join(root, file))\n- \n- # 添加图像信息\n- if image_files:\n- image_info = \"\\n## 提取的图像信息\\n\\n\"\n- image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n- \n- # 如果启用了图像识别\n- if recognize_images and api_key:\n- image_info += \"正在使用大模型识别图片内容,这可能需要一些时间...\\n\\n\"\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_info\n- ))\n- \n- # 创建图像识别器\n- recognizer = ImageRecognizer(api_base_url, api_key)\n- \n- # 处理每张图片\n- for i, img_file in enumerate(image_files):\n- try:\n- # 复制图片到保存目录\n- img_filename = f\"pdf_image_{os.path.basename(file_path).replace('.pdf', '')}_{i+1}.jpg\"\n- saved_img_path = os.path.join(image_save_dir, img_filename)\n- shutil.copy2(img_file, saved_img_path)\n- \n- # 识别图片内容\n- recognition_result = await recognizer.recognize_image(\n- saved_img_path, \n- \"请详细描述这张图片的内容,包括图表、文字和视觉元素\"\n- )\n- \n- # 添加识别结果\n- if recognition_result:\n- image_recognition_info = f\"### 图像 {i+1} 识别结果\\n\\n\"\n- image_recognition_info += f\"- 文件名: {img_filename}\\n\"\n- image_recognition_info += f\"- 保存路径: {saved_img_path}\\n\\n\"\n- image_recognition_info += f\"**识别内容**:\\n\\n{recognition_result}\\n\\n\"\n- image_recognition_info += \"---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_recognition_info\n- ))\n- else:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"### 图像 {i+1} 识别失败\\n\\n- 文件名: {img_filename}\\n- 保存路径: {saved_img_path}\\n\\n---\\n\\n\"\n- ))\n- except Exception as img_rec_error:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"### 图像 {i+1} 处理错误\\n\\n错误信息: {str(img_rec_error)}\\n\\n---\\n\\n\"\n- ))\n- else:\n- # 不进行图像识别,只显示图片信息\n- for i, img_file in enumerate(image_files):\n- try:\n- from PIL import Image\n- with Image.open(img_file) as img:\n- width, height = img.size\n- format_name = img.format\n- \n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- except Exception as e:\n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_info\n- ))\n- else:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n- ))\n- \n- except Exception as extract_error:\n- # 如果PymuPDF4llm提取失败,回退到原来的方法\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n- ))\n- \n- # 使用PyPDF2提取文本\n- text_content = \"\"\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n- \n- # 添加PDF元数据\n- text_content += f\"# PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:] # 移除前导斜杠\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 提取文本 - 限制页数以提高性能\n- max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n- if max_pages_to_process < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n- \n- text_content += \"\\n## 内容摘要\\n\\n\"\n- \n- # 逐页提取文本\n- for page_num in range(max_pages_to_process):\n- # 添加进度提示\n- if page_num % 5 == 0 and page_num > 0:\n- progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n- results.append(types.TextContent(type=\"text\", text=progress_msg))\n- \n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n- \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n- \n- # 尝试使用pdf2image提取图片\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"正在提取图片,这可能需要一些时间...\"\n- ))\n- \n- # 转换PDF页面为图片并保存\n- from pdf2image import convert_from_path\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n- \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- \n- # 如果启用了图像识别\n- if recognize_images and api_key:\n- image_markdown += \"正在使用大模型识别图片内容,这可能需要一些时间...\\n\\n\"\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- \n- # 创建图像识别器\n- recognizer = ImageRecognizer(api_base_url, api_key)\n- \n- # 处理每张图片\n- for i, img in enumerate(images):\n- try:\n- # 保存图片到临时目录和保存目录\n- temp_img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(temp_img_path, \"JPEG\", quality=80)\n- \n- # 复制到保存目录\n- img_filename = f\"pdf_image_{os.path.basename(file_path).replace('.pdf', '')}_{i+1}.jpg\"\n- saved_img_path = os.path.join(image_save_dir, img_filename)\n- shutil.copy2(temp_img_path, saved_img_path)\n- \n- # 获取图片尺寸\n- width, height = img.size\n- \n- # 识别图片内容\n- recognition_result = await recognizer.recognize_image(\n- saved_img_path, \n- \"请详细描述这张图片的内容,包括图表、文字和视觉元素\"\n- )\n- \n- # 添加识别结果\n- if recognition_result:\n- image_recognition_info = f\"### 第 {i+1} 页图片识别结果\\n\\n\"\n- image_recognition_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_recognition_info += f\"- 格式: JPEG\\n\"\n- image_recognition_info += f\"- 保存路径: {saved_img_path}\\n\\n\"\n- image_recognition_info += f\"**识别内容**:\\n\\n{recognition_result}\\n\\n\"\n- image_recognition_info += \"---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_recognition_info\n- ))\n- else:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"### 第 {i+1} 页图片识别失败\\n\\n- 尺寸: {width}x{height} 像素\\n- 格式: JPEG\\n- 保存路径: {saved_img_path}\\n\\n---\\n\\n\"\n- ))\n- except Exception as img_rec_error:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"### 第 {i+1} 页图片处理错误\\n\\n错误信息: {str(img_rec_error)}\\n\\n---\\n\\n\"\n- ))\n- else:\n- # 不进行图像识别,只显示图片信息\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n- \n- # 获取图片尺寸\n- width, height = img.size\n- \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n- \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n- \n- # 清理临时目录\n- shutil.rmtree(temp_dir)\n- \n- # 添加处理完成的提示\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"PDF处理完成!\"\n- ))\n- \n- return results\n- except Exception as e:\n- # 确保清理临时目录\n- if 'temp_dir' in locals() and os.path.exists(temp_dir):\n- shutil.rmtree(temp_dir)\n- \n- import traceback\n- error_details = traceback.format_exc()\n- return [\n- types.TextContent(\n- type=\"text\",\n- text=f\"错误: 解析PDF文件失败: {str(e)}\\n\"\n- f\"可能的原因:\\n\"\n- f\"1. 文件格式不兼容\\n\"\n- f\"2. 文件已加密或受密码保护\\n\"\n- f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n- f\"4. 文件太大,处理超时\\n\\n\"\n- f\"详细错误信息: {error_details}\"\n- )\n- ] \n\\ No newline at end of file\n+ \n\\ No newline at end of file\n" } ], "date": 1741332266981, "name": "Commit-0", "content": "\"\"\"\nPDF文档解析工具,提供完整的PDF解析功能,包括文本和图像提取\n\"\"\"\n\nimport os\nimport tempfile\nimport shutil\nimport PyPDF2\nimport pymupdf4llm\nimport traceback\nfrom typing import Dict, List, Any\nimport mcp.types as types\nfrom .base import BaseTool\nfrom .utils.pdf_helpers import extract_images_from_pdf\n\n\nclass PdfTool(BaseTool):\n \"\"\"\n 用于解析PDF文档的工具,提供完整的文本和图像提取功能\n \"\"\"\n \n @property\n def name(self) -> str:\n return \"file\"\n \n @property\n def description(self) -> str:\n return \"解析PDF文档并提取文本和图片内容\"\n \n @property\n def input_schema(self) -> Dict[str, Any]:\n return {\n \"type\": \"object\",\n \"required\": [\"file_path\"],\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n }\n },\n }\n \n async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 解析PDF文档\n \n Args:\n arguments: 参数字典,必须包含'file_path'键\n \n Returns:\n PDF内容列表,包括文本和图像信息\n \"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"错误: 缺少必要参数 'file_path'\"\n )]\n \n return await self._parse_pdf(arguments[\"file_path\"])\n \n async def _parse_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 解析PDF文档,提取文本和图像\n \n Args:\n file_path: PDF文件路径\n \n Returns:\n PDF内容列表,包括文本和图像信息\n \"\"\"\n results = []\n \n # 添加初始状态提示\n results.append(types.TextContent(\n type=\"text\",\n text=\"开始处理PDF文件,请稍候...\"\n ))\n \n # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n try:\n # 创建临时目录用于存储图片\n temp_dir = tempfile.mkdtemp()\n image_path = os.path.join(temp_dir, \"images\")\n os.makedirs(image_path, exist_ok=True)\n \n # 添加文件大小信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n text=f\"文件大小: {file_size_mb:.2f} MB\"\n ))\n \n # 对大文件提供警告\n if file_size_mb > 10:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n ))\n \n # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n try:\n # 获取PDF页数\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 限制处理的页数\n max_pages = min(num_pages, 30)\n pages_to_process = list(range(max_pages))\n \n # 使用PymuPDF4llm提取内容\n md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=True,\n image_path=image_path,\n image_format=\"jpg\",\n dpi=150\n )\n \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n # 添加提取的内容到结果\n results.append(types.TextContent(\n type=\"text\",\n text=md_content\n ))\n \n # 使用PyMuPDF提取图片(更高级的图片提取方法)\n results.append(types.TextContent(\n type=\"text\",\n text=\"正在使用高级方法提取图片,这可能需要一些时间...\"\n ))\n \n # 提取图片\n image_info = await extract_images_from_pdf(file_path, image_path)\n \n # 添加图像信息\n if image_info:\n image_markdown = \"\\n## 提取的图像信息\\n\\n\"\n image_markdown += f\"共提取了 {len(image_info)} 张图像。\\n\\n\"\n \n for i, img_data in enumerate(image_info):\n image_markdown += f\"### 图像 {i+1}\\n\\n\"\n image_markdown += f\"- 文件名: {img_data['filename']}\\n\"\n image_markdown += f\"- 页码: {img_data['page']}\\n\"\n image_markdown += f\"- 尺寸: {img_data['width']}x{img_data['height']} 像素\\n\"\n image_markdown += f\"- 格式: {img_data['format']}\\n\"\n image_markdown += f\"- 大小: {img_data['size_bytes'] / 1024:.2f} KB\\n\"\n \n if 'type' in img_data and img_data['type'] == 'rendered_page':\n image_markdown += f\"- 类型: 渲染页面\\n\"\n else:\n image_markdown += f\"- 类型: 嵌入图像\\n\"\n \n image_markdown += \"\\n---\\n\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n text=image_markdown\n ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,记录错误并尝试使用备用方法\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n ))\n \n # 这里可以添加备用提取方法,如PyPDF2等\n # 为简化起见,这个部分省略\n \n # 清理临时目录\n shutil.rmtree(temp_dir)\n \n # 添加处理完成的提示\n results.append(types.TextContent(\n type=\"text\",\n text=\"PDF处理完成!\"\n ))\n \n return results\n except Exception as e:\n # 确保清理临时目录\n if 'temp_dir' in locals() and os.path.exists(temp_dir):\n shutil.rmtree(temp_dir)\n \n error_details = traceback.format_exc()\n return [\n types.TextContent(\n type=\"text\",\n text=f\"错误: 解析PDF文件失败: {str(e)}\\n\"\n f\"可能的原因:\\n\"\n f\"1. 文件格式不兼容\\n\"\n f\"2. 文件已加密或受密码保护\\n\"\n f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n f\"4. 文件太大,处理超时\\n\\n\"\n f\"详细错误信息: {error_details}\"\n )\n ] " } ] }