MCP Development Framework

{ "sourceFile": "mcp_tool/tools/quick_pdf_tool.py", "activeCommit": 0, "commits": [ { "activePatchIndex": 1, "patches": [ { "date": 1741523138473, "content": "Index: \n===================================================================\n--- \n+++ \n" }, { "date": 1741523195946, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -6,17 +6,12 @@\n import fitz # PyMuPDF\n import PyPDF2\n import pymupdf4llm\n import traceback\n-import time\n-import asyncio\n from typing import Dict, List, Any\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n \n-# 快速PDF处理超时时间(秒)\n-QUICK_PDF_TIMEOUT = int(os.environ.get('QUICK_PDF_TIMEOUT', '120')) # 默认2分钟\n-\n @ToolRegistry.register\n class QuickPdfTool(BaseTool):\n \"\"\"\n 用于快速预览PDF文件的工具,仅提取文本内容,不处理图片\n@@ -30,12 +25,8 @@\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n- },\n- \"max_pages\": {\n- \"type\": \"integer\",\n- \"description\": \"最大处理页数,默认为50页\",\n }\n },\n }\n \n@@ -54,20 +45,16 @@\n type=\"text\",\n text=\"错误: 缺少必要参数 'file_path'\"\n )]\n \n- # 获取最大处理页数\n- max_pages = arguments.get(\"max_pages\", 50)\n- \n- return await self._quick_preview_pdf(arguments[\"file_path\"], max_pages)\n+ return await self._quick_preview_pdf(arguments[\"file_path\"])\n \n- async def _quick_preview_pdf(self, file_path: str, max_pages: int = 50) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ async def _quick_preview_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 快速预览PDF文件内容,不包含图片处理\n \n Args:\n file_path: PDF文件路径\n- max_pages: 最大处理页数\n \n Returns:\n PDF文本内容列表\n \"\"\"\n@@ -79,11 +66,8 @@\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n- # 设置处理开始时间\n- start_time = time.time()\n- \n try:\n # 添加文件信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n@@ -96,31 +80,12 @@\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 限制处理的页数\n- max_pages = min(num_pages, max_pages)\n+ max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n pages_to_process = list(range(max_pages))\n \n- # 对大文件提供警告\n- if file_size_mb > 20:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n- ))\n- # 对于特别大的文件,进一步限制页数\n- if file_size_mb > 50 and max_pages > 20:\n- max_pages = 20\n- pages_to_process = list(range(max_pages))\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"由于文件过大,已自动限制处理页数为 {max_pages} 页。\"\n- ))\n- \n try:\n- # 检查是否超时\n- if time.time() - start_time > QUICK_PDF_TIMEOUT:\n- raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n- \n # 尝试使用PyMuPDF提取文本(通常比PyPDF2更快更准确)\n pdf_document = fitz.open(file_path)\n \n # 提取文本内容\n@@ -144,13 +109,8 @@\n text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages):\n- # 检查是否超时\n- if time.time() - start_time > QUICK_PDF_TIMEOUT:\n- text_content += f\"\\n> 警告: 处理超时,剩余 {max_pages - page_num} 页未处理。\\n\"\n- break\n- \n page = pdf_document[page_num]\n page_text = page.get_text()\n \n if page_text.strip():\n@@ -164,27 +124,16 @@\n type=\"text\",\n text=text_content\n ))\n \n- except TimeoutError as timeout_error:\n- # 如果处理超时,添加错误信息但继续返回已处理的结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n- f\"已返回部分处理结果。您可以尝试减少处理的页数。\"\n- ))\n except Exception as pymupdf_error:\n # 如果PyMuPDF提取失败,回退到PymuPDF4llm\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PyMuPDF提取内容失败: {str(pymupdf_error)}\\n正在尝试使用备用方法...\"\n ))\n \n try:\n- # 检查是否超时\n- if time.time() - start_time > QUICK_PDF_TIMEOUT:\n- raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n- \n # 使用PymuPDF4llm提取内容,但不提取图像\n md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n@@ -202,88 +151,54 @@\n results.append(types.TextContent(\n type=\"text\",\n text=md_content\n ))\n- except TimeoutError as timeout_error:\n- # 如果处理超时,添加错误信息但继续返回已处理的结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n- f\"已返回部分处理结果。您可以尝试减少处理的页数。\"\n- ))\n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用最后的备用方法...\"\n ))\n \n- try:\n- # 检查是否超时\n- if time.time() - start_time > QUICK_PDF_TIMEOUT:\n- raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n+ # 使用PyPDF2提取文本\n+ text_content = \"\"\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n \n- # 使用PyPDF2提取文本\n- text_content = \"\"\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- \n- # 添加PDF元数据\n- text_content += f\"## PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:]\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 限制处理的页数\n- if max_pages < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n- \n- text_content += \"\\n## 内容摘要\\n\\n\"\n- \n- # 逐页提取文本\n- for page_num in range(max_pages):\n- # 检查是否超时\n- if time.time() - start_time > QUICK_PDF_TIMEOUT:\n- text_content += f\"\\n> 警告: 处理超时,剩余 {max_pages - page_num} 页未处理。\\n\"\n- break\n- \n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n+ # 添加PDF元数据\n+ text_content += f\"## PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n+ if reader.metadata:\n+ for key, value in reader.metadata.items():\n+ if key.startswith('/'):\n+ key = key[1:]\n+ if value and str(value).strip():\n+ text_content += f\"- {key}: {value}\\n\"\n \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n- except TimeoutError as timeout_error:\n- # 如果处理超时,添加错误信息但继续返回已处理的结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n- f\"已返回部分处理结果。您可以尝试减少处理的页数。\"\n- ))\n+ # 限制处理的页数\n+ if max_pages < num_pages:\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ \n+ text_content += \"\\n## 内容摘要\\n\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages):\n+ page = reader.pages[page_num]\n+ page_text = page.extract_text()\n+ if page_text:\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n+ text_content += page_text + \"\\n\"\n+ \n+ # 添加文本内容到结果\n+ results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 添加提示信息和处理时间\n- elapsed_time = time.time() - start_time\n+ # 添加提示信息\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"\\n## 注意\\n\\n快速预览完成!总耗时: {elapsed_time:.1f} 秒\\n\\n如需查看图片内容,请使用完整的PDF解析工具。\"\n+ text=\"\\n## 注意\\n\\n快速预览完成!如需查看图片内容,请使用完整的PDF解析工具。\"\n ))\n \n return results\n- except asyncio.TimeoutError:\n- # 处理异步超时\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"错误: PDF处理超时,已超过 {QUICK_PDF_TIMEOUT} 秒。\\n\"\n- f\"请尝试以下方法:\\n\"\n- f\"1. 减少处理的页数(使用max_pages参数)\\n\"\n- f\"2. 使用更小的PDF文件\"\n- )]\n except Exception as e:\n error_details = traceback.format_exc()\n return [types.TextContent(\n type=\"text\",\n" } ], "date": 1741523138473, "name": "Commit-0", "content": "\"\"\"\nPDF快速预览工具,仅提取文本内容,适用于大型PDF文件\n\"\"\"\n\nimport os\nimport fitz # PyMuPDF\nimport PyPDF2\nimport pymupdf4llm\nimport traceback\nimport time\nimport asyncio\nfrom typing import Dict, List, Any\nimport mcp.types as types\nfrom . import BaseTool, ToolRegistry\n\n# 快速PDF处理超时时间(秒)\nQUICK_PDF_TIMEOUT = int(os.environ.get('QUICK_PDF_TIMEOUT', '120')) # 默认2分钟\n\n@ToolRegistry.register\nclass QuickPdfTool(BaseTool):\n \"\"\"\n 用于快速预览PDF文件的工具,仅提取文本内容,不处理图片\n \"\"\"\n \n name = \"quick_pdf\"\n description = \"快速预览PDF文档内容(仅提取文本,不包含图片)\"\n input_schema = {\n \"type\": \"object\",\n \"required\": [\"file_path\"],\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n },\n \"max_pages\": {\n \"type\": \"integer\",\n \"description\": \"最大处理页数,默认为50页\",\n }\n },\n }\n \n async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 快速预览PDF文档\n \n Args:\n arguments: 参数字典,必须包含'file_path'键\n \n Returns:\n PDF文本内容列表\n \"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"错误: 缺少必要参数 'file_path'\"\n )]\n \n # 获取最大处理页数\n max_pages = arguments.get(\"max_pages\", 50)\n \n return await self._quick_preview_pdf(arguments[\"file_path\"], max_pages)\n \n async def _quick_preview_pdf(self, file_path: str, max_pages: int = 50) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 快速预览PDF文件内容,不包含图片处理\n \n Args:\n file_path: PDF文件路径\n max_pages: 最大处理页数\n \n Returns:\n PDF文本内容列表\n \"\"\"\n results = []\n \n # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n # 设置处理开始时间\n start_time = time.time()\n \n try:\n # 添加文件信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n text=f\"# 快速预览模式 - 仅提取文本内容\\n\\n文件大小: {file_size_mb:.2f} MB\"\n ))\n \n # 获取PDF页数\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 限制处理的页数\n max_pages = min(num_pages, max_pages)\n pages_to_process = list(range(max_pages))\n \n # 对大文件提供警告\n if file_size_mb > 20:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n ))\n # 对于特别大的文件,进一步限制页数\n if file_size_mb > 50 and max_pages > 20:\n max_pages = 20\n pages_to_process = list(range(max_pages))\n results.append(types.TextContent(\n type=\"text\",\n text=f\"由于文件过大,已自动限制处理页数为 {max_pages} 页。\"\n ))\n \n try:\n # 检查是否超时\n if time.time() - start_time > QUICK_PDF_TIMEOUT:\n raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n \n # 尝试使用PyMuPDF提取文本(通常比PyPDF2更快更准确)\n pdf_document = fitz.open(file_path)\n \n # 提取文本内容\n text_content = \"\"\n \n # 添加PDF元数据\n text_content += f\"## PDF文档信息\\n\\n\"\n text_content += f\"- 页数: {num_pages}\\n\"\n \n # 从PyMuPDF获取元数据\n metadata = pdf_document.metadata\n if metadata:\n for key, value in metadata.items():\n if value and str(value).strip():\n text_content += f\"- {key}: {value}\\n\"\n \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n \n text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages):\n # 检查是否超时\n if time.time() - start_time > QUICK_PDF_TIMEOUT:\n text_content += f\"\\n> 警告: 处理超时,剩余 {max_pages - page_num} 页未处理。\\n\"\n break\n \n page = pdf_document[page_num]\n page_text = page.get_text()\n \n if page_text.strip():\n text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n text_content += page_text.strip() + \"\\n\"\n \n pdf_document.close()\n \n # 添加提取的内容到结果\n results.append(types.TextContent(\n type=\"text\",\n text=text_content\n ))\n \n except TimeoutError as timeout_error:\n # 如果处理超时,添加错误信息但继续返回已处理的结果\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n f\"已返回部分处理结果。您可以尝试减少处理的页数。\"\n ))\n except Exception as pymupdf_error:\n # 如果PyMuPDF提取失败,回退到PymuPDF4llm\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PyMuPDF提取内容失败: {str(pymupdf_error)}\\n正在尝试使用备用方法...\"\n ))\n \n try:\n # 检查是否超时\n if time.time() - start_time > QUICK_PDF_TIMEOUT:\n raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n \n # 使用PymuPDF4llm提取内容,但不提取图像\n md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=False # 不提取图像\n )\n \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n # 添加提取的内容到结果\n results.append(types.TextContent(\n type=\"text\",\n text=md_content\n ))\n except TimeoutError as timeout_error:\n # 如果处理超时,添加错误信息但继续返回已处理的结果\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n f\"已返回部分处理结果。您可以尝试减少处理的页数。\"\n ))\n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用最后的备用方法...\"\n ))\n \n try:\n # 检查是否超时\n if time.time() - start_time > QUICK_PDF_TIMEOUT:\n raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n \n # 使用PyPDF2提取文本\n text_content = \"\"\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n \n # 添加PDF元数据\n text_content += f\"## PDF文档信息\\n\\n\"\n text_content += f\"- 页数: {num_pages}\\n\"\n if reader.metadata:\n for key, value in reader.metadata.items():\n if key.startswith('/'):\n key = key[1:]\n if value and str(value).strip():\n text_content += f\"- {key}: {value}\\n\"\n \n # 限制处理的页数\n if max_pages < num_pages:\n text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n \n text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages):\n # 检查是否超时\n if time.time() - start_time > QUICK_PDF_TIMEOUT:\n text_content += f\"\\n> 警告: 处理超时,剩余 {max_pages - page_num} 页未处理。\\n\"\n break\n \n page = reader.pages[page_num]\n page_text = page.extract_text()\n if page_text:\n text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n text_content += page_text + \"\\n\"\n \n # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n except TimeoutError as timeout_error:\n # 如果处理超时,添加错误信息但继续返回已处理的结果\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n f\"已返回部分处理结果。您可以尝试减少处理的页数。\"\n ))\n \n # 添加提示信息和处理时间\n elapsed_time = time.time() - start_time\n results.append(types.TextContent(\n type=\"text\",\n text=f\"\\n## 注意\\n\\n快速预览完成!总耗时: {elapsed_time:.1f} 秒\\n\\n如需查看图片内容,请使用完整的PDF解析工具。\"\n ))\n \n return results\n except asyncio.TimeoutError:\n # 处理异步超时\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: PDF处理超时,已超过 {QUICK_PDF_TIMEOUT} 秒。\\n\"\n f\"请尝试以下方法:\\n\"\n f\"1. 减少处理的页数(使用max_pages参数)\\n\"\n f\"2. 使用更小的PDF文件\"\n )]\n except Exception as e:\n error_details = traceback.format_exc()\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 快速预览PDF失败: {str(e)}\\n详细错误信息: {error_details}\"\n )] " } ] }