MCP Development Framework
by aigo666
{
"sourceFile": "mcp_tool/tools/pdf_tool.py",
"activeCommit": 0,
"commits": [
{
"activePatchIndex": 7,
"patches": [
{
"date": 1741521106081,
"content": "Index: \n===================================================================\n--- \n+++ \n"
},
{
"date": 1741522329863,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -13,8 +13,10 @@\n # 图像保存目录,与Docker挂载卷对应\n IMAGE_SAVE_DIR = os.environ.get('IMAGE_SAVE_DIR', '/img')\n # 是否启用图像识别\n ENABLE_IMAGE_RECOGNITION = os.environ.get('ENABLE_IMAGE_RECOGNITION', 'true').lower() == 'true'\n+# 最大处理图像数量\n+MAX_IMAGES_TO_PROCESS = int(os.environ.get('MAX_IMAGES_TO_PROCESS', '10'))\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n@@ -30,8 +32,12 @@\n },\n \"enable_image_recognition\": {\n \"type\": \"boolean\",\n \"description\": \"是否启用图像识别功能,默认为环境变量设置或true\",\n+ },\n+ \"max_images\": {\n+ \"type\": \"integer\",\n+ \"description\": \"最大处理图像数量,默认为环境变量设置或10\",\n }\n },\n }\n \n@@ -45,8 +51,10 @@\n \n file_path = arguments[\"file_path\"]\n # 获取是否启用图像识别的参数,默认使用环境变量设置\n enable_image_recognition = arguments.get(\"enable_image_recognition\", ENABLE_IMAGE_RECOGNITION)\n+ # 获取最大处理图像数量\n+ max_images = arguments.get(\"max_images\", MAX_IMAGES_TO_PROCESS)\n \n results = []\n \n # 添加初始状态提示\n@@ -133,8 +141,16 @@\n for file in files:\n if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n image_files.append(os.path.join(root, file))\n \n+ # 限制处理的图像数量\n+ if len(image_files) > max_images:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 提取的图像数量 ({len(image_files)}) 超过了最大处理限制 ({max_images}),仅处理前 {max_images} 张图像。\"\n+ ))\n+ image_files = image_files[:max_images]\n+ \n # 添加图像信息\n if image_files:\n image_info = \"\\n## 提取的图像信息\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n@@ -142,8 +158,11 @@\n # 如果启用了图像识别,添加提示\n if enable_image_recognition:\n image_info += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n \n+ recognition_success_count = 0\n+ recognition_fail_count = 0\n+ \n for i, img_file in enumerate(image_files):\n try:\n with Image.open(img_file) as img:\n width, height = img.size\n@@ -168,20 +187,39 @@\n recognition_result = recognize_image(saved_img_path)\n \n if recognition_result:\n image_info += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n+ recognition_success_count += 1\n else:\n image_info += \"\\n**图像识别失败**\\n\"\n+ image_info += \"可能原因:API请求超时、图像格式不支持或识别服务暂时不可用。\\n\"\n+ recognition_fail_count += 1\n except Exception as recog_error:\n image_info += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n+ recognition_fail_count += 1\n \n image_info += \"\\n---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n image_info += \"---\\n\\n\"\n \n+ # 添加图像识别统计信息\n+ if enable_image_recognition:\n+ image_info += f\"\\n### 图像识别统计\\n\\n\"\n+ image_info += f\"- 总图像数: {len(image_files)}\\n\"\n+ image_info += f\"- 成功识别: {recognition_success_count}\\n\"\n+ image_info += f\"- 识别失败: {recognition_fail_count}\\n\"\n+ \n+ if recognition_fail_count > 0:\n+ image_info += \"\\n如果图像识别失败,可能是由于以下原因:\\n\"\n+ image_info += \"1. API请求超时 - 服务器响应时间过长\\n\"\n+ image_info += \"2. 图像格式不支持 - 某些特殊格式可能无法被识别\\n\"\n+ image_info += \"3. 识别服务暂时不可用 - API服务可能暂时中断\\n\"\n+ image_info += \"4. 图像质量问题 - 图像分辨率过低或内容不清晰\\n\"\n+ image_info += \"\\n您可以尝试重新运行工具,或者调整超时设置后再试。\\n\"\n+ \n results.append(types.TextContent(\n type=\"text\",\n text=image_info\n ))\n@@ -256,15 +294,26 @@\n last_page=max_img_pages,\n thread_count=2 # 使用多线程加速\n )\n \n+ # 限制处理的图像数量\n+ if len(images) > max_images:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 提取的图像数量 ({len(images)}) 超过了最大处理限制 ({max_images}),仅处理前 {max_images} 张图像。\"\n+ ))\n+ images = images[:max_images]\n+ \n # 处理每个页面图片\n image_markdown = \"\\n## 图片内容\\n\\n\"\n \n # 如果启用了图像识别,添加提示\n if enable_image_recognition:\n image_markdown += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n \n+ recognition_success_count = 0\n+ recognition_fail_count = 0\n+ \n for i, img in enumerate(images):\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n img.save(img_path, \"JPEG\", quality=80)\n@@ -292,16 +341,35 @@\n recognition_result = recognize_image(saved_img_path)\n \n if recognition_result:\n image_markdown += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n+ recognition_success_count += 1\n else:\n image_markdown += \"\\n**图像识别失败**\\n\"\n+ image_markdown += \"可能原因:API请求超时、图像格式不支持或识别服务暂时不可用。\\n\"\n+ recognition_fail_count += 1\n except Exception as recog_error:\n image_markdown += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n+ recognition_fail_count += 1\n \n # 添加分隔线\n image_markdown += \"\\n---\\n\\n\"\n \n+ # 添加图像识别统计信息\n+ if enable_image_recognition:\n+ image_markdown += f\"\\n### 图像识别统计\\n\\n\"\n+ image_markdown += f\"- 总图像数: {len(images)}\\n\"\n+ image_markdown += f\"- 成功识别: {recognition_success_count}\\n\"\n+ image_markdown += f\"- 识别失败: {recognition_fail_count}\\n\"\n+ \n+ if recognition_fail_count > 0:\n+ image_markdown += \"\\n如果图像识别失败,可能是由于以下原因:\\n\"\n+ image_markdown += \"1. API请求超时 - 服务器响应时间过长\\n\"\n+ image_markdown += \"2. 图像格式不支持 - 某些特殊格式可能无法被识别\\n\"\n+ image_markdown += \"3. 识别服务暂时不可用 - API服务可能暂时中断\\n\"\n+ image_markdown += \"4. 图像质量问题 - 图像分辨率过低或内容不清晰\\n\"\n+ image_markdown += \"\\n您可以尝试重新运行工具,或者调整超时设置后再试。\\n\"\n+ \n # 添加图片信息到结果\n results.append(types.TextContent(\n type=\"text\",\n text=image_markdown\n"
},
{
"date": 1741523026140,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -2,8 +2,10 @@\n import tempfile\n import shutil\n import uuid\n import PyPDF2\n+import time\n+import asyncio\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n import mcp.types as types\n@@ -15,8 +17,12 @@\n # 是否启用图像识别\n ENABLE_IMAGE_RECOGNITION = os.environ.get('ENABLE_IMAGE_RECOGNITION', 'true').lower() == 'true'\n # 最大处理图像数量\n MAX_IMAGES_TO_PROCESS = int(os.environ.get('MAX_IMAGES_TO_PROCESS', '10'))\n+# PDF处理总超时时间(秒)\n+PDF_PROCESSING_TIMEOUT = int(os.environ.get('PDF_PROCESSING_TIMEOUT', '300')) # 默认5分钟\n+# 是否启用快速模式(减少处理量以提高速度)\n+FAST_MODE = os.environ.get('PDF_FAST_MODE', 'false').lower() == 'true'\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n@@ -36,8 +42,12 @@\n },\n \"max_images\": {\n \"type\": \"integer\",\n \"description\": \"最大处理图像数量,默认为环境变量设置或10\",\n+ },\n+ \"fast_mode\": {\n+ \"type\": \"boolean\",\n+ \"description\": \"是否启用快速模式,减少处理量以提高速度,默认为环境变量设置\",\n }\n },\n }\n \n@@ -53,8 +63,10 @@\n # 获取是否启用图像识别的参数,默认使用环境变量设置\n enable_image_recognition = arguments.get(\"enable_image_recognition\", ENABLE_IMAGE_RECOGNITION)\n # 获取最大处理图像数量\n max_images = arguments.get(\"max_images\", MAX_IMAGES_TO_PROCESS)\n+ # 获取是否启用快速模式\n+ fast_mode = arguments.get(\"fast_mode\", FAST_MODE)\n \n results = []\n \n # 添加初始状态提示\n@@ -69,14 +81,17 @@\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n+ # 创建临时目录用于存储图片\n+ temp_dir = tempfile.mkdtemp()\n+ image_path = os.path.join(temp_dir, \"images\")\n+ os.makedirs(image_path, exist_ok=True)\n+ \n+ # 设置处理开始时间\n+ start_time = time.time()\n+ \n try:\n- # 创建临时目录用于存储图片\n- temp_dir = tempfile.mkdtemp()\n- image_path = os.path.join(temp_dir, \"images\")\n- os.makedirs(image_path, exist_ok=True)\n- \n # 确保图像保存目录存在\n if enable_image_recognition and not os.path.exists(IMAGE_SAVE_DIR):\n try:\n os.makedirs(IMAGE_SAVE_DIR, exist_ok=True)\n@@ -99,20 +114,42 @@\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n ))\n+ # 对于大文件,自动启用快速模式\n+ if file_size_mb > 20 and not fast_mode:\n+ fast_mode = True\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"已自动启用快速模式以提高处理速度。\"\n+ ))\n \n+ # 如果启用了快速模式,减少处理量\n+ if fast_mode:\n+ max_pages = 10 # 快速模式下最多处理10页\n+ max_images = min(max_images, 3) # 快速模式下最多处理3张图像\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"快速模式已启用: 最多处理{max_pages}页和{max_images}张图像。\"\n+ ))\n+ else:\n+ max_pages = 30 # 正常模式下最多处理30页\n+ \n # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n try:\n # 获取PDF页数\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 限制处理的页数\n- max_pages = min(num_pages, 30)\n+ max_pages = min(num_pages, max_pages)\n pages_to_process = list(range(max_pages))\n \n+ # 检查是否超时\n+ if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n+ raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n+ \n # 使用PymuPDF4llm提取内容\n md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n@@ -162,8 +199,13 @@\n recognition_success_count = 0\n recognition_fail_count = 0\n \n for i, img_file in enumerate(image_files):\n+ # 检查是否超时\n+ if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n+ image_info += f\"\\n**警告: 处理超时,剩余 {len(image_files) - i} 张图像未处理**\\n\"\n+ break\n+ \n try:\n with Image.open(img_file) as img:\n width, height = img.size\n format_name = img.format\n@@ -228,8 +270,15 @@\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n+ except TimeoutError as timeout_error:\n+ # 如果处理超时,添加错误信息但继续返回已处理的结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n+ f\"已返回部分处理结果。您可以尝试使用快速模式或减少处理的页数和图像数量。\"\n+ ))\n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n@@ -252,16 +301,21 @@\n if value and str(value).strip():\n text_content += f\"- {key}: {value}\\n\"\n \n # 提取文本 - 限制页数以提高性能\n- max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n+ max_pages_to_process = min(num_pages, max_pages) # 限制处理的最大页数\n if max_pages_to_process < num_pages:\n text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n \n text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages_to_process):\n+ # 检查是否超时\n+ if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n+ text_content += f\"\\n> 警告: 处理超时,剩余 {max_pages_to_process - page_num} 页未处理。\\n\"\n+ break\n+ \n # 添加进度提示\n if page_num % 5 == 0 and page_num > 0:\n progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n results.append(types.TextContent(type=\"text\", text=progress_msg))\n@@ -274,125 +328,154 @@\n \n # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 尝试使用pdf2image提取图片\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- \n+ # 如果已经超时,跳过图像提取\n+ if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"正在提取图片,这可能需要一些时间...\"\n+ text=\"警告: 处理已超时,跳过图像提取步骤。\"\n ))\n- \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n- \n- # 限制处理的图像数量\n- if len(images) > max_images:\n+ else:\n+ # 尝试使用pdf2image提取图片\n+ try:\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 5 if fast_mode else 10) # 限制处理图片的最大页数\n+ \n results.append(types.TextContent(\n type=\"text\",\n- text=f\"警告: 提取的图像数量 ({len(images)}) 超过了最大处理限制 ({max_images}),仅处理前 {max_images} 张图像。\"\n+ text=f\"正在提取图片,这可能需要一些时间...\"\n ))\n- images = images[:max_images]\n- \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- \n- # 如果启用了图像识别,添加提示\n- if enable_image_recognition:\n- image_markdown += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n- \n- recognition_success_count = 0\n- recognition_fail_count = 0\n- \n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n \n- # 获取图片尺寸\n- width, height = img.size\n+ # 转换PDF页面为图片并保存\n+ images = convert_from_path(\n+ file_path, \n+ dpi=150, \n+ fmt=\"jpg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\"\n+ # 限制处理的图像数量\n+ if len(images) > max_images:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 提取的图像数量 ({len(images)}) 超过了最大处理限制 ({max_images}),仅处理前 {max_images} 张图像。\"\n+ ))\n+ images = images[:max_images]\n \n- # 如果启用了图像识别,保存图像并进行识别\n+ # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n+ \n+ # 如果启用了图像识别,添加提示\n if enable_image_recognition:\n- try:\n- # 生成唯一文件名\n- unique_filename = f\"{uuid.uuid4().hex}_page_{i+1}.jpg\"\n- saved_img_path = os.path.join(IMAGE_SAVE_DIR, unique_filename)\n+ image_markdown += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n+ \n+ recognition_success_count = 0\n+ recognition_fail_count = 0\n+ \n+ for i, img in enumerate(images):\n+ # 检查是否超时\n+ if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n+ image_markdown += f\"\\n**警告: 处理超时,剩余 {len(images) - i} 张图像未处理**\\n\"\n+ break\n \n- # 复制图像到保存目录\n- shutil.copy2(img_path, saved_img_path)\n- \n- # 进行图像识别\n- recognition_result = recognize_image(saved_img_path)\n- \n- if recognition_result:\n- image_markdown += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n- recognition_success_count += 1\n- else:\n- image_markdown += \"\\n**图像识别失败**\\n\"\n- image_markdown += \"可能原因:API请求超时、图像格式不支持或识别服务暂时不可用。\\n\"\n+ # 保存图片到临时目录\n+ img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\"\n+ \n+ # 如果启用了图像识别,保存图像并进行识别\n+ if enable_image_recognition:\n+ try:\n+ # 生成唯一文件名\n+ unique_filename = f\"{uuid.uuid4().hex}_page_{i+1}.jpg\"\n+ saved_img_path = os.path.join(IMAGE_SAVE_DIR, unique_filename)\n+ \n+ # 复制图像到保存目录\n+ shutil.copy2(img_path, saved_img_path)\n+ \n+ # 进行图像识别\n+ recognition_result = recognize_image(saved_img_path)\n+ \n+ if recognition_result:\n+ image_markdown += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n+ recognition_success_count += 1\n+ else:\n+ image_markdown += \"\\n**图像识别失败**\\n\"\n+ image_markdown += \"可能原因:API请求超时、图像格式不支持或识别服务暂时不可用。\\n\"\n+ recognition_fail_count += 1\n+ except Exception as recog_error:\n+ image_markdown += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n recognition_fail_count += 1\n- except Exception as recog_error:\n- image_markdown += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n- recognition_fail_count += 1\n+ \n+ # 添加分隔线\n+ image_markdown += \"\\n---\\n\\n\"\n \n- # 添加分隔线\n- image_markdown += \"\\n---\\n\\n\"\n- \n- # 添加图像识别统计信息\n- if enable_image_recognition:\n- image_markdown += f\"\\n### 图像识别统计\\n\\n\"\n- image_markdown += f\"- 总图像数: {len(images)}\\n\"\n- image_markdown += f\"- 成功识别: {recognition_success_count}\\n\"\n- image_markdown += f\"- 识别失败: {recognition_fail_count}\\n\"\n+ # 添加图像识别统计信息\n+ if enable_image_recognition:\n+ image_markdown += f\"\\n### 图像识别统计\\n\\n\"\n+ image_markdown += f\"- 总图像数: {len(images)}\\n\"\n+ image_markdown += f\"- 成功识别: {recognition_success_count}\\n\"\n+ image_markdown += f\"- 识别失败: {recognition_fail_count}\\n\"\n+ \n+ if recognition_fail_count > 0:\n+ image_markdown += \"\\n如果图像识别失败,可能是由于以下原因:\\n\"\n+ image_markdown += \"1. API请求超时 - 服务器响应时间过长\\n\"\n+ image_markdown += \"2. 图像格式不支持 - 某些特殊格式可能无法被识别\\n\"\n+ image_markdown += \"3. 识别服务暂时不可用 - API服务可能暂时中断\\n\"\n+ image_markdown += \"4. 图像质量问题 - 图像分辨率过低或内容不清晰\\n\"\n+ image_markdown += \"\\n您可以尝试重新运行工具,或者调整超时设置后再试。\\n\"\n \n- if recognition_fail_count > 0:\n- image_markdown += \"\\n如果图像识别失败,可能是由于以下原因:\\n\"\n- image_markdown += \"1. API请求超时 - 服务器响应时间过长\\n\"\n- image_markdown += \"2. 图像格式不支持 - 某些特殊格式可能无法被识别\\n\"\n- image_markdown += \"3. 识别服务暂时不可用 - API服务可能暂时中断\\n\"\n- image_markdown += \"4. 图像质量问题 - 图像分辨率过低或内容不清晰\\n\"\n- image_markdown += \"\\n您可以尝试重新运行工具,或者调整超时设置后再试。\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ except Exception as img_error:\n+ # 如果图片提取失败,添加错误信息但继续\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n+ ))\n \n # 清理临时目录\n shutil.rmtree(temp_dir)\n \n- # 添加处理完成的提示\n+ # 添加处理完成的提示和处理时间\n+ elapsed_time = time.time() - start_time\n results.append(types.TextContent(\n type=\"text\",\n- text=\"PDF处理完成!\"\n+ text=f\"PDF处理完成!总耗时: {elapsed_time:.1f} 秒\"\n ))\n \n return results\n+ except asyncio.TimeoutError:\n+ # 处理异步超时\n+ if 'temp_dir' in locals() and os.path.exists(temp_dir):\n+ shutil.rmtree(temp_dir)\n+ \n+ return [\n+ types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: PDF处理超时,已超过 {PDF_PROCESSING_TIMEOUT} 秒。\\n\"\n+ f\"请尝试以下方法:\\n\"\n+ f\"1. 使用快速模式处理文件\\n\"\n+ f\"2. 减少处理的页数和图像数量\\n\"\n+ f\"3. 使用更小的PDF文件\\n\"\n+ f\"4. 使用quick_pdf工具进行快速预览\"\n+ )\n+ ]\n except Exception as e:\n # 确保清理临时目录\n if 'temp_dir' in locals() and os.path.exists(temp_dir):\n shutil.rmtree(temp_dir)\n@@ -407,7 +490,11 @@\n f\"1. 文件格式不兼容\\n\"\n f\"2. 文件已加密或受密码保护\\n\"\n f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n f\"4. 文件太大,处理超时\\n\\n\"\n+ f\"建议:\\n\"\n+ f\"1. 使用quick_pdf工具进行快速预览\\n\"\n+ f\"2. 使用fast_mode=true参数减少处理量\\n\"\n+ f\"3. 减小文件大小或分割文件\\n\\n\"\n f\"详细错误信息: {error_details}\"\n )\n ] \n\\ No newline at end of file\n"
},
{
"date": 1741523195526,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,29 +1,14 @@\n import os\n import tempfile\n import shutil\n-import uuid\n import PyPDF2\n-import time\n-import asyncio\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n-from .image_recognition import recognize_image\n \n-# 图像保存目录,与Docker挂载卷对应\n-IMAGE_SAVE_DIR = os.environ.get('IMAGE_SAVE_DIR', '/img')\n-# 是否启用图像识别\n-ENABLE_IMAGE_RECOGNITION = os.environ.get('ENABLE_IMAGE_RECOGNITION', 'true').lower() == 'true'\n-# 最大处理图像数量\n-MAX_IMAGES_TO_PROCESS = int(os.environ.get('MAX_IMAGES_TO_PROCESS', '10'))\n-# PDF处理总超时时间(秒)\n-PDF_PROCESSING_TIMEOUT = int(os.environ.get('PDF_PROCESSING_TIMEOUT', '300')) # 默认5分钟\n-# 是否启用快速模式(减少处理量以提高速度)\n-FAST_MODE = os.environ.get('PDF_FAST_MODE', 'false').lower() == 'true'\n-\n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n name = \"file\"\n@@ -34,20 +19,8 @@\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n- },\n- \"enable_image_recognition\": {\n- \"type\": \"boolean\",\n- \"description\": \"是否启用图像识别功能,默认为环境变量设置或true\",\n- },\n- \"max_images\": {\n- \"type\": \"integer\",\n- \"description\": \"最大处理图像数量,默认为环境变量设置或10\",\n- },\n- \"fast_mode\": {\n- \"type\": \"boolean\",\n- \"description\": \"是否启用快速模式,减少处理量以提高速度,默认为环境变量设置\",\n }\n },\n }\n \n@@ -59,15 +32,8 @@\n text=\"Error: Missing required argument 'file_path'\"\n )]\n \n file_path = arguments[\"file_path\"]\n- # 获取是否启用图像识别的参数,默认使用环境变量设置\n- enable_image_recognition = arguments.get(\"enable_image_recognition\", ENABLE_IMAGE_RECOGNITION)\n- # 获取最大处理图像数量\n- max_images = arguments.get(\"max_images\", MAX_IMAGES_TO_PROCESS)\n- # 获取是否启用快速模式\n- fast_mode = arguments.get(\"fast_mode\", FAST_MODE)\n- \n results = []\n \n # 添加初始状态提示\n results.append(types.TextContent(\n@@ -81,27 +47,13 @@\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n- # 创建临时目录用于存储图片\n- temp_dir = tempfile.mkdtemp()\n- image_path = os.path.join(temp_dir, \"images\")\n- os.makedirs(image_path, exist_ok=True)\n- \n- # 设置处理开始时间\n- start_time = time.time()\n- \n try:\n- # 确保图像保存目录存在\n- if enable_image_recognition and not os.path.exists(IMAGE_SAVE_DIR):\n- try:\n- os.makedirs(IMAGE_SAVE_DIR, exist_ok=True)\n- except Exception as e:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法创建图像保存目录 {IMAGE_SAVE_DIR}: {str(e)}\\n图像识别功能可能不可用。\"\n- ))\n- enable_image_recognition = False\n+ # 创建临时目录用于存储图片\n+ temp_dir = tempfile.mkdtemp()\n+ image_path = os.path.join(temp_dir, \"images\")\n+ os.makedirs(image_path, exist_ok=True)\n \n # 添加文件大小信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n@@ -114,42 +66,20 @@\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n ))\n- # 对于大文件,自动启用快速模式\n- if file_size_mb > 20 and not fast_mode:\n- fast_mode = True\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"已自动启用快速模式以提高处理速度。\"\n- ))\n \n- # 如果启用了快速模式,减少处理量\n- if fast_mode:\n- max_pages = 10 # 快速模式下最多处理10页\n- max_images = min(max_images, 3) # 快速模式下最多处理3张图像\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"快速模式已启用: 最多处理{max_pages}页和{max_images}张图像。\"\n- ))\n- else:\n- max_pages = 30 # 正常模式下最多处理30页\n- \n # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n try:\n # 获取PDF页数\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 限制处理的页数\n- max_pages = min(num_pages, max_pages)\n+ max_pages = min(num_pages, 30)\n pages_to_process = list(range(max_pages))\n \n- # 检查是否超时\n- if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n- raise TimeoutError(f\"PDF处理超时,已经用时 {time.time() - start_time:.1f} 秒\")\n- \n # 使用PymuPDF4llm提取内容\n md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n@@ -178,90 +108,30 @@\n for file in files:\n if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n image_files.append(os.path.join(root, file))\n \n- # 限制处理的图像数量\n- if len(image_files) > max_images:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 提取的图像数量 ({len(image_files)}) 超过了最大处理限制 ({max_images}),仅处理前 {max_images} 张图像。\"\n- ))\n- image_files = image_files[:max_images]\n- \n # 添加图像信息\n if image_files:\n image_info = \"\\n## 提取的图像信息\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n- # 如果启用了图像识别,添加提示\n- if enable_image_recognition:\n- image_info += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n- \n- recognition_success_count = 0\n- recognition_fail_count = 0\n- \n for i, img_file in enumerate(image_files):\n- # 检查是否超时\n- if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n- image_info += f\"\\n**警告: 处理超时,剩余 {len(image_files) - i} 张图像未处理**\\n\"\n- break\n- \n try:\n with Image.open(img_file) as img:\n width, height = img.size\n format_name = img.format\n \n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\"\n- \n- # 如果启用了图像识别,保存图像并进行识别\n- if enable_image_recognition:\n- try:\n- # 生成唯一文件名\n- unique_filename = f\"{uuid.uuid4().hex}_{os.path.basename(img_file)}\"\n- saved_img_path = os.path.join(IMAGE_SAVE_DIR, unique_filename)\n- \n- # 复制图像到保存目录\n- shutil.copy2(img_file, saved_img_path)\n- \n- # 进行图像识别\n- recognition_result = recognize_image(saved_img_path)\n- \n- if recognition_result:\n- image_info += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n- recognition_success_count += 1\n- else:\n- image_info += \"\\n**图像识别失败**\\n\"\n- image_info += \"可能原因:API请求超时、图像格式不支持或识别服务暂时不可用。\\n\"\n- recognition_fail_count += 1\n- except Exception as recog_error:\n- image_info += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n- recognition_fail_count += 1\n- \n- image_info += \"\\n---\\n\\n\"\n+ image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n image_info += \"---\\n\\n\"\n \n- # 添加图像识别统计信息\n- if enable_image_recognition:\n- image_info += f\"\\n### 图像识别统计\\n\\n\"\n- image_info += f\"- 总图像数: {len(image_files)}\\n\"\n- image_info += f\"- 成功识别: {recognition_success_count}\\n\"\n- image_info += f\"- 识别失败: {recognition_fail_count}\\n\"\n- \n- if recognition_fail_count > 0:\n- image_info += \"\\n如果图像识别失败,可能是由于以下原因:\\n\"\n- image_info += \"1. API请求超时 - 服务器响应时间过长\\n\"\n- image_info += \"2. 图像格式不支持 - 某些特殊格式可能无法被识别\\n\"\n- image_info += \"3. 识别服务暂时不可用 - API服务可能暂时中断\\n\"\n- image_info += \"4. 图像质量问题 - 图像分辨率过低或内容不清晰\\n\"\n- image_info += \"\\n您可以尝试重新运行工具,或者调整超时设置后再试。\\n\"\n- \n results.append(types.TextContent(\n type=\"text\",\n text=image_info\n ))\n@@ -270,15 +140,8 @@\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n- except TimeoutError as timeout_error:\n- # 如果处理超时,添加错误信息但继续返回已处理的结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: PDF处理超时: {str(timeout_error)}\\n\"\n- f\"已返回部分处理结果。您可以尝试使用快速模式或减少处理的页数和图像数量。\"\n- ))\n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n@@ -301,21 +164,16 @@\n if value and str(value).strip():\n text_content += f\"- {key}: {value}\\n\"\n \n # 提取文本 - 限制页数以提高性能\n- max_pages_to_process = min(num_pages, max_pages) # 限制处理的最大页数\n+ max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n if max_pages_to_process < num_pages:\n text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n \n text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages_to_process):\n- # 检查是否超时\n- if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n- text_content += f\"\\n> 警告: 处理超时,剩余 {max_pages_to_process - page_num} 页未处理。\\n\"\n- break\n- \n # 添加进度提示\n if page_num % 5 == 0 and page_num > 0:\n progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n results.append(types.TextContent(type=\"text\", text=progress_msg))\n@@ -328,154 +186,70 @@\n \n # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 如果已经超时,跳过图像提取\n- if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n+ # 尝试使用pdf2image提取图片\n+ try:\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n+ \n results.append(types.TextContent(\n type=\"text\",\n- text=\"警告: 处理已超时,跳过图像提取步骤。\"\n+ text=f\"正在提取图片,这可能需要一些时间...\"\n ))\n- else:\n- # 尝试使用pdf2image提取图片\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 5 if fast_mode else 10) # 限制处理图片的最大页数\n+ \n+ # 转换PDF页面为图片并保存\n+ images = convert_from_path(\n+ file_path, \n+ dpi=150, \n+ fmt=\"jpg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n+ \n+ # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n+ for i, img in enumerate(images):\n+ # 保存图片到临时目录\n+ img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n \n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"正在提取图片,这可能需要一些时间...\"\n- ))\n+ # 获取图片尺寸\n+ width, height = img.size\n \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n \n- # 限制处理的图像数量\n- if len(images) > max_images:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 提取的图像数量 ({len(images)}) 超过了最大处理限制 ({max_images}),仅处理前 {max_images} 张图像。\"\n- ))\n- images = images[:max_images]\n- \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- \n- # 如果启用了图像识别,添加提示\n- if enable_image_recognition:\n- image_markdown += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n- \n- recognition_success_count = 0\n- recognition_fail_count = 0\n- \n- for i, img in enumerate(images):\n- # 检查是否超时\n- if time.time() - start_time > PDF_PROCESSING_TIMEOUT:\n- image_markdown += f\"\\n**警告: 处理超时,剩余 {len(images) - i} 张图像未处理**\\n\"\n- break\n- \n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n- \n- # 获取图片尺寸\n- width, height = img.size\n- \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\"\n- \n- # 如果启用了图像识别,保存图像并进行识别\n- if enable_image_recognition:\n- try:\n- # 生成唯一文件名\n- unique_filename = f\"{uuid.uuid4().hex}_page_{i+1}.jpg\"\n- saved_img_path = os.path.join(IMAGE_SAVE_DIR, unique_filename)\n- \n- # 复制图像到保存目录\n- shutil.copy2(img_path, saved_img_path)\n- \n- # 进行图像识别\n- recognition_result = recognize_image(saved_img_path)\n- \n- if recognition_result:\n- image_markdown += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n- recognition_success_count += 1\n- else:\n- image_markdown += \"\\n**图像识别失败**\\n\"\n- image_markdown += \"可能原因:API请求超时、图像格式不支持或识别服务暂时不可用。\\n\"\n- recognition_fail_count += 1\n- except Exception as recog_error:\n- image_markdown += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n- recognition_fail_count += 1\n- \n- # 添加分隔线\n- image_markdown += \"\\n---\\n\\n\"\n- \n- # 添加图像识别统计信息\n- if enable_image_recognition:\n- image_markdown += f\"\\n### 图像识别统计\\n\\n\"\n- image_markdown += f\"- 总图像数: {len(images)}\\n\"\n- image_markdown += f\"- 成功识别: {recognition_success_count}\\n\"\n- image_markdown += f\"- 识别失败: {recognition_fail_count}\\n\"\n- \n- if recognition_fail_count > 0:\n- image_markdown += \"\\n如果图像识别失败,可能是由于以下原因:\\n\"\n- image_markdown += \"1. API请求超时 - 服务器响应时间过长\\n\"\n- image_markdown += \"2. 图像格式不支持 - 某些特殊格式可能无法被识别\\n\"\n- image_markdown += \"3. 识别服务暂时不可用 - API服务可能暂时中断\\n\"\n- image_markdown += \"4. 图像质量问题 - 图像分辨率过低或内容不清晰\\n\"\n- image_markdown += \"\\n您可以尝试重新运行工具,或者调整超时设置后再试。\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n+ \n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ except Exception as img_error:\n+ # 如果图片提取失败,添加错误信息但继续\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n+ ))\n \n # 清理临时目录\n shutil.rmtree(temp_dir)\n \n- # 添加处理完成的提示和处理时间\n- elapsed_time = time.time() - start_time\n+ # 添加处理完成的提示\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"PDF处理完成!总耗时: {elapsed_time:.1f} 秒\"\n+ text=\"PDF处理完成!\"\n ))\n \n return results\n- except asyncio.TimeoutError:\n- # 处理异步超时\n- if 'temp_dir' in locals() and os.path.exists(temp_dir):\n- shutil.rmtree(temp_dir)\n- \n- return [\n- types.TextContent(\n- type=\"text\",\n- text=f\"错误: PDF处理超时,已超过 {PDF_PROCESSING_TIMEOUT} 秒。\\n\"\n- f\"请尝试以下方法:\\n\"\n- f\"1. 使用快速模式处理文件\\n\"\n- f\"2. 减少处理的页数和图像数量\\n\"\n- f\"3. 使用更小的PDF文件\\n\"\n- f\"4. 使用quick_pdf工具进行快速预览\"\n- )\n- ]\n except Exception as e:\n # 确保清理临时目录\n if 'temp_dir' in locals() and os.path.exists(temp_dir):\n shutil.rmtree(temp_dir)\n@@ -490,11 +264,7 @@\n f\"1. 文件格式不兼容\\n\"\n f\"2. 文件已加密或受密码保护\\n\"\n f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n f\"4. 文件太大,处理超时\\n\\n\"\n- f\"建议:\\n\"\n- f\"1. 使用quick_pdf工具进行快速预览\\n\"\n- f\"2. 使用fast_mode=true参数减少处理量\\n\"\n- f\"3. 减小文件大小或分割文件\\n\\n\"\n f\"详细错误信息: {error_details}\"\n )\n ] \n\\ No newline at end of file\n"
},
{
"date": 1741662681764,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,270 +1,196 @@\n+\"\"\"\n+PDF解析工具,用于解析PDF文件内容,支持快速预览和完整解析两种模式\n+\"\"\"\n+\n import os\n import tempfile\n import shutil\n+import fitz # PyMuPDF\n import PyPDF2\n-from pdf2image import convert_from_path\n-from PIL import Image\n import pymupdf4llm\n+import traceback\n+from typing import Dict, List, Any\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n+from PIL import Image\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n- \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n- name = \"file\"\n- description = \"解析PDF文件并提取文本和图片内容\"\n+ \"\"\"\n+ PDF解析工具,支持两种模式:\n+ 1. 快速预览模式:仅提取文本内容,适用于大型PDF文件\n+ 2. 完整解析模式:提取文本和图片内容,提供更详细的文档分析\n+ \"\"\"\n+ \n+ name = \"pdf\"\n+ description = \"解析PDF文件内容,支持快速预览和完整解析两种模式\"\n input_schema = {\n \"type\": \"object\",\n \"required\": [\"file_path\"],\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n+ },\n+ \"mode\": {\n+ \"type\": \"string\",\n+ \"description\": \"解析模式:'quick'(仅文本)或'full'(文本和图片),默认为'full'\",\n+ \"enum\": [\"quick\", \"full\"],\n+ \"default\": \"full\"\n }\n },\n }\n \n- async def execute(self, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"解析PDF文件并提取文本和图片\"\"\"\n+ async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"\n+ 解析PDF文件\n+ \n+ Args:\n+ arguments: 参数字典,必须包含'file_path'键,可选'mode'键\n+ \n+ Returns:\n+ PDF内容列表\n+ \"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n- text=\"Error: Missing required argument 'file_path'\"\n+ text=\"错误: 缺少必要参数 'file_path'\"\n )]\n- \n+ \n file_path = arguments[\"file_path\"]\n- results = []\n+ mode = arguments.get(\"mode\", \"full\")\n \n- # 添加初始状态提示\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"开始处理PDF文件,请稍候...\"\n- ))\n- \n # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n- text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n+ text=f\"错误: 文件不存在: {file_path}\"\n )]\n+ \n+ # 检查文件扩展名\n+ if not file_path.lower().endswith('.pdf'):\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: 文件不是PDF格式: {file_path}\"\n+ )]\n \n try:\n- # 创建临时目录用于存储图片\n+ if mode == \"quick\":\n+ return await self._quick_preview_pdf(file_path)\n+ else:\n+ return await self._full_parse_pdf(file_path)\n+ except Exception as e:\n+ error_details = traceback.format_exc()\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: 处理PDF文件时发生错误: {str(e)}\\n{error_details}\"\n+ )]\n+ \n+ async def _quick_preview_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"\n+ 快速预览PDF文件,仅提取文本内容\n+ \"\"\"\n+ try:\n+ # 使用PyMuPDF提取文本\n+ doc = fitz.open(file_path)\n+ text_content = []\n+ \n+ # 添加文件信息\n+ text_content.append(f\"文件名: {os.path.basename(file_path)}\")\n+ text_content.append(f\"页数: {doc.page_count}\")\n+ text_content.append(\"---\")\n+ \n+ # 提取每页文本\n+ for page_num in range(doc.page_count):\n+ page = doc[page_num]\n+ text = page.get_text()\n+ if text.strip():\n+ text_content.append(f\"第{page_num + 1}页:\")\n+ text_content.append(text)\n+ text_content.append(\"---\")\n+ \n+ doc.close()\n+ \n+ return [types.TextContent(\n+ type=\"text\",\n+ text=\"\\n\".join(text_content)\n+ )]\n+ \n+ except Exception as e:\n+ error_details = traceback.format_exc()\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: 快速预览PDF时发生错误: {str(e)}\\n{error_details}\"\n+ )]\n+ \n+ async def _full_parse_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"\n+ 完整解析PDF文件,提取文本和图片内容\n+ \"\"\"\n+ results = []\n+ temp_dir = None\n+ \n+ try:\n+ # 创建临时目录存储图片\n temp_dir = tempfile.mkdtemp()\n- image_path = os.path.join(temp_dir, \"images\")\n- os.makedirs(image_path, exist_ok=True)\n \n- # 添加文件大小信息\n- file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n+ # 使用PyMuPDF提取文本和图片\n+ doc = fitz.open(file_path)\n+ \n+ # 添加文件信息\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"文件大小: {file_size_mb:.2f} MB\"\n+ text=f\"文件名: {os.path.basename(file_path)}\\n页数: {doc.page_count}\\n---\"\n ))\n \n- # 对大文件提供警告\n- if file_size_mb > 10:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n- ))\n- \n- # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n- try:\n- # 获取PDF页数\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n+ # 处理每一页\n+ for page_num in range(doc.page_count):\n+ page = doc[page_num]\n \n- # 限制处理的页数\n- max_pages = min(num_pages, 30)\n- pages_to_process = list(range(max_pages))\n- \n- # 使用PymuPDF4llm提取内容\n- md_content = pymupdf4llm.to_markdown(\n- doc=file_path,\n- pages=pages_to_process,\n- page_chunks=True,\n- write_images=True,\n- image_path=image_path,\n- image_format=\"jpg\",\n- dpi=150\n- )\n- \n- # 如果处理的页数少于总页数,添加提示\n- if max_pages < num_pages:\n- md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n- else:\n- md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n- \n- # 添加提取的内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=md_content\n- ))\n- \n- # 处理提取的图像\n- image_files = []\n- for root, dirs, files in os.walk(image_path):\n- for file in files:\n- if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n- image_files.append(os.path.join(root, file))\n- \n- # 添加图像信息\n- if image_files:\n- image_info = \"\\n## 提取的图像信息\\n\\n\"\n- image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n- \n- for i, img_file in enumerate(image_files):\n- try:\n- with Image.open(img_file) as img:\n- width, height = img.size\n- format_name = img.format\n- \n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- except Exception as e:\n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- \n+ # 提取文本\n+ text = page.get_text()\n+ if text.strip():\n results.append(types.TextContent(\n type=\"text\",\n- text=image_info\n+ text=f\"第{page_num + 1}页:\\n{text}\\n---\"\n ))\n- else:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n- ))\n- \n- except Exception as extract_error:\n- # 如果PymuPDF4llm提取失败,回退到原来的方法\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n- ))\n \n- # 使用PyPDF2提取文本\n- text_content = \"\"\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n- \n- # 添加PDF元数据\n- text_content += f\"# PDF文档信息\\n\\n\"\n\\ No newline at end of file\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:] # 移除前导斜杠\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 提取文本 - 限制页数以提高性能\n- max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n- if max_pages_to_process < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n- \n- text_content += \"\\n## 内容摘要\\n\\n\"\n- \n- # 逐页提取文本\n- for page_num in range(max_pages_to_process):\n- # 添加进度提示\n- if page_num % 5 == 0 and page_num > 0:\n- progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n- results.append(types.TextContent(type=\"text\", text=progress_msg))\n+ # 提取图片\n+ image_list = page.get_images()\n+ for img_idx, img_info in enumerate(image_list):\n+ try:\n+ xref = img_info[0]\n+ base_image = doc.extract_image(xref)\n+ image_bytes = base_image[\"image\"]\n \n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n- \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n- \n- # 尝试使用pdf2image提取图片\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"正在提取图片,这可能需要一些时间...\"\n- ))\n- \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n- \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n+ # 保存图片到临时文件\n+ img_temp_path = os.path.join(temp_dir, f\"page_{page_num + 1}_img_{img_idx + 1}.png\")\n+ with open(img_temp_path, \"wb\") as img_file:\n+ img_file.write(image_bytes)\n \n- # 获取图片尺寸\n- width, height = img.size\n- \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n- \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n+ # 添加图片到结果\n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ title=f\"第{page_num + 1}页 图片{img_idx + 1}\",\n+ image_data=image_bytes\n+ ))\n+ except Exception as img_error:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 提取第{page_num + 1}页图片{img_idx + 1}时出错: {str(img_error)}\"\n+ ))\n \n- # 清理临时目录\n- shutil.rmtree(temp_dir)\n+ doc.close()\n+ return results\n \n- # 添加处理完成的提示\n- results.append(types.TextContent(\n+ except Exception as e:\n+ error_details = traceback.format_exc()\n+ return [types.TextContent(\n type=\"text\",\n- text=\"PDF处理完成!\"\n- ))\n+ text=f\"错误: 完整解析PDF时发生错误: {str(e)}\\n{error_details}\"\n+ )]\n \n- return results\n- except Exception as e:\n- # 确保清理临时目录\n- if 'temp_dir' in locals() and os.path.exists(temp_dir):\n- shutil.rmtree(temp_dir)\n- \n- import traceback\n- error_details = traceback.format_exc()\n- return [\n- types.TextContent(\n- type=\"text\",\n- text=f\"错误: 解析PDF文件失败: {str(e)}\\n\"\n- f\"可能的原因:\\n\"\n- f\"1. 文件格式不兼容\\n\"\n- f\"2. 文件已加密或受密码保护\\n\"\n- f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n- f\"4. 文件太大,处理超时\\n\\n\"\n- f\"详细错误信息: {error_details}\"\n- )\n- ] \n+ finally:\n+ # 清理临时目录\n+ if temp_dir and os.path.exists(temp_dir):\n+ shutil.rmtree(temp_dir) \n\\ No newline at end of file\n"
},
{
"date": 1741664919653,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,12 +8,13 @@\n import fitz # PyMuPDF\n import PyPDF2\n import pymupdf4llm\n import traceback\n-from typing import Dict, List, Any\n+from typing import Dict, List, Any, Tuple\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n from PIL import Image\n+import io\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"\n@@ -39,8 +40,14 @@\n \"default\": \"full\"\n }\n },\n }\n+\n+ # 图片处理配置\n+ MAX_IMAGE_SIZE = (1920, 1080) # 最大图片尺寸\n+ MIN_IMAGE_SIZE = (50, 50) # 最小图片尺寸\n+ QUALITY = 85 # JPEG压缩质量\n+ MAX_IMAGE_COUNT = 50 # 每个PDF最大图片数\n \n async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 解析PDF文件\n@@ -121,15 +128,53 @@\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 快速预览PDF时发生错误: {str(e)}\\n{error_details}\"\n )]\n+\n+ def _process_image(self, image_bytes: bytes) -> Tuple[bytes, str]:\n+ \"\"\"\n+ 处理图片:调整大小、优化质量、转换格式\n+ \n+ Args:\n+ image_bytes: 原始图片字节数据\n+ \n+ Returns:\n+ 处理后的图片字节数据和MIME类型\n+ \"\"\"\n+ try:\n+ # 从字节数据创建图片对象\n+ image = Image.open(io.BytesIO(image_bytes))\n+ \n+ # 检查图片尺寸是否在允许范围内\n+ width, height = image.size\n+ if width < self.MIN_IMAGE_SIZE[0] or height < self.MIN_IMAGE_SIZE[1]:\n+ raise ValueError(f\"图片尺寸过小: {width}x{height}\")\n+ \n+ # 调整图片大小(如果需要)\n+ if width > self.MAX_IMAGE_SIZE[0] or height > self.MAX_IMAGE_SIZE[1]:\n+ image.thumbnail(self.MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)\n+ \n+ # 转换为RGB模式(如果需要)\n+ if image.mode in ('RGBA', 'P'):\n+ image = image.convert('RGB')\n+ \n+ # 保存为JPEG格式(优化质量)\n+ output = io.BytesIO()\n+ image.save(output, format='JPEG', quality=self.QUALITY, optimize=True)\n+ processed_bytes = output.getvalue()\n+ \n+ return processed_bytes, 'image/jpeg'\n+ \n+ except Exception as e:\n+ raise ValueError(f\"图片处理失败: {str(e)}\")\n \n async def _full_parse_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 完整解析PDF文件,提取文本和图片内容\n \"\"\"\n results = []\n temp_dir = None\n+ image_count = 0\n \n try:\n # 创建临时目录存储图片\n temp_dir = tempfile.mkdtemp()\n@@ -157,24 +202,43 @@\n \n # 提取图片\n image_list = page.get_images()\n for img_idx, img_info in enumerate(image_list):\n+ # 检查是否超过最大图片数量限制\n+ if image_count >= self.MAX_IMAGE_COUNT:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 已达到最大图片数量限制({self.MAX_IMAGE_COUNT}),跳过剩余图片\"\n+ ))\n+ break\n+ \n try:\n xref = img_info[0]\n base_image = doc.extract_image(xref)\n image_bytes = base_image[\"image\"]\n \n- # 保存图片到临时文件\n- img_temp_path = os.path.join(temp_dir, f\"page_{page_num + 1}_img_{img_idx + 1}.png\")\n- with open(img_temp_path, \"wb\") as img_file:\n- img_file.write(image_bytes)\n- \n- # 添加图片到结果\n- results.append(types.ImageContent(\n- type=\"image\",\n- title=f\"第{page_num + 1}页 图片{img_idx + 1}\",\n- image_data=image_bytes\n- ))\n+ # 处理图片\n+ try:\n+ processed_image, mime_type = self._process_image(image_bytes)\n+ \n+ # 添加图片到结果\n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ title=f\"第{page_num + 1}页 图片{img_idx + 1}\",\n+ image_data=processed_image,\n+ mime_type=mime_type\n+ ))\n+ \n+ image_count += 1\n+ \n+ except ValueError as ve:\n+ # 图片处理失败,记录警告但继续处理\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 第{page_num + 1}页图片{img_idx + 1}处理失败: {str(ve)}\"\n+ ))\n+ continue\n+ \n except Exception as img_error:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 提取第{page_num + 1}页图片{img_idx + 1}时出错: {str(img_error)}\"\n"
},
{
"date": 1741664975374,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,13 +8,12 @@\n import fitz # PyMuPDF\n import PyPDF2\n import pymupdf4llm\n import traceback\n-from typing import Dict, List, Any, Tuple\n+from typing import Dict, List, Any\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n from PIL import Image\n-import io\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"\n@@ -40,14 +39,8 @@\n \"default\": \"full\"\n }\n },\n }\n-\n- # 图片处理配置\n- MAX_IMAGE_SIZE = (1920, 1080) # 最大图片尺寸\n- MIN_IMAGE_SIZE = (50, 50) # 最小图片尺寸\n- QUALITY = 85 # JPEG压缩质量\n- MAX_IMAGE_COUNT = 50 # 每个PDF最大图片数\n \n async def execute(self, arguments: Dict[str, Any]) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 解析PDF文件\n@@ -128,53 +121,15 @@\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 快速预览PDF时发生错误: {str(e)}\\n{error_details}\"\n )]\n-\n- def _process_image(self, image_bytes: bytes) -> Tuple[bytes, str]:\n- \"\"\"\n- 处理图片:调整大小、优化质量、转换格式\n- \n- Args:\n- image_bytes: 原始图片字节数据\n- \n- Returns:\n- 处理后的图片字节数据和MIME类型\n- \"\"\"\n- try:\n- # 从字节数据创建图片对象\n- image = Image.open(io.BytesIO(image_bytes))\n- \n- # 检查图片尺寸是否在允许范围内\n- width, height = image.size\n- if width < self.MIN_IMAGE_SIZE[0] or height < self.MIN_IMAGE_SIZE[1]:\n- raise ValueError(f\"图片尺寸过小: {width}x{height}\")\n- \n- # 调整图片大小(如果需要)\n- if width > self.MAX_IMAGE_SIZE[0] or height > self.MAX_IMAGE_SIZE[1]:\n- image.thumbnail(self.MAX_IMAGE_SIZE, Image.Resampling.LANCZOS)\n- \n- # 转换为RGB模式(如果需要)\n- if image.mode in ('RGBA', 'P'):\n- image = image.convert('RGB')\n- \n- # 保存为JPEG格式(优化质量)\n- output = io.BytesIO()\n- image.save(output, format='JPEG', quality=self.QUALITY, optimize=True)\n- processed_bytes = output.getvalue()\n- \n- return processed_bytes, 'image/jpeg'\n- \n- except Exception as e:\n- raise ValueError(f\"图片处理失败: {str(e)}\")\n \n async def _full_parse_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 完整解析PDF文件,提取文本和图片内容\n \"\"\"\n results = []\n temp_dir = None\n- image_count = 0\n \n try:\n # 创建临时目录存储图片\n temp_dir = tempfile.mkdtemp()\n@@ -202,43 +157,24 @@\n \n # 提取图片\n image_list = page.get_images()\n for img_idx, img_info in enumerate(image_list):\n- # 检查是否超过最大图片数量限制\n- if image_count >= self.MAX_IMAGE_COUNT:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 已达到最大图片数量限制({self.MAX_IMAGE_COUNT}),跳过剩余图片\"\n- ))\n- break\n- \n try:\n xref = img_info[0]\n base_image = doc.extract_image(xref)\n image_bytes = base_image[\"image\"]\n \n- # 处理图片\n- try:\n- processed_image, mime_type = self._process_image(image_bytes)\n- \n- # 添加图片到结果\n- results.append(types.ImageContent(\n- type=\"image\",\n- title=f\"第{page_num + 1}页 图片{img_idx + 1}\",\n- image_data=processed_image,\n- mime_type=mime_type\n- ))\n- \n- image_count += 1\n- \n- except ValueError as ve:\n- # 图片处理失败,记录警告但继续处理\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 第{page_num + 1}页图片{img_idx + 1}处理失败: {str(ve)}\"\n- ))\n- continue\n- \n+ # 保存图片到临时文件\n+ img_temp_path = os.path.join(temp_dir, f\"page_{page_num + 1}_img_{img_idx + 1}.png\")\n+ with open(img_temp_path, \"wb\") as img_file:\n+ img_file.write(image_bytes)\n+ \n+ # 添加图片到结果\n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ title=f\"第{page_num + 1}页 图片{img_idx + 1}\",\n+ image_data=image_bytes\n+ ))\n except Exception as img_error:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 提取第{page_num + 1}页图片{img_idx + 1}时出错: {str(img_error)}\"\n"
},
{
"date": 1741665090940,
"content": "Index: \n===================================================================\n--- \n+++ \n@@ -12,8 +12,10 @@\n from typing import Dict, List, Any\n import mcp.types as types\n from . import BaseTool, ToolRegistry\n from PIL import Image\n+import io\n+import pytesseract\n \n @ToolRegistry.register\n class PdfTool(BaseTool):\n \"\"\"\n@@ -122,8 +124,35 @@\n type=\"text\",\n text=f\"错误: 快速预览PDF时发生错误: {str(e)}\\n{error_details}\"\n )]\n \n+ async def _analyze_image(self, image_bytes: bytes, lang: str = 'chi_sim+eng') -> str:\n+ \"\"\"\n+ 分析图片内容,识别文字和场景\n+\n+ Args:\n+ image_bytes: 图片的二进制数据\n+ lang: OCR语言,默认中文简体+英文\n+\n+ Returns:\n+ str: 图片分析结果\n+ \"\"\"\n+ try:\n+ # 将二进制数据转换为PIL Image对象\n+ image = Image.open(io.BytesIO(image_bytes))\n+ \n+ # 进行OCR文字识别\n+ text = pytesseract.image_to_string(image, lang=lang)\n+ \n+ # 如果识别出文字,返回结果\n+ if text.strip():\n+ return f\"图片中识别出的文字:\\n{text.strip()}\"\n+ else:\n+ return \"未在图片中识别出文字\"\n+ \n+ except Exception as e:\n+ return f\"图片分析失败: {str(e)}\"\n+\n async def _full_parse_pdf(self, file_path: str) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"\n 完整解析PDF文件,提取文本和图片内容\n \"\"\"\n@@ -167,14 +196,21 @@\n img_temp_path = os.path.join(temp_dir, f\"page_{page_num + 1}_img_{img_idx + 1}.png\")\n with open(img_temp_path, \"wb\") as img_file:\n img_file.write(image_bytes)\n \n- # 添加图片到结果\n+ # 分析图片内容\n+ image_analysis = await self._analyze_image(image_bytes)\n+ \n+ # 添加图片和分析结果到结果列表\n results.append(types.ImageContent(\n type=\"image\",\n title=f\"第{page_num + 1}页 图片{img_idx + 1}\",\n image_data=image_bytes\n ))\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"第{page_num + 1}页 图片{img_idx + 1}分析结果:\\n{image_analysis}\\n---\"\n+ ))\n except Exception as img_error:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 提取第{page_num + 1}页图片{img_idx + 1}时出错: {str(img_error)}\"\n"
}
],
"date": 1741521106081,
"name": "Commit-0",
"content": "import os\nimport tempfile\nimport shutil\nimport uuid\nimport PyPDF2\nfrom pdf2image import convert_from_path\nfrom PIL import Image\nimport pymupdf4llm\nimport mcp.types as types\nfrom . import BaseTool, ToolRegistry\nfrom .image_recognition import recognize_image\n\n# 图像保存目录,与Docker挂载卷对应\nIMAGE_SAVE_DIR = os.environ.get('IMAGE_SAVE_DIR', '/img')\n# 是否启用图像识别\nENABLE_IMAGE_RECOGNITION = os.environ.get('ENABLE_IMAGE_RECOGNITION', 'true').lower() == 'true'\n\n@ToolRegistry.register\nclass PdfTool(BaseTool):\n \"\"\"PDF解析工具,用于解析PDF文件并提取文本和图片\"\"\"\n name = \"file\"\n description = \"解析PDF文件并提取文本和图片内容\"\n input_schema = {\n \"type\": \"object\",\n \"required\": [\"file_path\"],\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n },\n \"enable_image_recognition\": {\n \"type\": \"boolean\",\n \"description\": \"是否启用图像识别功能,默认为环境变量设置或true\",\n }\n },\n }\n \n async def execute(self, arguments: dict) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"解析PDF文件并提取文本和图片\"\"\"\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"Error: Missing required argument 'file_path'\"\n )]\n \n file_path = arguments[\"file_path\"]\n # 获取是否启用图像识别的参数,默认使用环境变量设置\n enable_image_recognition = arguments.get(\"enable_image_recognition\", ENABLE_IMAGE_RECOGNITION)\n \n results = []\n \n # 添加初始状态提示\n results.append(types.TextContent(\n type=\"text\",\n text=\"开始处理PDF文件,请稍候...\"\n ))\n \n # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n try:\n # 创建临时目录用于存储图片\n temp_dir = tempfile.mkdtemp()\n image_path = os.path.join(temp_dir, \"images\")\n os.makedirs(image_path, exist_ok=True)\n \n # 确保图像保存目录存在\n if enable_image_recognition and not os.path.exists(IMAGE_SAVE_DIR):\n try:\n os.makedirs(IMAGE_SAVE_DIR, exist_ok=True)\n except Exception as e:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 无法创建图像保存目录 {IMAGE_SAVE_DIR}: {str(e)}\\n图像识别功能可能不可用。\"\n ))\n enable_image_recognition = False\n \n # 添加文件大小信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n text=f\"文件大小: {file_size_mb:.2f} MB\"\n ))\n \n # 对大文件提供警告\n if file_size_mb > 10:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n ))\n \n # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n try:\n # 获取PDF页数\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 限制处理的页数\n max_pages = min(num_pages, 30)\n pages_to_process = list(range(max_pages))\n \n # 使用PymuPDF4llm提取内容\n md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=True,\n image_path=image_path,\n image_format=\"jpg\",\n dpi=150\n )\n \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n # 添加提取的内容到结果\n results.append(types.TextContent(\n type=\"text\",\n text=md_content\n ))\n \n # 处理提取的图像\n image_files = []\n for root, dirs, files in os.walk(image_path):\n for file in files:\n if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n image_files.append(os.path.join(root, file))\n \n # 添加图像信息\n if image_files:\n image_info = \"\\n## 提取的图像信息\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n # 如果启用了图像识别,添加提示\n if enable_image_recognition:\n image_info += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n \n for i, img_file in enumerate(image_files):\n try:\n with Image.open(img_file) as img:\n width, height = img.size\n format_name = img.format\n \n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n image_info += f\"- 格式: {format_name}\\n\"\n \n # 如果启用了图像识别,保存图像并进行识别\n if enable_image_recognition:\n try:\n # 生成唯一文件名\n unique_filename = f\"{uuid.uuid4().hex}_{os.path.basename(img_file)}\"\n saved_img_path = os.path.join(IMAGE_SAVE_DIR, unique_filename)\n \n # 复制图像到保存目录\n shutil.copy2(img_file, saved_img_path)\n \n # 进行图像识别\n recognition_result = recognize_image(saved_img_path)\n \n if recognition_result:\n image_info += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n else:\n image_info += \"\\n**图像识别失败**\\n\"\n except Exception as recog_error:\n image_info += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n \n image_info += \"\\n---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n image_info += \"---\\n\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n text=image_info\n ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n ))\n \n # 使用PyPDF2提取文本\n text_content = \"\"\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 添加PDF元数据\n text_content += f\"# PDF文档信息\\n\\n\"\n text_content += f\"- 页数: {num_pages}\\n\"\n if reader.metadata:\n for key, value in reader.metadata.items():\n if key.startswith('/'):\n key = key[1:] # 移除前导斜杠\n if value and str(value).strip():\n text_content += f\"- {key}: {value}\\n\"\n \n # 提取文本 - 限制页数以提高性能\n max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n if max_pages_to_process < num_pages:\n text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n \n text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages_to_process):\n # 添加进度提示\n if page_num % 5 == 0 and page_num > 0:\n progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n results.append(types.TextContent(type=\"text\", text=progress_msg))\n \n page = reader.pages[page_num]\n page_text = page.extract_text()\n if page_text:\n text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n text_content += page_text + \"\\n\"\n \n # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n # 尝试使用pdf2image提取图片\n try:\n # 限制处理的页数以提高性能\n max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n \n results.append(types.TextContent(\n type=\"text\",\n text=f\"正在提取图片,这可能需要一些时间...\"\n ))\n \n # 转换PDF页面为图片并保存\n images = convert_from_path(\n file_path, \n dpi=150, \n fmt=\"jpg\", \n first_page=1, \n last_page=max_img_pages,\n thread_count=2 # 使用多线程加速\n )\n \n # 处理每个页面图片\n image_markdown = \"\\n## 图片内容\\n\\n\"\n \n # 如果启用了图像识别,添加提示\n if enable_image_recognition:\n image_markdown += \"图像识别功能已启用,将尝试识别图像内容。\\n\\n\"\n \n for i, img in enumerate(images):\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n img.save(img_path, \"JPEG\", quality=80)\n \n # 获取图片尺寸\n width, height = img.size\n \n # 添加图片信息到Markdown\n image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n image_markdown += f\"- 格式: JPEG\\n\"\n image_markdown += f\"- DPI: 150\\n\"\n \n # 如果启用了图像识别,保存图像并进行识别\n if enable_image_recognition:\n try:\n # 生成唯一文件名\n unique_filename = f\"{uuid.uuid4().hex}_page_{i+1}.jpg\"\n saved_img_path = os.path.join(IMAGE_SAVE_DIR, unique_filename)\n \n # 复制图像到保存目录\n shutil.copy2(img_path, saved_img_path)\n \n # 进行图像识别\n recognition_result = recognize_image(saved_img_path)\n \n if recognition_result:\n image_markdown += f\"\\n**图像识别结果**:\\n\\n{recognition_result}\\n\"\n else:\n image_markdown += \"\\n**图像识别失败**\\n\"\n except Exception as recog_error:\n image_markdown += f\"\\n**图像识别过程中出错**: {str(recog_error)}\\n\"\n \n # 添加分隔线\n image_markdown += \"\\n---\\n\\n\"\n \n # 添加图片信息到结果\n results.append(types.TextContent(\n type=\"text\",\n text=image_markdown\n ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n ))\n \n # 清理临时目录\n shutil.rmtree(temp_dir)\n \n # 添加处理完成的提示\n results.append(types.TextContent(\n type=\"text\",\n text=\"PDF处理完成!\"\n ))\n \n return results\n except Exception as e:\n # 确保清理临时目录\n if 'temp_dir' in locals() and os.path.exists(temp_dir):\n shutil.rmtree(temp_dir)\n \n import traceback\n error_details = traceback.format_exc()\n return [\n types.TextContent(\n type=\"text\",\n text=f\"错误: 解析PDF文件失败: {str(e)}\\n\"\n f\"可能的原因:\\n\"\n f\"1. 文件格式不兼容\\n\"\n f\"2. 文件已加密或受密码保护\\n\"\n f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n f\"4. 文件太大,处理超时\\n\\n\"\n f\"详细错误信息: {error_details}\"\n )\n ] "
}
]
}