MCP Development Framework

{ "sourceFile": "mcp_simple_tool/tools/utils/pdf_helpers.py", "activeCommit": 0, "commits": [ { "activePatchIndex": 1, "patches": [ { "date": 1741332207363, "content": "Index: \n===================================================================\n--- \n+++ \n" }, { "date": 1741332589773, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,101 +1,1 @@\n-\"\"\"\n-PDF处理相关的辅助函数\n-\"\"\"\n-\n-import os\n-import tempfile\n-import fitz # PyMuPDF\n-from typing import List, Dict, Any\n-from PIL import Image\n-\n-\n-async def extract_images_from_pdf(file_path: str, output_dir: str) -> List[Dict[str, Any]]:\n- \"\"\"\n- 使用PyMuPDF (fitz) 从PDF中提取图片,这比pdf2image更高效且能提取嵌入图片\n- \n- Args:\n- file_path: PDF文件路径\n- output_dir: 图片输出目录\n- \n- Returns:\n- 提取的图片信息列表\n- \"\"\"\n- image_info = []\n- \n- try:\n- # 打开PDF文件\n- pdf_document = fitz.open(file_path)\n- \n- # 遍历每一页\n- for page_index in range(len(pdf_document)):\n- page = pdf_document[page_index]\n- \n- # 获取页面上的图片\n- image_list = page.get_images(full=True)\n- \n- # 遍历页面上的每个图片\n- for img_index, img in enumerate(image_list):\n- xref = img[0] # 图片的xref号\n- base_image = pdf_document.extract_image(xref)\n- image_bytes = base_image[\"image\"]\n- image_ext = base_image[\"ext\"] # 图片扩展名\n- \n- # 保存图片到文件\n- image_filename = f\"page_{page_index + 1}_img_{img_index + 1}.{image_ext}\"\n- image_path = os.path.join(output_dir, image_filename)\n- \n- with open(image_path, \"wb\") as img_file:\n- img_file.write(image_bytes)\n- \n- # 获取图片信息\n- with Image.open(image_path) as pil_img:\n- width, height = pil_img.size\n- format_name = pil_img.format\n- \n- # 添加图片信息到列表\n- image_info.append({\n- \"filename\": image_filename,\n- \"path\": image_path,\n- \"page\": page_index + 1,\n- \"width\": width,\n- \"height\": height,\n- \"format\": format_name,\n- \"size_bytes\": len(image_bytes)\n- })\n- \n- # 如果没有找到嵌入图片,尝试渲染页面为图片\n- if not image_info:\n- for page_index in range(len(pdf_document)):\n- page = pdf_document[page_index]\n- \n- # 将页面渲染为图片\n- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x缩放以获得更好的质量\n- image_filename = f\"page_{page_index + 1}_rendered.png\"\n- image_path = os.path.join(output_dir, image_filename)\n- \n- # 保存渲染的图片\n- pix.save(image_path)\n- \n- # 获取图片信息\n- with Image.open(image_path) as pil_img:\n- width, height = pil_img.size\n- format_name = pil_img.format\n- \n- # 添加图片信息到列表\n- image_info.append({\n- \"filename\": image_filename,\n- \"path\": image_path,\n- \"page\": page_index + 1,\n- \"width\": width,\n- \"height\": height,\n- \"format\": format_name,\n- \"size_bytes\": os.path.getsize(image_path),\n- \"type\": \"rendered_page\"\n- })\n- \n- pdf_document.close()\n- return image_info\n- \n- except Exception as e:\n- print(f\"提取图片时出错: {str(e)}\")\n- return [] \n\\ No newline at end of file\n+ \n\\ No newline at end of file\n" } ], "date": 1741332207363, "name": "Commit-0", "content": "\"\"\"\nPDF处理相关的辅助函数\n\"\"\"\n\nimport os\nimport tempfile\nimport fitz # PyMuPDF\nfrom typing import List, Dict, Any\nfrom PIL import Image\n\n\nasync def extract_images_from_pdf(file_path: str, output_dir: str) -> List[Dict[str, Any]]:\n \"\"\"\n 使用PyMuPDF (fitz) 从PDF中提取图片,这比pdf2image更高效且能提取嵌入图片\n \n Args:\n file_path: PDF文件路径\n output_dir: 图片输出目录\n \n Returns:\n 提取的图片信息列表\n \"\"\"\n image_info = []\n \n try:\n # 打开PDF文件\n pdf_document = fitz.open(file_path)\n \n # 遍历每一页\n for page_index in range(len(pdf_document)):\n page = pdf_document[page_index]\n \n # 获取页面上的图片\n image_list = page.get_images(full=True)\n \n # 遍历页面上的每个图片\n for img_index, img in enumerate(image_list):\n xref = img[0] # 图片的xref号\n base_image = pdf_document.extract_image(xref)\n image_bytes = base_image[\"image\"]\n image_ext = base_image[\"ext\"] # 图片扩展名\n \n # 保存图片到文件\n image_filename = f\"page_{page_index + 1}_img_{img_index + 1}.{image_ext}\"\n image_path = os.path.join(output_dir, image_filename)\n \n with open(image_path, \"wb\") as img_file:\n img_file.write(image_bytes)\n \n # 获取图片信息\n with Image.open(image_path) as pil_img:\n width, height = pil_img.size\n format_name = pil_img.format\n \n # 添加图片信息到列表\n image_info.append({\n \"filename\": image_filename,\n \"path\": image_path,\n \"page\": page_index + 1,\n \"width\": width,\n \"height\": height,\n \"format\": format_name,\n \"size_bytes\": len(image_bytes)\n })\n \n # 如果没有找到嵌入图片,尝试渲染页面为图片\n if not image_info:\n for page_index in range(len(pdf_document)):\n page = pdf_document[page_index]\n \n # 将页面渲染为图片\n pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x缩放以获得更好的质量\n image_filename = f\"page_{page_index + 1}_rendered.png\"\n image_path = os.path.join(output_dir, image_filename)\n \n # 保存渲染的图片\n pix.save(image_path)\n \n # 获取图片信息\n with Image.open(image_path) as pil_img:\n width, height = pil_img.size\n format_name = pil_img.format\n \n # 添加图片信息到列表\n image_info.append({\n \"filename\": image_filename,\n \"path\": image_path,\n \"page\": page_index + 1,\n \"width\": width,\n \"height\": height,\n \"format\": format_name,\n \"size_bytes\": os.path.getsize(image_path),\n \"type\": \"rendered_page\"\n })\n \n pdf_document.close()\n return image_info\n \n except Exception as e:\n print(f\"提取图片时出错: {str(e)}\")\n return [] " } ] }