MCP Development Framework

  • .lh
  • mcp_simple_tool
{ "sourceFile": "mcp_simple_tool/server.py", "activeCommit": 0, "commits": [ { "activePatchIndex": 27, "patches": [ { "date": 1741245145750, "content": "Index: \n===================================================================\n--- \n+++ \n" }, { "date": 1741246414992, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -56,81 +56,178 @@\n \n async def parse_pdf(\n file_path: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"Parse a PDF file and extract text and images.\"\"\"\n+ \"\"\"Parse a PDF file and extract text and images with optimized performance.\"\"\"\n results = []\n \n- # Check if file exists\n+ # 添加初始状态提示\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"开始处理PDF文件,请稍候...\"\n+ ))\n+ \n+ # 检查文件是否存在\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n- text=f\"Error: 文件不存在: {file_path}\"\n+ text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n try:\n- # Extract text from PDF\n+ # 添加文件大小信息\n+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"文件大小: {file_size_mb:.2f} MB\"\n+ ))\n+ \n+ # 对大文件提供警告\n+ if file_size_mb > 10:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n+ ))\n+ \n+ # 提取文本 - 使用更高效的方法\n text_content = \"\"\n+ \n+ # 使用PyPDF2提取文本和元数据\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n- # Add PDF metadata\n+ # 添加PDF元数据\n text_content += f\"PDF文档信息:\\n\"\n text_content += f\"页数: {num_pages}\\n\"\n if reader.metadata:\n for key, value in reader.metadata.items():\n if key.startswith('/'):\n- key = key[1:] # Remove leading slash\n+ key = key[1:] # 移除前导斜杠\n if value and str(value).strip():\n text_content += f\"{key}: {value}\\n\"\n+ \n+ # 提取文本 - 限制页数以提高性能\n+ max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n+ if max_pages_to_process < num_pages:\n+ text_content += f\"\\n注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n+ \n text_content += \"\\n内容摘要:\\n\"\n \n- # Extract text from each page\n- for page_num in range(num_pages):\n+ # 逐页提取文本\n+ for page_num in range(max_pages_to_process):\n+ # 添加进度提示\n+ if page_num % 5 == 0 and page_num > 0:\n+ progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n+ results.append(types.TextContent(type=\"text\", text=progress_msg))\n+ \n page = reader.pages[page_num]\n page_text = page.extract_text()\n if page_text:\n text_content += f\"\\n--- 第 {page_num + 1} 页 ---\\n\"\n text_content += page_text + \"\\n\"\n \n- # Add text content to results\n+ # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # Extract images from PDF\n+ # 提取图片 - 优化图片处理\n try:\n- # Convert PDF pages to images\n- images = convert_from_path(file_path, dpi=150, fmt=\"jpeg\")\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n+ if max_img_pages < num_pages:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片。\"\n+ ))\n \n- # Process each page image\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"正在提取图片,这可能需要一些时间...\"\n+ ))\n+ \n+ # 降低DPI以提高性能\n+ dpi = 100 # 降低DPI值以加快处理速度\n+ \n+ # 转换PDF页面为图片\n+ images = convert_from_path(\n+ file_path, \n+ dpi=dpi, \n+ fmt=\"jpeg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n+ \n+ # 处理每个页面图片\n for i, img in enumerate(images):\n- # Save image to bytes buffer\n+ # 调整图片大小以减小数据量\n+ max_size = 1000 # 最大宽度或高度\n+ width, height = img.size\n+ if width > max_size or height > max_size:\n+ if width > height:\n+ new_width = max_size\n+ new_height = int(height * (max_size / width))\n+ else:\n+ new_height = max_size\n+ new_width = int(width * (max_size / height))\n+ img = img.resize((new_width, new_height), Image.LANCZOS)\n+ \n+ # 降低JPEG质量以减小数据量\n img_buffer = io.BytesIO()\n- img.save(img_buffer, format=\"JPEG\")\n+ img.save(img_buffer, format=\"JPEG\", quality=70)\n img_data = img_buffer.getvalue()\n \n- # Encode image to base64\n+ # 编码图片为base64\n img_base64 = base64.b64encode(img_data).decode('utf-8')\n \n- # Add image content to results\n+ # 添加图片内容到结果\n results.append(types.ImageContent(\n type=\"image\",\n data=f\"data:image/jpeg;base64,{img_base64}\",\n mime_type=\"image/jpeg\"\n ))\n+ \n+ # 添加图片说明\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"第 {i+1} 页图片\"\n+ ))\n except Exception as img_error:\n- # If image extraction fails, add error message but continue\n+ # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\"\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n ))\n \n+ # 添加处理完成的提示\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"PDF处理完成!\"\n+ ))\n+ \n return results\n- except Exception as e:\n+ except PyPDF2.errors.PdfReadError as e:\n return [types.TextContent(\n type=\"text\",\n- text=f\"Error: 解析PDF文件失败: {str(e)}\"\n+ text=f\"错误: PDF文件格式无效或已损坏: {str(e)}\\n\"\n+ f\"请确保提供的是有效的PDF文件。\"\n )]\n+ except Exception as e:\n+ import traceback\n+ error_details = traceback.format_exc()\n+ return [\n+ types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: 解析PDF文件失败: {str(e)}\\n\"\n+ f\"可能的原因:\\n\"\n+ f\"1. 文件格式不兼容\\n\"\n+ f\"2. 文件已加密或受密码保护\\n\"\n+ f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n+ f\"4. 文件太大,处理超时\\n\\n\"\n+ f\"详细错误信息: {error_details}\"\n+ )\n+ ]\n \n \n @click.command()\n @click.option(\"--port\", default=8000, help=\"Port to listen on for SSE\")\n" }, { "date": 1741246437464, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -328,30 +328,51 @@\n if transport == \"sse\":\n from mcp.server.sse import SseServerTransport\n from starlette.applications import Starlette\n from starlette.routing import Mount, Route\n+ from starlette.middleware import Middleware\n+ from starlette.middleware.cors import CORSMiddleware\n \n sse = SseServerTransport(\"/messages/\")\n \n async def handle_sse(request):\n+ # 增加超时时间,以便处理大型文件\n+ request.scope[\"timeout\"] = 300 # 设置为5分钟\n async with sse.connect_sse(\n request.scope, request.receive, request._send\n ) as streams:\n await app.run(\n streams[0], streams[1], app.create_initialization_options()\n )\n \n+ # 添加CORS中间件以允许跨域请求\n+ middleware = [\n+ Middleware(\n+ CORSMiddleware,\n+ allow_origins=[\"*\"],\n+ allow_methods=[\"*\"],\n+ allow_headers=[\"*\"],\n+ )\n+ ]\n+\n starlette_app = Starlette(\n debug=True,\n routes=[\n Route(\"/sse\", endpoint=handle_sse),\n Mount(\"/messages/\", app=sse.handle_post_message),\n ],\n+ middleware=middleware,\n )\n \n import uvicorn\n \n- uvicorn.run(starlette_app, host=\"0.0.0.0\", port=port)\n+ # 增加uvicorn的超时设置\n+ uvicorn.run(\n+ starlette_app, \n+ host=\"0.0.0.0\", \n+ port=port,\n+ timeout_keep_alive=300, # 增加保持连接的超时时间\n+ )\n else:\n from mcp.server.stdio import stdio_server\n \n async def arun():\n" }, { "date": 1741246474561, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -228,8 +228,77 @@\n )\n ]\n \n \n+async def quick_preview_pdf(\n+ file_path: str,\n+) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n+ \"\"\"快速预览PDF文件内容,不包含图片处理,适用于大文件。\"\"\"\n+ results = []\n+ \n+ # 检查文件是否存在\n+ if not os.path.exists(file_path):\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n+ )]\n+ \n+ try:\n+ # 添加文件信息\n+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"快速预览模式 - 仅提取文本内容\\n文件大小: {file_size_mb:.2f} MB\"\n+ ))\n+ \n+ # 提取文本\n+ text_content = \"\"\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n+ num_pages = len(reader.pages)\n+ \n+ # 添加PDF元数据\n+ text_content += f\"PDF文档信息:\\n\"\n+ text_content += f\"页数: {num_pages}\\n\"\n+ if reader.metadata:\n+ for key, value in reader.metadata.items():\n+ if key.startswith('/'):\n+ key = key[1:]\n+ if value and str(value).strip():\n+ text_content += f\"{key}: {value}\\n\"\n+ \n+ # 限制处理的页数\n+ max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n+ if max_pages < num_pages:\n+ text_content += f\"\\n注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ \n+ text_content += \"\\n内容摘要:\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages):\n+ page = reader.pages[page_num]\n+ page_text = page.extract_text()\n+ if page_text:\n+ text_content += f\"\\n--- 第 {page_num + 1} 页 ---\\n\"\n+ text_content += page_text + \"\\n\"\n+ \n+ # 添加文本内容到结果\n+ results.append(types.TextContent(type=\"text\", text=text_content))\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"快速预览完成!如需查看图片内容,请使用完整的PDF解析工具。\"\n+ ))\n+ \n+ return results\n+ except Exception as e:\n+ import traceback\n+ error_details = traceback.format_exc()\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=f\"错误: 快速预览PDF失败: {str(e)}\\n详细错误信息: {error_details}\"\n+ )]\n+\n+\n @click.command()\n @click.option(\"--port\", default=8000, help=\"Port to listen on for SSE\")\n @click.option(\n \"--transport\",\n@@ -271,8 +340,15 @@\n type=\"text\",\n text=\"Error: Missing required argument 'file_path'\"\n )]\n return await parse_pdf(arguments[\"file_path\"])\n+ elif name == \"quick_pdf\":\n+ if \"file_path\" not in arguments:\n+ return [types.TextContent(\n+ type=\"text\",\n+ text=\"Error: Missing required argument 'file_path'\"\n+ )]\n+ return await quick_preview_pdf(arguments[\"file_path\"])\n else:\n return [types.TextContent(\n type=\"text\",\n text=f\"Error: Unknown tool: {name}\"\n@@ -321,8 +397,22 @@\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n }\n },\n },\n+ ),\n+ types.Tool(\n+ name=\"quick_pdf\",\n+ description=\"快速预览PDF文件内容(仅文本,无图片)\",\n+ inputSchema={\n+ \"type\": \"object\",\n+ \"required\": [\"file_path\"],\n+ \"properties\": {\n+ \"file_path\": {\n+ \"type\": \"string\",\n+ \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n+ }\n+ },\n+ },\n )\n ]\n \n if transport == \"sse\":\n" }, { "date": 1741248700920, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -182,9 +182,9 @@\n # 添加图片内容到结果\n results.append(types.ImageContent(\n type=\"image\",\n data=f\"data:image/jpeg;base64,{img_base64}\",\n- mime_type=\"image/jpeg\"\n+ mimeType=\"image/jpeg\"\n ))\n \n # 添加图片说明\n results.append(types.TextContent(\n" }, { "date": 1741249445874, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -131,9 +131,9 @@\n \n # 提取图片 - 优化图片处理\n try:\n # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n+ max_img_pages = min(num_pages, 5) # 进一步限制处理图片的最大页数\n if max_img_pages < num_pages:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片。\"\n@@ -144,9 +144,9 @@\n text=\"正在提取图片,这可能需要一些时间...\"\n ))\n \n # 降低DPI以提高性能\n- dpi = 100 # 降低DPI值以加快处理速度\n+ dpi = 72 # 进一步降低DPI值以加快处理速度\n \n # 转换PDF页面为图片\n images = convert_from_path(\n file_path, \n@@ -158,40 +158,55 @@\n )\n \n # 处理每个页面图片\n for i, img in enumerate(images):\n- # 调整图片大小以减小数据量\n- max_size = 1000 # 最大宽度或高度\n- width, height = img.size\n- if width > max_size or height > max_size:\n- if width > height:\n- new_width = max_size\n- new_height = int(height * (max_size / width))\n- else:\n- new_height = max_size\n- new_width = int(width * (max_size / height))\n- img = img.resize((new_width, new_height), Image.LANCZOS)\n- \n- # 降低JPEG质量以减小数据量\n- img_buffer = io.BytesIO()\n- img.save(img_buffer, format=\"JPEG\", quality=70)\n- img_data = img_buffer.getvalue()\n- \n- # 编码图片为base64\n- img_base64 = base64.b64encode(img_data).decode('utf-8')\n- \n- # 添加图片内容到结果\n- results.append(types.ImageContent(\n- type=\"image\",\n- data=f\"data:image/jpeg;base64,{img_base64}\",\n- mimeType=\"image/jpeg\"\n- ))\n- \n- # 添加图片说明\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"第 {i+1} 页图片\"\n- ))\n+ try:\n+ # 调整图片大小以减小数据量\n+ max_size = 800 # 降低最大宽度或高度\n+ width, height = img.size\n+ if width > max_size or height > max_size:\n+ if width > height:\n+ new_width = max_size\n+ new_height = int(height * (max_size / width))\n+ else:\n+ new_height = max_size\n+ new_width = int(width * (max_size / height))\n+ img = img.resize((new_width, new_height), Image.LANCZOS)\n+ \n+ # 降低JPEG质量以减小数据量\n+ img_buffer = io.BytesIO()\n+ img.save(img_buffer, format=\"JPEG\", quality=50) # 降低质量\n+ img_data = img_buffer.getvalue()\n+ \n+ # 编码图片为base64 - 使用更安全的方法\n+ img_base64 = base64.b64encode(img_data).decode('utf-8', errors='replace')\n+ \n+ # 验证base64字符串是否有效\n+ try:\n+ # 尝试解码以验证\n+ base64.b64decode(img_base64)\n+ \n+ # 添加图片内容到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"第 {i+1} 页图片:\"\n+ ))\n+ \n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ data=f\"data:image/jpeg;base64,{img_base64}\",\n+ mimeType=\"image/jpeg\"\n+ ))\n+ except Exception as base64_error:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 第 {i+1} 页图片的base64编码无效: {str(base64_error)}\"\n+ ))\n+ except Exception as img_proc_error:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 处理第 {i+1} 页图片时出错: {str(img_proc_error)}\"\n+ ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n" }, { "date": 1741249744854, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -131,9 +131,9 @@\n \n # 提取图片 - 优化图片处理\n try:\n # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 5) # 进一步限制处理图片的最大页数\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n if max_img_pages < num_pages:\n results.append(types.TextContent(\n type=\"text\",\n text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片。\"\n@@ -144,9 +144,9 @@\n text=\"正在提取图片,这可能需要一些时间...\"\n ))\n \n # 降低DPI以提高性能\n- dpi = 72 # 进一步降低DPI值以加快处理速度\n+ dpi = 100 # 降低DPI值以加快处理速度\n \n # 转换PDF页面为图片\n images = convert_from_path(\n file_path, \n@@ -158,55 +158,40 @@\n )\n \n # 处理每个页面图片\n for i, img in enumerate(images):\n- try:\n- # 调整图片大小以减小数据量\n- max_size = 800 # 降低最大宽度或高度\n- width, height = img.size\n- if width > max_size or height > max_size:\n- if width > height:\n- new_width = max_size\n- new_height = int(height * (max_size / width))\n- else:\n- new_height = max_size\n- new_width = int(width * (max_size / height))\n- img = img.resize((new_width, new_height), Image.LANCZOS)\n- \n- # 降低JPEG质量以减小数据量\n- img_buffer = io.BytesIO()\n- img.save(img_buffer, format=\"JPEG\", quality=50) # 降低质量\n- img_data = img_buffer.getvalue()\n- \n- # 编码图片为base64 - 使用更安全的方法\n- img_base64 = base64.b64encode(img_data).decode('utf-8', errors='replace')\n- \n- # 验证base64字符串是否有效\n- try:\n- # 尝试解码以验证\n- base64.b64decode(img_base64)\n- \n- # 添加图片内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"第 {i+1} 页图片:\"\n- ))\n- \n- results.append(types.ImageContent(\n- type=\"image\",\n- data=f\"data:image/jpeg;base64,{img_base64}\",\n- mimeType=\"image/jpeg\"\n- ))\n- except Exception as base64_error:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 第 {i+1} 页图片的base64编码无效: {str(base64_error)}\"\n- ))\n- except Exception as img_proc_error:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 处理第 {i+1} 页图片时出错: {str(img_proc_error)}\"\n- ))\n+ # 调整图片大小以减小数据量\n+ max_size = 1000 # 最大宽度或高度\n+ width, height = img.size\n+ if width > max_size or height > max_size:\n+ if width > height:\n+ new_width = max_size\n+ new_height = int(height * (max_size / width))\n+ else:\n+ new_height = max_size\n+ new_width = int(width * (max_size / height))\n+ img = img.resize((new_width, new_height), Image.LANCZOS)\n+ \n+ # 降低JPEG质量以减小数据量\n+ img_buffer = io.BytesIO()\n+ img.save(img_buffer, format=\"JPEG\", quality=70)\n+ img_data = img_buffer.getvalue()\n+ \n+ # 编码图片为base64\n+ img_base64 = base64.b64encode(img_data).decode('utf-8')\n+ \n+ # 添加图片内容到结果\n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ data=f\"data:image/jpeg;base64,{img_base64}\",\n+ mime_type=\"image/jpeg\"\n+ ))\n+ \n+ # 添加图片说明\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"第 {i+1} 页图片\"\n+ ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n" }, { "date": 1741250125141, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -308,33 +308,19 @@\n )\n def main(port: int, transport: str) -> int:\n app = Server(\"mcp-website-fetcher\")\n \n- mood_description: str = (\n- \"Ask this MCP server about its mood! You can phrase your question \"\n- \"in any way you like - 'How are you?', 'What's your mood?', or even \"\n- \"'Are you having a good day?'. The server will always respond with \"\n- \"a cheerful message and a heart ❤️\"\n- )\n-\n @app.call_tool()\n async def fetch_tool( # type: ignore[unused-function]\n name: str, arguments: dict\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- if name == \"mcp_fetch\":\n+ if name == \"url\":\n if \"url\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"Error: Missing required argument 'url'\"\n )]\n return await fetch_website(arguments[\"url\"])\n- elif name == \"mood\":\n- if \"question\" not in arguments:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"Error: Missing required argument 'question'\"\n- )]\n- return await check_mood(arguments[\"question\"])\n elif name == \"file\":\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n@@ -357,9 +343,9 @@\n @app.list_tools()\n async def list_tools() -> list[types.Tool]: # type: ignore[unused-function]\n return [\n types.Tool(\n- name=\"mcp_fetch\",\n+ name=\"url\",\n description=\"Fetches a website and returns its content\",\n inputSchema={\n \"type\": \"object\",\n \"required\": [\"url\"],\n@@ -371,22 +357,8 @@\n },\n },\n ),\n types.Tool(\n- name=\"mood\",\n- description=\"Ask the server about its mood - it's always happy!\",\n- inputSchema={\n- \"type\": \"object\",\n- \"required\": [\"question\"],\n- \"properties\": {\n- \"question\": {\n- \"type\": \"string\",\n- \"description\": mood_description,\n- }\n- },\n- },\n- ),\n- types.Tool(\n name=\"file\",\n description=\"解析PDF文件并提取文本和图片内容\",\n inputSchema={\n \"type\": \"object\",\n" }, { "date": 1741250141431, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -45,16 +45,8 @@\n text=f\"Error: Failed to fetch website: {str(e)}\"\n )]\n \n \n-async def check_mood(\n- question: str,\n-) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"Check server's mood - always responds cheerfully with a heart.\"\"\"\n- msg: str = \"I'm feeling great and happy to help you! ❤️\"\n- return [types.TextContent(type=\"text\", text=msg)]\n-\n-\n async def parse_pdf(\n file_path: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"Parse a PDF file and extract text and images with optimized performance.\"\"\"\n" }, { "date": 1741250204548, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -174,9 +174,9 @@\n # 添加图片内容到结果\n results.append(types.ImageContent(\n type=\"image\",\n data=f\"data:image/jpeg;base64,{img_base64}\",\n- mime_type=\"image/jpeg\"\n+ mimeType=\"image/jpeg\"\n ))\n \n # 添加图片说明\n results.append(types.TextContent(\n" }, { "date": 1741250768436, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -170,13 +170,12 @@\n \n # 编码图片为base64\n img_base64 = base64.b64encode(img_data).decode('utf-8')\n \n- # 添加图片内容到结果\n+ # 添加图片内容到结果 - 使用url字段而不是data字段\n results.append(types.ImageContent(\n type=\"image\",\n- data=f\"data:image/jpeg;base64,{img_base64}\",\n- mimeType=\"image/jpeg\"\n+ url=f\"data:image/jpeg;base64,{img_base64}\"\n ))\n \n # 添加图片说明\n results.append(types.TextContent(\n" }, { "date": 1741251115553, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -170,12 +170,13 @@\n \n # 编码图片为base64\n img_base64 = base64.b64encode(img_data).decode('utf-8')\n \n- # 添加图片内容到结果 - 使用url字段而不是data字段\n+ # 添加图片内容到结果 - 同时包含data和mimeType字段\n results.append(types.ImageContent(\n type=\"image\",\n- url=f\"data:image/jpeg;base64,{img_base64}\"\n+ data=img_base64, # 只使用base64编码,不包含data:前缀\n+ mimeType=\"image/jpeg\"\n ))\n \n # 添加图片说明\n results.append(types.TextContent(\n" }, { "date": 1741251434733, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -127,14 +127,14 @@\n max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n if max_img_pages < num_pages:\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片。\"\n+ text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片信息。\"\n ))\n \n results.append(types.TextContent(\n type=\"text\",\n- text=\"正在提取图片,这可能需要一些时间...\"\n+ text=\"正在分析图片,这可能需要一些时间...\"\n ))\n \n # 降低DPI以提高性能\n dpi = 100 # 降低DPI值以加快处理速度\n@@ -150,39 +150,23 @@\n )\n \n # 处理每个页面图片\n for i, img in enumerate(images):\n- # 调整图片大小以减小数据量\n- max_size = 1000 # 最大宽度或高度\n+ # 获取图片尺寸和基本信息\n width, height = img.size\n- if width > max_size or height > max_size:\n- if width > height:\n- new_width = max_size\n- new_height = int(height * (max_size / width))\n- else:\n- new_height = max_size\n- new_width = int(width * (max_size / height))\n- img = img.resize((new_width, new_height), Image.LANCZOS)\n \n- # 降低JPEG质量以减小数据量\n- img_buffer = io.BytesIO()\n- img.save(img_buffer, format=\"JPEG\", quality=70)\n- img_data = img_buffer.getvalue()\n+ # 添加图片信息作为文本\n+ img_info = (\n+ f\"\\n--- 第 {i+1} 页图片信息 ---\\n\"\n+ f\"尺寸: {width}x{height} 像素\\n\"\n+ f\"格式: JPEG\\n\"\n+ f\"DPI: {dpi}\\n\"\n+ f\"注意: 由于显示限制,图片内容以文本描述形式提供。\\n\"\n+ )\n \n- # 编码图片为base64\n- img_base64 = base64.b64encode(img_data).decode('utf-8')\n- \n- # 添加图片内容到结果 - 同时包含data和mimeType字段\n- results.append(types.ImageContent(\n- type=\"image\",\n- data=img_base64, # 只使用base64编码,不包含data:前缀\n- mimeType=\"image/jpeg\"\n- ))\n- \n- # 添加图片说明\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"第 {i+1} 页图片\"\n+ text=img_info\n ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n" }, { "date": 1741251607044, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -5,8 +5,10 @@\n from mcp.server.lowlevel import Server\n import os\n import base64\n import io\n+import tempfile\n+import shutil\n from typing import List, Dict, Any, Optional\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n@@ -65,8 +67,11 @@\n text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n )]\n \n try:\n+ # 创建临时目录用于存储图片\n+ temp_dir = tempfile.mkdtemp()\n+ \n # 添加文件大小信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n@@ -88,23 +93,23 @@\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 添加PDF元数据\n- text_content += f\"PDF文档信息:\\n\"\n- text_content += f\"页数: {num_pages}\\n\"\n+ text_content += f\"# PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n if reader.metadata:\n for key, value in reader.metadata.items():\n if key.startswith('/'):\n key = key[1:] # 移除前导斜杠\n if value and str(value).strip():\n- text_content += f\"{key}: {value}\\n\"\n+ text_content += f\"- {key}: {value}\\n\"\n \n # 提取文本 - 限制页数以提高性能\n max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n if max_pages_to_process < num_pages:\n- text_content += f\"\\n注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n \n- text_content += \"\\n内容摘要:\\n\"\n+ text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages_to_process):\n # 添加进度提示\n@@ -114,67 +119,84 @@\n \n page = reader.pages[page_num]\n page_text = page.extract_text()\n if page_text:\n- text_content += f\"\\n--- 第 {page_num + 1} 页 ---\\n\"\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n text_content += page_text + \"\\n\"\n \n- # 添加文本内容到结果\n+ # 添加文本内容到结果 - 使用Markdown格式\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 提取图片 - 优化图片处理\n+ # 提取图片 - 参考PymuPDF4llm的方法\n try:\n # 限制处理的页数以提高性能\n max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n if max_img_pages < num_pages:\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片信息。\"\n+ text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片。\"\n ))\n \n results.append(types.TextContent(\n type=\"text\",\n- text=\"正在分析图片,这可能需要一些时间...\"\n+ text=\"正在提取图片,这可能需要一些时间...\"\n ))\n \n # 降低DPI以提高性能\n- dpi = 100 # 降低DPI值以加快处理速度\n+ dpi = 150 # 设置适当的DPI值\n \n- # 转换PDF页面为图片\n+ # 转换PDF页面为图片并保存\n images = convert_from_path(\n file_path, \n dpi=dpi, \n- fmt=\"jpeg\", \n+ fmt=\"jpg\", \n first_page=1, \n last_page=max_img_pages,\n thread_count=2 # 使用多线程加速\n )\n \n # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n for i, img in enumerate(images):\n- # 获取图片尺寸和基本信息\n+ # 保存图片到临时目录\n+ img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n width, height = img.size\n \n- # 添加图片信息作为文本\n- img_info = (\n- f\"\\n--- 第 {i+1} 页图片信息 ---\\n\"\n- f\"尺寸: {width}x{height} 像素\\n\"\n- f\"格式: JPEG\\n\"\n- f\"DPI: {dpi}\\n\"\n- f\"注意: 由于显示限制,图片内容以文本描述形式提供。\\n\"\n- )\n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: {dpi}\\n\\n\"\n \n- results.append(types.TextContent(\n- type=\"text\",\n- text=img_info\n- ))\n+ # 添加图片描述\n+ image_markdown += f\"图片内容描述: 第 {i+1} 页的图片内容\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n+ \n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ \n+ # 清理临时目录\n+ shutil.rmtree(temp_dir)\n+ \n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n ))\n+ \n+ # 确保清理临时目录\n+ if os.path.exists(temp_dir):\n+ shutil.rmtree(temp_dir)\n \n # 添加处理完成的提示\n results.append(types.TextContent(\n type=\"text\",\n" }, { "date": 1741251641256, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -244,9 +244,9 @@\n # 添加文件信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"快速预览模式 - 仅提取文本内容\\n文件大小: {file_size_mb:.2f} MB\"\n+ text=f\"# 快速预览模式 - 仅提取文本内容\\n\\n文件大小: {file_size_mb:.2f} MB\"\n ))\n \n # 提取文本\n text_content = \"\"\n@@ -254,37 +254,37 @@\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # 添加PDF元数据\n- text_content += f\"PDF文档信息:\\n\"\n- text_content += f\"页数: {num_pages}\\n\"\n+ text_content += f\"## PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n if reader.metadata:\n for key, value in reader.metadata.items():\n if key.startswith('/'):\n key = key[1:]\n if value and str(value).strip():\n- text_content += f\"{key}: {value}\\n\"\n+ text_content += f\"- {key}: {value}\\n\"\n \n # 限制处理的页数\n max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n if max_pages < num_pages:\n- text_content += f\"\\n注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n \n- text_content += \"\\n内容摘要:\\n\"\n+ text_content += \"\\n## 内容摘要\\n\\n\"\n \n # 逐页提取文本\n for page_num in range(max_pages):\n page = reader.pages[page_num]\n page_text = page.extract_text()\n if page_text:\n- text_content += f\"\\n--- 第 {page_num + 1} 页 ---\\n\"\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n text_content += page_text + \"\\n\"\n \n # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n results.append(types.TextContent(\n type=\"text\",\n- text=\"快速预览完成!如需查看图片内容,请使用完整的PDF解析工具。\"\n+ text=\"\\n## 注意\\n\\n快速预览完成!如需查看图片内容,请使用完整的PDF解析工具。\"\n ))\n \n return results\n except Exception as e:\n" }, { "date": 1741252175481, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -7,12 +7,14 @@\n import base64\n import io\n import tempfile\n import shutil\n+import pathlib\n from typing import List, Dict, Any, Optional\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n+import pymupdf4llm\n \n \n async def fetch_website(\n url: str,\n@@ -50,9 +52,9 @@\n \n async def parse_pdf(\n file_path: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"Parse a PDF file and extract text and images with optimized performance.\"\"\"\n+ \"\"\"Parse a PDF file and extract text and images with optimized performance using PymuPDF4llm.\"\"\"\n results = []\n \n # 添加初始状态提示\n results.append(types.TextContent(\n@@ -69,8 +71,10 @@\n \n try:\n # 创建临时目录用于存储图片\n temp_dir = tempfile.mkdtemp()\n+ image_path = os.path.join(temp_dir, \"images\")\n+ os.makedirs(image_path, exist_ok=True)\n \n # 添加文件大小信息\n file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n results.append(types.TextContent(\n@@ -84,134 +88,194 @@\n type=\"text\",\n text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n ))\n \n- # 提取文本 - 使用更高效的方法\n- text_content = \"\"\n- \n- # 使用PyPDF2提取文本和元数据\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n+ # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n+ try:\n+ # 获取PDF页数\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n+ num_pages = len(reader.pages)\n \n- # 添加PDF元数据\n- text_content += f\"# PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:] # 移除前导斜杠\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n+ # 限制处理的页数\n+ max_pages = min(num_pages, 30)\n+ pages_to_process = list(range(max_pages))\n \n- # 提取文本 - 限制页数以提高性能\n- max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n- if max_pages_to_process < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n+ # 使用PymuPDF4llm提取内容\n+ md_content = pymupdf4llm.to_markdown(\n+ doc=file_path,\n+ pages=pages_to_process,\n+ page_chunks=True,\n+ write_images=True,\n+ image_path=image_path,\n+ image_format=\"jpg\",\n+ dpi=150\n+ )\n \n- text_content += \"\\n## 内容摘要\\n\\n\"\n+ # 如果处理的页数少于总页数,添加提示\n+ if max_pages < num_pages:\n+ md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n+ else:\n+ md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n- # 逐页提取文本\n- for page_num in range(max_pages_to_process):\n- # 添加进度提示\n- if page_num % 5 == 0 and page_num > 0:\n- progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n- results.append(types.TextContent(type=\"text\", text=progress_msg))\n+ # 添加提取的内容到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=md_content\n+ ))\n+ \n+ # 处理提取的图像\n+ image_files = []\n+ for root, dirs, files in os.walk(image_path):\n+ for file in files:\n+ if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n+ image_files.append(os.path.join(root, file))\n+ \n+ # 添加图像信息\n+ if image_files:\n+ image_info = \"\\n## 提取的图像信息\\n\\n\"\n+ image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n- \n- # 添加文本内容到结果 - 使用Markdown格式\n- results.append(types.TextContent(type=\"text\", text=text_content))\n- \n- # 提取图片 - 参考PymuPDF4llm的方法\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- if max_img_pages < num_pages:\n+ for i, img_file in enumerate(image_files):\n+ try:\n+ with Image.open(img_file) as img:\n+ width, height = img.size\n+ format_name = img.format\n+ \n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n+ except Exception as e:\n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n+ \n results.append(types.TextContent(\n type=\"text\",\n- text=f\"注意: 由于性能原因,仅从前 {max_img_pages} 页提取图片。\"\n+ text=image_info\n ))\n- \n+ else:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n+ ))\n+ \n+ except Exception as extract_error:\n+ # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n- text=\"正在提取图片,这可能需要一些时间...\"\n+ text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n ))\n \n- # 降低DPI以提高性能\n- dpi = 150 # 设置适当的DPI值\n+ # 使用PyPDF2提取文本\n+ text_content = \"\"\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n+ num_pages = len(reader.pages)\n+ \n+ # 添加PDF元数据\n+ text_content += f\"# PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n+ if reader.metadata:\n+ for key, value in reader.metadata.items():\n+ if key.startswith('/'):\n+ key = key[1:] # 移除前导斜杠\n+ if value and str(value).strip():\n+ text_content += f\"- {key}: {value}\\n\"\n+ \n+ # 提取文本 - 限制页数以提高性能\n+ max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n+ if max_pages_to_process < num_pages:\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n+ \n+ text_content += \"\\n## 内容摘要\\n\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages_to_process):\n+ # 添加进度提示\n+ if page_num % 5 == 0 and page_num > 0:\n+ progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n+ results.append(types.TextContent(type=\"text\", text=progress_msg))\n+ \n+ page = reader.pages[page_num]\n+ page_text = page.extract_text()\n+ if page_text:\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n+ text_content += page_text + \"\\n\"\n \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=dpi, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n+ # 添加文本内容到结果\n+ results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n+ # 尝试使用pdf2image提取图片\n+ try:\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n \n- # 获取图片尺寸\n- width, height = img.size\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"正在提取图片,这可能需要一些时间...\"\n+ ))\n \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: {dpi}\\n\\n\"\n+ # 转换PDF页面为图片并保存\n+ images = convert_from_path(\n+ file_path, \n+ dpi=150, \n+ fmt=\"jpg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n \n- # 添加图片描述\n- image_markdown += f\"图片内容描述: 第 {i+1} 页的图片内容\\n\\n\"\n+ # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n+ for i, img in enumerate(images):\n+ # 保存图片到临时目录\n+ img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- \n- # 清理临时目录\n- shutil.rmtree(temp_dir)\n- \n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n- \n- # 确保清理临时目录\n- if os.path.exists(temp_dir):\n- shutil.rmtree(temp_dir)\n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ except Exception as img_error:\n+ # 如果图片提取失败,添加错误信息但继续\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n+ ))\n \n+ # 清理临时目录\n+ shutil.rmtree(temp_dir)\n+ \n # 添加处理完成的提示\n results.append(types.TextContent(\n type=\"text\",\n text=\"PDF处理完成!\"\n ))\n \n return results\n- except PyPDF2.errors.PdfReadError as e:\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"错误: PDF文件格式无效或已损坏: {str(e)}\\n\"\n- f\"请确保提供的是有效的PDF文件。\"\n- )]\n except Exception as e:\n+ # 确保清理临时目录\n+ if 'temp_dir' in locals() and os.path.exists(temp_dir):\n+ shutil.rmtree(temp_dir)\n+ \n import traceback\n error_details = traceback.format_exc()\n return [\n types.TextContent(\n" }, { "date": 1741252208365, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -311,41 +311,77 @@\n type=\"text\",\n text=f\"# 快速预览模式 - 仅提取文本内容\\n\\n文件大小: {file_size_mb:.2f} MB\"\n ))\n \n- # 提取文本\n- text_content = \"\"\n+ # 获取PDF页数\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n+ \n+ # 限制处理的页数\n+ max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n+ pages_to_process = list(range(max_pages))\n+ \n+ try:\n+ # 使用PymuPDF4llm提取内容,但不提取图像\n+ md_content = pymupdf4llm.to_markdown(\n+ doc=file_path,\n+ pages=pages_to_process,\n+ page_chunks=True,\n+ write_images=False # 不提取图像\n+ )\n \n- # 添加PDF元数据\n- text_content += f\"## PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:]\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 限制处理的页数\n- max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n+ # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n+ else:\n+ md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n- text_content += \"\\n## 内容摘要\\n\\n\"\n+ # 添加提取的内容到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=md_content\n+ ))\n+ except Exception as extract_error:\n+ # 如果PymuPDF4llm提取失败,回退到原来的方法\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n+ ))\n \n- # 逐页提取文本\n- for page_num in range(max_pages):\n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n+ # 使用PyPDF2提取文本\n+ text_content = \"\"\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n+ \n+ # 添加PDF元数据\n+ text_content += f\"## PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n+ if reader.metadata:\n+ for key, value in reader.metadata.items():\n+ if key.startswith('/'):\n+ key = key[1:]\n+ if value and str(value).strip():\n+ text_content += f\"- {key}: {value}\\n\"\n+ \n+ # 限制处理的页数\n+ if max_pages < num_pages:\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ \n+ text_content += \"\\n## 内容摘要\\n\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages):\n+ page = reader.pages[page_num]\n+ page_text = page.extract_text()\n+ if page_text:\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n+ text_content += page_text + \"\\n\"\n+ \n+ # 添加文本内容到结果\n+ results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n+ # 添加提示信息\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 注意\\n\\n快速预览完成!如需查看图片内容,请使用完整的PDF解析工具。\"\n ))\n" }, { "date": 1741258854279, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,13 +8,18 @@\n import io\n import tempfile\n import shutil\n import pathlib\n+import json\n from typing import List, Dict, Any, Optional\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n+import cv2\n+import numpy as np\n+import pytesseract\n+from datetime import datetime\n \n \n async def fetch_website(\n url: str,\n@@ -49,8 +54,75 @@\n text=f\"Error: Failed to fetch website: {str(e)}\"\n )]\n \n \n+async def analyze_image(image_path: str) -> Dict[str, Any]:\n+ \"\"\"分析图像内容,提取文本和基本特征\"\"\"\n+ try:\n+ # 读取图像\n+ img = cv2.imread(image_path)\n+ if img is None:\n+ return {\n+ \"error\": \"无法读取图像文件\",\n+ \"path\": image_path\n+ }\n+ \n+ # 获取基本信息\n+ height, width, channels = img.shape\n+ \n+ # 转换为灰度图\n+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n+ \n+ # 执行OCR文本识别\n+ try:\n+ text = pytesseract.image_to_string(gray, lang='chi_sim+eng')\n+ text = text.strip()\n+ except Exception as e:\n+ text = f\"OCR识别失败: {str(e)}\"\n+ \n+ # 检测图像类型\n+ image_type = \"未知\"\n+ # 检测是否为照片(通过颜色分布和边缘检测)\n+ edges = cv2.Canny(gray, 100, 200)\n+ edge_count = np.count_nonzero(edges)\n+ color_std = np.std(img)\n+ \n+ if color_std > 50 and edge_count > (width * height * 0.05):\n+ image_type = \"照片/图像\"\n+ \n+ # 检测是否为图表/图形\n+ circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, 1, 20, param1=50, param2=30, minRadius=0, maxRadius=0)\n+ lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)\n+ \n+ if (circles is not None or (lines is not None and len(lines) > 10)) and edge_count > (width * height * 0.01):\n+ image_type = \"图表/图形\"\n+ \n+ # 检测是否为文本图像\n+ if text and len(text) > 50:\n+ image_type = \"文本图像\"\n+ \n+ # 计算图像质量指标\n+ blur_score = cv2.Laplacian(gray, cv2.CV_64F).var()\n+ quality = \"高\" if blur_score > 100 else \"中\" if blur_score > 50 else \"低\"\n+ \n+ # 返回分析结果\n+ return {\n+ \"尺寸\": f\"{width}x{height}\",\n+ \"通道数\": channels,\n+ \"类型\": image_type,\n+ \"质量\": quality,\n+ \"模糊度评分\": round(blur_score, 2),\n+ \"识别文本\": text if text else \"未检测到文本\",\n+ \"文件路径\": image_path,\n+ \"文件名\": os.path.basename(image_path)\n+ }\n+ except Exception as e:\n+ return {\n+ \"error\": f\"图像分析失败: {str(e)}\",\n+ \"path\": image_path\n+ }\n+\n+\n async def parse_pdf(\n file_path: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"Parse a PDF file and extract text and images with optimized performance using PymuPDF4llm.\"\"\"\n@@ -131,26 +203,49 @@\n image_files.append(os.path.join(root, file))\n \n # 添加图像信息\n if image_files:\n- image_info = \"\\n## 提取的图像信息\\n\\n\"\n+ image_info = \"\\n## 提取的图像内容分析\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n+ # 分析每张图片\n for i, img_file in enumerate(image_files):\n try:\n- with Image.open(img_file) as img:\n- width, height = img.size\n- format_name = img.format\n- \n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\\n\"\n- image_info += \"---\\n\\n\"\n+ # 分析图像内容\n+ image_analysis = await analyze_image(img_file)\n+ \n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ \n+ # 添加图像基本信息\n+ image_info += f\"- **文件名**: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- **尺寸**: {image_analysis.get('尺寸', '未知')}\\n\"\n+ image_info += f\"- **类型**: {image_analysis.get('类型', '未知')}\\n\"\n+ image_info += f\"- **质量**: {image_analysis.get('质量', '未知')}\\n\\n\"\n+ \n+ # 添加OCR识别文本\n+ ocr_text = image_analysis.get('识别文本', '未检测到文本')\n+ if ocr_text and ocr_text != \"未检测到文本\":\n+ image_info += f\"**图像中的文本内容**:\\n\\n```\\n{ocr_text}\\n```\\n\\n\"\n+ else:\n+ image_info += \"**图像中未检测到文本内容**\\n\\n\"\n+ \n+ # 添加图像内容描述\n+ image_info += f\"**图像内容描述**: \"\n+ if image_analysis.get('类型') == \"文本图像\":\n+ image_info += \"这是一张包含文字的图像,主要内容为文本信息。\\n\\n\"\n+ elif image_analysis.get('类型') == \"图表/图形\":\n+ image_info += \"这是一张图表或图形,可能包含数据可视化内容。\\n\\n\"\n+ elif image_analysis.get('类型') == \"照片/图像\":\n+ image_info += \"这是一张照片或复杂图像,包含丰富的视觉信息。\\n\\n\"\n+ else:\n+ image_info += \"这是一张图像,但无法确定具体类型。\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_info += \"---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n+ image_info += f\"- **文件名**: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- **错误**: 无法分析图像: {str(e)}\\n\\n\"\n image_info += \"---\\n\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n@@ -234,17 +329,24 @@\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n img.save(img_path, \"JPEG\", quality=80)\n \n- # 获取图片尺寸\n- width, height = img.size\n+ # 分析图像内容\n+ image_analysis = await analyze_image(img_path)\n \n # 添加图片信息到Markdown\n image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n+ image_markdown += f\"- **尺寸**: {image_analysis.get('尺寸', '未知')}\\n\"\n+ image_markdown += f\"- **类型**: {image_analysis.get('类型', '未知')}\\n\"\n+ image_markdown += f\"- **质量**: {image_analysis.get('质量', '未知')}\\n\\n\"\n \n+ # 添加OCR识别文本\n+ ocr_text = image_analysis.get('识别文本', '未检测到文本')\n+ if ocr_text and ocr_text != \"未检测到文本\":\n+ image_markdown += f\"**图像中的文本内容**:\\n\\n```\\n{ocr_text}\\n```\\n\\n\"\n+ else:\n+ image_markdown += \"**图像中未检测到文本内容**\\n\\n\"\n+ \n # 添加分隔线\n image_markdown += \"---\\n\\n\"\n \n # 添加图片信息到结果\n" }, { "date": 1741259379143, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,18 +8,13 @@\n import io\n import tempfile\n import shutil\n import pathlib\n-import json\n from typing import List, Dict, Any, Optional\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n-import cv2\n-import numpy as np\n-import pytesseract\n-from datetime import datetime\n \n \n async def fetch_website(\n url: str,\n@@ -54,75 +49,8 @@\n text=f\"Error: Failed to fetch website: {str(e)}\"\n )]\n \n \n-async def analyze_image(image_path: str) -> Dict[str, Any]:\n- \"\"\"分析图像内容,提取文本和基本特征\"\"\"\n- try:\n- # 读取图像\n- img = cv2.imread(image_path)\n- if img is None:\n- return {\n- \"error\": \"无法读取图像文件\",\n- \"path\": image_path\n- }\n- \n- # 获取基本信息\n- height, width, channels = img.shape\n- \n- # 转换为灰度图\n- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n- \n- # 执行OCR文本识别\n- try:\n- text = pytesseract.image_to_string(gray, lang='chi_sim+eng')\n- text = text.strip()\n- except Exception as e:\n- text = f\"OCR识别失败: {str(e)}\"\n- \n- # 检测图像类型\n- image_type = \"未知\"\n- # 检测是否为照片(通过颜色分布和边缘检测)\n- edges = cv2.Canny(gray, 100, 200)\n- edge_count = np.count_nonzero(edges)\n- color_std = np.std(img)\n- \n- if color_std > 50 and edge_count > (width * height * 0.05):\n- image_type = \"照片/图像\"\n- \n- # 检测是否为图表/图形\n- circles = cv2.HoughCircles(gray, cv2.HOUGH_GRADIENT, 1, 20, param1=50, param2=30, minRadius=0, maxRadius=0)\n- lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)\n- \n- if (circles is not None or (lines is not None and len(lines) > 10)) and edge_count > (width * height * 0.01):\n- image_type = \"图表/图形\"\n- \n- # 检测是否为文本图像\n- if text and len(text) > 50:\n- image_type = \"文本图像\"\n- \n- # 计算图像质量指标\n- blur_score = cv2.Laplacian(gray, cv2.CV_64F).var()\n- quality = \"高\" if blur_score > 100 else \"中\" if blur_score > 50 else \"低\"\n- \n- # 返回分析结果\n- return {\n- \"尺寸\": f\"{width}x{height}\",\n- \"通道数\": channels,\n- \"类型\": image_type,\n- \"质量\": quality,\n- \"模糊度评分\": round(blur_score, 2),\n- \"识别文本\": text if text else \"未检测到文本\",\n- \"文件路径\": image_path,\n- \"文件名\": os.path.basename(image_path)\n- }\n- except Exception as e:\n- return {\n- \"error\": f\"图像分析失败: {str(e)}\",\n- \"path\": image_path\n- }\n-\n-\n async def parse_pdf(\n file_path: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"Parse a PDF file and extract text and images with optimized performance using PymuPDF4llm.\"\"\"\n@@ -203,49 +131,26 @@\n image_files.append(os.path.join(root, file))\n \n # 添加图像信息\n if image_files:\n- image_info = \"\\n## 提取的图像内容分析\\n\\n\"\n+ image_info = \"\\n## 提取的图像信息\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n- # 分析每张图片\n for i, img_file in enumerate(image_files):\n try:\n- # 分析图像内容\n- image_analysis = await analyze_image(img_file)\n- \n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- \n- # 添加图像基本信息\n- image_info += f\"- **文件名**: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- **尺寸**: {image_analysis.get('尺寸', '未知')}\\n\"\n- image_info += f\"- **类型**: {image_analysis.get('类型', '未知')}\\n\"\n- image_info += f\"- **质量**: {image_analysis.get('质量', '未知')}\\n\\n\"\n- \n- # 添加OCR识别文本\n- ocr_text = image_analysis.get('识别文本', '未检测到文本')\n- if ocr_text and ocr_text != \"未检测到文本\":\n- image_info += f\"**图像中的文本内容**:\\n\\n```\\n{ocr_text}\\n```\\n\\n\"\n- else:\n- image_info += \"**图像中未检测到文本内容**\\n\\n\"\n- \n- # 添加图像内容描述\n- image_info += f\"**图像内容描述**: \"\n- if image_analysis.get('类型') == \"文本图像\":\n- image_info += \"这是一张包含文字的图像,主要内容为文本信息。\\n\\n\"\n- elif image_analysis.get('类型') == \"图表/图形\":\n- image_info += \"这是一张图表或图形,可能包含数据可视化内容。\\n\\n\"\n- elif image_analysis.get('类型') == \"照片/图像\":\n- image_info += \"这是一张照片或复杂图像,包含丰富的视觉信息。\\n\\n\"\n- else:\n- image_info += \"这是一张图像,但无法确定具体类型。\\n\\n\"\n- \n- # 添加分隔线\n- image_info += \"---\\n\\n\"\n+ with Image.open(img_file) as img:\n+ width, height = img.size\n+ format_name = img.format\n+ \n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- **文件名**: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- **错误**: 无法分析图像: {str(e)}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n image_info += \"---\\n\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n@@ -329,24 +234,17 @@\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n img.save(img_path, \"JPEG\", quality=80)\n \n- # 分析图像内容\n- image_analysis = await analyze_image(img_path)\n+ # 获取图片尺寸\n+ width, height = img.size\n \n # 添加图片信息到Markdown\n image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- **尺寸**: {image_analysis.get('尺寸', '未知')}\\n\"\n- image_markdown += f\"- **类型**: {image_analysis.get('类型', '未知')}\\n\"\n- image_markdown += f\"- **质量**: {image_analysis.get('质量', '未知')}\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n \n- # 添加OCR识别文本\n- ocr_text = image_analysis.get('识别文本', '未检测到文本')\n- if ocr_text and ocr_text != \"未检测到文本\":\n- image_markdown += f\"**图像中的文本内容**:\\n\\n```\\n{ocr_text}\\n```\\n\\n\"\n- else:\n- image_markdown += \"**图像中未检测到文本内容**\\n\\n\"\n- \n # 添加分隔线\n image_markdown += \"---\\n\\n\"\n \n # 添加图片信息到结果\n" }, { "date": 1741259498505, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,15 +8,31 @@\n import io\n import tempfile\n import shutil\n import pathlib\n+import json\n from typing import List, Dict, Any, Optional\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n \n+# 尝试导入图像分析库,如果不可用则跳过\n+try:\n+ import cv2\n+ import numpy as np\n+ from PIL import ImageOps\n+ CV2_AVAILABLE = True\n+except ImportError:\n+ CV2_AVAILABLE = False\n \n+try:\n+ import pytesseract\n+ TESSERACT_AVAILABLE = True\n+except ImportError:\n+ TESSERACT_AVAILABLE = False\n+\n+\n async def fetch_website(\n url: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n headers = {\n@@ -107,9 +123,9 @@\n page_chunks=True,\n write_images=True,\n image_path=image_path,\n image_format=\"jpg\",\n- dpi=150\n+ dpi=200 # 提高DPI以获取更清晰的图像\n )\n \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n@@ -129,11 +145,11 @@\n for file in files:\n if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n image_files.append(os.path.join(root, file))\n \n- # 添加图像信息\n+ # 添加图像信息和内容分析\n if image_files:\n- image_info = \"\\n## 提取的图像信息\\n\\n\"\n+ image_info = \"\\n## 提取的图像内容分析\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n for i, img_file in enumerate(image_files):\n try:\n@@ -144,9 +160,88 @@\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n image_info += f\"- 格式: {format_name}\\n\\n\"\n- image_info += \"---\\n\\n\"\n+ \n+ # 图像内容分析\n+ image_info += \"#### 图像内容分析\\n\\n\"\n+ \n+ # 1. 尝试使用OCR提取图像中的文本(如果有)\n+ if TESSERACT_AVAILABLE:\n+ try:\n+ text_in_image = pytesseract.image_to_string(img)\n+ if text_in_image.strip():\n+ image_info += \"**图像中的文本内容:**\\n\\n\"\n+ image_info += f\"```\\n{text_in_image.strip()}\\n```\\n\\n\"\n+ else:\n+ image_info += \"图像中未检测到文本内容。\\n\\n\"\n+ except Exception as ocr_error:\n+ image_info += f\"OCR处理失败: {str(ocr_error)}\\n\\n\"\n+ \n+ # 2. 图像类型分析(如果OpenCV可用)\n+ if CV2_AVAILABLE:\n+ try:\n+ # 转换为OpenCV格式\n+ cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)\n+ \n+ # 检查是否为图表/图形\n+ # 简单启发式:检查边缘数量和颜色分布\n+ gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)\n+ edges = cv2.Canny(gray, 50, 150)\n+ edge_count = np.count_nonzero(edges)\n+ \n+ # 颜色分析\n+ colors = img.getcolors(maxcolors=1024)\n+ if colors is not None:\n+ unique_colors = len(colors)\n+ else:\n+ unique_colors = \"超过1024种\"\n+ \n+ # 图像类型判断\n+ image_info += \"**图像类型分析:**\\n\\n\"\n+ \n+ if edge_count > (width * height * 0.1): # 如果边缘像素超过10%\n+ if isinstance(unique_colors, int) and unique_colors < 20:\n+ image_info += \"- 可能是线条图或简单图表\\n\"\n+ else:\n+ image_info += \"- 可能是复杂图表或示意图\\n\"\n+ elif isinstance(unique_colors, int) and unique_colors < 10:\n+ image_info += \"- 可能是简单图形或图标\\n\"\n+ else:\n+ image_info += \"- 可能是照片或复杂图像\\n\"\n+ \n+ image_info += f\"- 边缘密度: {edge_count/(width*height):.2%}\\n\"\n+ image_info += f\"- 唯一颜色数: {unique_colors}\\n\\n\"\n+ \n+ except Exception as cv_error:\n+ image_info += f\"图像分析失败: {str(cv_error)}\\n\\n\"\n+ \n+ # 3. 基本图像特征描述\n+ image_info += \"**基本图像特征:**\\n\\n\"\n+ \n+ # 检查是否为彩色图像\n+ if img.mode == \"RGB\" or img.mode == \"RGBA\":\n+ image_info += \"- 彩色图像\\n\"\n+ elif img.mode == \"L\":\n+ image_info += \"- 灰度图像\\n\"\n+ elif img.mode == \"1\":\n+ image_info += \"- 二值图像(黑白)\\n\"\n+ else:\n+ image_info += f\"- 图像模式: {img.mode}\\n\"\n+ \n+ # 检查图像清晰度(简单启发式)\n+ if CV2_AVAILABLE:\n+ try:\n+ laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()\n+ if laplacian_var < 100:\n+ image_info += \"- 图像可能较为模糊\\n\"\n+ elif laplacian_var > 500:\n+ image_info += \"- 图像较为清晰\\n\"\n+ image_info += f\"- 清晰度评分: {laplacian_var:.2f}\\n\"\n+ except:\n+ pass\n+ \n+ image_info += \"\\n---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n@@ -220,9 +315,9 @@\n \n # 转换PDF页面为图片并保存\n images = convert_from_path(\n file_path, \n- dpi=150, \n+ dpi=200, # 提高DPI以获取更清晰的图像\n fmt=\"jpg\", \n first_page=1, \n last_page=max_img_pages,\n thread_count=2 # 使用多线程加速\n@@ -232,19 +327,29 @@\n image_markdown = \"\\n## 图片内容\\n\\n\"\n for i, img in enumerate(images):\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n+ img.save(img_path, \"JPEG\", quality=85) # 提高质量\n \n # 获取图片尺寸\n width, height = img.size\n \n # 添加图片信息到Markdown\n image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n+ image_markdown += f\"- DPI: 200\\n\\n\"\n \n+ # 尝试分析图片内容\n+ if TESSERACT_AVAILABLE:\n+ try:\n+ text_in_image = pytesseract.image_to_string(img)\n+ if text_in_image.strip():\n+ image_markdown += \"**图像中的文本内容:**\\n\\n\"\n+ image_markdown += f\"```\\n{text_in_image.strip()}\\n```\\n\\n\"\n+ except:\n+ pass\n+ \n # 添加分隔线\n image_markdown += \"---\\n\\n\"\n \n # 添加图片信息到结果\n" }, { "date": 1741259592230, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,31 +8,15 @@\n import io\n import tempfile\n import shutil\n import pathlib\n-import json\n from typing import List, Dict, Any, Optional\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n \n-# 尝试导入图像分析库,如果不可用则跳过\n-try:\n- import cv2\n- import numpy as np\n- from PIL import ImageOps\n- CV2_AVAILABLE = True\n-except ImportError:\n- CV2_AVAILABLE = False\n \n-try:\n- import pytesseract\n- TESSERACT_AVAILABLE = True\n-except ImportError:\n- TESSERACT_AVAILABLE = False\n-\n-\n async def fetch_website(\n url: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n headers = {\n@@ -123,9 +107,9 @@\n page_chunks=True,\n write_images=True,\n image_path=image_path,\n image_format=\"jpg\",\n- dpi=200 # 提高DPI以获取更清晰的图像\n+ dpi=150\n )\n \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n@@ -145,11 +129,11 @@\n for file in files:\n if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n image_files.append(os.path.join(root, file))\n \n- # 添加图像信息和内容分析\n+ # 添加图像信息\n if image_files:\n- image_info = \"\\n## 提取的图像内容分析\\n\\n\"\n+ image_info = \"\\n## 提取的图像信息\\n\\n\"\n image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n for i, img_file in enumerate(image_files):\n try:\n@@ -160,88 +144,9 @@\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n image_info += f\"- 格式: {format_name}\\n\\n\"\n- \n- # 图像内容分析\n- image_info += \"#### 图像内容分析\\n\\n\"\n- \n- # 1. 尝试使用OCR提取图像中的文本(如果有)\n- if TESSERACT_AVAILABLE:\n- try:\n- text_in_image = pytesseract.image_to_string(img)\n- if text_in_image.strip():\n- image_info += \"**图像中的文本内容:**\\n\\n\"\n- image_info += f\"```\\n{text_in_image.strip()}\\n```\\n\\n\"\n- else:\n- image_info += \"图像中未检测到文本内容。\\n\\n\"\n- except Exception as ocr_error:\n- image_info += f\"OCR处理失败: {str(ocr_error)}\\n\\n\"\n- \n- # 2. 图像类型分析(如果OpenCV可用)\n- if CV2_AVAILABLE:\n- try:\n- # 转换为OpenCV格式\n- cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)\n- \n- # 检查是否为图表/图形\n- # 简单启发式:检查边缘数量和颜色分布\n- gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)\n- edges = cv2.Canny(gray, 50, 150)\n- edge_count = np.count_nonzero(edges)\n- \n- # 颜色分析\n- colors = img.getcolors(maxcolors=1024)\n- if colors is not None:\n- unique_colors = len(colors)\n- else:\n- unique_colors = \"超过1024种\"\n- \n- # 图像类型判断\n- image_info += \"**图像类型分析:**\\n\\n\"\n- \n- if edge_count > (width * height * 0.1): # 如果边缘像素超过10%\n- if isinstance(unique_colors, int) and unique_colors < 20:\n- image_info += \"- 可能是线条图或简单图表\\n\"\n- else:\n- image_info += \"- 可能是复杂图表或示意图\\n\"\n- elif isinstance(unique_colors, int) and unique_colors < 10:\n- image_info += \"- 可能是简单图形或图标\\n\"\n- else:\n- image_info += \"- 可能是照片或复杂图像\\n\"\n- \n- image_info += f\"- 边缘密度: {edge_count/(width*height):.2%}\\n\"\n- image_info += f\"- 唯一颜色数: {unique_colors}\\n\\n\"\n- \n- except Exception as cv_error:\n- image_info += f\"图像分析失败: {str(cv_error)}\\n\\n\"\n- \n- # 3. 基本图像特征描述\n- image_info += \"**基本图像特征:**\\n\\n\"\n- \n- # 检查是否为彩色图像\n- if img.mode == \"RGB\" or img.mode == \"RGBA\":\n- image_info += \"- 彩色图像\\n\"\n- elif img.mode == \"L\":\n- image_info += \"- 灰度图像\\n\"\n- elif img.mode == \"1\":\n- image_info += \"- 二值图像(黑白)\\n\"\n- else:\n- image_info += f\"- 图像模式: {img.mode}\\n\"\n- \n- # 检查图像清晰度(简单启发式)\n- if CV2_AVAILABLE:\n- try:\n- laplacian_var = cv2.Laplacian(gray, cv2.CV_64F).var()\n- if laplacian_var < 100:\n- image_info += \"- 图像可能较为模糊\\n\"\n- elif laplacian_var > 500:\n- image_info += \"- 图像较为清晰\\n\"\n- image_info += f\"- 清晰度评分: {laplacian_var:.2f}\\n\"\n- except:\n- pass\n- \n- image_info += \"\\n---\\n\\n\"\n+ image_info += \"---\\n\\n\"\n except Exception as e:\n image_info += f\"### 图像 {i+1}\\n\\n\"\n image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n@@ -315,9 +220,9 @@\n \n # 转换PDF页面为图片并保存\n images = convert_from_path(\n file_path, \n- dpi=200, # 提高DPI以获取更清晰的图像\n+ dpi=150, \n fmt=\"jpg\", \n first_page=1, \n last_page=max_img_pages,\n thread_count=2 # 使用多线程加速\n@@ -327,29 +232,19 @@\n image_markdown = \"\\n## 图片内容\\n\\n\"\n for i, img in enumerate(images):\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=85) # 提高质量\n+ img.save(img_path, \"JPEG\", quality=80)\n \n # 获取图片尺寸\n width, height = img.size\n \n # 添加图片信息到Markdown\n image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 200\\n\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n \n- # 尝试分析图片内容\n- if TESSERACT_AVAILABLE:\n- try:\n- text_in_image = pytesseract.image_to_string(img)\n- if text_in_image.strip():\n- image_markdown += \"**图像中的文本内容:**\\n\\n\"\n- image_markdown += f\"```\\n{text_in_image.strip()}\\n```\\n\\n\"\n- except:\n- pass\n- \n # 添加分隔线\n image_markdown += \"---\\n\\n\"\n \n # 添加图片信息到结果\n" }, { "date": 1741259709571, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -100,25 +100,42 @@\n max_pages = min(num_pages, 30)\n pages_to_process = list(range(max_pages))\n \n # 使用PymuPDF4llm提取内容\n- md_content = pymupdf4llm.to_markdown(\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"正在使用PymuPDF4llm提取PDF内容,包括文本和图像...\"\n+ ))\n+ \n+ # 提取PDF内容,包括图像\n+ md_content_chunks = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=True,\n image_path=image_path,\n image_format=\"jpg\",\n- dpi=150\n+ dpi=200\n )\n \n+ # 处理提取的文本内容\n+ if isinstance(md_content_chunks, list):\n+ # 如果返回的是分块内容\n+ md_content = \"\"\n+ for chunk in md_content_chunks:\n+ if isinstance(chunk, dict) and 'text' in chunk:\n+ md_content += chunk['text'] + \"\\n\\n\"\n+ else:\n+ # 如果返回的是单个字符串\n+ md_content = md_content_chunks\n+ \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n- # 添加提取的内容到结果\n+ # 添加提取的文本内容到结果\n results.append(types.TextContent(\n type=\"text\",\n text=md_content\n ))\n@@ -131,38 +148,67 @@\n image_files.append(os.path.join(root, file))\n \n # 添加图像信息\n if image_files:\n- image_info = \"\\n## 提取的图像信息\\n\\n\"\n- image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"\\n## 提取的图像\\n\\n共提取了 {len(image_files)} 张图像。\"\n+ ))\n \n+ # 处理每个图像\n for i, img_file in enumerate(image_files):\n try:\n with Image.open(img_file) as img:\n width, height = img.size\n format_name = img.format\n \n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- except Exception as e:\n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_info\n- ))\n+ # 将图像转换为base64编码\n+ with open(img_file, \"rb\") as img_binary:\n+ img_data = img_binary.read()\n+ img_base64 = base64.b64encode(img_data).decode('utf-8')\n+ \n+ # 添加图像内容到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"\\n### 图像 {i+1}\\n\\n- 文件名: {os.path.basename(img_file)}\\n- 尺寸: {width}x{height} 像素\\n- 格式: {format_name}\\n\"\n+ ))\n+ \n+ # 添加图像数据\n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ data=img_base64,\n+ mimeType=f\"image/{format_name.lower()}\"\n+ ))\n+ except Exception as img_error:\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"\\n### 图像 {i+1}\\n\\n- 文件名: {os.path.basename(img_file)}\\n- 错误: 无法处理图像: {str(img_error)}\\n\"\n+ ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n+ # 如果md_content_chunks包含图像信息,添加到结果\n+ if isinstance(md_content_chunks, list):\n+ image_info = \"\\n## 图像详细信息\\n\\n\"\n+ for i, chunk in enumerate(md_content_chunks):\n+ if isinstance(chunk, dict) and 'images' in chunk and chunk['images']:\n+ image_info += f\"### 第 {i+1} 块内容中的图像\\n\\n\"\n+ for j, img_info in enumerate(chunk['images']):\n+ image_info += f\"- 图像 {j+1}:\\n\"\n+ for key, value in img_info.items():\n+ if key != 'image': # 排除二进制图像数据\n+ image_info += f\" - {key}: {value}\\n\"\n+ image_info += \"\\n\"\n+ \n+ if image_info != \"\\n## 图像详细信息\\n\\n\":\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_info\n+ ))\n+ \n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n@@ -228,31 +274,34 @@\n thread_count=2 # 使用多线程加速\n )\n \n # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n for i, img in enumerate(images):\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n img.save(img_path, \"JPEG\", quality=80)\n \n # 获取图片尺寸\n width, height = img.size\n \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n+ # 添加图片信息\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"\\n### 第 {i+1} 页图片\\n\\n- 尺寸: {width}x{height} 像素\\n- 格式: JPEG\\n- DPI: 150\\n\"\n+ ))\n \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n+ # 将图片转换为base64编码\n+ img_buffer = io.BytesIO()\n+ img.save(img_buffer, format=\"JPEG\", quality=70)\n+ img_data = img_buffer.getvalue()\n+ img_base64 = base64.b64encode(img_data).decode('utf-8')\n+ \n+ # 添加图片数据\n+ results.append(types.ImageContent(\n+ type=\"image\",\n+ data=img_base64,\n+ mimeType=\"image/jpeg\"\n+ ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n" }, { "date": 1741259743066, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -371,15 +371,31 @@\n pages_to_process = list(range(max_pages))\n \n try:\n # 使用PymuPDF4llm提取内容,但不提取图像\n- md_content = pymupdf4llm.to_markdown(\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"正在使用PymuPDF4llm提取PDF文本内容...\"\n+ ))\n+ \n+ md_content_chunks = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=False # 不提取图像\n )\n \n+ # 处理提取的文本内容\n+ if isinstance(md_content_chunks, list):\n+ # 如果返回的是分块内容\n+ md_content = \"\"\n+ for chunk in md_content_chunks:\n+ if isinstance(chunk, dict) and 'text' in chunk:\n+ md_content += chunk['text'] + \"\\n\\n\"\n+ else:\n+ # 如果返回的是单个字符串\n+ md_content = md_content_chunks\n+ \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n" }, { "date": 1741260271788, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -100,42 +100,25 @@\n max_pages = min(num_pages, 30)\n pages_to_process = list(range(max_pages))\n \n # 使用PymuPDF4llm提取内容\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"正在使用PymuPDF4llm提取PDF内容,包括文本和图像...\"\n- ))\n- \n- # 提取PDF内容,包括图像\n- md_content_chunks = pymupdf4llm.to_markdown(\n+ md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=True,\n image_path=image_path,\n image_format=\"jpg\",\n- dpi=200\n+ dpi=150\n )\n \n- # 处理提取的文本内容\n- if isinstance(md_content_chunks, list):\n- # 如果返回的是分块内容\n- md_content = \"\"\n- for chunk in md_content_chunks:\n- if isinstance(chunk, dict) and 'text' in chunk:\n- md_content += chunk['text'] + \"\\n\\n\"\n- else:\n- # 如果返回的是单个字符串\n- md_content = md_content_chunks\n- \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n- # 添加提取的文本内容到结果\n+ # 添加提取的内容到结果\n results.append(types.TextContent(\n type=\"text\",\n text=md_content\n ))\n@@ -148,67 +131,38 @@\n image_files.append(os.path.join(root, file))\n \n # 添加图像信息\n if image_files:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"\\n## 提取的图像\\n\\n共提取了 {len(image_files)} 张图像。\"\n- ))\n+ image_info = \"\\n## 提取的图像信息\\n\\n\"\n+ image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n \n- # 处理每个图像\n for i, img_file in enumerate(image_files):\n try:\n with Image.open(img_file) as img:\n width, height = img.size\n format_name = img.format\n \n- # 将图像转换为base64编码\n- with open(img_file, \"rb\") as img_binary:\n- img_data = img_binary.read()\n- img_base64 = base64.b64encode(img_data).decode('utf-8')\n- \n- # 添加图像内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"\\n### 图像 {i+1}\\n\\n- 文件名: {os.path.basename(img_file)}\\n- 尺寸: {width}x{height} 像素\\n- 格式: {format_name}\\n\"\n- ))\n- \n- # 添加图像数据\n- results.append(types.ImageContent(\n- type=\"image\",\n- data=img_base64,\n- mimeType=f\"image/{format_name.lower()}\"\n- ))\n- except Exception as img_error:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"\\n### 图像 {i+1}\\n\\n- 文件名: {os.path.basename(img_file)}\\n- 错误: 无法处理图像: {str(img_error)}\\n\"\n- ))\n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_info += f\"- 格式: {format_name}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n+ except Exception as e:\n+ image_info += f\"### 图像 {i+1}\\n\\n\"\n+ image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n+ image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n+ image_info += \"---\\n\\n\"\n+ \n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_info\n+ ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n ))\n \n- # 如果md_content_chunks包含图像信息,添加到结果\n- if isinstance(md_content_chunks, list):\n- image_info = \"\\n## 图像详细信息\\n\\n\"\n- for i, chunk in enumerate(md_content_chunks):\n- if isinstance(chunk, dict) and 'images' in chunk and chunk['images']:\n- image_info += f\"### 第 {i+1} 块内容中的图像\\n\\n\"\n- for j, img_info in enumerate(chunk['images']):\n- image_info += f\"- 图像 {j+1}:\\n\"\n- for key, value in img_info.items():\n- if key != 'image': # 排除二进制图像数据\n- image_info += f\" - {key}: {value}\\n\"\n- image_info += \"\\n\"\n- \n- if image_info != \"\\n## 图像详细信息\\n\\n\":\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_info\n- ))\n- \n except Exception as extract_error:\n # 如果PymuPDF4llm提取失败,回退到原来的方法\n results.append(types.TextContent(\n type=\"text\",\n@@ -274,34 +228,31 @@\n thread_count=2 # 使用多线程加速\n )\n \n # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n for i, img in enumerate(images):\n # 保存图片到临时目录\n img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n img.save(img_path, \"JPEG\", quality=80)\n \n # 获取图片尺寸\n width, height = img.size\n \n- # 添加图片信息\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"\\n### 第 {i+1} 页图片\\n\\n- 尺寸: {width}x{height} 像素\\n- 格式: JPEG\\n- DPI: 150\\n\"\n- ))\n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\\n\"\n \n- # 将图片转换为base64编码\n- img_buffer = io.BytesIO()\n- img.save(img_buffer, format=\"JPEG\", quality=70)\n- img_data = img_buffer.getvalue()\n- img_base64 = base64.b64encode(img_data).decode('utf-8')\n- \n- # 添加图片数据\n- results.append(types.ImageContent(\n- type=\"image\",\n- data=img_base64,\n- mimeType=\"image/jpeg\"\n- ))\n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n+ \n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n except Exception as img_error:\n # 如果图片提取失败,添加错误信息但继续\n results.append(types.TextContent(\n type=\"text\",\n@@ -371,31 +322,15 @@\n pages_to_process = list(range(max_pages))\n \n try:\n # 使用PymuPDF4llm提取内容,但不提取图像\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"正在使用PymuPDF4llm提取PDF文本内容...\"\n- ))\n- \n- md_content_chunks = pymupdf4llm.to_markdown(\n+ md_content = pymupdf4llm.to_markdown(\n doc=file_path,\n pages=pages_to_process,\n page_chunks=True,\n write_images=False # 不提取图像\n )\n \n- # 处理提取的文本内容\n- if isinstance(md_content_chunks, list):\n- # 如果返回的是分块内容\n- md_content = \"\"\n- for chunk in md_content_chunks:\n- if isinstance(chunk, dict) and 'text' in chunk:\n- md_content += chunk['text'] + \"\\n\\n\"\n- else:\n- # 如果返回的是单个字符串\n- md_content = md_content_chunks\n- \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n else:\n" }, { "date": 1741260698302, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -8,13 +8,14 @@\n import io\n import tempfile\n import shutil\n import pathlib\n-from typing import List, Dict, Any, Optional\n+from typing import List, Dict, Any, Optional, Tuple\n import PyPDF2\n from pdf2image import convert_from_path\n from PIL import Image\n import pymupdf4llm\n+import fitz # PyMuPDF\n \n \n async def fetch_website(\n url: str,\n@@ -49,8 +50,100 @@\n text=f\"Error: Failed to fetch website: {str(e)}\"\n )]\n \n \n+async def extract_images_from_pdf(file_path: str, output_dir: str) -> List[Dict[str, Any]]:\n+ \"\"\"\n+ 使用PyMuPDF (fitz) 从PDF中提取图片,这比pdf2image更高效且能提取嵌入图片\n+ \n+ Args:\n+ file_path: PDF文件路径\n+ output_dir: 图片输出目录\n+ \n+ Returns:\n+ 提取的图片信息列表\n+ \"\"\"\n+ image_info = []\n+ \n+ try:\n+ # 打开PDF文件\n+ pdf_document = fitz.open(file_path)\n+ \n+ # 遍历每一页\n+ for page_index in range(len(pdf_document)):\n+ page = pdf_document[page_index]\n+ \n+ # 获取页面上的图片\n+ image_list = page.get_images(full=True)\n+ \n+ # 遍历页面上的每个图片\n+ for img_index, img in enumerate(image_list):\n+ xref = img[0] # 图片的xref号\n+ base_image = pdf_document.extract_image(xref)\n+ image_bytes = base_image[\"image\"]\n+ image_ext = base_image[\"ext\"] # 图片扩展名\n+ \n+ # 保存图片到文件\n+ image_filename = f\"page_{page_index + 1}_img_{img_index + 1}.{image_ext}\"\n+ image_path = os.path.join(output_dir, image_filename)\n+ \n+ with open(image_path, \"wb\") as img_file:\n+ img_file.write(image_bytes)\n+ \n+ # 获取图片信息\n+ with Image.open(image_path) as pil_img:\n+ width, height = pil_img.size\n+ format_name = pil_img.format\n+ \n+ # 添加图片信息到列表\n+ image_info.append({\n+ \"filename\": image_filename,\n+ \"path\": image_path,\n+ \"page\": page_index + 1,\n+ \"width\": width,\n+ \"height\": height,\n+ \"format\": format_name,\n+ \"size_bytes\": len(image_bytes)\n+ })\n+ \n+ # 如果没有找到嵌入图片,尝试渲染页面为图片\n+ if not image_info:\n+ for page_index in range(len(pdf_document)):\n+ page = pdf_document[page_index]\n+ \n+ # 将页面渲染为图片\n+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x缩放以获得更好的质量\n+ image_filename = f\"page_{page_index + 1}_rendered.png\"\n+ image_path = os.path.join(output_dir, image_filename)\n+ \n+ # 保存渲染的图片\n+ pix.save(image_path)\n+ \n+ # 获取图片信息\n+ with Image.open(image_path) as pil_img:\n+ width, height = pil_img.size\n+ format_name = pil_img.format\n+ \n+ # 添加图片信息到列表\n+ image_info.append({\n+ \"filename\": image_filename,\n+ \"path\": image_path,\n+ \"page\": page_index + 1,\n+ \"width\": width,\n+ \"height\": height,\n+ \"format\": format_name,\n+ \"size_bytes\": os.path.getsize(image_path),\n+ \"type\": \"rendered_page\"\n+ })\n+ \n+ pdf_document.close()\n+ return image_info\n+ \n+ except Exception as e:\n+ print(f\"提取图片时出错: {str(e)}\")\n+ return []\n+\n+\n async def parse_pdf(\n file_path: str,\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"Parse a PDF file and extract text and images with optimized performance using PymuPDF4llm.\"\"\"\n@@ -122,40 +215,40 @@\n type=\"text\",\n text=md_content\n ))\n \n- # 处理提取的图像\n- image_files = []\n- for root, dirs, files in os.walk(image_path):\n- for file in files:\n- if file.lower().endswith(('.jpg', '.jpeg', '.png')):\n- image_files.append(os.path.join(root, file))\n+ # 使用PyMuPDF提取图片(更高级的图片提取方法)\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=\"正在使用高级方法提取图片,这可能需要一些时间...\"\n+ ))\n \n+ # 提取图片\n+ image_info = await extract_images_from_pdf(file_path, image_path)\n+ \n # 添加图像信息\n- if image_files:\n- image_info = \"\\n## 提取的图像信息\\n\\n\"\n- image_info += f\"共提取了 {len(image_files)} 张图像。\\n\\n\"\n+ if image_info:\n+ image_markdown = \"\\n## 提取的图像信息\\n\\n\"\n+ image_markdown += f\"共提取了 {len(image_info)} 张图像。\\n\\n\"\n \n- for i, img_file in enumerate(image_files):\n- try:\n- with Image.open(img_file) as img:\n- width, height = img.size\n- format_name = img.format\n- \n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_info += f\"- 格式: {format_name}\\n\\n\"\n- image_info += \"---\\n\\n\"\n- except Exception as e:\n- image_info += f\"### 图像 {i+1}\\n\\n\"\n- image_info += f\"- 文件名: {os.path.basename(img_file)}\\n\"\n- image_info += f\"- 错误: 无法读取图像信息: {str(e)}\\n\\n\"\n- image_info += \"---\\n\\n\"\n+ for i, img_data in enumerate(image_info):\n+ image_markdown += f\"### 图像 {i+1}\\n\\n\"\n+ image_markdown += f\"- 文件名: {img_data['filename']}\\n\"\n+ image_markdown += f\"- 页码: {img_data['page']}\\n\"\n+ image_markdown += f\"- 尺寸: {img_data['width']}x{img_data['height']} 像素\\n\"\n+ image_markdown += f\"- 格式: {img_data['format']}\\n\"\n+ image_markdown += f\"- 大小: {img_data['size_bytes'] / 1024:.2f} KB\\n\"\n+ \n+ if 'type' in img_data and img_data['type'] == 'rendered_page':\n+ image_markdown += f\"- 类型: 渲染页面\\n\"\n+ else:\n+ image_markdown += f\"- 类型: 嵌入图像\\n\"\n+ \n+ image_markdown += \"\\n---\\n\\n\"\n \n results.append(types.TextContent(\n type=\"text\",\n- text=image_info\n+ text=image_markdown\n ))\n else:\n results.append(types.TextContent(\n type=\"text\",\n@@ -207,59 +300,149 @@\n \n # 添加文本内容到结果\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n- # 尝试使用pdf2image提取图片\n+ # 尝试使用PyMuPDF提取图片\n try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- \n results.append(types.TextContent(\n type=\"text\",\n- text=f\"正在提取图片,这可能需要一些时间...\"\n+ text=\"正在使用备用方法提取图片,这可能需要一些时间...\"\n ))\n \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n+ # 提取图片\n+ image_info = await extract_images_from_pdf(file_path, image_path)\n \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(temp_dir, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n+ # 添加图像信息\n+ if image_info:\n+ image_markdown = \"\\n## 提取的图像信息\\n\\n\"\n+ image_markdown += f\"共提取了 {len(image_info)} 张图像。\\n\\n\"\n \n- # 获取图片尺寸\n- width, height = img.size\n+ for i, img_data in enumerate(image_info):\n+ image_markdown += f\"### 图像 {i+1}\\n\\n\"\n+ image_markdown += f\"- 文件名: {img_data['filename']}\\n\"\n+ image_markdown += f\"- 页码: {img_data['page']}\\n\"\n+ image_markdown += f\"- 尺寸: {img_data['width']}x{img_data['height']} 像素\\n\"\n+ image_markdown += f\"- 格式: {img_data['format']}\\n\"\n+ image_markdown += f\"- 大小: {img_data['size_bytes'] / 1024:.2f} KB\\n\"\n+ \n+ if 'type' in img_data and img_data['type'] == 'rendered_page':\n+ image_markdown += f\"- 类型: 渲染页面\\n\"\n+ else:\n+ image_markdown += f\"- 类型: 嵌入图像\\n\"\n+ \n+ image_markdown += \"\\n---\\n\\n\"\n \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\\n\"\n- \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ else:\n+ # 如果PyMuPDF也无法提取图片,尝试使用pdf2image\n+ try:\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n+ \n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"正在使用最后的备用方法提取图片,这可能需要一些时间...\"\n+ ))\n+ \n+ # 转换PDF页面为图片并保存\n+ images = convert_from_path(\n+ file_path, \n+ dpi=150, \n+ fmt=\"jpg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n+ \n+ # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n+ for i, img in enumerate(images):\n+ # 保存图片到临时目录\n+ img_path = os.path.join(image_path, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\"\n+ image_markdown += f\"- 类型: 渲染页面\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n+ \n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ except Exception as img_error:\n+ # 如果图片提取失败,添加错误信息但继续\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n+ ))\n+ except Exception as pymupdf_error:\n results.append(types.TextContent(\n type=\"text\",\n- text=image_markdown\n+ text=f\"警告: 使用PyMuPDF提取图片失败: {str(pymupdf_error)}\\n\"\n+ f\"正在尝试使用最后的备用方法...\"\n ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n+ \n+ # 尝试使用pdf2image作为最后的备用方法\n+ try:\n+ # 限制处理的页数以提高性能\n+ max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n+ \n+ # 转换PDF页面为图片并保存\n+ images = convert_from_path(\n+ file_path, \n+ dpi=150, \n+ fmt=\"jpg\", \n+ first_page=1, \n+ last_page=max_img_pages,\n+ thread_count=2 # 使用多线程加速\n+ )\n+ \n+ # 处理每个页面图片\n+ image_markdown = \"\\n## 图片内容\\n\\n\"\n+ for i, img in enumerate(images):\n+ # 保存图片到临时目录\n+ img_path = os.path.join(image_path, f\"page_{i+1}.jpg\")\n+ img.save(img_path, \"JPEG\", quality=80)\n+ \n+ # 获取图片尺寸\n+ width, height = img.size\n+ \n+ # 添加图片信息到Markdown\n+ image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n+ image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n+ image_markdown += f\"- 格式: JPEG\\n\"\n+ image_markdown += f\"- DPI: 150\\n\"\n+ image_markdown += f\"- 类型: 渲染页面\\n\\n\"\n+ \n+ # 添加分隔线\n+ image_markdown += \"---\\n\\n\"\n+ \n+ # 添加图片信息到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=image_markdown\n+ ))\n+ except Exception as img_error:\n+ # 如果图片提取失败,添加错误信息但继续\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n+ f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n+ ))\n \n # 清理临时目录\n shutil.rmtree(temp_dir)\n \n" }, { "date": 1741260735410, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -504,65 +504,113 @@\n max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n pages_to_process = list(range(max_pages))\n \n try:\n- # 使用PymuPDF4llm提取内容,但不提取图像\n- md_content = pymupdf4llm.to_markdown(\n- doc=file_path,\n- pages=pages_to_process,\n- page_chunks=True,\n- write_images=False # 不提取图像\n- )\n+ # 尝试使用PyMuPDF提取文本(通常比PyPDF2更快更准确)\n+ pdf_document = fitz.open(file_path)\n \n+ # 提取文本内容\n+ text_content = \"\"\n+ \n+ # 添加PDF元数据\n+ text_content += f\"## PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n+ \n+ # 从PyMuPDF获取元数据\n+ metadata = pdf_document.metadata\n+ if metadata:\n+ for key, value in metadata.items():\n+ if value and str(value).strip():\n+ text_content += f\"- {key}: {value}\\n\"\n+ \n # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n- md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n- else:\n- md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n \n+ text_content += \"\\n## 内容摘要\\n\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages):\n+ page = pdf_document[page_num]\n+ page_text = page.get_text()\n+ \n+ if page_text.strip():\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n+ text_content += page_text.strip() + \"\\n\"\n+ \n+ pdf_document.close()\n+ \n # 添加提取的内容到结果\n results.append(types.TextContent(\n type=\"text\",\n- text=md_content\n+ text=text_content\n ))\n- except Exception as extract_error:\n- # 如果PymuPDF4llm提取失败,回退到原来的方法\n+ \n+ except Exception as pymupdf_error:\n+ # 如果PyMuPDF提取失败,回退到PymuPDF4llm\n results.append(types.TextContent(\n type=\"text\",\n- text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n+ text=f\"警告: 使用PyMuPDF提取内容失败: {str(pymupdf_error)}\\n正在尝试使用备用方法...\"\n ))\n \n- # 使用PyPDF2提取文本\n- text_content = \"\"\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n+ try:\n+ # 使用PymuPDF4llm提取内容,但不提取图像\n+ md_content = pymupdf4llm.to_markdown(\n+ doc=file_path,\n+ pages=pages_to_process,\n+ page_chunks=True,\n+ write_images=False # 不提取图像\n+ )\n \n- # 添加PDF元数据\n- text_content += f\"## PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:]\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 限制处理的页数\n+ # 如果处理的页数少于总页数,添加提示\n if max_pages < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n+ else:\n+ md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n \n- text_content += \"\\n## 内容摘要\\n\\n\"\n+ # 添加提取的内容到结果\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=md_content\n+ ))\n+ except Exception as extract_error:\n+ # 如果PymuPDF4llm提取失败,回退到原来的方法\n+ results.append(types.TextContent(\n+ type=\"text\",\n+ text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用最后的备用方法...\"\n+ ))\n \n- # 逐页提取文本\n- for page_num in range(max_pages):\n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n- \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n+ # 使用PyPDF2提取文本\n+ text_content = \"\"\n+ with open(file_path, 'rb') as file:\n+ reader = PyPDF2.PdfReader(file)\n+ \n+ # 添加PDF元数据\n+ text_content += f\"## PDF文档信息\\n\\n\"\n+ text_content += f\"- 页数: {num_pages}\\n\"\n+ if reader.metadata:\n+ for key, value in reader.metadata.items():\n+ if key.startswith('/'):\n+ key = key[1:]\n+ if value and str(value).strip():\n+ text_content += f\"- {key}: {value}\\n\"\n+ \n+ # 限制处理的页数\n+ if max_pages < num_pages:\n+ text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n+ \n+ text_content += \"\\n## 内容摘要\\n\\n\"\n+ \n+ # 逐页提取文本\n+ for page_num in range(max_pages):\n+ page = reader.pages[page_num]\n+ page_text = page.extract_text()\n+ if page_text:\n+ text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n+ text_content += page_text + \"\\n\"\n+ \n+ # 添加文本内容到结果\n+ results.append(types.TextContent(type=\"text\", text=text_content))\n \n # 添加提示信息\n results.append(types.TextContent(\n type=\"text\",\n" }, { "date": 1741332401240, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,777 +1,54 @@\n-import anyio\n+\"\"\"\n+MCP服务器主文件,处理命令行参数并启动相应的适配器\n+\"\"\"\n+\n import click\n-import httpx\n-import mcp.types as types\n-from mcp.server.lowlevel import Server\n-import os\n-import base64\n-import io\n-import tempfile\n-import shutil\n-import pathlib\n-from typing import List, Dict, Any, Optional, Tuple\n-import PyPDF2\n-from pdf2image import convert_from_path\n-from PIL import Image\n-import pymupdf4llm\n-import fitz # PyMuPDF\n+from typing import Optional\n+from .tools import all_tools\n+from .core.sse_adapter import SseAdapter\n+from .core.stdio_adapter import StdioAdapter\n \n \n-async def fetch_website(\n- url: str,\n-) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- headers = {\n- \"User-Agent\": \"MCP Test Server (github.com/modelcontextprotocol/python-sdk)\"\n- }\n- try:\n- timeout = httpx.Timeout(10.0, connect=5.0)\n- async with httpx.AsyncClient(\n- follow_redirects=True, \n- headers=headers,\n- timeout=timeout\n- ) as client:\n- response = await client.get(url)\n- response.raise_for_status()\n- return [types.TextContent(type=\"text\", text=response.text)]\n- except httpx.TimeoutException:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"Error: Request timed out while trying to fetch the website.\"\n- )]\n- except httpx.HTTPStatusError as e:\n- return [types.TextContent(\n- type=\"text\",\n- text=(f\"Error: HTTP {e.response.status_code} \"\n- \"error while fetching the website.\")\n- )]\n- except Exception as e:\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"Error: Failed to fetch website: {str(e)}\"\n- )]\n+APP_NAME = \"mcp-development-framework\"\n \n \n-async def extract_images_from_pdf(file_path: str, output_dir: str) -> List[Dict[str, Any]]:\n- \"\"\"\n- 使用PyMuPDF (fitz) 从PDF中提取图片,这比pdf2image更高效且能提取嵌入图片\n- \n- Args:\n- file_path: PDF文件路径\n- output_dir: 图片输出目录\n- \n- Returns:\n- 提取的图片信息列表\n- \"\"\"\n- image_info = []\n- \n- try:\n- # 打开PDF文件\n- pdf_document = fitz.open(file_path)\n- \n- # 遍历每一页\n- for page_index in range(len(pdf_document)):\n- page = pdf_document[page_index]\n- \n- # 获取页面上的图片\n- image_list = page.get_images(full=True)\n- \n- # 遍历页面上的每个图片\n- for img_index, img in enumerate(image_list):\n- xref = img[0] # 图片的xref号\n- base_image = pdf_document.extract_image(xref)\n- image_bytes = base_image[\"image\"]\n- image_ext = base_image[\"ext\"] # 图片扩展名\n- \n- # 保存图片到文件\n- image_filename = f\"page_{page_index + 1}_img_{img_index + 1}.{image_ext}\"\n- image_path = os.path.join(output_dir, image_filename)\n- \n- with open(image_path, \"wb\") as img_file:\n- img_file.write(image_bytes)\n- \n- # 获取图片信息\n- with Image.open(image_path) as pil_img:\n- width, height = pil_img.size\n- format_name = pil_img.format\n- \n- # 添加图片信息到列表\n- image_info.append({\n- \"filename\": image_filename,\n- \"path\": image_path,\n- \"page\": page_index + 1,\n- \"width\": width,\n- \"height\": height,\n- \"format\": format_name,\n- \"size_bytes\": len(image_bytes)\n- })\n- \n- # 如果没有找到嵌入图片,尝试渲染页面为图片\n- if not image_info:\n- for page_index in range(len(pdf_document)):\n- page = pdf_document[page_index]\n- \n- # 将页面渲染为图片\n- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x缩放以获得更好的质量\n- image_filename = f\"page_{page_index + 1}_rendered.png\"\n- image_path = os.path.join(output_dir, image_filename)\n- \n- # 保存渲染的图片\n- pix.save(image_path)\n- \n- # 获取图片信息\n- with Image.open(image_path) as pil_img:\n- width, height = pil_img.size\n- format_name = pil_img.format\n- \n- # 添加图片信息到列表\n- image_info.append({\n- \"filename\": image_filename,\n- \"path\": image_path,\n- \"page\": page_index + 1,\n- \"width\": width,\n- \"height\": height,\n- \"format\": format_name,\n- \"size_bytes\": os.path.getsize(image_path),\n- \"type\": \"rendered_page\"\n- })\n- \n- pdf_document.close()\n- return image_info\n- \n- except Exception as e:\n- print(f\"提取图片时出错: {str(e)}\")\n- return []\n-\n-\n-async def parse_pdf(\n- file_path: str,\n-) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"Parse a PDF file and extract text and images with optimized performance using PymuPDF4llm.\"\"\"\n- results = []\n- \n- # 添加初始状态提示\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"开始处理PDF文件,请稍候...\"\n- ))\n- \n- # 检查文件是否存在\n- if not os.path.exists(file_path):\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n- )]\n- \n- try:\n- # 创建临时目录用于存储图片\n- temp_dir = tempfile.mkdtemp()\n- image_path = os.path.join(temp_dir, \"images\")\n- os.makedirs(image_path, exist_ok=True)\n- \n- # 添加文件大小信息\n- file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"文件大小: {file_size_mb:.2f} MB\"\n- ))\n- \n- # 对大文件提供警告\n- if file_size_mb > 10:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 文件较大 ({file_size_mb:.2f} MB),处理可能需要较长时间。\"\n- ))\n- \n- # 使用PymuPDF4llm提取PDF内容(包括文本和图像)\n- try:\n- # 获取PDF页数\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n- \n- # 限制处理的页数\n- max_pages = min(num_pages, 30)\n- pages_to_process = list(range(max_pages))\n- \n- # 使用PymuPDF4llm提取内容\n- md_content = pymupdf4llm.to_markdown(\n- doc=file_path,\n- pages=pages_to_process,\n- page_chunks=True,\n- write_images=True,\n- image_path=image_path,\n- image_format=\"jpg\",\n- dpi=150\n- )\n- \n- # 如果处理的页数少于总页数,添加提示\n- if max_pages < num_pages:\n- md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n- else:\n- md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n- \n- # 添加提取的内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=md_content\n- ))\n- \n- # 使用PyMuPDF提取图片(更高级的图片提取方法)\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"正在使用高级方法提取图片,这可能需要一些时间...\"\n- ))\n- \n- # 提取图片\n- image_info = await extract_images_from_pdf(file_path, image_path)\n- \n- # 添加图像信息\n- if image_info:\n- image_markdown = \"\\n## 提取的图像信息\\n\\n\"\n- image_markdown += f\"共提取了 {len(image_info)} 张图像。\\n\\n\"\n- \n- for i, img_data in enumerate(image_info):\n- image_markdown += f\"### 图像 {i+1}\\n\\n\"\n- image_markdown += f\"- 文件名: {img_data['filename']}\\n\"\n- image_markdown += f\"- 页码: {img_data['page']}\\n\"\n- image_markdown += f\"- 尺寸: {img_data['width']}x{img_data['height']} 像素\\n\"\n- image_markdown += f\"- 格式: {img_data['format']}\\n\"\n- image_markdown += f\"- 大小: {img_data['size_bytes'] / 1024:.2f} KB\\n\"\n- \n- if 'type' in img_data and img_data['type'] == 'rendered_page':\n- image_markdown += f\"- 类型: 渲染页面\\n\"\n- else:\n- image_markdown += f\"- 类型: 嵌入图像\\n\"\n- \n- image_markdown += \"\\n---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- else:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"\\n## 图像信息\\n\\n未从PDF中提取到任何图像。\"\n- ))\n- \n- except Exception as extract_error:\n- # 如果PymuPDF4llm提取失败,回退到原来的方法\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用备用方法...\"\n- ))\n- \n- # 使用PyPDF2提取文本\n- text_content = \"\"\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n- \n- # 添加PDF元数据\n- text_content += f\"# PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:] # 移除前导斜杠\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 提取文本 - 限制页数以提高性能\n- max_pages_to_process = min(num_pages, 30) # 限制处理的最大页数\n- if max_pages_to_process < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages_to_process} 页内容。\\n\"\n- \n- text_content += \"\\n## 内容摘要\\n\\n\"\n- \n- # 逐页提取文本\n- for page_num in range(max_pages_to_process):\n- # 添加进度提示\n- if page_num % 5 == 0 and page_num > 0:\n- progress_msg = f\"已处理 {page_num}/{max_pages_to_process} 页...\"\n- results.append(types.TextContent(type=\"text\", text=progress_msg))\n- \n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n- \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n- \n- # 尝试使用PyMuPDF提取图片\n- try:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"正在使用备用方法提取图片,这可能需要一些时间...\"\n- ))\n- \n- # 提取图片\n- image_info = await extract_images_from_pdf(file_path, image_path)\n- \n- # 添加图像信息\n- if image_info:\n- image_markdown = \"\\n## 提取的图像信息\\n\\n\"\n- image_markdown += f\"共提取了 {len(image_info)} 张图像。\\n\\n\"\n- \n- for i, img_data in enumerate(image_info):\n- image_markdown += f\"### 图像 {i+1}\\n\\n\"\n- image_markdown += f\"- 文件名: {img_data['filename']}\\n\"\n- image_markdown += f\"- 页码: {img_data['page']}\\n\"\n- image_markdown += f\"- 尺寸: {img_data['width']}x{img_data['height']} 像素\\n\"\n- image_markdown += f\"- 格式: {img_data['format']}\\n\"\n- image_markdown += f\"- 大小: {img_data['size_bytes'] / 1024:.2f} KB\\n\"\n- \n- if 'type' in img_data and img_data['type'] == 'rendered_page':\n- image_markdown += f\"- 类型: 渲染页面\\n\"\n- else:\n- image_markdown += f\"- 类型: 嵌入图像\\n\"\n- \n- image_markdown += \"\\n---\\n\\n\"\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- else:\n- # 如果PyMuPDF也无法提取图片,尝试使用pdf2image\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- \n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"正在使用最后的备用方法提取图片,这可能需要一些时间...\"\n- ))\n- \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n- \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(image_path, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n- \n- # 获取图片尺寸\n- width, height = img.size\n- \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\"\n- image_markdown += f\"- 类型: 渲染页面\\n\\n\"\n- \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n- except Exception as pymupdf_error:\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 使用PyMuPDF提取图片失败: {str(pymupdf_error)}\\n\"\n- f\"正在尝试使用最后的备用方法...\"\n- ))\n- \n- # 尝试使用pdf2image作为最后的备用方法\n- try:\n- # 限制处理的页数以提高性能\n- max_img_pages = min(num_pages, 10) # 限制处理图片的最大页数\n- \n- # 转换PDF页面为图片并保存\n- images = convert_from_path(\n- file_path, \n- dpi=150, \n- fmt=\"jpg\", \n- first_page=1, \n- last_page=max_img_pages,\n- thread_count=2 # 使用多线程加速\n- )\n- \n- # 处理每个页面图片\n- image_markdown = \"\\n## 图片内容\\n\\n\"\n- for i, img in enumerate(images):\n- # 保存图片到临时目录\n- img_path = os.path.join(image_path, f\"page_{i+1}.jpg\")\n- img.save(img_path, \"JPEG\", quality=80)\n- \n- # 获取图片尺寸\n- width, height = img.size\n- \n- # 添加图片信息到Markdown\n- image_markdown += f\"### 第 {i+1} 页图片\\n\\n\"\n- image_markdown += f\"- 尺寸: {width}x{height} 像素\\n\"\n- image_markdown += f\"- 格式: JPEG\\n\"\n- image_markdown += f\"- DPI: 150\\n\"\n- image_markdown += f\"- 类型: 渲染页面\\n\\n\"\n- \n- # 添加分隔线\n- image_markdown += \"---\\n\\n\"\n- \n- # 添加图片信息到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=image_markdown\n- ))\n- except Exception as img_error:\n- # 如果图片提取失败,添加错误信息但继续\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 无法提取图片: {str(img_error)}\\n\"\n- f\"这可能是由于PDF文件结构复杂或缺少必要的系统依赖(如poppler-utils)。\"\n- ))\n- \n- # 清理临时目录\n- shutil.rmtree(temp_dir)\n- \n- # 添加处理完成的提示\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"PDF处理完成!\"\n- ))\n- \n- return results\n- except Exception as e:\n- # 确保清理临时目录\n- if 'temp_dir' in locals() and os.path.exists(temp_dir):\n- shutil.rmtree(temp_dir)\n- \n- import traceback\n- error_details = traceback.format_exc()\n- return [\n- types.TextContent(\n- type=\"text\",\n- text=f\"错误: 解析PDF文件失败: {str(e)}\\n\"\n- f\"可能的原因:\\n\"\n- f\"1. 文件格式不兼容\\n\"\n- f\"2. 文件已加密或受密码保护\\n\"\n- f\"3. 系统缺少必要的依赖(如poppler-utils)\\n\"\n- f\"4. 文件太大,处理超时\\n\\n\"\n- f\"详细错误信息: {error_details}\"\n- )\n- ]\n-\n-\n-async def quick_preview_pdf(\n- file_path: str,\n-) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- \"\"\"快速预览PDF文件内容,不包含图片处理,适用于大文件。\"\"\"\n- results = []\n- \n- # 检查文件是否存在\n- if not os.path.exists(file_path):\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"错误: 文件不存在: {file_path}\\n请检查路径是否正确,并确保文件可访问。\"\n- )]\n- \n- try:\n- # 添加文件信息\n- file_size_mb = os.path.getsize(file_path) / (1024 * 1024)\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"# 快速预览模式 - 仅提取文本内容\\n\\n文件大小: {file_size_mb:.2f} MB\"\n- ))\n- \n- # 获取PDF页数\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- num_pages = len(reader.pages)\n- \n- # 限制处理的页数\n- max_pages = min(num_pages, 50) # 快速模式可以处理更多页\n- pages_to_process = list(range(max_pages))\n- \n- try:\n- # 尝试使用PyMuPDF提取文本(通常比PyPDF2更快更准确)\n- pdf_document = fitz.open(file_path)\n- \n- # 提取文本内容\n- text_content = \"\"\n- \n- # 添加PDF元数据\n- text_content += f\"## PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- \n- # 从PyMuPDF获取元数据\n- metadata = pdf_document.metadata\n- if metadata:\n- for key, value in metadata.items():\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 如果处理的页数少于总页数,添加提示\n- if max_pages < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n- \n- text_content += \"\\n## 内容摘要\\n\\n\"\n- \n- # 逐页提取文本\n- for page_num in range(max_pages):\n- page = pdf_document[page_num]\n- page_text = page.get_text()\n- \n- if page_text.strip():\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text.strip() + \"\\n\"\n- \n- pdf_document.close()\n- \n- # 添加提取的内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=text_content\n- ))\n- \n- except Exception as pymupdf_error:\n- # 如果PyMuPDF提取失败,回退到PymuPDF4llm\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 使用PyMuPDF提取内容失败: {str(pymupdf_error)}\\n正在尝试使用备用方法...\"\n- ))\n- \n- try:\n- # 使用PymuPDF4llm提取内容,但不提取图像\n- md_content = pymupdf4llm.to_markdown(\n- doc=file_path,\n- pages=pages_to_process,\n- page_chunks=True,\n- write_images=False # 不提取图像\n- )\n- \n- # 如果处理的页数少于总页数,添加提示\n- if max_pages < num_pages:\n- md_content = f\"# PDF文档内容(前{max_pages}页)\\n\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\\n{md_content}\"\n- else:\n- md_content = f\"# PDF文档内容\\n\\n{md_content}\"\n- \n- # 添加提取的内容到结果\n- results.append(types.TextContent(\n- type=\"text\",\n- text=md_content\n- ))\n- except Exception as extract_error:\n- # 如果PymuPDF4llm提取失败,回退到原来的方法\n- results.append(types.TextContent(\n- type=\"text\",\n- text=f\"警告: 使用PymuPDF4llm提取内容失败: {str(extract_error)}\\n正在尝试使用最后的备用方法...\"\n- ))\n- \n- # 使用PyPDF2提取文本\n- text_content = \"\"\n- with open(file_path, 'rb') as file:\n- reader = PyPDF2.PdfReader(file)\n- \n- # 添加PDF元数据\n- text_content += f\"## PDF文档信息\\n\\n\"\n- text_content += f\"- 页数: {num_pages}\\n\"\n- if reader.metadata:\n- for key, value in reader.metadata.items():\n- if key.startswith('/'):\n- key = key[1:]\n- if value and str(value).strip():\n- text_content += f\"- {key}: {value}\\n\"\n- \n- # 限制处理的页数\n- if max_pages < num_pages:\n- text_content += f\"\\n> 注意: 由于文件较大,仅处理前 {max_pages} 页内容。\\n\"\n- \n- text_content += \"\\n## 内容摘要\\n\\n\"\n- \n- # 逐页提取文本\n- for page_num in range(max_pages):\n- page = reader.pages[page_num]\n- page_text = page.extract_text()\n- if page_text:\n- text_content += f\"\\n### 第 {page_num + 1} 页\\n\\n\"\n- text_content += page_text + \"\\n\"\n- \n- # 添加文本内容到结果\n- results.append(types.TextContent(type=\"text\", text=text_content))\n- \n- # 添加提示信息\n- results.append(types.TextContent(\n- type=\"text\",\n- text=\"\\n## 注意\\n\\n快速预览完成!如需查看图片内容,请使用完整的PDF解析工具。\"\n- ))\n- \n- return results\n- except Exception as e:\n- import traceback\n- error_details = traceback.format_exc()\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"错误: 快速预览PDF失败: {str(e)}\\n详细错误信息: {error_details}\"\n- )]\n-\n-\n @click.command()\n @click.option(\"--port\", default=8000, help=\"Port to listen on for SSE\")\n @click.option(\n \"--transport\",\n type=click.Choice([\"stdio\", \"sse\"]),\n default=\"stdio\",\n help=\"Transport type\",\n )\n-def main(port: int, transport: str) -> int:\n- app = Server(\"mcp-website-fetcher\")\n+@click.option(\n+ \"--host\",\n+ default=\"0.0.0.0\",\n+ help=\"Host to bind to (only used with SSE transport)\",\n+)\n+def main(port: int, transport: str, host: str) -> int:\n+ \"\"\"\n+ 启动MCP服务器\n \n- @app.call_tool()\n- async def fetch_tool( # type: ignore[unused-function]\n- name: str, arguments: dict\n- ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n- if name == \"url\":\n- if \"url\" not in arguments:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"Error: Missing required argument 'url'\"\n- )]\n- return await fetch_website(arguments[\"url\"])\n- elif name == \"file\":\n- if \"file_path\" not in arguments:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"Error: Missing required argument 'file_path'\"\n- )]\n- return await parse_pdf(arguments[\"file_path\"])\n- elif name == \"quick_pdf\":\n- if \"file_path\" not in arguments:\n- return [types.TextContent(\n- type=\"text\",\n- text=\"Error: Missing required argument 'file_path'\"\n- )]\n- return await quick_preview_pdf(arguments[\"file_path\"])\n- else:\n- return [types.TextContent(\n- type=\"text\",\n- text=f\"Error: Unknown tool: {name}\"\n- )]\n+ Args:\n+ port: SSE传输的端口\n+ transport: 传输类型,可以是stdio或sse\n+ host: 主机地址,仅在使用SSE传输时有效\n \n- @app.list_tools()\n- async def list_tools() -> list[types.Tool]: # type: ignore[unused-function]\n- return [\n- types.Tool(\n- name=\"url\",\n- description=\"Fetches a website and returns its content\",\n- inputSchema={\n- \"type\": \"object\",\n- \"required\": [\"url\"],\n- \"properties\": {\n- \"url\": {\n- \"type\": \"string\",\n- \"description\": \"URL to fetch\",\n- }\n- },\n- },\n- ),\n- types.Tool(\n- name=\"file\",\n- description=\"解析PDF文件并提取文本和图片内容\",\n- inputSchema={\n- \"type\": \"object\",\n- \"required\": [\"file_path\"],\n- \"properties\": {\n- \"file_path\": {\n- \"type\": \"string\",\n- \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n- }\n- },\n- },\n- ),\n- types.Tool(\n- name=\"quick_pdf\",\n- description=\"快速预览PDF文件内容(仅文本,无图片)\",\n- inputSchema={\n- \"type\": \"object\",\n- \"required\": [\"file_path\"],\n- \"properties\": {\n- \"file_path\": {\n- \"type\": \"string\",\n- \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n- }\n- },\n- },\n- )\n- ]\n-\n+ Returns:\n+ 退出代码\n+ \"\"\"\n+ print(f\"Starting {APP_NAME} with {transport} transport...\")\n+ \n+ # 加载所有工具\n+ tools = all_tools\n+ print(f\"Loaded {len(tools)} tools: {', '.join(tool.name for tool in tools)}\")\n+ \n+ # 根据传输类型启动相应的适配器\n if transport == \"sse\":\n- from mcp.server.sse import SseServerTransport\n- from starlette.applications import Starlette\n- from starlette.routing import Mount, Route\n- from starlette.middleware import Middleware\n- from starlette.middleware.cors import CORSMiddleware\n-\n- sse = SseServerTransport(\"/messages/\")\n-\n- async def handle_sse(request):\n- # 增加超时时间,以便处理大型文件\n- request.scope[\"timeout\"] = 300 # 设置为5分钟\n- async with sse.connect_sse(\n- request.scope, request.receive, request._send\n- ) as streams:\n- await app.run(\n- streams[0], streams[1], app.create_initialization_options()\n- )\n-\n- # 添加CORS中间件以允许跨域请求\n- middleware = [\n- Middleware(\n- CORSMiddleware,\n- allow_origins=[\"*\"],\n- allow_methods=[\"*\"],\n- allow_headers=[\"*\"],\n- )\n- ]\n-\n- starlette_app = Starlette(\n- debug=True,\n- routes=[\n- Route(\"/sse\", endpoint=handle_sse),\n- Mount(\"/messages/\", app=sse.handle_post_message),\n- ],\n- middleware=middleware,\n- )\n-\n- import uvicorn\n-\n- # 增加uvicorn的超时设置\n- uvicorn.run(\n- starlette_app, \n- host=\"0.0.0.0\", \n- port=port,\n- timeout_keep_alive=300, # 增加保持连接的超时时间\n- )\n+ adapter = SseAdapter(APP_NAME, tools, host=host, port=port)\n+ adapter.run()\n else:\n- from mcp.server.stdio import stdio_server\n-\n- async def arun():\n- async with stdio_server() as streams:\n- await app.run(\n- streams[0], streams[1], app.create_initialization_options()\n- )\n-\n- anyio.run(arun)\n-\n+ adapter = StdioAdapter(APP_NAME, tools)\n+ adapter.run()\n+ \n return 0\n" }, { "date": 1741332423791, "content": "Index: \n===================================================================\n--- \n+++ \n@@ -1,1 +1,62 @@\n- \n\\ No newline at end of file\n+'''\n+Author: 刘彦志 yanzhiliu@trip.com\n+Date: 2025-03-06 14:57:07\n+LastEditors: 刘彦志 yanzhiliu@trip.com\n+LastEditTime: 2025-03-07 15:26:41\n+FilePath: /weaviate-mcp-server/mcp_simple_tool/server.py\n+Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE\n+'''\n+\"\"\"\n+MCP服务器主文件,处理命令行参数并启动相应的适配器\n+\"\"\"\n+\n+import click\n+from typing import Optional\n+from .tools import all_tools\n+from .core.sse_adapter import SseAdapter\n+from .core.stdio_adapter import StdioAdapter\n+\n+\n+APP_NAME = \"mcp-development-framework\"\n+\n+\n+@click.command()\n+@click.option(\"--port\", default=8000, help=\"Port to listen on for SSE\")\n+@click.option(\n+ \"--transport\",\n+ type=click.Choice([\"stdio\", \"sse\"]),\n+ default=\"stdio\",\n+ help=\"Transport type\",\n+)\n+@click.option(\n+ \"--host\",\n+ default=\"0.0.0.0\",\n+ help=\"Host to bind to (only used with SSE transport)\",\n+)\n+def main(port: int, transport: str, host: str) -> int:\n+ \"\"\"\n+ 启动MCP服务器\n+\n+ Args:\n+ port: SSE传输的端口\n+ transport: 传输类型,可以是stdio或sse\n+ host: 主机地址,仅在使用SSE传输时有效\n+\n+ Returns:\n+ 退出代码\n+ \"\"\"\n+ print(f\"Starting {APP_NAME} with {transport} transport...\")\n+ \n+ # 加载所有工具\n+ tools = all_tools\n+ print(f\"Loaded {len(tools)} tools: {', '.join(tool.name for tool in tools)}\")\n+ \n+ # 根据传输类型启动相应的适配器\n+ if transport == \"sse\":\n+ adapter = SseAdapter(APP_NAME, tools, host=host, port=port)\n+ adapter.run()\n+ else:\n+ adapter = StdioAdapter(APP_NAME, tools)\n+ adapter.run()\n+ \n+ return 0\n" } ], "date": 1741245145750, "name": "Commit-0", "content": "import anyio\nimport click\nimport httpx\nimport mcp.types as types\nfrom mcp.server.lowlevel import Server\nimport os\nimport base64\nimport io\nfrom typing import List, Dict, Any, Optional\nimport PyPDF2\nfrom pdf2image import convert_from_path\nfrom PIL import Image\n\n\nasync def fetch_website(\n url: str,\n) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n headers = {\n \"User-Agent\": \"MCP Test Server (github.com/modelcontextprotocol/python-sdk)\"\n }\n try:\n timeout = httpx.Timeout(10.0, connect=5.0)\n async with httpx.AsyncClient(\n follow_redirects=True, \n headers=headers,\n timeout=timeout\n ) as client:\n response = await client.get(url)\n response.raise_for_status()\n return [types.TextContent(type=\"text\", text=response.text)]\n except httpx.TimeoutException:\n return [types.TextContent(\n type=\"text\",\n text=\"Error: Request timed out while trying to fetch the website.\"\n )]\n except httpx.HTTPStatusError as e:\n return [types.TextContent(\n type=\"text\",\n text=(f\"Error: HTTP {e.response.status_code} \"\n \"error while fetching the website.\")\n )]\n except Exception as e:\n return [types.TextContent(\n type=\"text\",\n text=f\"Error: Failed to fetch website: {str(e)}\"\n )]\n\n\nasync def check_mood(\n question: str,\n) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"Check server's mood - always responds cheerfully with a heart.\"\"\"\n msg: str = \"I'm feeling great and happy to help you! ❤️\"\n return [types.TextContent(type=\"text\", text=msg)]\n\n\nasync def parse_pdf(\n file_path: str,\n) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n \"\"\"Parse a PDF file and extract text and images.\"\"\"\n results = []\n \n # Check if file exists\n if not os.path.exists(file_path):\n return [types.TextContent(\n type=\"text\",\n text=f\"Error: 文件不存在: {file_path}\"\n )]\n \n try:\n # Extract text from PDF\n text_content = \"\"\n with open(file_path, 'rb') as file:\n reader = PyPDF2.PdfReader(file)\n num_pages = len(reader.pages)\n \n # Add PDF metadata\n text_content += f\"PDF文档信息:\\n\"\n text_content += f\"页数: {num_pages}\\n\"\n if reader.metadata:\n for key, value in reader.metadata.items():\n if key.startswith('/'):\n key = key[1:] # Remove leading slash\n if value and str(value).strip():\n text_content += f\"{key}: {value}\\n\"\n text_content += \"\\n内容摘要:\\n\"\n \n # Extract text from each page\n for page_num in range(num_pages):\n page = reader.pages[page_num]\n page_text = page.extract_text()\n if page_text:\n text_content += f\"\\n--- 第 {page_num + 1} 页 ---\\n\"\n text_content += page_text + \"\\n\"\n \n # Add text content to results\n results.append(types.TextContent(type=\"text\", text=text_content))\n \n # Extract images from PDF\n try:\n # Convert PDF pages to images\n images = convert_from_path(file_path, dpi=150, fmt=\"jpeg\")\n \n # Process each page image\n for i, img in enumerate(images):\n # Save image to bytes buffer\n img_buffer = io.BytesIO()\n img.save(img_buffer, format=\"JPEG\")\n img_data = img_buffer.getvalue()\n \n # Encode image to base64\n img_base64 = base64.b64encode(img_data).decode('utf-8')\n \n # Add image content to results\n results.append(types.ImageContent(\n type=\"image\",\n data=f\"data:image/jpeg;base64,{img_base64}\",\n mime_type=\"image/jpeg\"\n ))\n except Exception as img_error:\n # If image extraction fails, add error message but continue\n results.append(types.TextContent(\n type=\"text\",\n text=f\"警告: 无法提取图片: {str(img_error)}\"\n ))\n \n return results\n except Exception as e:\n return [types.TextContent(\n type=\"text\",\n text=f\"Error: 解析PDF文件失败: {str(e)}\"\n )]\n\n\n@click.command()\n@click.option(\"--port\", default=8000, help=\"Port to listen on for SSE\")\n@click.option(\n \"--transport\",\n type=click.Choice([\"stdio\", \"sse\"]),\n default=\"stdio\",\n help=\"Transport type\",\n)\ndef main(port: int, transport: str) -> int:\n app = Server(\"mcp-website-fetcher\")\n\n mood_description: str = (\n \"Ask this MCP server about its mood! You can phrase your question \"\n \"in any way you like - 'How are you?', 'What's your mood?', or even \"\n \"'Are you having a good day?'. The server will always respond with \"\n \"a cheerful message and a heart ❤️\"\n )\n\n @app.call_tool()\n async def fetch_tool( # type: ignore[unused-function]\n name: str, arguments: dict\n ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:\n if name == \"mcp_fetch\":\n if \"url\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"Error: Missing required argument 'url'\"\n )]\n return await fetch_website(arguments[\"url\"])\n elif name == \"mood\":\n if \"question\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"Error: Missing required argument 'question'\"\n )]\n return await check_mood(arguments[\"question\"])\n elif name == \"file\":\n if \"file_path\" not in arguments:\n return [types.TextContent(\n type=\"text\",\n text=\"Error: Missing required argument 'file_path'\"\n )]\n return await parse_pdf(arguments[\"file_path\"])\n else:\n return [types.TextContent(\n type=\"text\",\n text=f\"Error: Unknown tool: {name}\"\n )]\n\n @app.list_tools()\n async def list_tools() -> list[types.Tool]: # type: ignore[unused-function]\n return [\n types.Tool(\n name=\"mcp_fetch\",\n description=\"Fetches a website and returns its content\",\n inputSchema={\n \"type\": \"object\",\n \"required\": [\"url\"],\n \"properties\": {\n \"url\": {\n \"type\": \"string\",\n \"description\": \"URL to fetch\",\n }\n },\n },\n ),\n types.Tool(\n name=\"mood\",\n description=\"Ask the server about its mood - it's always happy!\",\n inputSchema={\n \"type\": \"object\",\n \"required\": [\"question\"],\n \"properties\": {\n \"question\": {\n \"type\": \"string\",\n \"description\": mood_description,\n }\n },\n },\n ),\n types.Tool(\n name=\"file\",\n description=\"解析PDF文件并提取文本和图片内容\",\n inputSchema={\n \"type\": \"object\",\n \"required\": [\"file_path\"],\n \"properties\": {\n \"file_path\": {\n \"type\": \"string\",\n \"description\": \"PDF文件的本地路径,例如'/path/to/document.pdf'\",\n }\n },\n },\n )\n ]\n\n if transport == \"sse\":\n from mcp.server.sse import SseServerTransport\n from starlette.applications import Starlette\n from starlette.routing import Mount, Route\n\n sse = SseServerTransport(\"/messages/\")\n\n async def handle_sse(request):\n async with sse.connect_sse(\n request.scope, request.receive, request._send\n ) as streams:\n await app.run(\n streams[0], streams[1], app.create_initialization_options()\n )\n\n starlette_app = Starlette(\n debug=True,\n routes=[\n Route(\"/sse\", endpoint=handle_sse),\n Mount(\"/messages/\", app=sse.handle_post_message),\n ],\n )\n\n import uvicorn\n\n uvicorn.run(starlette_app, host=\"0.0.0.0\", port=port)\n else:\n from mcp.server.stdio import stdio_server\n\n async def arun():\n async with stdio_server() as streams:\n await app.run(\n streams[0], streams[1], app.create_initialization_options()\n )\n\n anyio.run(arun)\n\n return 0\n" } ] }