PDF MCP Flow

extract_pdf.py•3.92 KiB

import sys import os import argparse import fitz # PyMuPDF def extract_pdf_content(file_path, start_page=1, end_page=None, output_dir="extracted_output"): """ 提取PDF指定页面的文本和图片。 :param file_path: PDF文件路径 :param start_page: 开始页码（从1开始） :param end_page: 结束页码（包含），如果为None则提取到最后 :param output_dir: 图片保存目录 """ if not os.path.exists(file_path): print(f"Error: 文件不存在 - {file_path}") return print(f"Processing: {file_path}") # 创建图片输出目录 img_dir = os.path.join(output_dir, "images") os.makedirs(img_dir, exist_ok=True) # 文本输出文件 text_output_file = os.path.join(output_dir, "content.txt") try: doc = fitz.open(file_path) total_pages = len(doc) # 处理页码范围 if end_page is None or end_page > total_pages: end_page = total_pages start_idx = max(0, start_page - 1) end_idx = min(total_pages, end_page) print(f"Extracting pages: {start_page} to {end_page} (Total: {total_pages})") print(f"Images will be saved to: {img_dir}") print(f"Text will be saved to: {text_output_file}") with open(text_output_file, "w", encoding="utf-8") as text_file: for i in range(start_idx, end_idx): page_num = i + 1 page = doc[i] header = f"\n{'='*20} Page {page_num} {'='*20}\n" print(header.strip()) text_file.write(header) # 1. 提取文本 text = page.get_text() # 简单清洗 safe_text = text.encode('utf-8', errors='replace').decode('utf-8') if safe_text.strip(): print(safe_text[:100] + "..." if len(safe_text) > 100 else safe_text) text_file.write(safe_text + "\n") else: print("(No text content)") text_file.write("(No text content)\n") # 2. 提取图片 image_list = page.get_images() if image_list: print(f" [Found {len(image_list)} images]") for j, img in enumerate(image_list): try: xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] filename = f"page_{page_num}_img_{j+1}.{image_ext}" filepath = os.path.join(img_dir, filename) with open(filepath, "wb") as img_file: img_file.write(image_bytes) except Exception as img_err: print(f" Warning: Failed to extract image {j+1} on page {page_num}: {img_err}") doc.close() print(f"\nDone! All content saved to '{output_dir}'") except Exception as e: print(f"Error processing PDF: {e}") def main(): parser = argparse.ArgumentParser(description="Extract text and images from PDF.") parser.add_argument("file", help="Path to the PDF file") parser.add_argument("-s", "--start", type=int, default=1, help="Start page number (default: 1)") parser.add_argument("-e", "--end", type=int, help="End page number (default: last page)") parser.add_argument("-o", "--output", default="extracted_output", help="Output directory (default: extracted_output)") args = parser.parse_args() extract_pdf_content(args.file, args.start, args.end, args.output) if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Dublin1231/PDF_MCP_Flow'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extract_pdf.py•3.92 KiB