Skip to main content
Glama
pdf_extract.py7.65 kB
"""PDF 文本提取 - 使用 PyMuPDF4LLM""" import re from dataclasses import dataclass, field from pathlib import Path from typing import Any import pymupdf # PyMuPDF (fitz) import pymupdf4llm @dataclass class PdfMetadata: """PDF 元数据""" title: str | None = None authors: str | None = None year: int | None = None subject: str | None = None keywords: str | None = None creator: str | None = None producer: str | None = None @dataclass class PageText: """单页文本""" page_num: int # 从 1 开始 text: str is_empty: bool tables: list[dict] = field(default_factory=list) images: list[dict] = field(default_factory=list) @dataclass class PdfExtractResult: """PDF 提取结果""" total_pages: int pages: list[PageText] empty_pages: int metadata: PdfMetadata = field(default_factory=PdfMetadata) @property def all_text(self) -> str: """获取所有页面的合并文本""" return "\n\n".join( f"[Page {p.page_num}]\n{p.text}" for p in self.pages if not p.is_empty ) @property def all_markdown(self) -> str: """获取所有页面的 Markdown 文本""" return "\n\n---\n\n".join( p.text for p in self.pages if not p.is_empty ) def _parse_year_from_date(date_str: str | None) -> int | None: """从 PDF 日期字符串中解析年份 PDF 日期格式通常为: D:YYYYMMDDHHmmSSOHH'mm' 例如: D:20231215143022+08'00' """ if not date_str: return None # 尝试匹配 D:YYYY 或直接 YYYY 开头的格式 match = re.search(r'D?:?(\d{4})', date_str) if match: year = int(match.group(1)) # 合理的年份范围检查 if 1900 <= year <= 2100: return year return None def _clean_metadata_string(value: str | None) -> str | None: """清理元数据字符串,去除空白和无效值""" if not value: return None cleaned = value.strip() # 过滤掉明显无效的值 if not cleaned or cleaned.lower() in ('unknown', 'none', 'null', ''): return None return cleaned def extract_pdf_metadata(file_path: str | Path) -> PdfMetadata: """从 PDF 文件提取元数据 使用 PyMuPDF 读取 PDF 的 Document Info Dictionary。 Args: file_path: PDF 文件路径 Returns: PdfMetadata 包含标题、作者、年份等信息 """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"PDF file not found: {file_path}") metadata = PdfMetadata() try: doc = pymupdf.open(str(file_path)) info = doc.metadata if info: # 提取标题 metadata.title = _clean_metadata_string(info.get("title")) # 提取作者 metadata.authors = _clean_metadata_string(info.get("author")) # 提取年份 - 优先使用 creationDate,其次 modDate year = _parse_year_from_date(info.get("creationDate")) if not year: year = _parse_year_from_date(info.get("modDate")) metadata.year = year # 其他元数据 metadata.subject = _clean_metadata_string(info.get("subject")) metadata.keywords = _clean_metadata_string(info.get("keywords")) metadata.creator = _clean_metadata_string(info.get("creator")) metadata.producer = _clean_metadata_string(info.get("producer")) doc.close() except Exception: # 元数据提取失败不应该阻断主流程 pass return metadata def extract_pdf( file_path: str | Path, *, table_strategy: str = "lines_strict", ignore_images: bool = True, show_progress: bool = False, ) -> PdfExtractResult: """从 PDF 文件提取文本(使用 PyMuPDF4LLM) PyMuPDF4LLM 优势: - 输出 LLM 优化的 Markdown 格式 - 智能表格检测和格式化 - 保留文档结构(标题、列表等) - 更好的多栏布局处理 Args: file_path: PDF 文件路径 table_strategy: 表格检测策略 - "lines_strict": 仅检测有可见线条的表格(推荐) - "lines": 检测有线条的表格(更宽松) - "text": 基于文本对齐检测表格 - "explicit": 仅检测明确标记的表格 ignore_images: 是否忽略图像(默认 True,加快处理速度) show_progress: 是否显示进度条 Returns: PdfExtractResult 包含所有页面的 Markdown 文本 """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"PDF file not found: {file_path}") if file_path.suffix.lower() != ".pdf": raise ValueError(f"Not a PDF file: {file_path}") # 使用 page_chunks=True 获取每页的独立结果 page_data = pymupdf4llm.to_markdown( str(file_path), page_chunks=True, table_strategy=table_strategy, ignore_images=ignore_images, show_progress=show_progress, ) pages = [] empty_count = 0 for chunk in page_data: chunk_metadata = chunk.get("metadata", {}) page_num = chunk_metadata.get("page", 0) + 1 # 转换为从 1 开始 text = chunk.get("text", "").strip() tables = chunk.get("tables", []) images = chunk.get("images", []) is_empty = len(text) < 10 # 少于 10 个字符视为空页 if is_empty: empty_count += 1 pages.append(PageText( page_num=page_num, text=text, is_empty=is_empty, tables=tables, images=images, )) # 提取 PDF 元数据 pdf_metadata = extract_pdf_metadata(file_path) return PdfExtractResult( total_pages=len(page_data), pages=pages, empty_pages=empty_count, metadata=pdf_metadata, ) def extract_pdf_pages(file_path: str | Path) -> list[tuple[int, str]]: """从 PDF 文件提取文本(简化版本) Args: file_path: PDF 文件路径 Returns: 列表,每项为 (page_num, text) 元组,page_num 从 1 开始 """ result = extract_pdf(file_path) return [ (p.page_num, p.text) for p in result.pages if not p.is_empty ] def extract_pdf_to_markdown( file_path: str | Path, *, pages: list[int] | None = None, write_images: bool = False, image_path: str | None = None, dpi: int = 150, ) -> str: """将 PDF 转换为单个 Markdown 字符串 Args: file_path: PDF 文件路径 pages: 要处理的页面列表(从 0 开始),None 表示全部 write_images: 是否保存图像文件 image_path: 图像保存路径 dpi: 图像分辨率 Returns: Markdown 格式的文本 """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"PDF file not found: {file_path}") kwargs: dict[str, Any] = { "table_strategy": "lines_strict", } if pages is not None: kwargs["pages"] = pages if write_images: kwargs["write_images"] = True kwargs["dpi"] = dpi if image_path: kwargs["image_path"] = image_path return pymupdf4llm.to_markdown(str(file_path), **kwargs)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h-lu/paperlib-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server