Skip to main content
Glama
parser.py4.44 kB
import os import fitz from typing import Dict, Any, List, Optional from .loader import PDFLoader from .extractor import TextExtractor from .image import ImageExtractor from .table import TableExtractor class PDFParser: def __init__(self): self.loader = PDFLoader() self.text_extractor = TextExtractor() self.image_extractor = ImageExtractor() # Table extraction can be slow, so we might want to make it optional or on-demand self.table_extractor = TableExtractor() async def parse( self, source: str, page_range: str = None, extract_images: bool = False, force_ocr: bool = False, ) -> Dict[str, Any]: """ Main entry point to parse a PDF. Args: source: URL or local path. page_range: String like "1-5", "10", or None for all. extract_images: Whether to extract images. Returns: Dict containing metadata and content (markdown). """ # 1. Load Document doc = await self.loader.load(source) try: # 2. Parse Page Range pages = self._parse_page_range(doc, page_range) # 3. Extract Text (Markdown) text_md = self.text_extractor.extract_text(doc, pages, force_ocr=force_ocr) # 4. Extract Images (Optional) images_data = [] if extract_images: images_data = self.image_extractor.extract_images(doc, pages) # Append image markdown to text_md (simplified approach: append at end or interpolate) # For now, let's just keep them separate data, but maybe append to content if images_data: text_md += "\n\n## Extracted Images\n" for img in images_data: text_md += f"\n{img['markdown']}\n" # 5. Extract Tables (Optional enhancement) # Use 'source' if it's a local path. If URL, pdfplumber needs a file-like object or path. # Our loader handles URL->fitz. pdfplumber needs a bit more work for URLs (stream or temp file). # For this MVP, let's apply a check: if fitz loaded from URL (stream), we might skip table extraction # OR save the fitz doc to a temp file for pdfplumber. # Let's save to temp file to be robust. temp_pdf_path = None if doc.name and os.path.exists(doc.name): # It's a local file pdf_path = doc.name else: # It's a stream (URL), save to temp import tempfile with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: doc.save(tmp.name) pdf_path = tmp.name temp_pdf_path = tmp.name tables_md = self.table_extractor.extract_tables(pdf_path, pages) if tables_md: text_md += "\n\n## Extracted Tables\n" + "\n\n".join(tables_md) # Cleanup temp file if temp_pdf_path and os.path.exists(temp_pdf_path): os.remove(temp_pdf_path) # 6. Construct Final Result metadata = { "page_count": len(doc), "title": doc.metadata.get("title", ""), "author": doc.metadata.get("author", ""), "source": source, } return { "metadata": metadata, "content": text_md, "images": [img["path"] for img in images_data], } finally: doc.close() def _parse_page_range(self, doc: fitz.Document, range_str: Optional[str]) -> range: """ Parse string range "1-3" to range(0, 3). """ total_pages = len(doc) if not range_str: return range(total_pages) try: if "-" in range_str: start, end = map(int, range_str.split("-")) # Adjust to 0-indexed, end inclusive in user mind -> exclusive in python range return range(max(0, start - 1), min(total_pages, end)) else: page = int(range_str) return range(max(0, page - 1), min(total_pages, page)) except ValueError: # Fallback to all pages on error return range(total_pages)

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rexfelix/readPDF_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server