PDF Reader MCP Server

Overview Inspect Schema Related Servers Score Discussions

parser.py•4.44 kB

import os import fitz from typing import Dict, Any, List, Optional from .loader import PDFLoader from .extractor import TextExtractor from .image import ImageExtractor from .table import TableExtractor class PDFParser: def __init__(self): self.loader = PDFLoader() self.text_extractor = TextExtractor() self.image_extractor = ImageExtractor() # Table extraction can be slow, so we might want to make it optional or on-demand self.table_extractor = TableExtractor() async def parse( self, source: str, page_range: str = None, extract_images: bool = False, force_ocr: bool = False, ) -> Dict[str, Any]: """ Main entry point to parse a PDF. Args: source: URL or local path. page_range: String like "1-5", "10", or None for all. extract_images: Whether to extract images. Returns: Dict containing metadata and content (markdown). """ # 1. Load Document doc = await self.loader.load(source) try: # 2. Parse Page Range pages = self._parse_page_range(doc, page_range) # 3. Extract Text (Markdown) text_md = self.text_extractor.extract_text(doc, pages, force_ocr=force_ocr) # 4. Extract Images (Optional) images_data = [] if extract_images: images_data = self.image_extractor.extract_images(doc, pages) # Append image markdown to text_md (simplified approach: append at end or interpolate) # For now, let's just keep them separate data, but maybe append to content if images_data: text_md += "\n\n## Extracted Images\n" for img in images_data: text_md += f"\n{img['markdown']}\n" # 5. Extract Tables (Optional enhancement) # Use 'source' if it's a local path. If URL, pdfplumber needs a file-like object or path. # Our loader handles URL->fitz. pdfplumber needs a bit more work for URLs (stream or temp file). # For this MVP, let's apply a check: if fitz loaded from URL (stream), we might skip table extraction # OR save the fitz doc to a temp file for pdfplumber. # Let's save to temp file to be robust. temp_pdf_path = None if doc.name and os.path.exists(doc.name): # It's a local file pdf_path = doc.name else: # It's a stream (URL), save to temp import tempfile with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: doc.save(tmp.name) pdf_path = tmp.name temp_pdf_path = tmp.name tables_md = self.table_extractor.extract_tables(pdf_path, pages) if tables_md: text_md += "\n\n## Extracted Tables\n" + "\n\n".join(tables_md) # Cleanup temp file if temp_pdf_path and os.path.exists(temp_pdf_path): os.remove(temp_pdf_path) # 6. Construct Final Result metadata = { "page_count": len(doc), "title": doc.metadata.get("title", ""), "author": doc.metadata.get("author", ""), "source": source, } return { "metadata": metadata, "content": text_md, "images": [img["path"] for img in images_data], } finally: doc.close() def _parse_page_range(self, doc: fitz.Document, range_str: Optional[str]) -> range: """ Parse string range "1-3" to range(0, 3). """ total_pages = len(doc) if not range_str: return range(total_pages) try: if "-" in range_str: start, end = map(int, range_str.split("-")) # Adjust to 0-indexed, end inclusive in user mind -> exclusive in python range return range(max(0, start - 1), min(total_pages, end)) else: page = int(range_str) return range(max(0, page - 1), min(total_pages, page)) except ValueError: # Fallback to all pages on error return range(total_pages)

Implementation Reference

read_pdf

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rexfelix/readPDF_mcp_server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server