import os
import fitz
from typing import Dict, Any, List, Optional
from .loader import PDFLoader
from .extractor import TextExtractor
from .image import ImageExtractor
from .table import TableExtractor
class PDFParser:
def __init__(self):
self.loader = PDFLoader()
self.text_extractor = TextExtractor()
self.image_extractor = ImageExtractor()
# Table extraction can be slow, so we might want to make it optional or on-demand
self.table_extractor = TableExtractor()
async def parse(
self,
source: str,
page_range: str = None,
extract_images: bool = False,
force_ocr: bool = False,
) -> Dict[str, Any]:
"""
Main entry point to parse a PDF.
Args:
source: URL or local path.
page_range: String like "1-5", "10", or None for all.
extract_images: Whether to extract images.
Returns:
Dict containing metadata and content (markdown).
"""
# 1. Load Document
doc = await self.loader.load(source)
try:
# 2. Parse Page Range
pages = self._parse_page_range(doc, page_range)
# 3. Extract Text (Markdown)
text_md = self.text_extractor.extract_text(doc, pages, force_ocr=force_ocr)
# 4. Extract Images (Optional)
images_data = []
if extract_images:
images_data = self.image_extractor.extract_images(doc, pages)
# Append image markdown to text_md (simplified approach: append at end or interpolate)
# For now, let's just keep them separate data, but maybe append to content
if images_data:
text_md += "\n\n## Extracted Images\n"
for img in images_data:
text_md += f"\n{img['markdown']}\n"
# 5. Extract Tables (Optional enhancement)
# Use 'source' if it's a local path. If URL, pdfplumber needs a file-like object or path.
# Our loader handles URL->fitz. pdfplumber needs a bit more work for URLs (stream or temp file).
# For this MVP, let's apply a check: if fitz loaded from URL (stream), we might skip table extraction
# OR save the fitz doc to a temp file for pdfplumber.
# Let's save to temp file to be robust.
temp_pdf_path = None
if doc.name and os.path.exists(doc.name):
# It's a local file
pdf_path = doc.name
else:
# It's a stream (URL), save to temp
import tempfile
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
doc.save(tmp.name)
pdf_path = tmp.name
temp_pdf_path = tmp.name
tables_md = self.table_extractor.extract_tables(pdf_path, pages)
if tables_md:
text_md += "\n\n## Extracted Tables\n" + "\n\n".join(tables_md)
# Cleanup temp file
if temp_pdf_path and os.path.exists(temp_pdf_path):
os.remove(temp_pdf_path)
# 6. Construct Final Result
metadata = {
"page_count": len(doc),
"title": doc.metadata.get("title", ""),
"author": doc.metadata.get("author", ""),
"source": source,
}
return {
"metadata": metadata,
"content": text_md,
"images": [img["path"] for img in images_data],
}
finally:
doc.close()
def _parse_page_range(self, doc: fitz.Document, range_str: Optional[str]) -> range:
"""
Parse string range "1-3" to range(0, 3).
"""
total_pages = len(doc)
if not range_str:
return range(total_pages)
try:
if "-" in range_str:
start, end = map(int, range_str.split("-"))
# Adjust to 0-indexed, end inclusive in user mind -> exclusive in python range
return range(max(0, start - 1), min(total_pages, end))
else:
page = int(range_str)
return range(max(0, page - 1), min(total_pages, page))
except ValueError:
# Fallback to all pages on error
return range(total_pages)