"""PDF processing module for page-by-page OCR."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
import pypdfium2 as pdfium
from PIL import Image
from .ocr import OCRResult, ocr_image_pil
@dataclass
class PDFOCRConfig:
"""Configuration for PDF OCR."""
dpi: int = 200
languages: list[str] | None = None
page_separator: str = "\n\n===== Page {page} =====\n\n"
def render_pdf_page_to_image(
pdf_doc: pdfium.PdfDocument, page_index: int, dpi: int = 200
) -> Image.Image:
"""
Render a single PDF page to a PIL Image.
Args:
pdf_doc: pypdfium2 PDF document
page_index: 0-based page index
dpi: Resolution for rendering
Returns:
PIL Image of the rendered page
"""
page = pdf_doc[page_index]
scale = dpi / 72 # PDF default is 72 DPI
bitmap = page.render(scale=scale)
pil_image = bitmap.to_pil()
return pil_image
def ocr_pdf(
pdf_path: Path,
config: PDFOCRConfig | None = None,
pages: list[int] | None = None,
) -> tuple[str, list[OCRResult]]:
"""
Perform OCR on a PDF file, extracting text from all or specific pages.
Args:
pdf_path: Path to the PDF file
config: OCR configuration options
pages: Optional list of 1-based page numbers to process.
If None, all pages are processed.
Returns:
Tuple of (combined_text, list of OCRResult per page)
"""
if config is None:
config = PDFOCRConfig()
pdf_doc = pdfium.PdfDocument(str(pdf_path))
total_pages = len(pdf_doc)
# Determine which pages to process (convert to 0-based indices)
if pages is not None:
page_indices = [p - 1 for p in pages if 1 <= p <= total_pages]
else:
page_indices = list(range(total_pages))
results: list[OCRResult] = []
text_parts: list[str] = []
for idx in page_indices:
page_num = idx + 1 # 1-based for display
# Render page to image
pil_image = render_pdf_page_to_image(pdf_doc, idx, config.dpi)
# Perform OCR
page_text = ocr_image_pil(pil_image, config.languages)
results.append(
OCRResult(
page_number=page_num,
text=page_text,
confidence=1.0, # Vision framework doesn't expose overall confidence
)
)
# Add separator before text (except for first page)
if text_parts:
text_parts.append(config.page_separator.format(page=page_num))
text_parts.append(page_text)
combined_text = "".join(text_parts)
return combined_text, results
def ocr_pdf_to_file(
pdf_path: Path,
output_path: Path | None = None,
config: PDFOCRConfig | None = None,
pages: list[int] | None = None,
) -> Path:
"""
Perform OCR on a PDF file and save the result to a text file.
Args:
pdf_path: Path to the PDF file
output_path: Optional output file path. If None, creates .txt next to PDF.
config: OCR configuration options
pages: Optional list of 1-based page numbers to process
Returns:
Path to the created text file
"""
if output_path is None:
output_path = pdf_path.with_suffix(".txt")
combined_text, _ = ocr_pdf(pdf_path, config, pages)
output_path.write_text(combined_text, encoding="utf-8")
return output_path