pdf.py•4.54 kB
"""
PDF processing utilities for MCP Invoice.
Provides functionality for merging PDFs and converting images to PDF.
"""
import os
import io
import logging
from pathlib import Path
from typing import List, Union, Tuple, Optional
from PIL import Image
from PyPDF2 import PdfReader, PdfWriter, PdfMerger
logger = logging.getLogger(__name__)
class PDFProcessor:
"""PDF processing utilities for MCP Invoice."""
def __init__(self, dpi: int = 100):
"""Initialize the PDF processor.
Args:
dpi: DPI for image conversion (default: 100)
"""
self.dpi = dpi
def _convert_image_to_pdf(self, image_path: Union[str, Path]) -> io.BytesIO:
"""Convert an image to a PDF in memory.
Args:
image_path: Path to the image file
Returns:
BytesIO object containing the PDF data
"""
img_path = Path(image_path)
if not img_path.exists():
raise FileNotFoundError(f"Image file not found: {img_path}")
try:
# Open image and convert to RGB if necessary
img = Image.open(img_path)
if img.mode != 'RGB':
img = img.convert('RGB')
# Scale to A4 width while maintaining aspect ratio
# A4 is 210mm x 297mm at 72 DPI = 595 x 842 points
a4_width_px = int(8.27 * self.dpi) # 8.27 inches = 210mm
width_percent = a4_width_px / float(img.width)
new_height = int(float(img.height) * width_percent)
img = img.resize((a4_width_px, new_height), Image.LANCZOS)
# Create in-memory PDF
pdf_bytes = io.BytesIO()
img.save(pdf_bytes, format='PDF', resolution=self.dpi)
pdf_bytes.seek(0)
return pdf_bytes
except Exception as e:
logger.error(f"Error converting image to PDF: {e}")
raise
def merge_pdfs(self, file_paths: List[Union[str, Path]], output_path: Union[str, Path]) -> str:
"""Merge multiple files (PDFs and images) into a single PDF file.
Args:
file_paths: List of file paths to merge (PDFs and images)
output_path: Path where the merged PDF should be saved
Returns:
Path to the merged PDF file
"""
output_file = Path(output_path)
# Ensure output directory exists
output_dir = output_file.parent
if not output_dir.exists():
output_dir.mkdir(parents=True)
# Create PDF merger
merger = PdfMerger()
try:
for file_path in file_paths:
path = Path(file_path)
if not path.exists():
logger.warning(f"File not found: {path}, skipping")
continue
# Determine file type based on extension
file_extension = path.suffix.lower()
if file_extension in ('.pdf'):
# Add PDF directly
merger.append(str(path))
elif file_extension in ('.jpg', '.jpeg', '.png'):
# Convert image to PDF and add
pdf_bytes = self._convert_image_to_pdf(path)
merger.append(pdf_bytes)
else:
logger.warning(f"Unsupported file type: {file_extension}, skipping: {path}")
# Write merged PDF to output file
merger.write(str(output_file))
merger.close()
logger.info(f"Successfully merged {len(file_paths)} files into: {output_file}")
return str(output_file)
except Exception as e:
logger.error(f"Error merging PDF files: {e}")
raise
def get_pdf_page_count(self, pdf_path: Union[str, Path]) -> int:
"""Get the number of pages in a PDF file.
Args:
pdf_path: Path to the PDF file
Returns:
Number of pages
"""
try:
with open(pdf_path, 'rb') as file:
reader = PdfReader(file)
return len(reader.pages)
except Exception as e:
logger.error(f"Error getting PDF page count: {e}")
raise