import os
import tempfile
from urllib.parse import urlparse
import httpx
import fitz # PyMuPDF
class PDFLoader:
def __init__(self):
pass
async def load(self, source: str) -> fitz.Document:
"""
Load a PDF from a local path or a URL.
Args:
source: Local file path or URL string.
Returns:
fitz.Document: Opened PDF document object.
Raises:
ValueError: If source is invalid or file not found.
httpx.HTTPError: If URL download fails.
"""
if self._is_url(source):
return await self._load_from_url(source)
else:
return self._load_from_local(source)
def _is_url(self, source: str) -> bool:
try:
result = urlparse(source)
return all([result.scheme, result.netloc])
except ValueError:
return False
def _load_from_local(self, path: str) -> fitz.Document:
if not os.path.exists(path):
raise FileNotFoundError(f"PDF file not found at: {path}")
try:
return fitz.open(path)
except Exception as e:
raise ValueError(f"Failed to open local PDF: {str(e)}")
async def _load_from_url(self, url: str) -> fitz.Document:
async with httpx.AsyncClient() as client:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
# Open PDF from memory stream
try:
# stream=response.content allows opening directly from bytes
# filetype="pdf" hint is sometimes useful but fitz usually auto-detects
doc = fitz.open(stream=response.content, filetype="pdf")
return doc
except Exception as e:
raise ValueError(f"Failed to open PDF from URL: {str(e)}")