"""PDF parsing and receipt extraction logic."""
import re
from datetime import datetime
from pathlib import Path
from typing import Optional
import pdfplumber
from .models import Receipt, LineItem
# Known store name patterns
STORE_PATTERNS = [
r"\bwalmart\b",
r"\bcostco\s*wholesale\b",
r"\bcostco\b",
r"\btarget\b",
r"\bwhole\s*foods\b",
r"\btrader\s*joe'?s\b",
r"\bsafeway\b",
r"\bkroger\b",
r"\bpublix\b",
]
# Date patterns (multiple formats)
DATE_PATTERNS = [
r"(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})", # MM/DD/YYYY or MM-DD-YYYY
r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", # YYYY-MM-DD or YYYY/MM/DD
]
# Price pattern
PRICE_PATTERN = r"\$?\s*(\d+\.\d{2})"
def extract_text_from_pdf(pdf_path: Path) -> str:
"""Extract text from PDF file.
Args:
pdf_path: Path to PDF file
Returns:
Extracted text as string
Raises:
FileNotFoundError: If PDF doesn't exist
Exception: If PDF parsing fails
"""
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
text_content = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
text_content.append(text)
if not text_content:
raise ValueError("No text could be extracted from PDF")
return "\n".join(text_content)
def identify_store(text: str) -> str:
"""Identify store name from receipt text.
Args:
text: Receipt text
Returns:
Store name (capitalized) or "Unknown Store"
"""
# Check first 10 lines for store name
lines = text.split("\n")[:10]
text_to_search = "\n".join(lines).lower()
for pattern in STORE_PATTERNS:
match = re.search(pattern, text_to_search, re.IGNORECASE)
if match:
store_name = match.group(0).strip()
# Capitalize properly
return store_name.title()
return "Unknown Store"
def extract_date(text: str) -> str:
"""Extract purchase date from receipt.
Args:
text: Receipt text
Returns:
Date in ISO format (YYYY-MM-DD) or current date if not found
"""
# Look for date in first 20 lines
lines = text.split("\n")[:20]
text_to_search = "\n".join(lines)
for pattern in DATE_PATTERNS:
match = re.search(pattern, text_to_search)
if match:
groups = match.groups()
# Try to parse based on pattern type
try:
if len(groups[0]) == 4: # YYYY-MM-DD format
year, month, day = groups
else: # MM-DD-YYYY format
month, day, year = groups
# Handle 2-digit years
year = int(year)
if year < 100:
year += 2000 if year < 50 else 1900
# Create date object to validate
date_obj = datetime(year, int(month), int(day))
return date_obj.strftime("%Y-%m-%d")
except (ValueError, IndexError):
continue
# Default to current date if not found
return datetime.now().strftime("%Y-%m-%d")
def extract_totals(text: str) -> dict:
"""Extract subtotal, tax, and total from receipt.
Args:
text: Receipt text
Returns:
Dictionary with subtotal, tax, total (may be None)
"""
result = {"subtotal": None, "tax": None, "total": None}
# Look for these in the last 30 lines (where totals usually are)
lines = text.split("\n")[-30:]
text_to_search = "\n".join(lines).lower()
# Extract total (required)
total_match = re.search(
rf"(?:^|\s)total\s*:?\s*{PRICE_PATTERN}", text_to_search, re.MULTILINE
)
if total_match:
result["total"] = float(total_match.group(1))
# Extract subtotal
subtotal_match = re.search(
rf"(?:^|\s)sub\s*total\s*:?\s*{PRICE_PATTERN}", text_to_search, re.MULTILINE
)
if subtotal_match:
result["subtotal"] = float(subtotal_match.group(1))
# Extract tax
tax_match = re.search(
rf"(?:^|\s)tax\s*:?\s*{PRICE_PATTERN}", text_to_search, re.MULTILINE
)
if tax_match:
result["tax"] = float(tax_match.group(1))
return result
def extract_line_items(text: str) -> list[dict]:
"""Extract line items from receipt.
Args:
text: Receipt text
Returns:
List of dictionaries with item_name, quantity, price
"""
items = []
lines = text.split("\n")
# Item line patterns (trying multiple formats)
patterns = [
# Pattern 1: [qty] ITEM_NAME $PRICE
r"^(\d+)\s+(.+?)\s+\$?\s*(\d+\.\d{2})\s*$",
# Pattern 2: ITEM_NAME $PRICE
r"^(.+?)\s+\$\s*(\d+\.\d{2})\s*$",
# Pattern 3: ITEM_NAME PRICE (multiple spaces)
r"^(.+?)\s{2,}\$?\s*(\d+\.\d{2})\s*$",
]
for line in lines:
line = line.strip()
# Skip empty lines, header lines, and total lines
if not line:
continue
if len(line) < 5:
continue
if any(
keyword in line.lower()
for keyword in ["total", "subtotal", "tax", "payment", "change", "card"]
):
continue
# Try each pattern
for pattern in patterns:
match = re.match(pattern, line, re.IGNORECASE)
if match:
groups = match.groups()
# Parse based on number of groups
if len(groups) == 3: # qty, name, price
qty, name, price = groups
items.append(
{
"item_name": name.strip(),
"quantity": float(qty),
"price": float(price),
}
)
elif len(groups) == 2: # name, price (no quantity)
name, price = groups
items.append(
{"item_name": name.strip(), "quantity": 1.0, "price": float(price)}
)
break # Found a match, move to next line
return items
def parse_receipt(text: str) -> tuple[Receipt, list[dict]]:
"""Parse receipt text into structured data.
Args:
text: Raw receipt text
Returns:
Tuple of (Receipt object, list of item dicts)
Raises:
ValueError: If total is not found or is invalid
"""
# Extract metadata
store_name = identify_store(text)
purchase_date = extract_date(text)
totals = extract_totals(text)
if totals["total"] is None or totals["total"] <= 0:
raise ValueError("Could not extract valid total from receipt")
# Create receipt object
receipt = Receipt(
store_name=store_name,
purchase_date=purchase_date,
subtotal=totals["subtotal"],
tax=totals["tax"],
total=totals["total"],
)
# Extract line items
items = extract_line_items(text)
return receipt, items
def parse_pdf_receipt(pdf_path: Path) -> tuple[Receipt, list[dict]]:
"""Parse a PDF receipt file.
Args:
pdf_path: Path to PDF file
Returns:
Tuple of (Receipt object, list of item dicts)
Raises:
FileNotFoundError: If PDF doesn't exist
ValueError: If parsing fails
"""
# Convert string to Path if needed
if isinstance(pdf_path, str):
pdf_path = Path(pdf_path)
# Extract text
text = extract_text_from_pdf(pdf_path)
# Parse receipt
return parse_receipt(text)