Skip to main content
Glama
Sharan0402

Expense Tracker MCP Server

by Sharan0402
pdf_parser.py7.64 kB
"""PDF parsing and receipt extraction logic.""" import re from datetime import datetime from pathlib import Path from typing import Optional import pdfplumber from .models import Receipt, LineItem # Known store name patterns STORE_PATTERNS = [ r"\bwalmart\b", r"\bcostco\s*wholesale\b", r"\bcostco\b", r"\btarget\b", r"\bwhole\s*foods\b", r"\btrader\s*joe'?s\b", r"\bsafeway\b", r"\bkroger\b", r"\bpublix\b", ] # Date patterns (multiple formats) DATE_PATTERNS = [ r"(\d{1,2})[/-](\d{1,2})[/-](\d{2,4})", # MM/DD/YYYY or MM-DD-YYYY r"(\d{4})[/-](\d{1,2})[/-](\d{1,2})", # YYYY-MM-DD or YYYY/MM/DD ] # Price pattern PRICE_PATTERN = r"\$?\s*(\d+\.\d{2})" def extract_text_from_pdf(pdf_path: Path) -> str: """Extract text from PDF file. Args: pdf_path: Path to PDF file Returns: Extracted text as string Raises: FileNotFoundError: If PDF doesn't exist Exception: If PDF parsing fails """ if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}") text_content = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() if text: text_content.append(text) if not text_content: raise ValueError("No text could be extracted from PDF") return "\n".join(text_content) def identify_store(text: str) -> str: """Identify store name from receipt text. Args: text: Receipt text Returns: Store name (capitalized) or "Unknown Store" """ # Check first 10 lines for store name lines = text.split("\n")[:10] text_to_search = "\n".join(lines).lower() for pattern in STORE_PATTERNS: match = re.search(pattern, text_to_search, re.IGNORECASE) if match: store_name = match.group(0).strip() # Capitalize properly return store_name.title() return "Unknown Store" def extract_date(text: str) -> str: """Extract purchase date from receipt. Args: text: Receipt text Returns: Date in ISO format (YYYY-MM-DD) or current date if not found """ # Look for date in first 20 lines lines = text.split("\n")[:20] text_to_search = "\n".join(lines) for pattern in DATE_PATTERNS: match = re.search(pattern, text_to_search) if match: groups = match.groups() # Try to parse based on pattern type try: if len(groups[0]) == 4: # YYYY-MM-DD format year, month, day = groups else: # MM-DD-YYYY format month, day, year = groups # Handle 2-digit years year = int(year) if year < 100: year += 2000 if year < 50 else 1900 # Create date object to validate date_obj = datetime(year, int(month), int(day)) return date_obj.strftime("%Y-%m-%d") except (ValueError, IndexError): continue # Default to current date if not found return datetime.now().strftime("%Y-%m-%d") def extract_totals(text: str) -> dict: """Extract subtotal, tax, and total from receipt. Args: text: Receipt text Returns: Dictionary with subtotal, tax, total (may be None) """ result = {"subtotal": None, "tax": None, "total": None} # Look for these in the last 30 lines (where totals usually are) lines = text.split("\n")[-30:] text_to_search = "\n".join(lines).lower() # Extract total (required) total_match = re.search( rf"(?:^|\s)total\s*:?\s*{PRICE_PATTERN}", text_to_search, re.MULTILINE ) if total_match: result["total"] = float(total_match.group(1)) # Extract subtotal subtotal_match = re.search( rf"(?:^|\s)sub\s*total\s*:?\s*{PRICE_PATTERN}", text_to_search, re.MULTILINE ) if subtotal_match: result["subtotal"] = float(subtotal_match.group(1)) # Extract tax tax_match = re.search( rf"(?:^|\s)tax\s*:?\s*{PRICE_PATTERN}", text_to_search, re.MULTILINE ) if tax_match: result["tax"] = float(tax_match.group(1)) return result def extract_line_items(text: str) -> list[dict]: """Extract line items from receipt. Args: text: Receipt text Returns: List of dictionaries with item_name, quantity, price """ items = [] lines = text.split("\n") # Item line patterns (trying multiple formats) patterns = [ # Pattern 1: [qty] ITEM_NAME $PRICE r"^(\d+)\s+(.+?)\s+\$?\s*(\d+\.\d{2})\s*$", # Pattern 2: ITEM_NAME $PRICE r"^(.+?)\s+\$\s*(\d+\.\d{2})\s*$", # Pattern 3: ITEM_NAME PRICE (multiple spaces) r"^(.+?)\s{2,}\$?\s*(\d+\.\d{2})\s*$", ] for line in lines: line = line.strip() # Skip empty lines, header lines, and total lines if not line: continue if len(line) < 5: continue if any( keyword in line.lower() for keyword in ["total", "subtotal", "tax", "payment", "change", "card"] ): continue # Try each pattern for pattern in patterns: match = re.match(pattern, line, re.IGNORECASE) if match: groups = match.groups() # Parse based on number of groups if len(groups) == 3: # qty, name, price qty, name, price = groups items.append( { "item_name": name.strip(), "quantity": float(qty), "price": float(price), } ) elif len(groups) == 2: # name, price (no quantity) name, price = groups items.append( {"item_name": name.strip(), "quantity": 1.0, "price": float(price)} ) break # Found a match, move to next line return items def parse_receipt(text: str) -> tuple[Receipt, list[dict]]: """Parse receipt text into structured data. Args: text: Raw receipt text Returns: Tuple of (Receipt object, list of item dicts) Raises: ValueError: If total is not found or is invalid """ # Extract metadata store_name = identify_store(text) purchase_date = extract_date(text) totals = extract_totals(text) if totals["total"] is None or totals["total"] <= 0: raise ValueError("Could not extract valid total from receipt") # Create receipt object receipt = Receipt( store_name=store_name, purchase_date=purchase_date, subtotal=totals["subtotal"], tax=totals["tax"], total=totals["total"], ) # Extract line items items = extract_line_items(text) return receipt, items def parse_pdf_receipt(pdf_path: Path) -> tuple[Receipt, list[dict]]: """Parse a PDF receipt file. Args: pdf_path: Path to PDF file Returns: Tuple of (Receipt object, list of item dicts) Raises: FileNotFoundError: If PDF doesn't exist ValueError: If parsing fails """ # Convert string to Path if needed if isinstance(pdf_path, str): pdf_path = Path(pdf_path) # Extract text text = extract_text_from_pdf(pdf_path) # Parse receipt return parse_receipt(text)

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Sharan0402/expense-tracker-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server