"""Item categorization logic with static rules and LLM fallback."""
import re
from typing import Optional
# Static mapping of item types to patterns
ITEM_TYPE_MAPPINGS = {
# Dairy
"milk": [
"milk",
"2% milk",
"whole milk",
"skim milk",
"1% milk",
"dairy milk",
"vitamin d milk",
],
"oatmilk": ["oat milk", "oatly", "oat beverage", "oat drink", "oatmilk"],
"eggs": ["eggs", "egg", "large eggs", "dozen eggs", "extra large eggs", "organic eggs"],
"cheese": ["cheese", "cheddar", "mozzarella", "parmesan", "swiss", "gouda"],
"yogurt": ["yogurt", "yoghurt", "greek yogurt", "yoghourt"],
"butter": ["butter", "margarine", "spread"],
# Grains
"bread": ["bread", "loaf", "baguette", "sourdough", "wheat bread", "white bread"],
"rice": ["rice", "basmati", "jasmine rice", "brown rice", "white rice", "long grain"],
"lentils": ["lentils", "lentil", "dal", "red lentils", "green lentils"],
"pasta": ["pasta", "spaghetti", "penne", "macaroni", "noodles", "fusilli"],
"cereal": ["cereal", "granola", "oatmeal", "oats", "corn flakes"],
# Produce
"veggies": [
"vegetables",
"veggie",
"broccoli",
"carrot",
"carrots",
"spinach",
"lettuce",
"cucumber",
"tomato",
"tomatoes",
"onion",
"onions",
"pepper",
"peppers",
"bell pepper",
"celery",
"kale",
"cabbage",
],
"fruits": [
"fruit",
"apple",
"apples",
"banana",
"bananas",
"orange",
"oranges",
"berries",
"strawberries",
"blueberries",
"raspberries",
"grapes",
"pear",
"pears",
"mango",
"mangoes",
"peach",
"peaches",
],
"potatoes": ["potato", "potatoes", "russet", "sweet potato", "yam"],
# Proteins
"meat": [
"chicken",
"beef",
"pork",
"turkey",
"lamb",
"steak",
"ground beef",
"sausage",
"bacon",
],
"fish": ["fish", "salmon", "tuna", "tilapia", "cod", "shrimp", "seafood"],
# Snacks & Beverages
"snacks": [
"chips",
"crackers",
"pretzels",
"popcorn",
"cookies",
"candy",
"chocolate",
"nuts",
"trail mix",
],
"beverages": [
"soda",
"juice",
"water",
"coffee",
"tea",
"sports drink",
"energy drink",
"cola",
"sprite",
],
# Pantry
"oil": ["oil", "olive oil", "vegetable oil", "canola oil", "cooking oil"],
"spices": ["spices", "spice", "salt", "pepper", "cumin", "turmeric", "paprika"],
"sauce": ["sauce", "ketchup", "mustard", "mayo", "mayonnaise", "salsa", "soy sauce"],
# Household (non-food)
"cleaning": [
"cleaner",
"detergent",
"soap",
"dish soap",
"laundry",
"bleach",
"wipes",
],
"paper": ["paper towel", "toilet paper", "tissue", "napkins"],
}
def normalize_text(text: str) -> str:
"""Normalize text for matching."""
# Convert to lowercase
text = text.lower()
# Remove extra whitespace
text = " ".join(text.split())
# Remove common brand prefixes/suffixes
text = re.sub(r"\bkirkland\b", "", text)
text = re.sub(r"\bgreat value\b", "", text)
text = re.sub(r"\borganics?\b", "", text)
text = re.sub(r"\b(oz|lb|lbs|kg|g|ml|l)\b", "", text)
# Clean up extra spaces again
text = " ".join(text.split())
return text.strip()
def deterministic_categorize(item_name: str) -> Optional[str]:
"""Try to categorize using static rules.
Returns:
item_type if found, None otherwise
"""
normalized = normalize_text(item_name)
# Direct substring matching
for item_type, patterns in ITEM_TYPE_MAPPINGS.items():
for pattern in patterns:
pattern_normalized = normalize_text(pattern)
# Check if pattern appears in item name or vice versa
if pattern_normalized in normalized or normalized in pattern_normalized:
return item_type
# Fuzzy word-based matching
words = set(normalized.split())
for item_type, patterns in ITEM_TYPE_MAPPINGS.items():
for pattern in patterns:
pattern_words = set(normalize_text(pattern).split())
# If all pattern words appear in item name
if pattern_words and pattern_words.issubset(words):
return item_type
return None
async def llm_categorize(item_name: str, ctx) -> str:
"""Use LLM to categorize unknown items.
Args:
item_name: Raw item name from receipt
ctx: FastMCP Context for LLM sampling
Returns:
item_type category
"""
# Get list of known categories
categories = ", ".join(sorted(ITEM_TYPE_MAPPINGS.keys()))
prompt = f"""Categorize this grocery/household item into ONE category.
Item: "{item_name}"
Available categories: {categories}, other
Rules:
- Return ONLY the category name (lowercase, no spaces between words)
- If the item clearly fits a category, use it
- If uncertain or it doesn't fit any category, return "other"
- Do not explain, just return the category name
Category:"""
try:
response = await ctx.sample(
messages=[{"role": "user", "content": prompt}],
max_tokens=20,
temperature=0.3, # Low temperature for consistent categorization
)
# Extract and validate category
category = response.strip().lower()
# Remove any punctuation or extra text
category = re.sub(r"[^\w]", "", category)
# Validate it's a known category
if category in ITEM_TYPE_MAPPINGS or category == "other":
return category
# If LLM returned something weird, default to "other"
return "other"
except Exception as e:
# If LLM fails, default to "other"
print(f"LLM categorization failed for '{item_name}': {e}")
return "other"
async def categorize_item(item_name: str, ctx=None) -> str:
"""Main categorization function with hybrid approach.
Args:
item_name: Raw item name from receipt
ctx: Optional FastMCP Context for LLM fallback
Returns:
item_type category (guaranteed to return a value)
"""
# Try deterministic rules first
category = deterministic_categorize(item_name)
if category:
return category
# Fall back to LLM if context is available
if ctx:
return await llm_categorize(item_name, ctx)
# Ultimate fallback
return "other"