"""
Smart category detection for expense tracker.
Goals:
- Suggest best-fit category for a free-text query or expense description.
- Return confidence score + top alternatives.
- Offer an 'auto_assign' boolean when confidence is high so modules can auto-fill.
- Simple, dependency-free (stdlib only) implementation suitable for offline use.
"""
from typing import List, Dict, Optional, Tuple
import re
import difflib
# Basic keyword mappings. Extend this map as your domain grows.
# Each key is a category -> entry contains keywords, typical price range (min,max)
KEYWORD_MAP = {
"food": {
"keywords": ["food", "lunch", "dinner", "breakfast", "restaurant", "cafe", "coffee",
"tea", "snack", "grocery", "vegetable", "fruit", "meal", "takeout",
"delivery", "bakery", "supermarket"],
"price_range": (10, 5000)
},
"transport": {
"keywords": ["fuel", "petrol", "diesel", "uber", "ola", "taxi", "bus", "train",
"metro", "auto", "rickshaw", "parking", "toll", "commute", "cab"],
"price_range": (20, 10000)
},
"shopping": {
"keywords": ["shopping", "clothes", "shirt", "pant", "dress", "shoe", "watch",
"electronics", "phone", "laptop", "gadget", "appliance", "furniture"],
"price_range": (200, 100000)
},
"entertainment": {
"keywords": ["movie", "cinema", "netflix", "prime", "spotify", "music",
"game", "concert", "party", "ticket", "show"],
"price_range": (50, 20000)
},
"utilities": {
"keywords": ["electricity", "water", "internet", "wifi", "mobile bill", "mobile",
"phone bill", "broadband", "gas", "cylinder", "cable", "tv"],
"price_range": (100, 50000)
},
"health": {
"keywords": ["medicine", "doctor", "hospital", "pharmacy", "medical", "clinic",
"checkup", "lab", "diagnostic", "health", "insurance"],
"price_range": (50, 100000)
},
"education": {
"keywords": ["book", "course", "tuition", "class", "training", "seminar",
"workshop", "school", "college", "university", "exam"],
"price_range": (100, 200000)
},
"housing": {
"keywords": ["rent", "mortgage", "emi", "home", "apartment", "flat", "lease"],
"price_range": (1000, 500000)
},
"personal_care": {
"keywords": ["salon", "barber", "spa", "gym", "fitness", "yoga", "haircut",
"beauty", "skincare", "cosmetic", "toiletries"],
"price_range": (50, 20000)
},
"subscriptions": {
"keywords": ["subscription", "membership", "renewal", "software", "app",
"monthly", "yearly", "plan", "premium"],
"price_range": (50, 5000)
},
"travel": {
"keywords": ["flight", "airline", "hotel", "booking", "vacation", "tour", "travel", "stay"],
"price_range": (500, 500000)
},
"home": {
"keywords": ["sofa", "mattress", "furniture", "home", "kitchen", "decor", "appliance"],
"price_range": (500, 200000)
},
"pet": {
"keywords": ["pet", "vet", "veterinary", "pet food", "dog food", "cat food", "grooming"],
"price_range": (50, 50000)
},
"gifts_donations": {
"keywords": ["gift", "donation", "charity", "present", "donate"],
"price_range": (50, 100000)
},
"finance_fees": {
"keywords": ["fee", "bank", "interest", "charges", "transaction fee", "atm fee"],
"price_range": (10, 10000)
},
"misc": {
"keywords": ["misc", "miscellaneous", "other", "unknown"],
"price_range": (0, 100000)
}
}
# Build a flat list of all keywords => category for fast fallback fuzzy matching
_FLAT_KEY_TO_CAT = {}
for cat, data in KEYWORD_MAP.items():
for kw in data["keywords"]:
_FLAT_KEY_TO_CAT[kw] = cat
def _tokenize(text: str) -> List[str]:
text = text.lower()
# split on non-word characters, keep words of length >=2
tokens = [t for t in re.split(r'\W+', text) if len(t) >= 2]
return tokens
def _keyword_score(tokens: List[str], keywords: List[str]) -> float:
"""Return a simple normalized score = matched_keywords / total_keywords_considered."""
if not keywords:
return 0.0
match_count = 0
for kw in keywords:
# match multi-word keywords by checking substring of original text tokens joined
if ' ' in kw:
if kw in ' '.join(tokens):
match_count += 1
else:
if kw in tokens:
match_count += 1
return match_count / max(len(keywords), 1)
def _amount_score(amount: Optional[float], price_range: Tuple[int, int]) -> float:
"""Returns 1.0 if amount is inside price_range, 0.5 if near (within 50%), else 0."""
if amount is None:
return 0.5 # neutral if no amount given
low, high = price_range
if low <= amount <= high:
return 1.0
# near if within 50% outside range
if amount >= low and amount <= high * 1.5:
return 0.6
if amount >= low * 0.5 and amount <= high:
return 0.6
return 0.0
def detect_category(text: str, amount: Optional[float] = None, top_n: int = 3,
high_threshold: float = 0.65, low_threshold: float = 0.25) -> Dict:
"""
Detect category for given text and optional amount.
Returns a dict:
{
"detected_category": "food",
"confidence": 0.82,
"confidence_label": "high", # high | medium | low
"alternatives": [
{"category": "health", "score": 0.3}, ...
],
"auto_assign": True|False,
"extracted_keywords": ["pizza", "delivery"]
}
"""
tokens = _tokenize(text)
if not tokens:
return {
"detected_category": "misc",
"confidence": 0.0,
"confidence_label": "low",
"alternatives": [],
"auto_assign": False,
"extracted_keywords": []
}
scores = {}
extracted_keywords = set()
for cat, data in KEYWORD_MAP.items():
kws = data.get("keywords", [])
key_score = _keyword_score(tokens, kws)
amt_score = _amount_score(amount, data.get("price_range", (0, 1_000_000)))
# Weighted combination: keywords are primary (weight 0.7), amount is secondary (0.3)
combined = 0.7 * key_score + 0.3 * amt_score
scores[cat] = combined
# collect any matched keywords for explanation
for kw in kws:
if (' ' in kw and kw in ' '.join(tokens)) or kw in tokens:
extracted_keywords.add(kw)
# Also consider fuzzy matching of individual tokens against known keywords (typo tolerance)
for tok in tokens:
# use difflib to find close keywords in flat key map
close = difflib.get_close_matches(tok, list(_FLAT_KEY_TO_CAT.keys()), n=2, cutoff=0.8)
for ckw in close:
cat = _FLAT_KEY_TO_CAT.get(ckw)
if cat:
# boost that category slightly
scores[cat] = min(1.0, scores.get(cat, 0) + 0.15)
extracted_keywords.add(ckw)
# Sort categories by score
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
top = ranked[:top_n]
alternatives = [{"category": c, "score": round(s, 3)} for c, s in top]
best_cat, best_score = top[0] if top else ("misc", 0.0)
# Map numeric score to label
if best_score >= high_threshold:
label = "high"
elif best_score >= low_threshold:
label = "medium"
else:
label = "low"
auto_assign = True if best_score >= high_threshold else False
return {
"detected_category": best_cat,
"confidence": round(best_score, 3),
"confidence_label": label,
"alternatives": alternatives[1:] if len(alternatives) > 1 else [],
"auto_assign": auto_assign,
"extracted_keywords": sorted(list(extracted_keywords))
}
# Backwards-compatible alias
suggest_category = detect_category
if __name__ == "__main__":
tests = [
("2 coffees and a sandwich from cafe", 350),
("Uber ride to airport", 850),
("monthly netflix subscription", 199),
("Headphones sony wireless", 4500),
("emergency dentist visit", 7200),
("rent for apartment", 15000),
("dog vet appointment and vaccination", 1200),
]
for text, amt in tests:
print(text, amt, "->", detect_category(text, amt))