"""
Team Name Normalization Utility
This module provides functions to normalize team names for better matching
with API responses that may use different naming conventions.
"""
import re
import logging
from typing import Optional
logger = logging.getLogger(__name__)
# Common team name mappings and patterns
TEAM_NORMALIZATIONS = {
# Remove common suffixes/prefixes
"crimson tide": "",
"fighting irish": "",
"fighting illini": "",
"sooners": "",
"buckeyes": "",
"tigers": "",
"wildcats": "",
"eagles": "",
"hawks": "",
"bulldogs": "",
"seminoles": "",
"hurricanes": "",
"trojans": "",
"bruins": "",
"cougars": "",
"longhorns": "",
"aggies": "",
"wolverines": "",
"spartans": "",
"badgers": "",
"badgers": "",
"cornhuskers": "",
"hoosiers": "",
"boilermakers": "",
"golden gophers": "",
"gophers": "",
"nittany lions": "",
"terrapins": "",
"scarlet knights": "",
"orange": "",
"cavaliers": "",
"hokies": "",
"mountaineers": "",
}
# Common school name patterns to extract
SCHOOL_PATTERNS = [
r"^(.+?)\s+(?:university|univ|college|state|tech|technical institute).*$",
r"^(.+?)\s+(?:state|tech).*$",
]
def normalize_team_name(team_name: str) -> str:
"""
Normalize a team name to improve matching with API responses.
This function:
1. Converts to lowercase
2. Removes common team name suffixes (e.g., "Crimson Tide", "Fighting Irish")
3. Extracts school name from full names
4. Strips extra whitespace
Args:
team_name: Original team name (e.g., "Alabama Crimson Tide", "Notre Dame Fighting Irish")
Returns:
Normalized team name (e.g., "alabama", "notre dame")
"""
if not team_name:
return ""
normalized = team_name.lower().strip()
# Try to extract school name from patterns
for pattern in SCHOOL_PATTERNS:
match = re.match(pattern, normalized)
if match:
normalized = match.group(1).strip()
break
# Remove common team name suffixes
for suffix, replacement in TEAM_NORMALIZATIONS.items():
if suffix in normalized:
normalized = normalized.replace(suffix, replacement).strip()
break
# Clean up multiple spaces
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
def fuzzy_match_team(team_name: str, candidates: list[str]) -> Optional[str]:
"""
Find the best matching team name from a list of candidates.
Uses normalized names for matching, with fallback to substring matching.
Args:
team_name: Team name to search for
candidates: List of candidate team names to match against
Returns:
Best matching candidate name or None if no good match found
"""
if not team_name or not candidates:
return None
normalized_search = normalize_team_name(team_name)
# First try: exact normalized match
for candidate in candidates:
normalized_candidate = normalize_team_name(candidate)
if normalized_search == normalized_candidate:
return candidate
# Second try: normalized substring match (search term in candidate)
for candidate in candidates:
normalized_candidate = normalize_team_name(candidate)
if normalized_search in normalized_candidate or normalized_candidate in normalized_search:
return candidate
# Third try: original substring match (case-insensitive)
team_lower = team_name.lower()
for candidate in candidates:
candidate_lower = candidate.lower()
if team_lower in candidate_lower or candidate_lower in team_lower:
return candidate
# Fourth try: word-based matching (check if key words match)
search_words = set(normalized_search.split())
for candidate in candidates:
normalized_candidate = normalize_team_name(candidate)
candidate_words = set(normalized_candidate.split())
# If more than 50% of words match, consider it a match
if search_words and len(search_words.intersection(candidate_words)) / len(search_words) >= 0.5:
return candidate
return None