"""
Input Normalizer
Handles normalization of user input for consistent matching.
Implements a 4-stage normalization pipeline:
1. Trim whitespace
2. Convert to lowercase
3. Unicode normalization (remove diacritics)
4. Collapse multiple spaces
"""
import re
import unicodedata
import logging
from typing import Optional
logger = logging.getLogger(__name__)
class InputNormalizer:
"""
Normalizes user input for consistent entity matching.
The normalization process ensures that variations in user input
(case, whitespace, diacritics) don't prevent successful matching.
"""
@staticmethod
def normalize(input_text: str) -> str:
"""
Normalize input text through 4-stage pipeline.
Args:
input_text: Raw user input
Returns:
Normalized text
Raises:
ValueError: If input is None or empty after trimming
"""
if input_text is None:
raise ValueError("Input text cannot be None")
# Stage 1: Trim whitespace
normalized = input_text.strip()
if not normalized:
raise ValueError("Input text cannot be empty after trimming")
# Stage 2: Convert to lowercase
normalized = normalized.lower()
# Stage 3: Unicode normalization (NFD decomposition + remove diacritics)
# This converts accented characters to base + combining diacritics,
# then removes the diacritics
# Example: "café" → "cafe", "naïve" → "naive"
normalized = unicodedata.normalize('NFD', normalized)
normalized = ''.join(
char for char in normalized
if unicodedata.category(char) != 'Mn' # Mn = Mark, Nonspacing
)
# Stage 4: Collapse multiple spaces to single space
normalized = re.sub(r'\s+', ' ', normalized)
logger.debug(f"Normalized '{input_text}' → '{normalized}'")
return normalized
@staticmethod
def normalize_safe(input_text: Optional[str]) -> Optional[str]:
"""
Safely normalize input, returning None if normalization fails.
Args:
input_text: Raw user input (can be None)
Returns:
Normalized text or None if normalization fails
"""
if input_text is None:
return None
try:
return InputNormalizer.normalize(input_text)
except ValueError as e:
logger.warning(f"Normalization failed for '{input_text}': {e}")
return None
@staticmethod
def is_valid_input(input_text: str) -> bool:
"""
Check if input is valid (non-empty after normalization).
Args:
input_text: Raw user input
Returns:
True if input is valid, False otherwise
"""
try:
normalized = InputNormalizer.normalize(input_text)
return len(normalized) > 0
except ValueError:
return False
@staticmethod
def remove_special_chars(input_text: str, keep_spaces: bool = True) -> str:
"""
Remove special characters from input.
Args:
input_text: Text to clean
keep_spaces: If True, preserve spaces; if False, remove them
Returns:
Text with special characters removed
"""
if keep_spaces:
# Keep alphanumeric and spaces
return re.sub(r'[^a-zA-Z0-9\s]', '', input_text)
else:
# Keep only alphanumeric
return re.sub(r'[^a-zA-Z0-9]', '', input_text)
@staticmethod
def truncate(input_text: str, max_length: int = 200) -> str:
"""
Truncate input to maximum length.
Args:
input_text: Text to truncate
max_length: Maximum allowed length
Returns:
Truncated text
"""
if len(input_text) <= max_length:
return input_text
truncated = input_text[:max_length]
logger.warning(f"Input truncated from {len(input_text)} to {max_length} characters")
return truncated