Skip to main content
Glama
Dazlarus
by Dazlarus
fuzzy_match.py5.82 kB
""" Fuzzy string matching for OCR results. Handles common OCR errors: - Character substitutions (i/l, 0/O, etc.) - Missing/extra characters - Spacing issues """ from difflib import SequenceMatcher from typing import List, Optional, Tuple def similarity_ratio(s1: str, s2: str) -> float: """ Calculate similarity ratio between two strings (0.0 to 1.0). Uses SequenceMatcher which handles insertions, deletions, and substitutions. """ if not s1 or not s2: return 0.0 return SequenceMatcher(None, s1.lower(), s2.lower()).ratio() def fuzzy_match(target: str, candidates: List[str], threshold: float = 0.7) -> Optional[Tuple[str, float]]: """ Find the best fuzzy match for target in candidates. Args: target: String to search for candidates: List of strings to search in threshold: Minimum similarity ratio to consider a match (0.0-1.0) Returns: Tuple of (matched_string, similarity) or None if no match above threshold """ if not target or not candidates: return None best_match = None best_score = 0.0 target_lower = target.lower() for candidate in candidates: if not candidate: continue candidate_lower = candidate.lower() # Exact match if target_lower == candidate_lower: return (candidate, 1.0) # Check if target is contained in candidate or vice versa if target_lower in candidate_lower or candidate_lower in target_lower: score = len(min(target_lower, candidate_lower, key=len)) / len(max(target_lower, candidate_lower, key=len)) if score > best_score: best_score = score best_match = candidate continue # Fuzzy match using SequenceMatcher score = similarity_ratio(target, candidate) if score > best_score: best_score = score best_match = candidate if best_score >= threshold: return (best_match, best_score) return None def fuzzy_contains(target: str, text: str, threshold: float = 0.75) -> bool: """ Check if target is approximately contained in text. Handles cases like: - "Ensign Karl" in "Ensian Karl" (OCR error) - "Agent Expert" in "Agent Expert Help" - "New chat" in "New chat" """ if not target or not text: return False target_lower = target.lower() text_lower = text.lower() # Exact containment if target_lower in text_lower or text_lower in target_lower: return True # Check similarity of the whole strings if similarity_ratio(target, text) >= threshold: return True # Check if all words from target are approximately in text target_words = target_lower.split() text_words = text_lower.split() if not target_words: return False matched_words = 0 for tw in target_words: # Check each target word against all text words for txt_w in text_words: if tw == txt_w or similarity_ratio(tw, txt_w) >= 0.8: matched_words += 1 break # If most target words matched, consider it a match return matched_words >= len(target_words) * 0.7 def find_best_match_in_list(target: str, items: List[str], threshold: float = 0.7) -> Optional[int]: """ Find the index of the best matching item in a list. Args: target: String to search for items: List of strings to search in threshold: Minimum similarity to consider Returns: Index of best match, or None if no match above threshold """ result = fuzzy_match(target, items, threshold) if result: matched_text, _ = result try: return items.index(matched_text) except ValueError: # Find by lowercase comparison for i, item in enumerate(items): if item.lower() == matched_text.lower(): return i return None # Test if __name__ == '__main__': print("=" * 60) print("FUZZY MATCHING TEST") print("=" * 60) test_cases = [ # (target, candidate, expected_match) ("Ensign Karl", "Ensian Karl", True), # OCR error ("Ensign Karl", "Ensign Karl", True), # Exact ("New chat", "New chat", True), # Exact ("Agent Expert Help", "Agent Expert", True), # Partial ("Library", "Libary", True), # Typo ("Monday", "Mondey", True), # OCR error ("Codex", "Code", False), # Too different ("Search chats", "Search", False), # Partial but key word missing ("New project", "New projct", True), # Missing letter ] for target, candidate, expected in test_cases: score = similarity_ratio(target, candidate) matches = fuzzy_contains(target, candidate) status = "✓" if matches == expected else "✗" print(f" {status} '{target}' ~ '{candidate}': {score:.2f} (match={matches}, expected={expected})") print("\n" + "=" * 60) print("FIND IN LIST TEST") print("=" * 60) ocr_results = ["New chat", "Search chats", "Libary", "Codex", "Explore GPTs", "Mondey", "New projct", "Agent Expert Help", "Ensian Karl"] searches = ["Ensign Karl", "Library", "Monday", "New project", "Agent Expert Help"] for search in searches: idx = find_best_match_in_list(search, ocr_results) if idx is not None: print(f" ✓ '{search}' -> found at idx {idx}: '{ocr_results[idx]}'") else: print(f" ✗ '{search}' -> NOT FOUND")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Dazlarus/chatgpt-escalation-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server