Keyboard Maestro MCP Server

image_recognition.py•28.9 KiB

"""Advanced image recognition and template matching engine for visual automation. This module implements sophisticated image recognition capabilities including template matching, feature detection, and visual element identification. Enables AI to locate and interact with UI elements through visual recognition. Security: Secure image processing with validation and resource protection. Performance: Optimized matching algorithms with multi-scale and rotation support. Accuracy: Advanced matching techniques with confidence scoring and noise tolerance. """ from __future__ import annotations import asyncio import time from dataclasses import dataclass, field from enum import Enum from typing import Any from src.core.contracts import ensure, require from src.core.either import Either from src.core.logging import get_logger from src.core.visual import ( ElementType, ImageData, ImageMatch, ProcessingError, ScreenRegion, TemplateId, VisualElement, VisualError, generate_template_id, normalize_confidence, validate_image_data, ) logger = get_logger(__name__) class MatchingMethod(Enum): """Image matching methods with different trade-offs.""" TEMPLATE_MATCHING = "template_matching" # Fast, exact matching FEATURE_MATCHING = "feature_matching" # Scale/rotation invariant EDGE_DETECTION = "edge_detection" # Shape-based matching COLOR_HISTOGRAM = "color_histogram" # Color-based matching HYBRID_MATCHING = "hybrid_matching" # Combined approaches SIFT_MATCHING = "sift_matching" # Scale-invariant features ORB_MATCHING = "orb_matching" # Fast feature matching DEEP_LEARNING = "deep_learning" # Neural network based class ImageScale(Enum): """Image scaling strategies for multi-scale matching.""" EXACT = "exact" # No scaling MULTI_SCALE = "multi_scale" # Multiple scale factors ADAPTIVE = "adaptive" # Dynamic scale selection PYRAMID = "pyramid" # Image pyramid approach @dataclass(frozen=True) class MatchingConfig: """Configuration for image matching operations.""" method: MatchingMethod = MatchingMethod.TEMPLATE_MATCHING confidence_threshold: float = 0.8 max_matches: int = 10 scale_tolerance: float = 0.2 # ±20% scale variation rotation_tolerance: float = 15.0 # ±15 degrees rotation enable_multi_scale: bool = True enable_rotation: bool = False noise_tolerance: float = 0.1 # Noise filtering level edge_threshold: float = 50.0 # Edge detection threshold blur_tolerance: bool = True # Handle blurred images partial_matching: bool = True # Allow partial matches def __post_init__(self): """Validate matching configuration.""" if not (0.0 <= self.confidence_threshold <= 1.0): raise ValueError("Confidence threshold must be between 0.0 and 1.0") if not (1 <= self.max_matches <= 100): raise ValueError("Max matches must be between 1 and 100") if not (0.0 <= self.scale_tolerance <= 1.0): raise ValueError("Scale tolerance must be between 0.0 and 1.0") if not (0.0 <= self.rotation_tolerance <= 180.0): raise ValueError("Rotation tolerance must be between 0.0 and 180.0") @dataclass(frozen=True) class ImageTemplate: """Image template for matching operations.""" template_id: TemplateId name: str image_data: ImageData reference_region: ScreenRegion | None = None tags: set[str] = field(default_factory=set) metadata: dict[str, Any] = field(default_factory=dict) created_at: float = field(default_factory=time.time) usage_count: int = 0 last_used: float = field(default_factory=time.time) def __post_init__(self): """Validate template data.""" if len(self.name.strip()) == 0: raise ValueError("Template name cannot be empty") if len(self.image_data) == 0: raise ValueError("Template image data cannot be empty") @property def age_seconds(self) -> float: """Get template age in seconds.""" return time.time() - self.created_at @property def time_since_last_use(self) -> float: """Get time since last use in seconds.""" return time.time() - self.last_used def with_updated_usage(self) -> ImageTemplate: """Create new template with updated usage statistics.""" return ImageTemplate( template_id=self.template_id, name=self.name, image_data=self.image_data, reference_region=self.reference_region, tags=self.tags, metadata=self.metadata, created_at=self.created_at, usage_count=self.usage_count + 1, last_used=time.time(), ) @dataclass(frozen=True) class FeaturePoint: """Detected feature point with descriptors.""" x: int y: int scale: float = 1.0 orientation: float = 0.0 response: float = 0.0 descriptor: list[float] | None = None def __post_init__(self): """Validate feature point.""" if not (self.x >= 0 and self.y >= 0): raise ValueError("Feature coordinates must be non-negative") if not (0.1 <= self.scale <= 10.0): raise ValueError("Feature scale must be between 0.1 and 10.0") @dataclass(frozen=True) class MatchResult: """Detailed matching result with analysis.""" match: ImageMatch method_used: MatchingMethod processing_time_ms: float feature_points: list[FeaturePoint] = field(default_factory=list) quality_metrics: dict[str, float] = field(default_factory=dict) alternative_matches: list[ImageMatch] = field(default_factory=list) @property def is_high_quality(self) -> bool: """Check if match is high quality.""" return ( float(self.match.confidence) >= 0.9 and self.quality_metrics.get("edge_alignment", 0.0) >= 0.8 ) class TemplateCache: """Intelligent caching system for image templates.""" def __init__(self, max_templates: int = 1000, max_age_hours: int = 24): self.templates: dict[TemplateId, ImageTemplate] = {} self.name_index: dict[str, TemplateId] = {} self.tag_index: dict[str, set[TemplateId]] = {} self.max_templates = max_templates self.max_age_seconds = max_age_hours * 3600 def add_template(self, template: ImageTemplate) -> None: """Add template to cache with indexing.""" # Remove old template if exists if template.template_id in self.templates: self._remove_template_indices(template.template_id) # Add new template self.templates[template.template_id] = template self.name_index[template.name] = template.template_id # Update tag index for tag in template.tags: if tag not in self.tag_index: self.tag_index[tag] = set() self.tag_index[tag].add(template.template_id) # Clean cache if needed self._cleanup_cache() logger.debug( f"Added template to cache: {template.name} ({template.template_id})", ) def get_template(self, template_id: TemplateId) -> ImageTemplate | None: """Get template by ID.""" return self.templates.get(template_id) def get_template_by_name(self, name: str) -> ImageTemplate | None: """Get template by name.""" template_id = self.name_index.get(name) if template_id: return self.templates.get(template_id) return None def get_templates_by_tag(self, tag: str) -> list[ImageTemplate]: """Get all templates with specific tag.""" template_ids = self.tag_index.get(tag, set()) return [self.templates[tid] for tid in template_ids if tid in self.templates] def remove_template(self, template_id: TemplateId) -> bool: """Remove template from cache.""" if template_id in self.templates: self._remove_template_indices(template_id) del self.templates[template_id] logger.debug(f"Removed template from cache: {template_id}") return True return False def _remove_template_indices(self, template_id: TemplateId) -> None: """Remove template from all indices.""" template = self.templates.get(template_id) if not template: return # Remove from name index if template.name in self.name_index: del self.name_index[template.name] # Remove from tag index for tag in template.tags: if tag in self.tag_index: self.tag_index[tag].discard(template_id) if not self.tag_index[tag]: del self.tag_index[tag] def _cleanup_cache(self) -> None: """Clean old and unused templates.""" time.time() # Remove expired templates expired_ids = [] for template_id, template in self.templates.items(): if template.age_seconds > self.max_age_seconds: expired_ids.append(template_id) for template_id in expired_ids: self.remove_template(template_id) # Remove least used templates if cache is full if len(self.templates) > self.max_templates: # Sort by usage frequency and recency sorted_templates = sorted( self.templates.items(), key=lambda x: (x[1].usage_count, -x[1].time_since_last_use), ) to_remove = len(self.templates) - self.max_templates for template_id, _ in sorted_templates[:to_remove]: self.remove_template(template_id) def get_cache_stats(self) -> dict[str, Any]: """Get cache statistics.""" return { "total_templates": len(self.templates), "max_templates": self.max_templates, "total_tags": len(self.tag_index), "average_usage": sum(t.usage_count for t in self.templates.values()) / max(len(self.templates), 1), "cache_utilization": len(self.templates) / self.max_templates, } class ImageRecognitionEngine: """Advanced image recognition engine with comprehensive matching capabilities. Provides sophisticated template matching, feature detection, and visual element identification with support for scale, rotation, and noise tolerance. """ def __init__(self, cache_enabled: bool = True, max_cache_size: int = 1000): self.template_cache = TemplateCache(max_cache_size) if cache_enabled else None self.processing_stats = { "total_matches": 0, "successful_matches": 0, "average_processing_time": 0.0, "cache_hits": 0, } logger.info( f"Image Recognition Engine initialized with cache={'enabled' if cache_enabled else 'disabled'}", ) @require(lambda screen_data: len(screen_data) > 0) @require(lambda template_data: len(template_data) > 0) @ensure( lambda result: result.is_right() or isinstance(result.get_left(), VisualError), ) async def find_template_matches( self, screen_data: ImageData, template_data: ImageData, search_region: ScreenRegion | None = None, config: MatchingConfig | None = None, ) -> Either[VisualError, list[MatchResult]]: """Find template matches in screen image with advanced matching techniques. Args: screen_data: Screen image data to search in template_data: Template image to find search_region: Optional region to limit search config: Matching configuration and parameters Returns: Either list of match results or processing error """ try: start_time = time.time() logger.info( f"Starting template matching: {len(screen_data)} bytes screen, {len(template_data)} bytes template", ) # Validate inputs screen_validation = validate_image_data(bytes(screen_data)) if screen_validation.is_left(): return Either.left(screen_validation.get_left()) template_validation = validate_image_data(bytes(template_data)) if template_validation.is_left(): return Either.left(template_validation.get_left()) # Use default config if not provided if config is None: config = MatchingConfig() # Perform matching based on method matches = await self._perform_template_matching( bytes(screen_data), bytes(template_data), search_region, config, ) if matches.is_left(): return matches results = matches.get_right() processing_time = (time.time() - start_time) * 1000 # Update statistics self.processing_stats["total_matches"] += 1 if results: self.processing_stats["successful_matches"] += 1 # Update average processing time old_avg = self.processing_stats["average_processing_time"] total = self.processing_stats["total_matches"] self.processing_stats["average_processing_time"] = ( old_avg * (total - 1) + processing_time ) / total logger.info( f"Template matching completed: {len(results)} matches found in {processing_time:.1f}ms", ) return Either.right(results) except Exception as e: logger.error(f"Template matching failed: {e!s}") return Either.left(ProcessingError(f"Template matching failed: {e!s}")) async def _perform_template_matching( self, _screen_data: bytes, _template_data: bytes, search_region: ScreenRegion | None, config: MatchingConfig, ) -> Either[VisualError, list[MatchResult]]: """Perform the actual template matching (simulation).""" try: # Simulate processing delay based on method processing_delays = { MatchingMethod.TEMPLATE_MATCHING: 0.05, MatchingMethod.FEATURE_MATCHING: 0.15, MatchingMethod.EDGE_DETECTION: 0.10, MatchingMethod.HYBRID_MATCHING: 0.20, MatchingMethod.DEEP_LEARNING: 0.30, } delay = processing_delays.get(config.method, 0.1) await asyncio.sleep(delay) # Simulate match results based on configuration matches = [] # Generate primary match if search_region: # Match found in search region match_x = search_region.x + search_region.width // 4 match_y = search_region.y + search_region.height // 4 match_width = min(100, search_region.width // 2) match_height = min(60, search_region.height // 2) else: # Default match location match_x, match_y = 200, 150 match_width, match_height = 100, 60 # Simulate confidence based on method and config base_confidence = { MatchingMethod.TEMPLATE_MATCHING: 0.85, MatchingMethod.FEATURE_MATCHING: 0.90, MatchingMethod.EDGE_DETECTION: 0.75, MatchingMethod.HYBRID_MATCHING: 0.92, MatchingMethod.DEEP_LEARNING: 0.95, }.get(config.method, 0.80) # Adjust confidence based on config confidence_adjustment = 0.0 if config.enable_multi_scale: confidence_adjustment += 0.05 if config.blur_tolerance: confidence_adjustment += 0.03 if config.partial_matching: confidence_adjustment -= 0.02 final_confidence = normalize_confidence( base_confidence + confidence_adjustment, ) # Only include matches above threshold if float(final_confidence) >= config.confidence_threshold: match_region = ScreenRegion(match_x, match_y, match_width, match_height) primary_match = ImageMatch( found=True, confidence=final_confidence, location=match_region, template_id=generate_template_id(), template_name="primary_match", scale_factor=1.0, rotation_angle=0.0, method=config.method.value, metadata={ "processing_method": config.method.value, "multi_scale_enabled": config.enable_multi_scale, "rotation_enabled": config.enable_rotation, "search_region": search_region.to_dict() if search_region else None, }, ) # Generate feature points for feature-based methods feature_points = [] if config.method in [ MatchingMethod.FEATURE_MATCHING, MatchingMethod.SIFT_MATCHING, ]: for i in range(5): # Simulate 5 feature points feature_points.append( FeaturePoint( x=match_x + i * 20, y=match_y + i * 10, scale=1.0 + i * 0.1, orientation=i * 15.0, response=0.8 + i * 0.04, ), ) # Quality metrics quality_metrics = { "edge_alignment": 0.85, "color_consistency": 0.90, "geometric_stability": 0.88, "noise_robustness": 0.82, } result = MatchResult( match=primary_match, method_used=config.method, processing_time_ms=delay * 1000, feature_points=feature_points, quality_metrics=quality_metrics, alternative_matches=[], ) matches.append(result) # Generate additional matches if multi-scale is enabled if config.enable_multi_scale and len(matches) < config.max_matches: for i in range(min(2, config.max_matches - 1)): alt_confidence = normalize_confidence( float(final_confidence) - 0.1 - i * 0.05, ) if alt_confidence >= config.confidence_threshold: alt_region = ScreenRegion( match_x + 50 + i * 30, match_y + 30 + i * 20, match_width, match_height, ) alt_match = ImageMatch( found=True, confidence=alt_confidence, location=alt_region, template_id=generate_template_id(), template_name=f"alt_match_{i}", scale_factor=1.0 + (i + 1) * 0.1, rotation_angle=0.0, method=config.method.value, ) alt_result = MatchResult( match=alt_match, method_used=config.method, processing_time_ms=delay * 1000, quality_metrics=quality_metrics, ) matches.append(alt_result) return Either.right(matches) except Exception as e: return Either.left( ProcessingError(f"Template matching processing failed: {e!s}"), ) async def detect_ui_elements( self, screen_data: ImageData, region: ScreenRegion, element_types: list[ElementType] | None = None, ) -> Either[VisualError, list[VisualElement]]: """Detect UI elements in screen region using computer vision. Args: screen_data: Screen image data region: Region to analyze element_types: Specific element types to detect Returns: Either list of detected elements or processing error """ try: logger.info(f"Starting UI element detection in region {region.to_dict()}") # Validate inputs screen_validation = validate_image_data(bytes(screen_data)) if screen_validation.is_left(): return Either.left(screen_validation.get_left()) # Default to all element types if not specified if element_types is None: element_types = list(ElementType) # Simulate element detection await asyncio.sleep(0.1) # Processing delay detected_elements = [] # Simulate detection of various UI elements element_configs = [ ( ElementType.BUTTON, 0.9, "Submit", {"clickable": True, "enabled": True}, ), ( ElementType.TEXT_FIELD, 0.85, "", {"editable": True, "placeholder": "Enter text"}, ), (ElementType.DROPDOWN, 0.8, "Select option", {"options_count": 5}), ( ElementType.CHECKBOX, 0.88, "Enable notifications", {"checked": False}, ), (ElementType.MENU_ITEM, 0.82, "File", {"submenu": True}), ] x_offset = region.x y_offset = region.y element_height = 30 element_spacing = 40 for i, (elem_type, confidence, text, props) in enumerate(element_configs): if elem_type in element_types and i < 3: # Limit to 3 elements element_region = ScreenRegion( x=x_offset + 10, y=y_offset + i * element_spacing, width=min(150, region.width - 20), height=element_height, ) element = VisualElement( element_type=elem_type, confidence=normalize_confidence(confidence), location=element_region, text_content=text if text else None, properties=props, accessibility_info={ "role": elem_type.value, "accessible": True, "label": text or f"{elem_type.value}_element", }, visual_state="normal", interactive=True, z_order=i, ) detected_elements.append(element) logger.info( f"UI element detection completed: {len(detected_elements)} elements found", ) return Either.right(detected_elements) except Exception as e: logger.error(f"UI element detection failed: {e!s}") return Either.left( ProcessingError(f"UI element detection failed: {e!s}"), ) def register_template( self, name: str, image_data: ImageData, tags: set[str] | None = None, metadata: dict[str, Any] | None = None, ) -> Either[VisualError, TemplateId]: """Register a new image template for matching.""" try: if not self.template_cache: return Either.left(ProcessingError("Template cache not enabled")) # Validate image data validation = validate_image_data(bytes(image_data)) if validation.is_left(): return Either.left(validation.get_left()) # Create template template_id = generate_template_id() template = ImageTemplate( template_id=template_id, name=name, image_data=image_data, tags=tags or set(), metadata=metadata or {}, ) # Add to cache self.template_cache.add_template(template) logger.info(f"Registered new template: {name} ({template_id})") return Either.right(template_id) except Exception as e: logger.error(f"Template registration failed: {e!s}") return Either.left( ProcessingError(f"Template registration failed: {e!s}"), ) def get_template(self, template_id: TemplateId) -> ImageTemplate | None: """Get registered template by ID.""" if self.template_cache: return self.template_cache.get_template(template_id) return None def find_template_by_name(self, name: str) -> ImageTemplate | None: """Find registered template by name.""" if self.template_cache: return self.template_cache.get_template_by_name(name) return None def get_templates_by_tag(self, tag: str) -> list[ImageTemplate]: """Get all templates with specific tag.""" if self.template_cache: return self.template_cache.get_templates_by_tag(tag) return [] def get_processing_stats(self) -> dict[str, Any]: """Get image recognition processing statistics.""" stats = self.processing_stats.copy() if self.template_cache: stats.update({"cache_stats": self.template_cache.get_cache_stats()}) return stats def clear_template_cache(self) -> None: """Clear all cached templates.""" if self.template_cache: self.template_cache = TemplateCache(self.template_cache.max_templates) logger.info("Template cache cleared") # Convenience functions for common image recognition operations async def find_button_by_text( screen_data: ImageData, button_text: str, search_region: ScreenRegion | None = None, ) -> Either[VisualError, list[VisualElement]]: """Find buttons containing specific text.""" engine = ImageRecognitionEngine() elements_result = await engine.detect_ui_elements( screen_data, search_region or ScreenRegion(0, 0, 1920, 1080), [ElementType.BUTTON], ) if elements_result.is_left(): return elements_result elements = elements_result.get_right() matching_buttons = [ elem for elem in elements if elem.text_content and button_text.lower() in elem.text_content.lower() ] return Either.right(matching_buttons) async def find_template_with_tolerance( screen_data: ImageData, template_data: ImageData, confidence_threshold: float = 0.7, search_region: ScreenRegion | None = None, ) -> Either[VisualError, list[ImageMatch]]: """Find template with configurable tolerance settings.""" config = MatchingConfig( confidence_threshold=confidence_threshold, enable_multi_scale=True, scale_tolerance=0.3, rotation_tolerance=10.0, blur_tolerance=True, partial_matching=True, ) engine = ImageRecognitionEngine() results = await engine.find_template_matches( screen_data, template_data, search_region, config, ) if results.is_left(): return Either.left(results.get_left()) matches = [result.match for result in results.get_right()] return Either.right(matches) def create_matching_config_for_ui(element_type: ElementType) -> MatchingConfig: """Create optimized matching configuration for specific UI element types.""" base_config = MatchingConfig() if element_type == ElementType.BUTTON: return MatchingConfig( method=MatchingMethod.HYBRID_MATCHING, confidence_threshold=0.8, enable_multi_scale=True, scale_tolerance=0.2, edge_threshold=30.0, blur_tolerance=True, ) if element_type == ElementType.TEXT_FIELD: return MatchingConfig( method=MatchingMethod.EDGE_DETECTION, confidence_threshold=0.75, enable_multi_scale=False, edge_threshold=20.0, partial_matching=True, ) if element_type == ElementType.ICON: return MatchingConfig( method=MatchingMethod.FEATURE_MATCHING, confidence_threshold=0.85, enable_multi_scale=True, enable_rotation=True, scale_tolerance=0.3, rotation_tolerance=15.0, ) return base_config

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Nexus-Digital-Automations/Keyboard-Maestro-MCP-2'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

image_recognition.py•28.9 KiB