OmniMCP

OmniMCP
omnimcp

types.py•12.4 kB

# omnimcp/types.py import time from dataclasses import dataclass, field from typing import List, Optional, Dict, Any, Tuple, Literal from loguru import logger from pydantic import BaseModel, Field, field_validator, ValidationInfo # Define Bounds (assuming normalized coordinates 0.0-1.0) Bounds = Tuple[float, float, float, float] # (x, y, width, height) # --- Core Data Structures (Using Dataclasses as provided) --- @dataclass class UIElement: """Represents a UI element detected in a single frame.""" # Per-frame ID assigned by parser/mapper id: int type: str # button, text_field, checkbox, link, text, etc. content: str # Text content or accessibility label bounds: Bounds # Normalized coordinates (x, y, width, height) confidence: float = 1.0 attributes: Dict[str, Any] = field(default_factory=dict) # e.g., {'checked': False} def to_dict(self) -> Dict[str, Any]: """Convert UIElement to a dictionary.""" return { "id": self.id, "type": self.type, "content": self.content, "bounds": self.bounds, "confidence": self.confidence, "attributes": self.attributes, } def to_prompt_repr(self) -> str: """Concise string representation suitable for LLM prompts.""" bound_str = ( f"({self.bounds[0]:.3f}, {self.bounds[1]:.3f}, " f"{self.bounds[2]:.3f}, {self.bounds[3]:.3f})" ) content_preview = ( (self.content[:30] + "...") if len(self.content) > 33 else self.content ) # Avoid newlines in prompt list content_preview = content_preview.replace("\n", " ") type_lower = self.type.lower() if isinstance(self.type, str) else "unknown" return ( f"ID: {self.id}, Type: {type_lower}, " f"Content: '{content_preview}', Bounds: {bound_str}" ) def short_repr(self) -> str: """Provides a short representation using the per-frame ID.""" content_preview = self.content[:25].replace("\n", " ") if len(self.content) > 25: content_preview += "..." type_lower = self.type.lower() if isinstance(self.type, str) else "unknown" return f"ID {self.id} ({type_lower} '{content_preview}')" @dataclass class ScreenState: """Represents the raw state of the screen at a point in time.""" elements: List[UIElement] dimensions: Tuple[int, int] # Actual pixel dimensions timestamp: float # --- Action / Interaction Results (Using Dataclasses) --- @dataclass class ActionVerification: """Optional verification data for an action's effect.""" success: bool before_state: bytes # Screenshot bytes after_state: bytes # Screenshot bytes changes_detected: List[Bounds] # Regions where changes occurred confidence: float @dataclass class InteractionResult: """Generic result of an interaction attempt.""" success: bool element: Optional[UIElement] # The element interacted with, if applicable error: Optional[str] = None context: Dict[str, Any] = field(default_factory=dict) verification: Optional[ActionVerification] = None @dataclass class ScrollResult(InteractionResult): """Result specific to a scroll action.""" scroll_amount: float = 0.0 @dataclass class TypeResult(InteractionResult): """Result specific to typing text.""" text_entered: str = "" # --- Error / Debug Context (Using Dataclasses) --- @dataclass class ToolError: """Rich error information, potentially for MCP tools or agent errors.""" message: str visual_context: Optional[bytes] # Screenshot bytes attempted_action: str element_description: str # Description or ID of intended target recovery_suggestions: List[str] @dataclass class DebugContext: """Context for debugging a specific operation or tool call.""" tool_name: str inputs: Dict[str, Any] result: Any duration: float visual_state: Optional[ScreenState] # Raw screen state at the time error: Optional[Dict] = None # e.g., ToolError as dict def save_snapshot(self, path: str) -> None: """Save debug snapshot for analysis.""" # Implementation would involve serializing state/context to a file logger.warning("DebugContext.save_snapshot not yet implemented.") # --- LLM Plan / Action Structures (Using Pydantic for Validation) --- class LLMActionPlan(BaseModel): """ Defines the structured output expected from the LLM for basic action planning. This might be superseded by ActionDecision but serves as the current target. """ reasoning: str = Field( ..., description="Step-by-step thinking process leading to the chosen action." ) action: Literal["click", "type", "scroll", "press_key"] = Field( ..., description="The single next action to perform." ) is_goal_complete: bool = Field( ..., description="Set to true if the user's overall goal is fully achieved by the current state, false otherwise.", ) element_id: Optional[int] = Field( default=None, description="The per-frame ID of the target UI element IF the action is 'click' or 'type' and goal is not complete. Must be null otherwise.", ) text_to_type: Optional[str] = Field( default=None, description="Text to type IF action is 'type' and goal is not complete. Must be null otherwise.", ) key_info: Optional[str] = Field( default=None, description="Key or shortcut to press IF action is 'press_key' and goal is not complete (e.g., 'Enter', 'Cmd+Space'). Must be null otherwise.", ) @field_validator("element_id") @classmethod def check_element_id(cls, v: Optional[int], info: ValidationInfo) -> Optional[int]: # Skip validation if goal is already complete if info.data.get("is_goal_complete", False): return v action = info.data.get("action") if action == "click" and v is None: raise ValueError( "element_id is required for action 'click' when goal is not complete" ) if action in ["scroll", "press_key"] and v is not None: raise ValueError( f"element_id must be null for action '{action}' when goal is not complete" ) return v @field_validator("text_to_type") @classmethod def check_text_to_type( cls, v: Optional[str], info: ValidationInfo ) -> Optional[str]: if info.data.get("is_goal_complete", False): return v action = info.data.get("action") if action == "type" and v is None: raise ValueError( "text_to_type (even empty string) is required for action 'type' when goal is not complete" ) if action != "type" and v is not None: raise ValueError( "text_to_type must be null for actions other than 'type' when goal is not complete" ) return v @field_validator("key_info") @classmethod def check_key_info(cls, v: Optional[str], info: ValidationInfo) -> Optional[str]: if info.data.get("is_goal_complete", False): return v action = info.data.get("action") if action == "press_key" and v is None: raise ValueError( "key_info is required for action 'press_key' when goal is not complete" ) if action != "press_key" and v is not None: raise ValueError( "key_info must be null for actions other than 'press_key' when goal is not complete" ) return v # --- Models for Tracking and Advanced Planning (Issue #8) --- class ElementTrack(BaseModel): """Tracking information for a UI element across frames, managed by SimpleElementTracker.""" track_id: str = Field( description="Persistent tracking ID assigned by the tracker (e.g., 'track_0')" ) # Storing Optional[UIElement] (dataclass) directly in Pydantic model works latest_element: Optional[UIElement] = Field( None, description="The UIElement dataclass instance detected in the current frame, if any.", ) consecutive_misses: int = Field( 0, description="Number of consecutive frames this element track was not detected.", ) last_seen_frame: int = Field( 0, description="The frame number when this track was last successfully detected.", ) def short_repr(self) -> str: """Short representation for prompt, using persistent track_id.""" status = ( "VISIBLE" if self.latest_element else f"MISSING({self.consecutive_misses})" ) if self.latest_element: # Use the short_repr from the underlying UIElement dataclass element_repr = self.latest_element.short_repr() # Gets ID, type, content return f"TrackID {self.track_id} [{element_repr}] - Status: {status}, LastSeen: f{self.last_seen_frame}" else: # If missing, we don't know the type/content from this object alone return f"TrackID {self.track_id} (Type Unknown) - Status: {status}, LastSeen: f{self.last_seen_frame}" class ScreenAnalysis(BaseModel): """LLM's analysis of the current UI state with tracking information.""" reasoning: str = Field( description="Detailed reasoning about the UI state, changes, and tracked elements relevant to the goal." ) disappeared_elements: List[str] = Field( default_factory=list, description="List of track_ids considered permanently gone.", ) temporarily_missing_elements: List[str] = Field( default_factory=list, description="List of track_ids considered temporarily missing but likely to reappear.", ) new_elements: List[str] = Field( default_factory=list, description="List of track_ids for newly appeared elements.", ) critical_elements_status: Dict[str, str] = Field( default_factory=dict, description="Status (e.g., 'Visible', 'Missing', 'Gone') of track_ids deemed critical for the current goal/step.", ) class ActionDecision(BaseModel): """LLM's decision on the next action based on its analysis.""" analysis_reasoning: str = Field( description="Reference or summary of the reasoning from ScreenAnalysis leading to this action." ) action_type: str = Field( description="The type of action to perform (e.g., 'click', 'type', 'press_key', 'wait', 'finish')." ) target_element_id: Optional[int] = Field( None, description="The CURRENT per-frame 'id' of the target UIElement, if applicable and visible.", ) parameters: Dict[str, Any] = Field( default_factory=dict, description="Action parameters, e.g., {'text_to_type': 'hello', 'key_info': 'Enter'}", ) is_goal_complete: bool = Field( False, description="Set to true if the overall user goal is now complete." ) # --- Model for Structured Step Logging --- class LoggedStep(BaseModel): """Structure for logging data for a single agent step.""" step_index: int timestamp: float = Field(default_factory=time.time) goal: str screenshot_path: Optional[str] = None # Relative path within run dir # Inputs to Planner input_elements_count: int # Store list of dicts for JSON serialization compatibility tracking_context: Optional[List[Dict]] = Field( None, description="Snapshot of ElementTrack data (as dicts) provided to LLM" ) action_history_at_step: List[str] # Planner Outputs (Store as dicts) llm_analysis: Optional[Dict] = Field( None, description="ScreenAnalysis output from LLM" ) llm_decision: Optional[Dict] = Field( None, description="ActionDecision output from LLM" ) raw_llm_action_plan: Optional[Dict] = Field( None, description="LLMActionPlan if ActionDecision not yet implemented" ) # Execution executed_action: str executed_target_element_id: Optional[int] = None executed_parameters: Dict[str, Any] action_success: bool # Metrics perception_time_s: float planning_time_s: float execution_time_s: float step_time_s: float

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/OpenAdaptAI/OmniMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server