Ubuntu Desktop Control MCP

Overview Schema Related Servers Score Discussions

ubuntu-desktop-control-mcp
ubuntu_desktop_control

server.py•106 KiB

#!/usr/bin/env python3 """ Ubuntu Desktop Control MCP Server Provides tools for LLMs to control Ubuntu desktop via screenshots and mouse clicks. Designed for X11-based GNOME desktop environments. Requirements: - Python 3.9+ - X11 display server - System packages: python3-xlib, scrot """ import os import shutil import textwrap import time from datetime import datetime from typing import Any, Dict, List, Optional, Tuple # pyautogui tries to connect to the display at import time. On headless systems # or when permissions are restricted, that import can raise before the MCP # server finishes initializing. We lazily import it inside the tools instead so # the server can start and report a clear error to the caller. _pyautogui = None _pyautogui_error: Optional[str] = None # Default folder inside workspace to save artifacts clients can read _CAPTURE_DIR_NAME = "captures" # Element cache (invalidated by TTL or focus change) _ELEMENT_CACHE: Optional[Dict[str, Any]] = None _ELEMENT_CACHE_TTL = 5.0 def _default_output_dir() -> str: """Return a workspace-local capture directory, creating it if missing.""" base = os.getcwd() path = os.path.join(base, _CAPTURE_DIR_NAME) os.makedirs(path, exist_ok=True) return path def _get_pyautogui(): """Load pyautogui lazily and capture any import/display errors.""" global _pyautogui, _pyautogui_error if _pyautogui or _pyautogui_error: return _pyautogui try: import pyautogui as _pyautogui_module _pyautogui = _pyautogui_module return _pyautogui except Exception as exc: # noqa: BLE001 - surface any startup issue _pyautogui_error = ( "PyAutoGUI unavailable. Ensure an X11 display is accessible: " f"{exc}" ) return None from mcp.server.fastmcp import FastMCP from pydantic import BaseModel, Field # Initialize FastMCP server mcp = FastMCP("ubuntu-desktop-control") def _safe_display_server() -> str: """Return the display server name with a sane default.""" return os.environ.get("XDG_SESSION_TYPE", "unknown").lower() # Non-fatal warnings about the runtime environment def _collect_env_warnings() -> List[str]: """Collect non-fatal environment warnings to return with results.""" warnings: List[str] = [] display_server = _safe_display_server() if display_server != "x11": warnings.append( f"Display server is '{display_server}'. PyAutoGUI works best on X11; Wayland may block screenshots/clicks." ) if not shutil.which("gnome-screenshot") and not shutil.which("scrot"): warnings.append( "Neither gnome-screenshot nor scrot found; PyAutoGUI may fail to capture screenshots." ) return warnings def _prompt_text(template: str) -> str: """Dedent and strip prompt template text for registration.""" return textwrap.dedent(template).strip() # Cache for scaling factor detection (factor, logical size, actual size) _scaling_factor_cache: Optional[Tuple[float, Tuple[int, int], Tuple[int, int]]] = None # Short-term caches to avoid hammering diagnostics/screen info when clients poll aggressively _DIAG_CACHE: Optional[Tuple[float, Any]] = None # (timestamp, DiagnosticInfo) _SCREEN_INFO_CACHE: Optional[Tuple[float, Any]] = None # (timestamp, ScreenInfo) def _clamp(value: float, min_value: float, max_value: float) -> float: return max(min_value, min(max_value, value)) def _safe_percent(numerator: float, denominator: float) -> float: if denominator <= 0: return 0.0 return _clamp(numerator / denominator, 0.0, 1.0) def _prefer_atspi_actions() -> bool: """Return True when AT-SPI action clicks should be attempted.""" raw = os.environ.get("UDC_FORCE_COORDS", "").strip().lower() return raw not in {"1", "true", "yes"} def _get_active_app_name() -> Optional[str]: """Best-effort active application name for cache invalidation.""" try: import pyatspi desktop = pyatspi.Registry.getDesktop(0) for i in range(desktop.childCount): try: app = desktop.getChildAtIndex(i) state = app.getState() if state.contains(pyatspi.STATE_ACTIVE) or state.contains(pyatspi.STATE_FOCUSED): name = getattr(app, "name", None) return name or "active-app" except Exception: continue except Exception: return None return None def _resolve_atspi_path(path: List[int]): """Resolve an AT-SPI element from a desktop path.""" import pyatspi desktop = pyatspi.Registry.getDesktop(0) node = desktop for idx in path: node = node.getChildAtIndex(idx) return node def _attempt_atspi_action( element_data: Dict[str, Any], clicks: int, ) -> Tuple[bool, Optional[str]]: """Attempt to click via AT-SPI action interface. Returns success, warning.""" if not _prefer_atspi_actions(): return False, "AT-SPI action disabled via UDC_FORCE_COORDS" path = element_data.get("atspi_path") if not path: return False, "No AT-SPI path available for element" try: import pyatspi except Exception as exc: return False, f"AT-SPI unavailable: {exc}" try: node = _resolve_atspi_path(path) except Exception as exc: return False, f"AT-SPI path no longer valid: {exc}" try: state = node.getState() if state.contains(pyatspi.STATE_DEFUNCT): return False, "AT-SPI element defunct" except Exception: pass action = getattr(node, "action", None) if action: try: action_count = action.nActions action_index = None preferred = {"click", "press", "activate", "open"} for i in range(action_count): name = action.getName(i).lower() if name in preferred: action_index = i break if action_index is None and action_count > 0: action_index = 0 if action_index is not None: for _ in range(max(1, clicks)): action.doAction(action_index) return True, f"AT-SPI action '{action.getName(action_index)}' executed" except Exception as exc: return False, f"AT-SPI action failed: {exc}" component = getattr(node, "component", None) if component: try: component.grabFocus() return True, "AT-SPI focus set (no action interface)" except Exception as exc: return False, f"AT-SPI focus failed: {exc}" return False, "AT-SPI action interface not available" def _get_mss_monitors() -> Tuple[Optional[List[Dict[str, int]]], Optional[str]]: """Return mss monitor list and optional warning.""" try: import mss # type: ignore with mss.mss() as sct: return list(sct.monitors), None except Exception as exc: # noqa: BLE001 return None, f"mss monitor query failed: {exc}" def _get_monitor_geometry( monitor_index: Optional[int], pyautogui_module=None, ) -> Tuple[Optional[Dict[str, int]], Optional[str]]: """Return logical monitor geometry for the given 0-based index.""" if monitor_index is None: return None, None monitors, warning = _get_mss_monitors() if not monitors: return None, warning or "Monitor info unavailable" mss_index = monitor_index + 1 if mss_index >= len(monitors): return None, f"Monitor index {monitor_index} out of range" bounds = dict(monitors[mss_index]) if pyautogui_module is not None: try: logical_w, logical_h = pyautogui_module.size() physical_full = _get_fullscreen_physical_size() if physical_full: scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui_module, logical_size=(logical_w, logical_h), actual_size=physical_full, ) if scaling_warning: warning = f"{warning}; {scaling_warning}" if warning else scaling_warning if scaling_factor not in (None, 0, 1.0): bounds = { "left": int(bounds["left"] / scaling_factor), "top": int(bounds["top"] / scaling_factor), "width": int(bounds["width"] / scaling_factor), "height": int(bounds["height"] / scaling_factor), } except Exception: pass return bounds, warning def _get_fullscreen_physical_size() -> Optional[Tuple[int, int]]: monitors, _ = _get_mss_monitors() if monitors: full = monitors[0] return full["width"], full["height"] return None def _element_cache_to_xy( *, elem: Dict[str, Any], screen_width: int, screen_height: int, monitor_bounds: Optional[Dict[str, int]] = None, ) -> Tuple[int, int]: """Resolve a cached element to logical (x, y) click coordinates. Supports both legacy cache entries (x/y) and new cache entries (x_percent/y_percent). """ if "x_percent" in elem and "y_percent" in elem: x_percent = float(elem["x_percent"]) y_percent = float(elem["y_percent"]) x_percent = _clamp(x_percent, 0.0, 1.0) y_percent = _clamp(y_percent, 0.0, 1.0) if monitor_bounds: left = monitor_bounds["left"] top = monitor_bounds["top"] width = monitor_bounds["width"] height = monitor_bounds["height"] return int(left + width * x_percent), int(top + height * y_percent) return int(screen_width * x_percent), int(screen_height * y_percent) return int(elem.get("x", 0)), int(elem.get("y", 0)) def _get_cached_elements(pyautogui_module): """Return element cache if screen size matches; otherwise invalidate.""" global _ELEMENT_CACHE cache = _ELEMENT_CACHE if not cache: return None, None captured_at = cache.get("captured_at") if captured_at is None or time.monotonic() - captured_at > _ELEMENT_CACHE_TTL: _ELEMENT_CACHE = None return None, None try: logical_w, logical_h = pyautogui_module.size() except Exception: logical_w, logical_h = None, None # New-style cache has meta and elements if isinstance(cache, dict) and "elements" in cache and "meta" in cache: meta = cache.get("meta", {}) if logical_w and logical_h: if meta.get("logical_width") != logical_w or meta.get("logical_height") != logical_h: _ELEMENT_CACHE = None return None, None active_app = _get_active_app_name() cached_app = meta.get("active_app_name") if active_app and cached_app and active_app != cached_app: _ELEMENT_CACHE = None return None, None return cache.get("elements", {}), meta # Legacy cache: element_id -> data if isinstance(cache, dict): return cache, None return None, None def _build_preferred_targets( *, elements: List["AccessibleElement"], element_map: Dict[int, Dict[str, Any]], logical_width: int, logical_height: int, limit: int = 5, ) -> List[Dict[str, Any]]: """Rank likely targets (e.g., Firefox) to enable one-shot clicks.""" keywords = ["firefox", "browser", "mozilla", "chrome", "web"] preferred_roles = {"icon", "push button", "menu item", "list item"} ranked: List[Tuple[float, Dict[str, Any]]] = [] for elem in elements: cached = element_map.get(elem.id, {}) name = (cached.get("name") or elem.name or "").strip() role = (cached.get("role") or elem.role or "").strip().lower() lname = name.lower() score = 0.0 for kw in keywords: if kw in lname: score += 10.0 if lname.startswith(kw): score += 2.0 if role in preferred_roles: score += 3.0 if role and role.startswith("app"): score += 1.0 x_percent = cached.get("x_percent") y_percent = cached.get("y_percent") if x_percent is None: x_percent = _safe_percent(elem.center_x, logical_width) if y_percent is None: y_percent = _safe_percent(elem.center_y, logical_height) # Slight preference for mid-sized targets area = max(1, cached.get("width") or elem.width) * max(1, cached.get("height") or elem.height) score += min(area / 20000.0, 5.0) ranked.append( ( score, { "id": elem.id, "name": name, "role": role, "x_percent": x_percent, "y_percent": y_percent, }, ) ) ranked.sort(key=lambda t: (-t[0], t[1]["id"])) return [item for _, item in ranked[:limit] if _clamp(item["x_percent"], 0.0, 1.0) >= 0.0] def _detect_scaling_factor( pyautogui_module, logical_size: Optional[Tuple[int, int]] = None, actual_size: Optional[Tuple[int, int]] = None, force_refresh: bool = False, ) -> Tuple[float, Optional[str]]: """ Detect display scaling factor by comparing logical vs physical screenshot size. Returns a tuple of (factor, warning). Warning is None when no issues. """ global _scaling_factor_cache if _scaling_factor_cache and not force_refresh and logical_size is None and actual_size is None: return _scaling_factor_cache[0], None try: logical_w, logical_h = logical_size or pyautogui_module.size() # Take a quick screenshot if caller didn't provide one if actual_size is None: actual_w, actual_h = pyautogui_module.screenshot().size else: actual_w, actual_h = actual_size if logical_w <= 0 or logical_h <= 0: return 1.0, "Logical screen size reported as zero; assuming 1.0x scaling" factor_w = actual_w / logical_w factor_h = actual_h / logical_h factor = round((factor_w + factor_h) / 2.0, 4) # Cache result with sizes to avoid repeated screenshots _scaling_factor_cache = (factor, (logical_w, logical_h), (actual_w, actual_h)) warning = None if abs(factor - 1.0) > 0.01: warning = ( f"Display scaling detected ({factor:.2f}x). Logical: " f"{logical_w}x{logical_h}, Screenshot: {actual_w}x{actual_h}." ) return factor, warning except Exception as exc: # noqa: BLE001 - propagate as warning only return 1.0, f"Scaling detection failed; assuming 1.0x: {exc}" def _get_screenshot_with_backend(pyautogui_module, monitor_index: Optional[int] = None): """ Capture a screenshot using mss if available (faster), else fall back to PyAutoGUI. Returns (image, width, height, warning, origin) """ try: import mss # type: ignore from PIL import Image with mss.mss() as sct: if monitor_index is not None: mss_index = monitor_index + 1 if mss_index >= len(sct.monitors): return None, 0, 0, f"Monitor index {monitor_index} out of range", None monitor = sct.monitors[mss_index] else: monitor = sct.monitors[0] raw = sct.grab(monitor) img = Image.frombytes("RGB", raw.size, raw.bgra, "raw", "BGRX") origin = {"left": monitor["left"], "top": monitor["top"]} return img, raw.width, raw.height, None, origin except Exception as exc: # noqa: BLE001 warning = None if "mss" in str(exc).lower(): warning = f"mss capture failed, using PyAutoGUI screenshot: {exc}" image = pyautogui_module.screenshot() width, height = image.size return image, width, height, warning, {"left": 0, "top": 0} class ScreenshotResult(BaseModel): """Result of a screenshot operation""" success: bool file_path: Optional[str] = Field(None, description="Absolute path to screenshot file") grid_overlay_path: Optional[str] = Field(None, description="Path to screenshot with coordinate grid overlay") full_screenshot_path: Optional[str] = Field(None, description="Path to full screenshot (when quadrants generated)") top_left_path: Optional[str] = Field(None, description="Path to top-left quadrant") top_right_path: Optional[str] = Field(None, description="Path to top-right quadrant") bottom_left_path: Optional[str] = Field(None, description="Path to bottom-left quadrant") bottom_right_path: Optional[str] = Field(None, description="Path to bottom-right quadrant") width: Optional[int] = Field(None, description="Screen width in logical pixels (reported by OS)") height: Optional[int] = Field(None, description="Screen height in logical pixels (reported by OS)") actual_width: Optional[int] = Field(None, description="Actual screenshot width in physical pixels") actual_height: Optional[int] = Field(None, description="Actual screenshot height in physical pixels") scaling_factor: Optional[float] = Field(None, description="Detected display scaling factor (1.0 = no scaling)") scaling_warning: Optional[str] = Field(None, description="Warning if scaling mismatch detected") quadrant_info: Optional[str] = Field(None, description="Explanation of quadrant coordinate system") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings about environment or scaling") error: Optional[str] = Field(None, description="Error message if operation failed") class MouseClickResult(BaseModel): """Result of a mouse click operation""" success: bool x: int = Field(description="X coordinate where click was performed (logical pixels)") y: int = Field(description="Y coordinate where click was performed (logical pixels)") button: str = Field(description="Mouse button that was clicked") clicks: int = Field(description="Number of clicks performed") applied_scaling: Optional[float] = Field(None, description="Scaling factor auto-applied to incoming coordinates") scaling_warning: Optional[str] = Field(None, description="Warning if display scaling may affect accuracy") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings about environment or scaling") error: Optional[str] = Field(None, description="Error message if operation failed") class ScreenInfo(BaseModel): """Information about the screen/display""" success: bool width: Optional[int] = Field(None, description="Screen width in pixels") height: Optional[int] = Field(None, description="Screen height in pixels") display_server: str = Field(description="Display server type (X11 or Wayland)") scaling_factor: Optional[float] = Field(None, description="Detected display scaling factor (1.0 = no scaling)") monitors: Optional[List[Dict[str, int]]] = Field(None, description="Monitor list from mss (index 0 is virtual screen)") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings about environment or scaling") error: Optional[str] = Field(None, description="Error message if operation failed") class DiagnosticInfo(BaseModel): """Detailed diagnostic information for troubleshooting coordinate issues""" success: bool logical_width: Optional[int] = Field(None, description="Screen width in logical pixels (what PyAutoGUI reports)") logical_height: Optional[int] = Field(None, description="Screen height in logical pixels (what PyAutoGUI reports)") actual_screenshot_width: Optional[int] = Field(None, description="Actual screenshot width in physical pixels") actual_screenshot_height: Optional[int] = Field(None, description="Actual screenshot height in physical pixels") scaling_factor: Optional[float] = Field(None, description="Detected scaling factor (actual/logical)") has_scaling_mismatch: bool = Field(description="True if scaling != 1.0") display_server: str = Field(description="Display server type (x11, wayland, unknown)") recommendation: Optional[str] = Field(None, description="Recommendation for fixing coordinate issues") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings about environment or scaling") error: Optional[str] = Field(None, description="Error message if operation failed") class QuadrantScreenshotResult(BaseModel): """Result of a quadrant screenshot operation""" success: bool full_screenshot_path: Optional[str] = Field(None, description="Path to full screenshot") top_left_path: Optional[str] = Field(None, description="Path to top-left quadrant (x: 0 to mid_x, y: 0 to mid_y)") top_right_path: Optional[str] = Field(None, description="Path to top-right quadrant (x: mid_x to max_x, y: 0 to mid_y)") bottom_left_path: Optional[str] = Field(None, description="Path to bottom-left quadrant (x: 0 to mid_x, y: mid_y to max_y)") bottom_right_path: Optional[str] = Field(None, description="Path to bottom-right quadrant (x: mid_x to max_x, y: mid_y to max_y)") scaling_factor: Optional[float] = Field(None, description="Display scaling factor") quadrant_info: Optional[str] = Field(None, description="Explanation of quadrant coordinate system") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings about environment or scaling") error: Optional[str] = Field(None, description="Error message if operation failed") class CoordinateConversionResult(BaseModel): """Result of converting screenshot coordinates to logical click coordinates""" success: bool screenshot_x: int = Field(description="X coordinate in screenshot pixels") screenshot_y: int = Field(description="Y coordinate in screenshot pixels") logical_x: Optional[int] = Field(None, description="Converted X coordinate for click_screen") logical_y: Optional[int] = Field(None, description="Converted Y coordinate for click_screen") scaling_factor: Optional[float] = Field(None, description="Scaling factor used for conversion") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings about environment or scaling") error: Optional[str] = Field(None, description="Error message if conversion failed") class GUIElement(BaseModel): """Represents a detected UI element with its bounding box and center.""" id: int x: int y: int width: int height: int center_x: int center_y: int area: int class GUIElementMapResult(BaseModel): """Result of the GUI element mapping operation.""" success: bool elements: List[GUIElement] = Field(default_factory=list, description="List of detected UI elements") count: int = Field(description="Total number of elements detected") debug_image_path: Optional[str] = Field(None, description="Path to the debug image with drawn contours") screenshot_path: Optional[str] = Field(None, description="Path to the screenshot used for detection") scaling_factor: Optional[float] = Field(None, description="Detected display scaling factor") coordinates: Optional[str] = Field(None, description="Coordinate space for x/y (logical or physical)") warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings") error: Optional[str] = Field(None, description="Error message if operation failed") class AccessibleElement(BaseModel): """Represents a UI element from accessibility API.""" id: int name: str role: str x: int y: int width: int height: int center_x: int center_y: int is_clickable: bool children_count: int = 0 class AnnotatedScreenshot(BaseModel): """Screenshot with detected elements and annotations.""" success: bool screenshot_path: str = Field(description="Path to annotated screenshot with numbered elements") original_path: Optional[str] = Field(None, description="Path to original full-resolution screenshot") elements: List[AccessibleElement] = Field(default_factory=list, description="Detected accessible elements") element_map: Dict[int, Dict] = Field(default_factory=dict, description="Map of element ID to coordinates and metadata") display_width: int = Field(description="Width used for display (downsampled)") display_height: int = Field(description="Height used for display (downsampled)") actual_width: int = Field(description="Actual screen width") actual_height: int = Field(description="Actual screen height") monitor_index: Optional[int] = Field(None, description="Monitor index used for capture (0-based)") monitor_origin: Optional[Dict[str, int]] = Field(None, description="Physical origin for monitor capture") scaling_info: str = Field(description="Explanation of coordinate scaling") suggested_targets: Optional[List[Dict[str, Any]]] = Field( None, description="Top recommended elements to click (id, name, role, x_percent, y_percent)", ) warnings: Optional[List[str]] = Field(None, description="Non-fatal warnings") error: Optional[str] = Field(None, description="Error message if operation failed") class WorkflowAction(BaseModel): """Single action in a workflow.""" action: str = Field(description="Action type: screenshot, click, move, type, wait") element_id: Optional[int] = Field(None, description="Element ID to interact with") x_percent: Optional[float] = Field(None, description="X coordinate as percentage (0.0-1.0)") y_percent: Optional[float] = Field(None, description="Y coordinate as percentage (0.0-1.0)") text: Optional[str] = Field(None, description="Text to type") duration: Optional[float] = Field(None, description="Wait duration or animation duration") button: Optional[str] = Field(None, description="Mouse button: left, right, middle") monitor_index: Optional[int] = Field(None, description="Monitor index (0-based) for clicks/moves/screenshot") class WorkflowResult(BaseModel): """Result of workflow execution.""" success: bool actions_completed: int total_actions: int results: List[Dict] = Field(default_factory=list, description="Results of each action") final_screenshot: Optional[str] = Field(None, description="Path to final screenshot") error: Optional[str] = Field(None, description="Error message if workflow failed") class PromptTemplateSummary(BaseModel): """Metadata for an MCP prompt template.""" name: str = Field(description="Prompt identifier") title: str = Field(description="Prompt title") description: str = Field(description="Short description of what the prompt is for") class PromptTemplateResult(PromptTemplateSummary): """Rendered MCP prompt text for clients without prompt support.""" prompt: str = Field(description="Rendered prompt text") class PromptTemplatesResult(BaseModel): """Collection of available prompt templates exposed as tools.""" templates: List[PromptTemplateSummary] = Field( description="Available prompt templates" ) @mcp.tool() def take_screenshot( detect_elements: bool = True, output_dir: Optional[str] = None, monitor_index: Optional[int] = None, ) -> AnnotatedScreenshot: """ Take an annotated screenshot with automatic element detection and downsampling. OPTIMIZED FOR LLM COMPUTER USE: - Automatically downsamples to 1280x720 for faster processing - Detects UI elements using accessibility API (AT-SPI) - Overlays numbered markers on each clickable element - Returns element map for direct interaction ("click element #5") This replaces the old multi-tool approach with a single optimized workflow. Args: detect_elements: Whether to detect and annotate UI elements (default: True). If False, returns plain downsampled screenshot. output_dir: Directory for output files. Defaults to a workspace-local 'captures' folder. Returns: AnnotatedScreenshot with: - screenshot_path: Annotated image with numbered elements (1280x720) - original_path: Full-resolution original screenshot - elements: List of detected elements with names, roles, and coordinates - element_map: Direct coordinate lookup {element_id: {x, y, width, height}} - scaling_info: How to interpret coordinates Usage: result = take_screenshot() # LLM sees numbered elements in image # LLM responds: "Click element #3" click_element(element_id=3) """ pyautogui = _get_pyautogui() if pyautogui is None: return AnnotatedScreenshot( success=False, screenshot_path="", display_width=0, display_height=0, actual_width=0, actual_height=0, scaling_info="", error=_pyautogui_error ) warnings = _collect_env_warnings() try: # Logical screen dimensions (what PyAutoGUI uses for input coordinates) logical_width, logical_height = pyautogui.size() # Set output directory inside workspace so clients can read captures if output_dir is None: output_dir = _default_output_dir() else: os.makedirs(output_dir, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") monitor_bounds, monitor_warning = _get_monitor_geometry(monitor_index, pyautogui_module=pyautogui) if monitor_warning: warnings.append(monitor_warning) # Capture full-resolution screenshot (physical pixels) screenshot, img_width, img_height, backend_warning, origin = _get_screenshot_with_backend( pyautogui, monitor_index=monitor_index, ) if screenshot is None: raise RuntimeError("Screenshot capture failed") if backend_warning: warnings.append(backend_warning) if monitor_index is None: scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui, logical_size=(logical_width, logical_height), actual_size=(img_width, img_height), ) else: logical_full = (logical_width, logical_height) physical_full = _get_fullscreen_physical_size() if physical_full: scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui, logical_size=logical_full, actual_size=physical_full, ) else: scaling_factor, scaling_warning = 1.0, "Scaling detection skipped for per-monitor capture" if scaling_warning: warnings.append(scaling_warning) # Save original full-resolution version original_path = os.path.join(output_dir, f"screenshot_original_{timestamp}.png") screenshot.save(original_path) # Downsample to 1280x720 for LLM processing (huge speed improvement) from PIL import Image target_width = 1280 target_height = 720 # Calculate scaling to fit within target dimensions while maintaining aspect ratio width_ratio = target_width / img_width height_ratio = target_height / img_height # Avoid upscaling small screens (adds blur without adding detail) scale_ratio = min(width_ratio, height_ratio, 1.0) display_width = int(img_width * scale_ratio) display_height = int(img_height * scale_ratio) downsampled = screenshot.resize((display_width, display_height), Image.Resampling.LANCZOS) scaling_info = ( f"Screenshot downsampled from {img_width}x{img_height} to {display_width}x{display_height}. " f"Click by element ID with click_screen(element_id=N) or by percentage with " f"click_screen(x_percent=0.5, y_percent=0.3). " f"Logical input size: {logical_width}x{logical_height}. " f"Detected scaling factor: {scaling_factor:.2f}x." ) # Detect elements using AT-SPI if requested elements = [] element_map = {} if detect_elements: try: # Try AT-SPI first (most accurate) import pyatspi def run_at_spi_scan() -> None: nonlocal element_id desktop = pyatspi.Registry.getDesktop(0) for i in range(desktop.childCount): try: app = desktop.getChildAtIndex(i) extract_elements(app, depth=0, path=[i]) except Exception: continue element_id = 1 def extract_elements(node, depth: int = 0, max_depth: int = 10, path: Optional[List[int]] = None): nonlocal element_id if depth > max_depth or element_id > 50: # Limit to 50 elements return if path is None: path = [] # Get element info defensively (do not break recursion if one property fails) try: name = node.name or "" except Exception: name = "" try: role = node.getRoleName() except Exception: role = "" interactive_roles = [ "push button", "toggle button", "check box", "radio button", "menu item", "list item", "link", "entry", "text", "icon", ] if role in interactive_roles: try: comp = getattr(node, "component", None) if comp: ext = comp.getExtents(pyatspi.DESKTOP_COORDS) if ext.width > 0 and ext.height > 0: # Filter out giant containers; they're rarely "clickable" targets if ext.width * ext.height <= int(img_width * img_height * 0.5): raw_center_x = ext.x + ext.width // 2 raw_center_y = ext.y + ext.height // 2 # Determine coordinate space heuristically if 0 <= raw_center_x <= logical_width and 0 <= raw_center_y <= logical_height: logical_center_x = int(raw_center_x) logical_center_y = int(raw_center_y) logical_x = int(ext.x) logical_y = int(ext.y) logical_w = int(ext.width) logical_h = int(ext.height) elif ( 0 <= raw_center_x <= img_width and 0 <= raw_center_y <= img_height and scaling_factor ): logical_center_x = int(raw_center_x / scaling_factor) logical_center_y = int(raw_center_y / scaling_factor) logical_x = int(ext.x / scaling_factor) logical_y = int(ext.y / scaling_factor) logical_w = int(ext.width / scaling_factor) logical_h = int(ext.height / scaling_factor) else: logical_center_x = -1 logical_center_y = -1 logical_x = 0 logical_y = 0 logical_w = 0 logical_h = 0 if monitor_bounds: left = monitor_bounds["left"] top = monitor_bounds["top"] width = monitor_bounds["width"] height = monitor_bounds["height"] if not (left <= ext.x <= left + width and top <= ext.y <= top + height): logical_center_x = -1 logical_center_y = -1 if 0 <= logical_center_x <= logical_width and 0 <= logical_center_y <= logical_height: x_percent = _safe_percent(logical_center_x, logical_width) y_percent = _safe_percent(logical_center_y, logical_height) children_count = 0 try: children_count = int(getattr(node, "childCount", 0)) except Exception: children_count = 0 element = AccessibleElement( id=element_id, name=name or f"{role}", role=role, x=logical_x, y=logical_y, width=max(1, logical_w), height=max(1, logical_h), center_x=logical_center_x, center_y=logical_center_y, is_clickable=True, children_count=children_count, ) elements.append(element) element_map[element_id] = { "x": logical_center_x, "y": logical_center_y, "width": max(1, logical_w), "height": max(1, logical_h), "name": name, "role": role, "x_percent": x_percent, "y_percent": y_percent, "atspi_path": path + [], } element_id += 1 except Exception: pass # Recurse to children regardless of extraction errors above try: child_count = int(getattr(node, "childCount", 0)) except Exception: child_count = 0 for i in range(child_count): try: child = node.getChildAtIndex(i) extract_elements(child, depth + 1, path + [i]) except Exception: continue # Start extraction from all applications run_at_spi_scan() # AT-SPI can race during app startup; retry once if empty. if not elements: import time time.sleep(0.1) element_id = 1 elements.clear() element_map.clear() run_at_spi_scan() if not elements: warnings.append("AT-SPI found no elements, falling back to CV detection") # Fall back to CV-based detection elements, element_map = _fallback_cv_detection( screenshot, img_width, img_height, logical_width, logical_height, ) except ImportError: warnings.append("pyatspi not available, using CV-based detection") elements, element_map = _fallback_cv_detection( screenshot, img_width, img_height, logical_width, logical_height, ) except Exception as e: warnings.append(f"AT-SPI detection failed: {e}, using CV fallback") elements, element_map = _fallback_cv_detection( screenshot, img_width, img_height, logical_width, logical_height, ) if isinstance(element_map, dict) and element_map.get("_warning"): warnings.append(element_map.pop("_warning")) # Create annotated version with numbered markers annotated = downsampled.copy() if elements: from PIL import ImageDraw, ImageFont draw = ImageDraw.Draw(annotated) # Try to load a font try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 20) except Exception: font = ImageFont.load_default() for elem in elements: # Draw using percentage coordinates when available for best alignment cached = element_map.get(elem.id, {}) x_percent = cached.get("x_percent") y_percent = cached.get("y_percent") if x_percent is not None and y_percent is not None: x = int(display_width * float(x_percent)) y = int(display_height * float(y_percent)) else: # Fallback: treat element coordinates as logical and scale to downsampled x = int(_safe_percent(elem.center_x, logical_width) * display_width) y = int(_safe_percent(elem.center_y, logical_height) * display_height) # Draw circle marker radius = 12 draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=(255, 0, 0, 200), outline=(255, 255, 255)) # Draw element ID text = str(elem.id) bbox = draw.textbbox((x, y), text, font=font) text_width = bbox[2] - bbox[0] text_height = bbox[3] - bbox[1] draw.text((x - text_width // 2, y - text_height // 2), text, fill=(255, 255, 255), font=font) # Build suggested targets for one-shot selection (e.g., Firefox) suggested_targets = None if elements: suggested_targets = _build_preferred_targets( elements=elements, element_map=element_map, logical_width=logical_width, logical_height=logical_height, ) # Save annotated screenshot annotated_path = os.path.join(output_dir, f"screenshot_annotated_{timestamp}.png") annotated.save(annotated_path) # Cache element map for click_screen to use, with metadata for invalidation if element_map: global _ELEMENT_CACHE _ELEMENT_CACHE = { "meta": { "logical_width": logical_width, "logical_height": logical_height, "scaling_factor": scaling_factor, "active_app_name": _get_active_app_name(), "monitor_index": monitor_index, "monitor_origin": origin, }, "elements": element_map, "captured_at": time.monotonic(), } return AnnotatedScreenshot( success=True, screenshot_path=annotated_path, original_path=original_path, elements=elements, element_map=element_map, display_width=display_width, display_height=display_height, actual_width=img_width, actual_height=img_height, monitor_index=monitor_index, monitor_origin=origin, scaling_info=scaling_info, suggested_targets=suggested_targets, warnings=warnings or None ) except Exception as e: return AnnotatedScreenshot( success=False, screenshot_path="", display_width=0, display_height=0, actual_width=0, actual_height=0, monitor_index=monitor_index, monitor_origin=None, scaling_info="", error=f"Screenshot failed: {str(e)}", warnings=warnings or None ) def _fallback_cv_detection( screenshot, img_width: int, img_height: int, logical_width: Optional[int] = None, logical_height: Optional[int] = None, ) -> Tuple[List[AccessibleElement], Dict[int, Dict[str, Any]]]: """Fallback CV-based element detection when AT-SPI fails.""" elements = [] element_map = {} # Backward-compatible defaults: if logical size isn't supplied, assume 1:1 logical_width = int(logical_width or img_width) logical_height = int(logical_height or img_height) try: import cv2 import numpy as np # Convert PIL to OpenCV img_array = np.array(screenshot) gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) # Use Canny edge detection (fast, good for UI boundaries) edges = cv2.Canny(gray, 50, 150) # Connect gaps so buttons/inputs become single contours try: kernel = np.ones((3, 3), np.uint8) edges = cv2.dilate(edges, kernel, iterations=1) except Exception: pass # Find contours (pass 1) contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # If Canny finds nothing, fall back to threshold-based segmentation if not contours: try: thresh = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 5, ) kernel = np.ones((5, 5), np.uint8) closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=1) contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) except Exception: contours = [] # Filter and sort by area MIN_WIDTH, MIN_HEIGHT = 20, 20 valid_contours = [] for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) area = w * h if area <= 100: continue if w < MIN_WIDTH or h < MIN_HEIGHT: continue if area >= int(img_width * img_height * 0.5): continue valid_contours.append((area, x, y, w, h)) # Sort by area and take top 30 valid_contours.sort(reverse=True) valid_contours = valid_contours[:30] element_id = 1 for _, x, y, w, h in valid_contours: physical_center_x = x + w // 2 physical_center_y = y + h // 2 # Convert to percent of the screen, then to logical coords (robust to scaling) x_percent = _safe_percent(physical_center_x, img_width) y_percent = _safe_percent(physical_center_y, img_height) center_x = int(logical_width * x_percent) center_y = int(logical_height * y_percent) element = AccessibleElement( id=element_id, name=f"Element {element_id}", role="detected", x=int(logical_width * _safe_percent(x, img_width)), y=int(logical_height * _safe_percent(y, img_height)), width=max(1, int(logical_width * _safe_percent(w, img_width))), height=max(1, int(logical_height * _safe_percent(h, img_height))), center_x=center_x, center_y=center_y, is_clickable=True, children_count=0 ) elements.append(element) element_map[element_id] = { "x": center_x, "y": center_y, "width": element.width, "height": element.height, "name": f"Element {element_id}", "role": "detected", "x_percent": x_percent, "y_percent": y_percent, } element_id += 1 except Exception as exc: element_map["_warning"] = f"CV fallback failed: {exc}" return elements, element_map # ============================================================================ # OLD/DEPRECATED IMPLEMENTATIONS - For reference only, do not use # ============================================================================ # The functions below (take_screenshot_old, click_screen_old) represent the # old non-optimized approach before AT-SPI integration and percentage coords. # They are kept for reference but should not be called. # ============================================================================ # @mcp.tool() # Commented out - replaced by optimized take_screenshot() def click_screen_old( x: int, y: int, button: str = "left", clicks: int = 1, interval: float = 0.0, auto_scale: bool = False, ) -> MouseClickResult: """DEPRECATED: Use click_element() or click_screen() with percentage coordinates.""" pass @mcp.tool() def click_screen( x_percent: Optional[float] = None, y_percent: Optional[float] = None, element_id: Optional[int] = None, button: str = "left", clicks: int = 1, monitor_index: Optional[int] = None, ) -> MouseClickResult: """ Click at screen location using PERCENTAGE coordinates or element ID. Preferred order: 1) element_id from take_screenshot (best accuracy) 2) x_percent/y_percent (resolution-agnostic fallback) Args: x_percent: X coordinate as percentage (0.0 = left edge, 1.0 = right edge) y_percent: Y coordinate as percentage (0.0 = top edge, 1.0 = bottom edge) element_id: ID of element from take_screenshot (overrides x_percent/y_percent) button: Mouse button: "left", "right", or "middle" clicks: Number of clicks (1 for single, 2 for double) Returns: MouseClickResult with success status Examples: - click_screen(element_id=3) - Click element #3 from screenshot - click_screen(x_percent=0.5, y_percent=0.5) - Click center of screen - click_screen(x_percent=0.02, y_percent=0.3, button="right") - Right click """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, error=_pyautogui_error, ) warnings = _collect_env_warnings() cached_elements, cache_meta = _get_cached_elements(pyautogui) cache_warning = None if cache_meta is None and _ELEMENT_CACHE and not cached_elements: cache_warning = "Element cache missing metadata; refresh with take_screenshot(detect_elements=True)." try: # Validate button valid_buttons = ["left", "right", "middle"] if button not in valid_buttons: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, error=f"Invalid button. Must be one of: {valid_buttons}", ) # Get screen dimensions screen_width, screen_height = pyautogui.size() monitor_bounds, monitor_warning = _get_monitor_geometry(monitor_index, pyautogui_module=pyautogui) if monitor_warning: warnings.append(monitor_warning) x = 0 y = 0 action_warning = None # Determine click coordinates (logical) if element_id is not None: if not cached_elements: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, warnings=warnings or None, error="No element cache available. Run take_screenshot(detect_elements=True) first.", ) if element_id not in cached_elements: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, warnings=warnings or None, error=f"Element {element_id} not found. Valid IDs: {list(cached_elements.keys())}", ) elem = cached_elements[element_id] x, y = _element_cache_to_xy( elem=elem, screen_width=screen_width, screen_height=screen_height, monitor_bounds=monitor_bounds, ) action_success, action_warning = _attempt_atspi_action(elem, clicks=clicks) if action_success: if action_warning: warnings.append(action_warning) if cache_warning: warnings.append(cache_warning) return MouseClickResult( success=True, x=x, y=y, button=button, clicks=clicks, applied_scaling=None, scaling_warning=None, warnings=warnings or None, ) elif x_percent is not None and y_percent is not None: # Use percentage coordinates if not (0.0 <= x_percent <= 1.0 and 0.0 <= y_percent <= 1.0): return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, error="Percentage coordinates must be between 0.0 and 1.0", ) if monitor_bounds: x = int(monitor_bounds["left"] + monitor_bounds["width"] * x_percent) y = int(monitor_bounds["top"] + monitor_bounds["height"] * y_percent) else: x = int(screen_width * x_percent) y = int(screen_height * y_percent) else: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, error="Must provide either element_id or both x_percent and y_percent", ) # Perform click via coordinates pyautogui.click(x=x, y=y, clicks=clicks, button=button) if action_warning: warnings.append(action_warning) if cache_warning: warnings.append(cache_warning) return MouseClickResult( success=True, x=x, y=y, button=button, clicks=clicks, applied_scaling=None, scaling_warning=None, warnings=warnings or None, ) except Exception as e: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=clicks, error=f"Click failed: {str(e)}", warnings=warnings or None, ) @mcp.tool() def click_first_match( name_substring: str, role_hint: Optional[str] = None, ) -> MouseClickResult: """ Find the best-matching cached element by name/role and click it (uses element IDs). Intended for quick launches like "Firefox" without manual element_id lookup. If no cache exists, it will call take_screenshot(detect_elements=True) once. """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button="left", clicks=1, error=_pyautogui_error, ) if not name_substring: return MouseClickResult( success=False, x=0, y=0, button="left", clicks=1, error="name_substring is required", ) warnings = _collect_env_warnings() cached_elements, _ = _get_cached_elements(pyautogui) if not cached_elements: shot = take_screenshot(detect_elements=True) if not shot.success: return MouseClickResult( success=False, x=0, y=0, button="left", clicks=1, warnings=warnings or shot.warnings, error=shot.error or "Failed to detect elements", ) cached_elements, _ = _get_cached_elements(pyautogui) if not cached_elements: return MouseClickResult( success=False, x=0, y=0, button="left", clicks=1, warnings=warnings or None, error="No elements available after screenshot; cannot match.", ) target = name_substring.lower() role_hint_lower = role_hint.lower() if role_hint else None best_id = None best_score = -1.0 for elem_id, data in cached_elements.items(): name = str(data.get("name") or "").lower() role = str(data.get("role") or "").lower() score = 0.0 if target in name: score += 10.0 if name.startswith(target): score += 2.0 if role_hint_lower: if role == role_hint_lower: score += 3.0 elif role_hint_lower in role: score += 2.0 if score <= 0: continue score += 0.1 # small bias to break ties if score > best_score: best_score = score best_id = elem_id if best_id is None: return MouseClickResult( success=False, x=0, y=0, button="left", clicks=1, warnings=warnings or None, error=f"No element matched '{name_substring}'.", ) result = click_screen(element_id=best_id, button="left", clicks=1) if warnings: if result.warnings: result.warnings = list(result.warnings) + warnings else: result.warnings = warnings return result @mcp.tool() def get_screen_info() -> ScreenInfo: """ Get information about the screen/display. Returns screen dimensions and display server type. Use this before taking screenshots or clicking to understand the coordinate system. Returns: ScreenInfo with width, height, and display server type Example: - get_screen_info() - Returns screen dimensions and X11/Wayland info """ global _SCREEN_INFO_CACHE # Serve cached success for a short window to avoid client polling spam if _SCREEN_INFO_CACHE: ts, cached = _SCREEN_INFO_CACHE if time.monotonic() - ts < 2.0 and getattr(cached, "success", False): return cached pyautogui = _get_pyautogui() if pyautogui is None: return ScreenInfo( success=False, display_server=_safe_display_server(), error=_pyautogui_error or "PyAutoGUI not installed", ) warnings = _collect_env_warnings() try: screen_width, screen_height = pyautogui.size() scaling_factor, scaling_warning = _detect_scaling_factor(pyautogui) if scaling_warning: warnings.append(scaling_warning) monitors, monitor_warning = _get_mss_monitors() if monitor_warning: warnings.append(monitor_warning) result = ScreenInfo( success=True, width=screen_width, height=screen_height, display_server=_safe_display_server(), scaling_factor=scaling_factor, monitors=monitors, warnings=warnings or None, ) _SCREEN_INFO_CACHE = (time.monotonic(), result) return result except Exception as e: # noqa: BLE001 return ScreenInfo( success=False, display_server=_safe_display_server(), error=f"Failed to get screen info: {str(e)}", warnings=warnings or None, ) @mcp.tool() def move_mouse( x_percent: Optional[float] = None, y_percent: Optional[float] = None, element_id: Optional[int] = None, duration: float = 0.0, monitor_index: Optional[int] = None, ) -> MouseClickResult: """ Move mouse cursor to specified screen coordinates without clicking. Supports both percentage coordinates and element IDs (like click_screen). Args: x_percent: X coordinate as percentage (0.0 = left edge, 1.0 = right edge) y_percent: Y coordinate as percentage (0.0 = top edge, 1.0 = bottom edge) element_id: Element ID from take_screenshot (overrides percentages) duration: Time in seconds to animate the movement (0 for instant) Returns: MouseClickResult with success status (clicks will be 0) Examples: - move_mouse(x_percent=0.5, y_percent=0.3) - Move to center-top - move_mouse(element_id=5, duration=0.5) - Smoothly move to element #5 """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=_pyautogui_error, ) warnings = _collect_env_warnings() cached_elements, cache_meta = _get_cached_elements(pyautogui) cache_warning = None if cache_meta is None and _ELEMENT_CACHE and not cached_elements: cache_warning = "Element cache missing metadata; refresh with take_screenshot(detect_elements=True)." x = 0 y = 0 try: screen_width, screen_height = pyautogui.size() monitor_bounds, monitor_warning = _get_monitor_geometry(monitor_index, pyautogui_module=pyautogui) if monitor_warning: warnings.append(monitor_warning) # Determine coordinates if element_id is not None: if not cached_elements: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error="No element cache. Run take_screenshot(detect_elements=True) first.", ) if element_id not in cached_elements: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=f"Element {element_id} not found.", ) elem = cached_elements[element_id] x, y = _element_cache_to_xy( elem=elem, screen_width=screen_width, screen_height=screen_height, monitor_bounds=monitor_bounds, ) elif x_percent is not None and y_percent is not None: if not (0.0 <= x_percent <= 1.0 and 0.0 <= y_percent <= 1.0): return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error="Percentage coordinates must be between 0.0 and 1.0" ) if monitor_bounds: x = int(monitor_bounds["left"] + monitor_bounds["width"] * x_percent) y = int(monitor_bounds["top"] + monitor_bounds["height"] * y_percent) else: x = int(screen_width * x_percent) y = int(screen_height * y_percent) else: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error="Must provide either element_id or both x_percent and y_percent" ) # Move mouse pyautogui.moveTo(x, y, duration=duration) if cache_warning: warnings = (warnings or []) + [cache_warning] return MouseClickResult( success=True, x=x, y=y, button="none", clicks=0, warnings=warnings or None, ) except Exception as e: # noqa: BLE001 return MouseClickResult( success=False, x=x, y=y, button="none", clicks=0, error=f"Mouse move failed: {str(e)}", warnings=warnings or None, ) @mcp.tool() def drag_mouse( x: int, y: int, button: str = "left", duration: float = 0.5 ) -> MouseClickResult: """ Drag the mouse cursor to specific coordinates while holding a button. This tool performs a "click and drag" operation. It starts from the CURRENT mouse position and drags to the specified (x, y) coordinates. Useful for selecting text, moving windows, or drag-and-drop operations. Args: x: Target X coordinate in logical pixels y: Target Y coordinate in logical pixels button: Mouse button to hold down (default "left") duration: Time in seconds to perform the drag (default 0.5) Returns: MouseClickResult with success status Examples: - drag_mouse(x=500, y=300) - Drag from current pos to (500, 300) """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button=button, clicks=0, error=_pyautogui_error or "PyAutoGUI not available" ) warnings = _collect_env_warnings() try: pyautogui.dragTo(x, y, duration=duration, button=button) return MouseClickResult( success=True, x=x, y=y, button=button, clicks=0, warnings=warnings or None, ) except Exception as e: # noqa: BLE001 return MouseClickResult( success=False, x=x, y=y, button=button, clicks=0, error=f"Mouse drag failed: {str(e)}", warnings=warnings or None, ) @mcp.tool() def type_text(text: str, interval: float = 0.0) -> MouseClickResult: """ Type text using the keyboard. This tool types the specified text string. It can be used to enter data into forms, search bars, or terminal windows. Args: text: The string of text to type. interval: Seconds to wait between each key press (default 0.0). Returns: MouseClickResult with success status (reused model for simplicity) Examples: - type_text(text="Hello World") - Types "Hello World" instantly - type_text(text="ls -la", interval=0.1) - Types slowly """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=_pyautogui_error, ) warnings = _collect_env_warnings() try: pyautogui.write(text, interval=interval) return MouseClickResult( success=True, x=0, y=0, button="none", clicks=0, warnings=warnings or None, ) except Exception as e: # noqa: BLE001 return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=f"Type text failed: {str(e)}", warnings=warnings or None, ) @mcp.tool() def press_key(key: str) -> MouseClickResult: """ Press a specific key on the keyboard. This tool presses a single key. Useful for navigation (arrows, enter, esc) or shortcuts. Args: key: The name of the key to press. Examples: 'enter', 'esc', 'left', 'f1', 'a'. See pyautogui documentation for full list of valid key names. Returns: MouseClickResult with success status Examples: - press_key(key="enter") - press_key(key="esc") """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=_pyautogui_error, ) warnings = _collect_env_warnings() try: pyautogui.press(key) return MouseClickResult( success=True, x=0, y=0, button="none", clicks=0, warnings=warnings or None, ) except Exception as e: # noqa: BLE001 return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=f"Press key failed: {str(e)}", warnings=warnings or None, ) @mcp.tool() def press_hotkey(keys: List[str]) -> MouseClickResult: """ Press a combination of keys simultaneously (e.g., Ctrl+Shift+C). This tool performs a hotkey combination. It presses the keys in order, holds them down, then releases them in reverse order. Args: keys: List of keys to press. Example: ["ctrl", "shift", "c"] or ["alt", "tab"] Returns: MouseClickResult with success status Examples: - press_hotkey(keys=["ctrl", "c"]) - Copy - press_hotkey(keys=["ctrl", "shift", "c"]) - Terminal Copy - press_hotkey(keys=["alt", "f4"]) - Close window """ pyautogui = _get_pyautogui() if pyautogui is None: return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=_pyautogui_error, ) warnings = _collect_env_warnings() try: pyautogui.hotkey(*keys) return MouseClickResult( success=True, x=0, y=0, button="none", clicks=0, warnings=warnings or None, ) except Exception as e: # noqa: BLE001 return MouseClickResult( success=False, x=0, y=0, button="none", clicks=0, error=f"Hotkey failed: {str(e)}", warnings=warnings or None, ) @mcp.tool() def execute_workflow( actions: List[Dict], take_final_screenshot: bool = True ) -> WorkflowResult: """ Execute multiple actions in sequence for efficient multi-step workflows. OPTIMIZED FOR COMPLEX TASKS: This tool batches multiple actions (screenshot, click, type, wait) into one MCP call, reducing round-trip latency. Use for multi-step workflows like: - Opening an application and clicking through menus - Filling out multi-field forms - Navigating through a series of UI steps Args: actions: List of actions to execute. Each action is a dict with: - action: "screenshot" | "click" | "move" | "type" | "wait" - element_id: (optional) Element ID to interact with - x_percent: (optional) X coordinate as percentage - y_percent: (optional) Y coordinate as percentage - text: (optional) Text to type - duration: (optional) Wait duration in seconds - button: (optional) Mouse button ("left", "right", "middle") take_final_screenshot: Whether to take a screenshot after completion (default: True) Returns: WorkflowResult with: - success: Whether all actions completed successfully - actions_completed: Number of actions executed - results: List of individual action results - final_screenshot: Path to final screenshot (if enabled) Example: execute_workflow(actions=[ {"action": "screenshot"}, {"action": "click", "element_id": 3}, {"action": "wait", "duration": 0.5}, {"action": "type", "text": "Hello"}, {"action": "click", "x_percent": 0.5, "y_percent": 0.8} ]) """ pyautogui = _get_pyautogui() if pyautogui is None: return WorkflowResult( success=False, actions_completed=0, total_actions=len(actions), error=_pyautogui_error ) results = [] completed = 0 try: for i, action_dict in enumerate(actions): action_type = action_dict.get("action", "").lower() monitor_index = action_dict.get("monitor_index") try: # Validate action shape try: WorkflowAction(**action_dict) except Exception as exc: results.append({ "action": action_type or "unknown", "success": False, "error": f"Invalid action payload: {exc}", }) break if action_type == "screenshot": # Take screenshot with element detection result = take_screenshot( detect_elements=action_dict.get("detect_elements", True), output_dir=action_dict.get("output_dir"), monitor_index=monitor_index, ) results.append({ "action": "screenshot", "success": result.success, "screenshot_path": result.screenshot_path if result.success else None, "elements_detected": len(result.elements) if result.success else 0 }) elif action_type == "click": # Click using element ID or percentage coordinates result = click_screen( x_percent=action_dict.get("x_percent"), y_percent=action_dict.get("y_percent"), element_id=action_dict.get("element_id"), button=action_dict.get("button", "left"), clicks=action_dict.get("clicks", 1), monitor_index=monitor_index, ) results.append({ "action": "click", "success": result.success, "x": result.x if result.success else None, "y": result.y if result.success else None }) elif action_type == "move": # Move mouse result = move_mouse( x_percent=action_dict.get("x_percent"), y_percent=action_dict.get("y_percent"), element_id=action_dict.get("element_id"), duration=action_dict.get("duration", 0.0), monitor_index=monitor_index, ) results.append({ "action": "move", "success": result.success, "x": result.x if result.success else None, "y": result.y if result.success else None }) elif action_type == "type": # Type text text = action_dict.get("text", "") interval = action_dict.get("interval", 0.0) result = type_text(text=text, interval=interval) results.append({ "action": "type", "success": result.success, "text_length": len(text) }) elif action_type == "wait": # Wait/sleep import time duration = action_dict.get("duration", 0.5) time.sleep(duration) results.append({ "action": "wait", "success": True, "duration": duration }) else: results.append({ "action": action_type, "success": False, "error": f"Unknown action type: {action_type}" }) continue # Check if action succeeded if results[-1]["success"]: completed += 1 else: # Stop on failure break except Exception as e: results.append({ "action": action_type, "success": False, "error": str(e) }) break # Take final screenshot if requested and all actions succeeded final_screenshot_path = None if take_final_screenshot and completed == len(actions): final_result = take_screenshot() if final_result.success: final_screenshot_path = final_result.screenshot_path return WorkflowResult( success=(completed == len(actions)), actions_completed=completed, total_actions=len(actions), results=results, final_screenshot=final_screenshot_path ) except Exception as e: return WorkflowResult( success=False, actions_completed=completed, total_actions=len(actions), results=results, error=f"Workflow execution failed: {str(e)}" ) @mcp.tool() def get_display_diagnostics() -> DiagnosticInfo: """ Get detailed diagnostic information about display scaling and coordinate systems. This tool helps troubleshoot coordinate mismatch issues by comparing logical screen dimensions (what the OS reports) with actual screenshot dimensions. Use this tool when: - Clicks are landing in the wrong position - Screenshot coordinates don't match screen coordinates - You suspect display scaling is causing issues Returns: DiagnosticInfo with scaling factor, dimensions, and recommendations """ global _DIAG_CACHE # Serve cached success briefly to prevent repeated polling from clients if _DIAG_CACHE: ts, cached = _DIAG_CACHE if time.monotonic() - ts < 2.0 and getattr(cached, "success", False): return cached pyautogui = _get_pyautogui() if pyautogui is None: return DiagnosticInfo( success=False, has_scaling_mismatch=False, display_server=_safe_display_server(), error=_pyautogui_error or "PyAutoGUI not installed", ) warnings = _collect_env_warnings() try: # Get logical screen size logical_width, logical_height = pyautogui.size() # Take test screenshot to get actual dimensions (fastest backend) screenshot, actual_width, actual_height, backend_warning, _ = _get_screenshot_with_backend(pyautogui) if backend_warning: warnings.append(backend_warning) # Detect scaling factor scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui, logical_size=(logical_width, logical_height), actual_size=(actual_width, actual_height), force_refresh=True, ) has_scaling_mismatch = abs(scaling_factor - 1.0) > 0.01 if scaling_factor else False display_server = _safe_display_server() if scaling_warning: warnings.append(scaling_warning) # Generate recommendation if has_scaling_mismatch: recommendation = ( f"Display scaling is active ({scaling_factor:.2f}x). " f"Divide screenshot coordinates by {scaling_factor:.2f} before clicking, " f"or use `convert_screenshot_coordinates` to convert screenshot pixels to logical click coordinates. " f"For example: screenshot (1000, 500) -> click ({int(1000/scaling_factor)}, {int(500/scaling_factor)}). " f"Alternatively, use take_screenshot(detect_elements=True) to click by element_id or by x_percent/y_percent." ) else: recommendation = ( "No display scaling detected. Screenshot coordinates should match " "screen coordinates directly." ) result = DiagnosticInfo( success=True, logical_width=logical_width, logical_height=logical_height, actual_screenshot_width=actual_width, actual_screenshot_height=actual_height, scaling_factor=scaling_factor, has_scaling_mismatch=has_scaling_mismatch, display_server=display_server, recommendation=recommendation, warnings=warnings or None, ) _DIAG_CACHE = (time.monotonic(), result) return result except Exception as e: # noqa: BLE001 return DiagnosticInfo( success=False, has_scaling_mismatch=False, display_server=_safe_display_server(), error=f"Failed to get diagnostics: {str(e)}", warnings=warnings or None, ) @mcp.tool() def convert_screenshot_coordinates( screenshot_x: int, screenshot_y: int, ) -> CoordinateConversionResult: """ Convert screenshot pixel coordinates to logical coordinates suitable for click_screen(). Use this when you have coordinates measured on the saved screenshot and want the exact logical coordinates to click (accounting for display scaling). """ pyautogui = _get_pyautogui() if pyautogui is None: return CoordinateConversionResult( success=False, screenshot_x=screenshot_x, screenshot_y=screenshot_y, error=_pyautogui_error or "PyAutoGUI not installed", ) warnings = _collect_env_warnings() try: logical_w, logical_h = pyautogui.size() screenshot_img, actual_w, actual_h, backend_warning, _ = _get_screenshot_with_backend(pyautogui) if backend_warning: warnings.append(backend_warning) # Avoid unused variable warning _ = screenshot_img scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui, logical_size=(logical_w, logical_h), actual_size=(actual_w, actual_h), ) if scaling_warning: warnings.append(scaling_warning) if scaling_factor in (None, 0): return CoordinateConversionResult( success=False, screenshot_x=screenshot_x, screenshot_y=screenshot_y, error="Could not detect scaling factor", warnings=warnings or None, ) logical_x = int(screenshot_x / scaling_factor) logical_y = int(screenshot_y / scaling_factor) return CoordinateConversionResult( success=True, screenshot_x=screenshot_x, screenshot_y=screenshot_y, logical_x=logical_x, logical_y=logical_y, scaling_factor=scaling_factor, warnings=warnings or None, ) except Exception as exc: # noqa: BLE001 return CoordinateConversionResult( success=False, screenshot_x=screenshot_x, screenshot_y=screenshot_y, error=f"Conversion failed: {exc}", warnings=warnings or None, ) @mcp.tool() def map_GUI_elements_location( screenshot_path: Optional[str] = None, min_area: int = 100, max_area: Optional[int] = None, debug_output_path: Optional[str] = None ) -> GUIElementMapResult: """ Analyze a screenshot to detect and map GUI elements (buttons, inputs, etc.) using Computer Vision. This tool uses edge detection (Canny) and contour analysis to identify potential UI elements. It returns a list of "hit boxes" (x, y, width, height) and their center coordinates. This is useful for finding clickable elements when you don't know their exact coordinates. Args: screenshot_path: Path to an existing screenshot. If None, takes a new screenshot. min_area: Minimum area (width * height) for a contour to be considered a UI element. Default 100. max_area: Optional maximum area to filter out large containers or full screen. debug_output_path: Path to save a debug image with drawn contours. Returns: GUIElementMapResult containing the list of detected elements. """ pyautogui = _get_pyautogui() warnings = _collect_env_warnings() captured_path: Optional[str] = None scaling_factor = 1.0 coordinate_space = "physical" try: import cv2 import numpy as np except ImportError: return GUIElementMapResult( success=False, elements=[], count=0, error="OpenCV (cv2) or NumPy not installed. Please install 'opencv-python' and 'numpy'." ) try: # 1. Get Screenshot if screenshot_path: if not os.path.exists(screenshot_path): return GUIElementMapResult( success=False, elements=[], count=0, error=f"File not found: {screenshot_path}", screenshot_path=os.path.abspath(screenshot_path), ) image = cv2.imread(screenshot_path) if image is None: return GUIElementMapResult( success=False, elements=[], count=0, error=f"Failed to load image: {screenshot_path}", screenshot_path=os.path.abspath(screenshot_path), ) captured_path = os.path.abspath(screenshot_path) # Detect scaling if possible (requires pyautogui) if pyautogui: screen_width, screen_height = pyautogui.size() h, w = image.shape[:2] scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui, logical_size=(screen_width, screen_height), actual_size=(w, h) ) if scaling_warning: warnings.append(scaling_warning) coordinate_space = "logical" else: if pyautogui is None: return GUIElementMapResult( success=False, elements=[], count=0, error=_pyautogui_error, ) # Take new screenshot pil_image, actual_w, actual_h, backend_warning, _ = _get_screenshot_with_backend(pyautogui) if backend_warning: warnings.append(backend_warning) # Persist the captured screenshot so clients can inspect it capture_dir = _default_output_dir() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") captured_path = os.path.join(capture_dir, f"screenshot_elements_{timestamp}.png") try: pil_image.save(captured_path) except Exception as save_exc: # noqa: BLE001 - best-effort persistence warnings.append(f"Failed to save screenshot to {captured_path}: {save_exc}") captured_path = None # Convert PIL to OpenCV (RGB -> BGR) image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) screen_width, screen_height = pyautogui.size() scaling_factor, scaling_warning = _detect_scaling_factor( pyautogui, logical_size=(screen_width, screen_height), actual_size=(actual_w, actual_h) ) if scaling_warning: warnings.append(scaling_warning) coordinate_space = "logical" # 2. Process Image (Edge Detection) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Apply Canny edge detection edges = cv2.Canny(gray, 50, 150) # Dilate edges to connect gaps kernel = np.ones((3, 3), np.uint8) dilated = cv2.dilate(edges, kernel, iterations=1) # Find contours contours, hierarchy = cv2.findContours(dilated, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) detected_elements = [] element_id = 1 # Prepare debug image if requested debug_image = None if debug_output_path: debug_image = image.copy() for contour in contours: # Get bounding box x, y, w, h = cv2.boundingRect(contour) area = w * h # Filter by area if area < min_area: continue if max_area and area > max_area: continue if coordinate_space == "logical" and scaling_factor not in (None, 0, 1.0): x = int(x / scaling_factor) y = int(y / scaling_factor) w = max(1, int(w / scaling_factor)) h = max(1, int(h / scaling_factor)) # Calculate center center_x = x + w // 2 center_y = y + h // 2 # Add to list element = GUIElement( id=element_id, x=x, y=y, width=w, height=h, center_x=center_x, center_y=center_y, area=area ) detected_elements.append(element) # Draw on debug image if debug_image is not None: cv2.rectangle(debug_image, (x, y), (x + w, y + h), (0, 255, 0), 2) cv2.circle(debug_image, (center_x, center_y), 2, (0, 0, 255), -1) cv2.putText(debug_image, str(element_id), (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1) element_id += 1 # Save debug image saved_debug_path = None if debug_output_path and debug_image is not None: # Ensure directory exists os.makedirs(os.path.dirname(os.path.abspath(debug_output_path)), exist_ok=True) cv2.imwrite(debug_output_path, debug_image) saved_debug_path = debug_output_path return GUIElementMapResult( success=True, elements=detected_elements, count=len(detected_elements), debug_image_path=saved_debug_path, screenshot_path=captured_path, scaling_factor=scaling_factor, coordinates=coordinate_space, warnings=warnings or None ) except Exception as e: return GUIElementMapResult( success=False, elements=[], count=0, error=f"Detection failed: {str(e)}", screenshot_path=captured_path, coordinates=coordinate_space, warnings=warnings or None ) # MCP prompt templates @mcp.prompt( name="baseline_display_check", title="Baseline display and scaling check", description="Collect display server, sizing, and scaling warnings before interacting with the desktop.", ) def prompt_baseline_display_check(task_hint: Optional[str] = None) -> str: """Prompt template guiding baseline diagnostics.""" target = task_hint or "the requested task" return _prompt_text( f""" Before working on {target}, call `get_screen_info` and `get_display_diagnostics`. Report display_server, logical size (width/height), scaling_factor, and any warnings. Prefer clicking by element ID from `take_screenshot` or by percentage with `click_screen(x_percent=..., y_percent=...)`. """ ) @mcp.prompt( name="capture_full_desktop", title="Capture full desktop screenshot", description="Capture a full-screen screenshot and summarize what is visible before acting.", ) def prompt_capture_full_desktop( goal: str, output_path: Optional[str] = None, ) -> str: """Prompt template for full desktop capture.""" path_hint = f"output_dir={output_path}" if output_path else "the default output directory" return _prompt_text( f""" Goal: {goal}. Call `take_screenshot(detect_elements=True)` (use {path_hint}). Return screenshot_path, original_path, display_width/height, actual_width/height, and any warnings. List 2-3 concise observations relevant to the goal and suggest a next step (click by element_id or click by percentage). """ ) @mcp.prompt( name="capture_region_for_task", title="Capture region screenshot", description="Grab a focused region screenshot to inspect a specific UI area.", ) def prompt_capture_region_for_task( region: str, goal: Optional[str] = None, output_path: Optional[str] = None, ) -> str: """Prompt template for region capture with validation guidance.""" goal_text = goal or "this task" path_hint = f"output_dir={output_path}" if output_path else "the default output directory" return _prompt_text( f""" Goal: {goal_text}. Call `take_screenshot(detect_elements=True)` ({path_hint}). Focus your analysis on region '{region}' (descriptive guidance; `take_screenshot` currently captures the full screen). Return screenshot_path, any warnings, and a short summary of what is visible in that region. """ ) @mcp.prompt( name="convert_screenshot_coordinates", title="Convert screenshot pixels to logical click coords", description="Turn coordinates measured on a screenshot into logical positions for click_screen.", ) def prompt_convert_screenshot_coordinates( screenshot_x: int, screenshot_y: int, note: Optional[str] = None, ) -> str: """Prompt template for converting screenshot coordinates.""" target_note = note or "the target" return _prompt_text( f""" You measured a screenshot coordinate at ({screenshot_x}, {screenshot_y}) for {target_note}. Call `convert_screenshot_coordinates` and return logical_x/logical_y, scaling_factor, and warnings. Provide the exact `click_screen` call to run next (use x_percent/y_percent derived from logical coords if you want resolution independence). """ ) @mcp.prompt( name="safe_click", title="Safely perform a click", description="Perform a guarded click with scaling awareness and post-click confirmation guidance.", ) def prompt_safe_click( x: int, y: int, coordinate_source: str, button: str = "left", clicks: int = 1, reason: Optional[str] = None, ) -> str: """Prompt template for scaling-aware click execution.""" rationale = reason or "the requested action" return _prompt_text( f""" Reason: {rationale}. Prepare to click at ({x}, {y}) using button={button} and clicks={clicks}. If coordinate_source is 'screenshot', first convert with `convert_screenshot_coordinates` to get logical_x/logical_y. Call `click_screen` accordingly and return success, applied_scaling, and any warnings. Suggest taking a follow-up screenshot if verification is needed. """ ) @mcp.prompt( name="hover_and_capture", title="Hover to reveal UI then capture", description="Move the mouse to a target location to reveal hover state, then capture evidence.", ) def prompt_hover_and_capture( x: int, y: int, duration: float = 0.0, goal: Optional[str] = None, output_path: Optional[str] = None, ) -> str: """Prompt template for hover then capture workflows.""" goal_text = goal or "the target UI" path_hint = f"output_dir={output_path}" if output_path else "the default output directory" return _prompt_text( f""" Goal: {goal_text}. Move the cursor with `move_mouse` to ({x}, {y}) using duration={duration}. Avoid clicking. After the hover settles, call `take_screenshot` ({path_hint}) so the hover state is captured. Return the screenshot path, any warnings, and a brief note on whether the target UI reacted. """ ) @mcp.prompt( name="coordinate_mismatch_recovery", title="Recover from coordinate mismatch", description="Diagnose why a click landed wrong and propose corrected coordinates.", ) def prompt_coordinate_mismatch_recovery( target_description: str, last_click_x: Optional[int] = None, last_click_y: Optional[int] = None, observed_offset: Optional[str] = None, ) -> str: """Prompt template for troubleshooting misaligned clicks.""" click_text = ( f"({last_click_x}, {last_click_y})" if last_click_x is not None and last_click_y is not None else "unknown coordinates" ) offset_text = observed_offset or "unspecified offset" return _prompt_text( f""" You attempted to click {target_description} at {click_text} and it missed (offset: {offset_text}). Run `get_display_diagnostics` to confirm scaling, then call `take_screenshot(detect_elements=True)` and work with element IDs or x_percent/y_percent. If the intended spot is visible, propose corrected percentage coords and the exact `click_screen` call. Keep safety first and suggest reconfirming with a quick screenshot after the next attempt. """ ) @mcp.prompt( name="end_to_end_capture_and_act", title="End-to-end capture and action", description="Plan a safe workflow: inspect, pick coordinates, click, and verify the result.", ) def prompt_end_to_end_capture_and_act( goal: str, target_hint: Optional[str] = None, output_dir: Optional[str] = None, ) -> str: """Prompt template for full capture-plan-act-verify loops.""" hint_text = target_hint or "the target control" dir_hint = f"output_dir={output_dir}" if output_dir else "the default directory" return _prompt_text( f""" Goal: {goal}. 1) Capture context with `take_screenshot(detect_elements=True)` ({dir_hint}). 2) Identify the control described by {hint_text}, selecting precise percentage coordinates (use `convert_screenshot_coordinates` if starting from screenshot pixels). 3) Provide the exact `click_screen` call (prefer element_id or percentage coordinates) and flag any risks. 4) Recommend a quick verification capture after the action. """ ) _PROMPT_TEMPLATE_META: Dict[str, Dict[str, str]] = { "baseline_display_check": { "title": "Baseline display and scaling check", "description": "Collect display server, sizing, and scaling warnings before interacting with the desktop.", }, "capture_full_desktop": { "title": "Capture full desktop screenshot", "description": "Capture a full-screen screenshot and summarize what is visible before acting.", }, "capture_region_for_task": { "title": "Capture region screenshot", "description": "Grab a focused region screenshot to inspect a specific UI area.", }, "convert_screenshot_coordinates": { "title": "Convert screenshot pixels to logical click coords", "description": "Turn coordinates measured on a screenshot into logical positions for click_screen.", }, "safe_click": { "title": "Safely perform a click", "description": "Perform a guarded click with scaling awareness and post-click confirmation guidance.", }, "hover_and_capture": { "title": "Hover to reveal UI then capture", "description": "Move the mouse to a target location to reveal hover state, then capture evidence.", }, "coordinate_mismatch_recovery": { "title": "Recover from coordinate mismatch", "description": "Diagnose why a click landed wrong and propose corrected coordinates.", }, "end_to_end_capture_and_act": { "title": "End-to-end capture and action", "description": "Plan a safe workflow: inspect, pick coordinates, click, and verify the result.", }, } @mcp.tool() def list_prompt_templates() -> PromptTemplatesResult: """List available prompt templates for clients without prompt support.""" templates = [ PromptTemplateSummary( name=name, title=meta["title"], description=meta["description"], ) for name, meta in _PROMPT_TEMPLATE_META.items() ] return PromptTemplatesResult(templates=templates) @mcp.tool() def render_prompt_baseline_display_check( task_hint: Optional[str] = None, ) -> PromptTemplateResult: """Expose baseline display check prompt for Codex CLI as a tool.""" meta = _PROMPT_TEMPLATE_META["baseline_display_check"] return PromptTemplateResult( name="baseline_display_check", title=meta["title"], description=meta["description"], prompt=prompt_baseline_display_check(task_hint), ) @mcp.tool() def render_prompt_capture_full_desktop( goal: str, output_path: Optional[str] = None, ) -> PromptTemplateResult: """Expose full desktop capture prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["capture_full_desktop"] return PromptTemplateResult( name="capture_full_desktop", title=meta["title"], description=meta["description"], prompt=prompt_capture_full_desktop(goal=goal, output_path=output_path), ) @mcp.tool() def render_prompt_capture_region_for_task( region: str, goal: Optional[str] = None, output_path: Optional[str] = None, ) -> PromptTemplateResult: """Expose region capture prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["capture_region_for_task"] return PromptTemplateResult( name="capture_region_for_task", title=meta["title"], description=meta["description"], prompt=prompt_capture_region_for_task( region=region, goal=goal, output_path=output_path, ), ) @mcp.tool() def render_prompt_convert_screenshot_coordinates( screenshot_x: int, screenshot_y: int, note: Optional[str] = None, ) -> PromptTemplateResult: """Expose coordinate conversion prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["convert_screenshot_coordinates"] return PromptTemplateResult( name="convert_screenshot_coordinates", title=meta["title"], description=meta["description"], prompt=prompt_convert_screenshot_coordinates( screenshot_x=screenshot_x, screenshot_y=screenshot_y, note=note, ), ) @mcp.tool() def render_prompt_safe_click( x: int, y: int, coordinate_source: str, button: str = "left", clicks: int = 1, reason: Optional[str] = None, ) -> PromptTemplateResult: """Expose safe click prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["safe_click"] return PromptTemplateResult( name="safe_click", title=meta["title"], description=meta["description"], prompt=prompt_safe_click( x=x, y=y, coordinate_source=coordinate_source, button=button, clicks=clicks, reason=reason, ), ) @mcp.tool() def render_prompt_hover_and_capture( x: int, y: int, duration: float = 0.0, goal: Optional[str] = None, output_path: Optional[str] = None, ) -> PromptTemplateResult: """Expose hover and capture prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["hover_and_capture"] return PromptTemplateResult( name="hover_and_capture", title=meta["title"], description=meta["description"], prompt=prompt_hover_and_capture( x=x, y=y, duration=duration, goal=goal, output_path=output_path, ), ) @mcp.tool() def render_prompt_coordinate_mismatch_recovery( target_description: str, last_click_x: Optional[int] = None, last_click_y: Optional[int] = None, observed_offset: Optional[str] = None, ) -> PromptTemplateResult: """Expose coordinate mismatch recovery prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["coordinate_mismatch_recovery"] return PromptTemplateResult( name="coordinate_mismatch_recovery", title=meta["title"], description=meta["description"], prompt=prompt_coordinate_mismatch_recovery( target_description=target_description, last_click_x=last_click_x, last_click_y=last_click_y, observed_offset=observed_offset, ), ) @mcp.tool() def render_prompt_end_to_end_capture_and_act( goal: str, target_hint: Optional[str] = None, output_dir: Optional[str] = None, ) -> PromptTemplateResult: """Expose end-to-end capture and action prompt as a tool.""" meta = _PROMPT_TEMPLATE_META["end_to_end_capture_and_act"] return PromptTemplateResult( name="end_to_end_capture_and_act", title=meta["title"], description=meta["description"], prompt=prompt_end_to_end_capture_and_act( goal=goal, target_hint=target_hint, output_dir=output_dir, ), ) def main(): """Entry point for the MCP server.""" mcp.run() if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/charettep/ubuntu-desktop-control-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

server.py•106 KiB