AutoMac MCP

automac_mcp.py•21.3 KiB

#!/usr/bin/env python3

import subprocess
import json
import time
from typing import Any, Dict, List, Optional
import pyautogui
import easyocr
import numpy as np
from mcp.server.fastmcp import FastMCP
try:
    from Cocoa import NSWorkspace
    from Quartz import CGWindowListCopyWindowInfo, kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGEventCreateScrollWheelEvent, CGEventPost, kCGScrollEventUnitPixel, kCGHIDEventTap
    from ApplicationServices import AXUIElementCreateApplication, AXUIElementCopyAttributeValue, kAXWindowsAttribute, kAXTitleAttribute, kAXPositionAttribute, kAXSizeAttribute, kAXRoleAttribute
    ACCESSIBILITY_AVAILABLE = True
except ImportError:
    ACCESSIBILITY_AVAILABLE = False

# Initialize the MCP server
mcp = FastMCP("AutoMac MCP - macOS UI Automation")

# Initialize OCR reader and pyautogui settings
reader = easyocr.Reader(['en'])
pyautogui.FAILSAFE = True


def _scale_coordinates_for_display(x: int, y: int) -> tuple[int, int]:
    """Scale coordinates for retina/high-DPI displays."""
    try:
        # Get the actual screen size from pyautogui
        screen_width, screen_height = pyautogui.size()
        
        # Take a screenshot to get the logical size
        screenshot = pyautogui.screenshot()
        logical_width, logical_height = screenshot.size
        
        # Calculate scaling factors
        scale_x = screen_width / logical_width
        scale_y = screen_height / logical_height
        
        # Scale the coordinates
        scaled_x = int(x * scale_x)
        scaled_y = int(y * scale_y)
        
        return scaled_x, scaled_y
    except Exception:
        # If scaling fails, return original coordinates
        return x, y

@mcp.tool()
def get_screen_size() -> Dict[str, Any]:
    screen_width, screen_height = pyautogui.size()
    return {"success": True, "message": f"Screen size = ({screen_width}, {screen_height})"}


@mcp.tool()
def mouse_move(x: int, y: int) -> Dict[str, Any]:
    """Single click at the specified screen coordinates."""
    if x is None or y is None:
        raise ValueError("x and y coordinates are required")
    
    # Scale coordinates for retina displays
    scaled_x, scaled_y = _scale_coordinates_for_display(x, y)
    pyautogui.moveTo(x=scaled_x, y=scaled_y, clicks=1)
    return {"success": True, "message": f"Moved mouse pointer to ({x}, {y})"}


@mcp.tool()
def mouse_single_click(x: int, y: int) -> Dict[str, Any]:
    """Single click at the specified screen coordinates."""
    if x is None or y is None:
        raise ValueError("x and y coordinates are required")
    
    # Scale coordinates for retina displays
    scaled_x, scaled_y = _scale_coordinates_for_display(x, y)
    pyautogui.click(x=scaled_x, y=scaled_y, clicks=1)
    return {"success": True, "message": f"Single clicked at ({x}, {y})"}


@mcp.tool()
def mouse_double_click(x: int, y: int) -> Dict[str, Any]:
    """Double click at the specified screen coordinates."""
    if x is None or y is None:
        raise ValueError("x and y coordinates are required")
    
    # Scale coordinates for retina displays
    scaled_x, scaled_y = _scale_coordinates_for_display(x, y)
    pyautogui.click(x=scaled_x, y=scaled_y, clicks=2)
    return {"success": True, "message": f"Double clicked at ({x}, {y})"}


@mcp.tool()
def type_text(text: str) -> Dict[str, Any]:
    """Type the specified text."""
    if not text:
        raise ValueError("text is required")
    
    pyautogui.write(text)
    return {"success": True, "message": f"Typed: {text}"}


@mcp.tool()
def scroll(dx: int = 0, dy: int = 0) -> Dict[str, Any]:
    
    """Scroll with the specified pixel delta values.
    Args:
        dx: Horizontal scroll pixel delta (positive = right, negative = left)
        dy: Vertical scroll pixel delta (positive = down, negative = up)
    """
    # pyautogui.scroll: positive = up, negative = down
    # We want intuitive behavior: positive dy = scroll down, negative dy = scroll up
    if dy != 0:
        CGEventPost(kCGHIDEventTap, CGEventCreateScrollWheelEvent(None, kCGScrollEventUnitPixel, 1, -dy))
    if dx != 0:
        # pyautogui.hscroll: positive = right, negative = left (already correct)
        pyautogui.hscroll(clicks=dx)
    return {"success": True, "message": f"Scrolled dx={dx}, dy={dy}"}


@mcp.tool()
def play_sound_for_user_prompt() -> Dict[str, Any]:
    """Play the system bell sound to alert the user."""
    result = subprocess.run(
        ["osascript", "-e", "beep"],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        return {
            "success": False, 
            "message": f"Failed to play system bell: {result.stderr.strip()}"
        }
    
    return {"success": True, "message": "System bell played"}


def _execute_applescript_keystroke(keystroke_command: str, description: str) -> Dict[str, Any]:
    """Helper function to execute AppleScript keystrokes."""
    script = f'''
    tell application "System Events"
        {keystroke_command}
    end tell
    '''
    
    result = subprocess.run(
        ["osascript", "-e", script],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        raise RuntimeError(f"AppleScript error: {result.stderr}")
    
    return {"success": True, "message": f"Executed: {description}"}


@mcp.tool()
def keyboard_shortcut_return_key() -> Dict[str, Any]:
    """Press the Return/Enter key."""
    return _execute_applescript_keystroke('keystroke return', "Return key")


@mcp.tool()
def keyboard_shortcut_escape_key() -> Dict[str, Any]:
    """Press the Escape key."""
    return _execute_applescript_keystroke('key code 53', "Escape key")


@mcp.tool()
def keyboard_shortcut_tab_key() -> Dict[str, Any]:
    """Press the Tab key."""
    return _execute_applescript_keystroke('keystroke tab', "Tab key")


@mcp.tool()
def keyboard_shortcut_space_key() -> Dict[str, Any]:
    """Press the Space key."""
    return _execute_applescript_keystroke('keystroke " "', "Space key")


@mcp.tool()
def keyboard_shortcut_delete_key() -> Dict[str, Any]:
    """Press the Delete key (backspace)."""
    return _execute_applescript_keystroke('key code 51', "Delete key")


@mcp.tool()
def keyboard_shortcut_forward_delete_key() -> Dict[str, Any]:
    """Press the Forward Delete key."""
    return _execute_applescript_keystroke('key code 117', "Forward Delete key")


@mcp.tool()
def keyboard_shortcut_arrow_up() -> Dict[str, Any]:
    """Press the Up Arrow key."""
    return _execute_applescript_keystroke('key code 126', "Up Arrow key")


@mcp.tool()
def keyboard_shortcut_arrow_down() -> Dict[str, Any]:
    """Press the Down Arrow key."""
    return _execute_applescript_keystroke('key code 125', "Down Arrow key")


@mcp.tool()
def keyboard_shortcut_arrow_left() -> Dict[str, Any]:
    """Press the Left Arrow key."""
    return _execute_applescript_keystroke('key code 123', "Left Arrow key")


@mcp.tool()
def keyboard_shortcut_arrow_right() -> Dict[str, Any]:
    """Press the Right Arrow key."""
    return _execute_applescript_keystroke('key code 124', "Right Arrow key")


@mcp.tool()
def keyboard_shortcut_select_all() -> Dict[str, Any]:
    """Select all text (Cmd+A)."""
    return _execute_applescript_keystroke('keystroke "a" using {command down}', "Select All (Cmd+A)")


@mcp.tool()
def keyboard_shortcut_copy() -> Dict[str, Any]:
    """Copy selected content (Cmd+C)."""
    return _execute_applescript_keystroke('keystroke "c" using {command down}', "Copy (Cmd+C)")


@mcp.tool()
def keyboard_shortcut_paste() -> Dict[str, Any]:
    """Paste from clipboard (Cmd+V)."""
    return _execute_applescript_keystroke('keystroke "v" using {command down}', "Paste (Cmd+V)")


@mcp.tool()
def keyboard_shortcut_cut() -> Dict[str, Any]:
    """Cut selected content (Cmd+X)."""
    return _execute_applescript_keystroke('keystroke "x" using {command down}', "Cut (Cmd+X)")


@mcp.tool()
def keyboard_shortcut_undo() -> Dict[str, Any]:
    """Undo last action (Cmd+Z)."""
    return _execute_applescript_keystroke('keystroke "z" using {command down}', "Undo (Cmd+Z)")


@mcp.tool()
def keyboard_shortcut_redo() -> Dict[str, Any]:
    """Redo last undone action (Cmd+Shift+Z)."""
    return _execute_applescript_keystroke('keystroke "z" using {command down, shift down}', "Redo (Cmd+Shift+Z)")


@mcp.tool()
def keyboard_shortcut_save() -> Dict[str, Any]:
    """Save current document (Cmd+S)."""
    return _execute_applescript_keystroke('keystroke "s" using {command down}', "Save (Cmd+S)")


@mcp.tool()
def keyboard_shortcut_new() -> Dict[str, Any]:
    """Create new document (Cmd+N)."""
    return _execute_applescript_keystroke('keystroke "n" using {command down}', "New (Cmd+N)")


@mcp.tool()
def keyboard_shortcut_open() -> Dict[str, Any]:
    """Open document (Cmd+O)."""
    return _execute_applescript_keystroke('keystroke "o" using {command down}', "Open (Cmd+O)")


@mcp.tool()
def keyboard_shortcut_find() -> Dict[str, Any]:
    """Find in document (Cmd+F)."""
    return _execute_applescript_keystroke('keystroke "f" using {command down}', "Find (Cmd+F)")


@mcp.tool()
def keyboard_shortcut_close_window() -> Dict[str, Any]:
    """Close current window (Cmd+W)."""
    return _execute_applescript_keystroke('keystroke "w" using {command down}', "Close Window (Cmd+W)")


@mcp.tool()
def keyboard_shortcut_quit_app() -> Dict[str, Any]:
    """Quit current application (Cmd+Q)."""
    return _execute_applescript_keystroke('keystroke "q" using {command down}', "Quit App (Cmd+Q)")


@mcp.tool()
def keyboard_shortcut_minimize_window() -> Dict[str, Any]:
    """Minimize current window (Cmd+M)."""
    return _execute_applescript_keystroke('keystroke "m" using {command down}', "Minimize Window (Cmd+M)")


@mcp.tool()
def keyboard_shortcut_hide_app() -> Dict[str, Any]:
    """Hide current application (Cmd+H)."""
    return _execute_applescript_keystroke('keystroke "h" using {command down}', "Hide App (Cmd+H)")


@mcp.tool()
def keyboard_shortcut_switch_app_forward() -> Dict[str, Any]:
    """Switch to next application (Cmd+Tab)."""
    return _execute_applescript_keystroke('keystroke tab using {command down}', "Switch App Forward (Cmd+Tab)")


@mcp.tool()
def keyboard_shortcut_switch_app_backward() -> Dict[str, Any]:
    """Switch to previous application (Cmd+Shift+Tab)."""
    return _execute_applescript_keystroke('keystroke tab using {command down, shift down}', "Switch App Backward (Cmd+Shift+Tab)")


@mcp.tool()
def keyboard_shortcut_spotlight_search() -> Dict[str, Any]:
    """Open Spotlight search (Cmd+Space)."""
    return _execute_applescript_keystroke('keystroke " " using {command down}', "Spotlight Search (Cmd+Space)")


@mcp.tool()
def keyboard_shortcut_force_quit() -> Dict[str, Any]:
    """Open Force Quit dialog (Cmd+Option+Esc)."""
    return _execute_applescript_keystroke('key code 53 using {command down, option down}', "Force Quit (Cmd+Option+Esc)")


@mcp.tool()
def keyboard_shortcut_refresh() -> Dict[str, Any]:
    """Refresh/Reload (Cmd+R)."""
    return _execute_applescript_keystroke('keystroke "r" using {command down}', "Refresh (Cmd+R)")


@mcp.tool()
def focus_app(app_name: str, timeout: int = 30) -> Dict[str, Any]:
    """Bring the specified application to the foreground and wait for it to become active.
    
    Args:
        app_name: Name of the application to focus
        timeout: Maximum time to wait for app to become active (default: 30 seconds)
    """
    if not app_name:
        raise ValueError("app_name is required")
    
    if timeout <= 0:
        raise ValueError("timeout must be positive")
    
    # First, try to activate the app
    script = f'tell application "{app_name}" to activate'
    
    result = subprocess.run(
        ["osascript", "-e", script],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        return {
            "success": False, 
            "message": f"Failed to activate app '{app_name}': {result.stderr.strip()}"
        }
    
    # Wait for the app to become the active application
    start_time = time.time()
    last_active_app = None
    
    while time.time() - start_time < timeout:
        try:
            if ACCESSIBILITY_AVAILABLE:
                # Use Cocoa NSWorkspace to check active app
                workspace = NSWorkspace.sharedWorkspace()
                active_app = workspace.activeApplication()
                if active_app:
                    active_app_name = active_app.get("NSApplicationName", "")
                    if active_app_name.lower() == app_name.lower():
                        elapsed_time = round(time.time() - start_time, 2)
                        return {
                            "success": True, 
                            "message": f"Successfully focused '{app_name}' (took {elapsed_time}s)",
                            "elapsed_time": elapsed_time,
                            "active_app": {
                                "name": active_app_name,
                                "bundle_id": active_app.get("NSApplicationBundleIdentifier", "Unknown"),
                                "pid": active_app.get("NSApplicationProcessIdentifier", -1)
                            }
                        }
                    last_active_app = active_app_name
            else:
                # Fallback: use AppleScript to check frontmost app
                check_script = 'tell application "System Events" to get name of first application process whose frontmost is true'
                check_result = subprocess.run(
                    ["osascript", "-e", check_script],
                    capture_output=True,
                    text=True
                )
                
                if check_result.returncode == 0:
                    frontmost_app = check_result.stdout.strip()
                    if frontmost_app.lower() == app_name.lower():
                        elapsed_time = round(time.time() - start_time, 2)
                        return {
                            "success": True, 
                            "message": f"Successfully focused '{app_name}' (took {elapsed_time}s)",
                            "elapsed_time": elapsed_time,
                            "active_app": {"name": frontmost_app}
                        }
                    last_active_app = frontmost_app
        
        except Exception as e:
            # Continue waiting even if we can't check the active app
            pass
        
        # Wait a bit before checking again
        time.sleep(0.5)
    
    # Timeout reached
    return {
        "success": False,
        "message": f"Timeout waiting for '{app_name}' to become active after {timeout}s",
        "timeout": timeout,
        "last_active_app": last_active_app,
        "elapsed_time": timeout
    }


@mcp.tool()
def get_screen_layout() -> str:
    """Get information about windows and applications currently visible on the screen."""
    return _get_screen_content_accessibility()


@mcp.tool()
def get_screen_text() -> str:
    """Get all text currently visible on the screen using OCR."""
    return _get_screen_content_ocr()


def _get_screen_content_accessibility() -> str:
    """Get screen content using macOS accessibility APIs."""
    if not ACCESSIBILITY_AVAILABLE:
        return json.dumps({
            "success": False,
            "error": "macOS accessibility frameworks not available",
            "message": "Install pyobjc-framework-Cocoa and pyobjc-framework-Quartz"
        }, indent=2)
    
    try:
        screen_info = {
            "mode": "accessibility",
            "timestamp": str(subprocess.run(["date"], capture_output=True, text=True).stdout.strip()),
            "windows": [],
            "active_app": None
        }
        
        # Get active application
        try:
            workspace = NSWorkspace.sharedWorkspace()
            active_app = workspace.activeApplication()
            if active_app:
                screen_info["active_app"] = {
                    "name": active_app.get("NSApplicationName", "Unknown"),
                    "bundle_id": active_app.get("NSApplicationBundleIdentifier", "Unknown"),
                    "pid": active_app.get("NSApplicationProcessIdentifier", -1)
                }
        except Exception as e:
            screen_info["active_app_error"] = str(e)
        
        # Get window information using Quartz
        try:
            window_list = CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID)
            
            for window in window_list:
                # Skip windows without titles or that are too small
                window_name = window.get('kCGWindowName', '')
                window_bounds = window.get('kCGWindowBounds', {})
                
                if (window_name and 
                    window_bounds.get('Width', 0) > 50 and 
                    window_bounds.get('Height', 0) > 50):
                    
                    window_info = {
                        "title": window_name,
                        "app": window.get('kCGWindowOwnerName', 'Unknown'),
                        "bounds": {
                            "x": int(window_bounds.get('X', 0)),
                            "y": int(window_bounds.get('Y', 0)),
                            "width": int(window_bounds.get('Width', 0)),
                            "height": int(window_bounds.get('Height', 0))
                        },
                        "layer": window.get('kCGWindowLayer', 0),
                        "pid": window.get('kCGWindowOwnerPID', -1)
                    }
                    screen_info["windows"].append(window_info)
        
        except Exception as e:
            screen_info["windows_error"] = str(e)
        
        # Sort windows by layer (front to back)
        screen_info["windows"].sort(key=lambda w: w.get("layer", 0))
        
        # Get screen size
        try:
            screenshot = pyautogui.screenshot()
            screen_info["screen_size"] = {
                "width": screenshot.width,
                "height": screenshot.height
            }
        except Exception as e:
            screen_info["screen_size_error"] = str(e)
        
        return json.dumps({
            "success": True,
            "screen_info": screen_info,
            "message": f"Found {len(screen_info['windows'])} visible windows"
        }, indent=2)
        
    except Exception as e:
        return json.dumps({
            "success": False,
            "error": str(e),
            "message": "Failed to get screen content using accessibility"
        }, indent=2)


def _get_screen_content_ocr() -> str:
    """Get screen content using OCR to read all text on screen."""
    try:
        # Take screenshot
        screenshot = pyautogui.screenshot()
        
        # Convert PIL Image to numpy array for easyocr
        screenshot_array = np.array(screenshot)
        
        # Use OCR to extract all text
        results = reader.readtext(screenshot_array)
        
        screen_info = {
            "mode": "ocr",
            "timestamp": str(subprocess.run(["date"], capture_output=True, text=True).stdout.strip()),
            "screen_size": {
                "width": screenshot.width,
                "height": screenshot.height
            },
            "text_elements": [],
            "full_text": ""
        }
        
        all_text_lines = []
        
        for (bbox, detected_text, confidence) in results:
            if confidence > 0.3:  # Lower threshold for general screen reading
                x1, y1 = bbox[0]
                x2, y2 = bbox[2]
                center_x = int((x1 + x2) / 2)
                center_y = int((y1 + y2) / 2)
                
                text_element = {
                    "text": detected_text.strip(),
                    "confidence": round(confidence, 3),
                    "position": {
                        "center_x": center_x,
                        "center_y": center_y,
                        "bbox": [[int(point[0]), int(point[1])] for point in bbox]
                    }
                }
                
                screen_info["text_elements"].append(text_element)
                all_text_lines.append(detected_text.strip())
        
        # Sort text elements by vertical position (top to bottom, then left to right)
        screen_info["text_elements"].sort(key=lambda x: (x["position"]["center_y"], x["position"]["center_x"]))
        
        # Create full text representation
        screen_info["full_text"] = "\n".join([elem["text"] for elem in screen_info["text_elements"]])
        
        return json.dumps({
            "success": True,
            "screen_info": screen_info,
            "message": f"Found {len(screen_info['text_elements'])} text elements on screen"
        }, indent=2)
        
    except Exception as e:
        return json.dumps({
            "success": False,
            "error": str(e),
            "message": "Failed to get screen content using OCR"
        }, indent=2)



@mcp.tool()
def get_available_apps() -> str:
    """Get a list of all running applications."""
    script = '''
    tell application "System Events"
        get name of (processes where background only is false)
    end tell
    '''
    
    result = subprocess.run(
        ["osascript", "-e", script],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        raise RuntimeError(f"Failed to get apps: {result.stderr}")
    
    apps = [app.strip() for app in result.stdout.split(", ")]
    return json.dumps({"success": True, "apps": apps}, indent=2)


def main():
    """Entry point for the MCP server."""
    mcp.run()


if __name__ == "__main__":
    main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/digithree/automac-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

automac_mcp.py•21.3 KiB

#!/usr/bin/env python3

import subprocess
import json
import time
from typing import Any, Dict, List, Optional
import pyautogui
import easyocr
import numpy as np
from mcp.server.fastmcp import FastMCP
try:
    from Cocoa import NSWorkspace
    from Quartz import CGWindowListCopyWindowInfo, kCGWindowListOptionOnScreenOnly, kCGNullWindowID, CGEventCreateScrollWheelEvent, CGEventPost, kCGScrollEventUnitPixel, kCGHIDEventTap
    from ApplicationServices import AXUIElementCreateApplication, AXUIElementCopyAttributeValue, kAXWindowsAttribute, kAXTitleAttribute, kAXPositionAttribute, kAXSizeAttribute, kAXRoleAttribute
    ACCESSIBILITY_AVAILABLE = True
except ImportError:
    ACCESSIBILITY_AVAILABLE = False

# Initialize the MCP server
mcp = FastMCP("AutoMac MCP - macOS UI Automation")

# Initialize OCR reader and pyautogui settings
reader = easyocr.Reader(['en'])
pyautogui.FAILSAFE = True


def _scale_coordinates_for_display(x: int, y: int) -> tuple[int, int]:
    """Scale coordinates for retina/high-DPI displays."""
    try:
        # Get the actual screen size from pyautogui
        screen_width, screen_height = pyautogui.size()
        
        # Take a screenshot to get the logical size
        screenshot = pyautogui.screenshot()
        logical_width, logical_height = screenshot.size
        
        # Calculate scaling factors
        scale_x = screen_width / logical_width
        scale_y = screen_height / logical_height
        
        # Scale the coordinates
        scaled_x = int(x * scale_x)
        scaled_y = int(y * scale_y)
        
        return scaled_x, scaled_y
    except Exception:
        # If scaling fails, return original coordinates
        return x, y

@mcp.tool()
def get_screen_size() -> Dict[str, Any]:
    screen_width, screen_height = pyautogui.size()
    return {"success": True, "message": f"Screen size = ({screen_width}, {screen_height})"}


@mcp.tool()
def mouse_move(x: int, y: int) -> Dict[str, Any]:
    """Single click at the specified screen coordinates."""
    if x is None or y is None:
        raise ValueError("x and y coordinates are required")
    
    # Scale coordinates for retina displays
    scaled_x, scaled_y = _scale_coordinates_for_display(x, y)
    pyautogui.moveTo(x=scaled_x, y=scaled_y, clicks=1)
    return {"success": True, "message": f"Moved mouse pointer to ({x}, {y})"}


@mcp.tool()
def mouse_single_click(x: int, y: int) -> Dict[str, Any]:
    """Single click at the specified screen coordinates."""
    if x is None or y is None:
        raise ValueError("x and y coordinates are required")
    
    # Scale coordinates for retina displays
    scaled_x, scaled_y = _scale_coordinates_for_display(x, y)
    pyautogui.click(x=scaled_x, y=scaled_y, clicks=1)
    return {"success": True, "message": f"Single clicked at ({x}, {y})"}


@mcp.tool()
def mouse_double_click(x: int, y: int) -> Dict[str, Any]:
    """Double click at the specified screen coordinates."""
    if x is None or y is None:
        raise ValueError("x and y coordinates are required")
    
    # Scale coordinates for retina displays
    scaled_x, scaled_y = _scale_coordinates_for_display(x, y)
    pyautogui.click(x=scaled_x, y=scaled_y, clicks=2)
    return {"success": True, "message": f"Double clicked at ({x}, {y})"}


@mcp.tool()
def type_text(text: str) -> Dict[str, Any]:
    """Type the specified text."""
    if not text:
        raise ValueError("text is required")
    
    pyautogui.write(text)
    return {"success": True, "message": f"Typed: {text}"}


@mcp.tool()
def scroll(dx: int = 0, dy: int = 0) -> Dict[str, Any]:
    
    """Scroll with the specified pixel delta values.
    Args:
        dx: Horizontal scroll pixel delta (positive = right, negative = left)
        dy: Vertical scroll pixel delta (positive = down, negative = up)
    """
    # pyautogui.scroll: positive = up, negative = down
    # We want intuitive behavior: positive dy = scroll down, negative dy = scroll up
    if dy != 0:
        CGEventPost(kCGHIDEventTap, CGEventCreateScrollWheelEvent(None, kCGScrollEventUnitPixel, 1, -dy))
    if dx != 0:
        # pyautogui.hscroll: positive = right, negative = left (already correct)
        pyautogui.hscroll(clicks=dx)
    return {"success": True, "message": f"Scrolled dx={dx}, dy={dy}"}


@mcp.tool()
def play_sound_for_user_prompt() -> Dict[str, Any]:
    """Play the system bell sound to alert the user."""
    result = subprocess.run(
        ["osascript", "-e", "beep"],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        return {
            "success": False, 
            "message": f"Failed to play system bell: {result.stderr.strip()}"
        }
    
    return {"success": True, "message": "System bell played"}


def _execute_applescript_keystroke(keystroke_command: str, description: str) -> Dict[str, Any]:
    """Helper function to execute AppleScript keystrokes."""
    script = f'''
    tell application "System Events"
        {keystroke_command}
    end tell
    '''
    
    result = subprocess.run(
        ["osascript", "-e", script],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        raise RuntimeError(f"AppleScript error: {result.stderr}")
    
    return {"success": True, "message": f"Executed: {description}"}


@mcp.tool()
def keyboard_shortcut_return_key() -> Dict[str, Any]:
    """Press the Return/Enter key."""
    return _execute_applescript_keystroke('keystroke return', "Return key")


@mcp.tool()
def keyboard_shortcut_escape_key() -> Dict[str, Any]:
    """Press the Escape key."""
    return _execute_applescript_keystroke('key code 53', "Escape key")


@mcp.tool()
def keyboard_shortcut_tab_key() -> Dict[str, Any]:
    """Press the Tab key."""
    return _execute_applescript_keystroke('keystroke tab', "Tab key")


@mcp.tool()
def keyboard_shortcut_space_key() -> Dict[str, Any]:
    """Press the Space key."""
    return _execute_applescript_keystroke('keystroke " "', "Space key")


@mcp.tool()
def keyboard_shortcut_delete_key() -> Dict[str, Any]:
    """Press the Delete key (backspace)."""
    return _execute_applescript_keystroke('key code 51', "Delete key")


@mcp.tool()
def keyboard_shortcut_forward_delete_key() -> Dict[str, Any]:
    """Press the Forward Delete key."""
    return _execute_applescript_keystroke('key code 117', "Forward Delete key")


@mcp.tool()
def keyboard_shortcut_arrow_up() -> Dict[str, Any]:
    """Press the Up Arrow key."""
    return _execute_applescript_keystroke('key code 126', "Up Arrow key")


@mcp.tool()
def keyboard_shortcut_arrow_down() -> Dict[str, Any]:
    """Press the Down Arrow key."""
    return _execute_applescript_keystroke('key code 125', "Down Arrow key")


@mcp.tool()
def keyboard_shortcut_arrow_left() -> Dict[str, Any]:
    """Press the Left Arrow key."""
    return _execute_applescript_keystroke('key code 123', "Left Arrow key")


@mcp.tool()
def keyboard_shortcut_arrow_right() -> Dict[str, Any]:
    """Press the Right Arrow key."""
    return _execute_applescript_keystroke('key code 124', "Right Arrow key")


@mcp.tool()
def keyboard_shortcut_select_all() -> Dict[str, Any]:
    """Select all text (Cmd+A)."""
    return _execute_applescript_keystroke('keystroke "a" using {command down}', "Select All (Cmd+A)")


@mcp.tool()
def keyboard_shortcut_copy() -> Dict[str, Any]:
    """Copy selected content (Cmd+C)."""
    return _execute_applescript_keystroke('keystroke "c" using {command down}', "Copy (Cmd+C)")


@mcp.tool()
def keyboard_shortcut_paste() -> Dict[str, Any]:
    """Paste from clipboard (Cmd+V)."""
    return _execute_applescript_keystroke('keystroke "v" using {command down}', "Paste (Cmd+V)")


@mcp.tool()
def keyboard_shortcut_cut() -> Dict[str, Any]:
    """Cut selected content (Cmd+X)."""
    return _execute_applescript_keystroke('keystroke "x" using {command down}', "Cut (Cmd+X)")


@mcp.tool()
def keyboard_shortcut_undo() -> Dict[str, Any]:
    """Undo last action (Cmd+Z)."""
    return _execute_applescript_keystroke('keystroke "z" using {command down}', "Undo (Cmd+Z)")


@mcp.tool()
def keyboard_shortcut_redo() -> Dict[str, Any]:
    """Redo last undone action (Cmd+Shift+Z)."""
    return _execute_applescript_keystroke('keystroke "z" using {command down, shift down}', "Redo (Cmd+Shift+Z)")


@mcp.tool()
def keyboard_shortcut_save() -> Dict[str, Any]:
    """Save current document (Cmd+S)."""
    return _execute_applescript_keystroke('keystroke "s" using {command down}', "Save (Cmd+S)")


@mcp.tool()
def keyboard_shortcut_new() -> Dict[str, Any]:
    """Create new document (Cmd+N)."""
    return _execute_applescript_keystroke('keystroke "n" using {command down}', "New (Cmd+N)")


@mcp.tool()
def keyboard_shortcut_open() -> Dict[str, Any]:
    """Open document (Cmd+O)."""
    return _execute_applescript_keystroke('keystroke "o" using {command down}', "Open (Cmd+O)")


@mcp.tool()
def keyboard_shortcut_find() -> Dict[str, Any]:
    """Find in document (Cmd+F)."""
    return _execute_applescript_keystroke('keystroke "f" using {command down}', "Find (Cmd+F)")


@mcp.tool()
def keyboard_shortcut_close_window() -> Dict[str, Any]:
    """Close current window (Cmd+W)."""
    return _execute_applescript_keystroke('keystroke "w" using {command down}', "Close Window (Cmd+W)")


@mcp.tool()
def keyboard_shortcut_quit_app() -> Dict[str, Any]:
    """Quit current application (Cmd+Q)."""
    return _execute_applescript_keystroke('keystroke "q" using {command down}', "Quit App (Cmd+Q)")


@mcp.tool()
def keyboard_shortcut_minimize_window() -> Dict[str, Any]:
    """Minimize current window (Cmd+M)."""
    return _execute_applescript_keystroke('keystroke "m" using {command down}', "Minimize Window (Cmd+M)")


@mcp.tool()
def keyboard_shortcut_hide_app() -> Dict[str, Any]:
    """Hide current application (Cmd+H)."""
    return _execute_applescript_keystroke('keystroke "h" using {command down}', "Hide App (Cmd+H)")


@mcp.tool()
def keyboard_shortcut_switch_app_forward() -> Dict[str, Any]:
    """Switch to next application (Cmd+Tab)."""
    return _execute_applescript_keystroke('keystroke tab using {command down}', "Switch App Forward (Cmd+Tab)")


@mcp.tool()
def keyboard_shortcut_switch_app_backward() -> Dict[str, Any]:
    """Switch to previous application (Cmd+Shift+Tab)."""
    return _execute_applescript_keystroke('keystroke tab using {command down, shift down}', "Switch App Backward (Cmd+Shift+Tab)")


@mcp.tool()
def keyboard_shortcut_spotlight_search() -> Dict[str, Any]:
    """Open Spotlight search (Cmd+Space)."""
    return _execute_applescript_keystroke('keystroke " " using {command down}', "Spotlight Search (Cmd+Space)")


@mcp.tool()
def keyboard_shortcut_force_quit() -> Dict[str, Any]:
    """Open Force Quit dialog (Cmd+Option+Esc)."""
    return _execute_applescript_keystroke('key code 53 using {command down, option down}', "Force Quit (Cmd+Option+Esc)")


@mcp.tool()
def keyboard_shortcut_refresh() -> Dict[str, Any]:
    """Refresh/Reload (Cmd+R)."""
    return _execute_applescript_keystroke('keystroke "r" using {command down}', "Refresh (Cmd+R)")


@mcp.tool()
def focus_app(app_name: str, timeout: int = 30) -> Dict[str, Any]:
    """Bring the specified application to the foreground and wait for it to become active.
    
    Args:
        app_name: Name of the application to focus
        timeout: Maximum time to wait for app to become active (default: 30 seconds)
    """
    if not app_name:
        raise ValueError("app_name is required")
    
    if timeout <= 0:
        raise ValueError("timeout must be positive")
    
    # First, try to activate the app
    script = f'tell application "{app_name}" to activate'
    
    result = subprocess.run(
        ["osascript", "-e", script],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        return {
            "success": False, 
            "message": f"Failed to activate app '{app_name}': {result.stderr.strip()}"
        }
    
    # Wait for the app to become the active application
    start_time = time.time()
    last_active_app = None
    
    while time.time() - start_time < timeout:
        try:
            if ACCESSIBILITY_AVAILABLE:
                # Use Cocoa NSWorkspace to check active app
                workspace = NSWorkspace.sharedWorkspace()
                active_app = workspace.activeApplication()
                if active_app:
                    active_app_name = active_app.get("NSApplicationName", "")
                    if active_app_name.lower() == app_name.lower():
                        elapsed_time = round(time.time() - start_time, 2)
                        return {
                            "success": True, 
                            "message": f"Successfully focused '{app_name}' (took {elapsed_time}s)",
                            "elapsed_time": elapsed_time,
                            "active_app": {
                                "name": active_app_name,
                                "bundle_id": active_app.get("NSApplicationBundleIdentifier", "Unknown"),
                                "pid": active_app.get("NSApplicationProcessIdentifier", -1)
                            }
                        }
                    last_active_app = active_app_name
            else:
                # Fallback: use AppleScript to check frontmost app
                check_script = 'tell application "System Events" to get name of first application process whose frontmost is true'
                check_result = subprocess.run(
                    ["osascript", "-e", check_script],
                    capture_output=True,
                    text=True
                )
                
                if check_result.returncode == 0:
                    frontmost_app = check_result.stdout.strip()
                    if frontmost_app.lower() == app_name.lower():
                        elapsed_time = round(time.time() - start_time, 2)
                        return {
                            "success": True, 
                            "message": f"Successfully focused '{app_name}' (took {elapsed_time}s)",
                            "elapsed_time": elapsed_time,
                            "active_app": {"name": frontmost_app}
                        }
                    last_active_app = frontmost_app
        
        except Exception as e:
            # Continue waiting even if we can't check the active app
            pass
        
        # Wait a bit before checking again
        time.sleep(0.5)
    
    # Timeout reached
    return {
        "success": False,
        "message": f"Timeout waiting for '{app_name}' to become active after {timeout}s",
        "timeout": timeout,
        "last_active_app": last_active_app,
        "elapsed_time": timeout
    }


@mcp.tool()
def get_screen_layout() -> str:
    """Get information about windows and applications currently visible on the screen."""
    return _get_screen_content_accessibility()


@mcp.tool()
def get_screen_text() -> str:
    """Get all text currently visible on the screen using OCR."""
    return _get_screen_content_ocr()


def _get_screen_content_accessibility() -> str:
    """Get screen content using macOS accessibility APIs."""
    if not ACCESSIBILITY_AVAILABLE:
        return json.dumps({
            "success": False,
            "error": "macOS accessibility frameworks not available",
            "message": "Install pyobjc-framework-Cocoa and pyobjc-framework-Quartz"
        }, indent=2)
    
    try:
        screen_info = {
            "mode": "accessibility",
            "timestamp": str(subprocess.run(["date"], capture_output=True, text=True).stdout.strip()),
            "windows": [],
            "active_app": None
        }
        
        # Get active application
        try:
            workspace = NSWorkspace.sharedWorkspace()
            active_app = workspace.activeApplication()
            if active_app:
                screen_info["active_app"] = {
                    "name": active_app.get("NSApplicationName", "Unknown"),
                    "bundle_id": active_app.get("NSApplicationBundleIdentifier", "Unknown"),
                    "pid": active_app.get("NSApplicationProcessIdentifier", -1)
                }
        except Exception as e:
            screen_info["active_app_error"] = str(e)
        
        # Get window information using Quartz
        try:
            window_list = CGWindowListCopyWindowInfo(kCGWindowListOptionOnScreenOnly, kCGNullWindowID)
            
            for window in window_list:
                # Skip windows without titles or that are too small
                window_name = window.get('kCGWindowName', '')
                window_bounds = window.get('kCGWindowBounds', {})
                
                if (window_name and 
                    window_bounds.get('Width', 0) > 50 and 
                    window_bounds.get('Height', 0) > 50):
                    
                    window_info = {
                        "title": window_name,
                        "app": window.get('kCGWindowOwnerName', 'Unknown'),
                        "bounds": {
                            "x": int(window_bounds.get('X', 0)),
                            "y": int(window_bounds.get('Y', 0)),
                            "width": int(window_bounds.get('Width', 0)),
                            "height": int(window_bounds.get('Height', 0))
                        },
                        "layer": window.get('kCGWindowLayer', 0),
                        "pid": window.get('kCGWindowOwnerPID', -1)
                    }
                    screen_info["windows"].append(window_info)
        
        except Exception as e:
            screen_info["windows_error"] = str(e)
        
        # Sort windows by layer (front to back)
        screen_info["windows"].sort(key=lambda w: w.get("layer", 0))
        
        # Get screen size
        try:
            screenshot = pyautogui.screenshot()
            screen_info["screen_size"] = {
                "width": screenshot.width,
                "height": screenshot.height
            }
        except Exception as e:
            screen_info["screen_size_error"] = str(e)
        
        return json.dumps({
            "success": True,
            "screen_info": screen_info,
            "message": f"Found {len(screen_info['windows'])} visible windows"
        }, indent=2)
        
    except Exception as e:
        return json.dumps({
            "success": False,
            "error": str(e),
            "message": "Failed to get screen content using accessibility"
        }, indent=2)


def _get_screen_content_ocr() -> str:
    """Get screen content using OCR to read all text on screen."""
    try:
        # Take screenshot
        screenshot = pyautogui.screenshot()
        
        # Convert PIL Image to numpy array for easyocr
        screenshot_array = np.array(screenshot)
        
        # Use OCR to extract all text
        results = reader.readtext(screenshot_array)
        
        screen_info = {
            "mode": "ocr",
            "timestamp": str(subprocess.run(["date"], capture_output=True, text=True).stdout.strip()),
            "screen_size": {
                "width": screenshot.width,
                "height": screenshot.height
            },
            "text_elements": [],
            "full_text": ""
        }
        
        all_text_lines = []
        
        for (bbox, detected_text, confidence) in results:
            if confidence > 0.3:  # Lower threshold for general screen reading
                x1, y1 = bbox[0]
                x2, y2 = bbox[2]
                center_x = int((x1 + x2) / 2)
                center_y = int((y1 + y2) / 2)
                
                text_element = {
                    "text": detected_text.strip(),
                    "confidence": round(confidence, 3),
                    "position": {
                        "center_x": center_x,
                        "center_y": center_y,
                        "bbox": [[int(point[0]), int(point[1])] for point in bbox]
                    }
                }
                
                screen_info["text_elements"].append(text_element)
                all_text_lines.append(detected_text.strip())
        
        # Sort text elements by vertical position (top to bottom, then left to right)
        screen_info["text_elements"].sort(key=lambda x: (x["position"]["center_y"], x["position"]["center_x"]))
        
        # Create full text representation
        screen_info["full_text"] = "\n".join([elem["text"] for elem in screen_info["text_elements"]])
        
        return json.dumps({
            "success": True,
            "screen_info": screen_info,
            "message": f"Found {len(screen_info['text_elements'])} text elements on screen"
        }, indent=2)
        
    except Exception as e:
        return json.dumps({
            "success": False,
            "error": str(e),
            "message": "Failed to get screen content using OCR"
        }, indent=2)



@mcp.tool()
def get_available_apps() -> str:
    """Get a list of all running applications."""
    script = '''
    tell application "System Events"
        get name of (processes where background only is false)
    end tell
    '''
    
    result = subprocess.run(
        ["osascript", "-e", script],
        capture_output=True,
        text=True
    )
    
    if result.returncode != 0:
        raise RuntimeError(f"Failed to get apps: {result.stderr}")
    
    apps = [app.strip() for app in result.stdout.split(", ")]
    return json.dumps({"success": True, "apps": apps}, indent=2)


def main():
    """Entry point for the MCP server."""
    mcp.run()


if __name__ == "__main__":
    main()