IACR MCP Server

MIT License
Overview InspectNew Schema Related Servers Reviews Score
import asyncio
import base64
import math
import os
import platform
import shlex
import shutil
import tempfile
import time
from enum import StrEnum
from pathlib import Path
from typing import Literal, TypedDict
from uuid import uuid4

# Add import for PyAutoGUI
import pyautogui
from anthropic.types.beta import BetaToolComputerUse20241022Param

from .base import BaseAnthropicTool, ToolError, ToolResult
from .run import run

OUTPUT_DIR = "/tmp/outputs"

TYPING_DELAY_MS = 12
TYPING_GROUP_SIZE = 50

Action = Literal[
    "key",
    "type",
    "mouse_move",
    "left_click",
    "left_click_drag",
    "right_click",
    "middle_click",
    "double_click",
    "screenshot",
    "cursor_position",
]


class Resolution(TypedDict):
    width: int
    height: int


# sizes above XGA/WXGA are not recommended (see README.md)
# scale down to one of these targets if ComputerTool._scaling_enabled is set
MAX_SCALING_TARGETS: dict[str, Resolution] = {
    "XGA": Resolution(width=1024, height=768),  # 4:3
    "WXGA": Resolution(width=1280, height=800),  # 16:10
    "FWXGA": Resolution(width=1366, height=768),  # ~16:9
}


class ScalingSource(StrEnum):
    COMPUTER = "computer"
    API = "api"


class ComputerToolOptions(TypedDict):
    display_height_px: int
    display_width_px: int
    display_number: int | None


def chunks(s: str, chunk_size: int) -> list[str]:
    return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]


def smooth_move_to(x, y, duration=1.2):
    start_x, start_y = pyautogui.position()
    dx = x - start_x
    dy = y - start_y
    distance = math.hypot(dx, dy)  # Calculate the distance in pixels

    start_time = time.time()

    while True:
        elapsed_time = time.time() - start_time
        if elapsed_time > duration:
            break

        t = elapsed_time / duration
        eased_t = (1 - math.cos(t * math.pi)) / 2  # easeInOutSine function

        target_x = start_x + dx * eased_t
        target_y = start_y + dy * eased_t
        pyautogui.moveTo(target_x, target_y)

    # Ensure the mouse ends up exactly at the target (x, y)
    pyautogui.moveTo(x, y)


class ComputerTool(BaseAnthropicTool):
    """
    A tool that allows the agent to interact with the primary monitor's screen, keyboard, and mouse.
    The tool parameters are defined by Anthropic and are not editable.
    """

    name: Literal["computer"] = "computer"
    api_type: Literal["computer_20241022"] = "computer_20241022"
    width: int
    height: int
    display_num: None  # Simplified to always be None since we're only using primary display

    _screenshot_delay = 2.0
    _scaling_enabled = True

    @property
    def options(self) -> ComputerToolOptions:
        width, height = self.scale_coordinates(
            ScalingSource.COMPUTER, self.width, self.height
        )
        return {
            "display_width_px": width,
            "display_height_px": height,
            "display_number": self.display_num,
        }

    def to_params(self) -> BetaToolComputerUse20241022Param:
        return {"name": self.name, "type": self.api_type, **self.options}

    def __init__(self):
        super().__init__()
        self.width, self.height = pyautogui.size()
        self.display_num = None

    async def __call__(
        self,
        *,
        action: Action,
        text: str | None = None,
        coordinate: tuple[int, int] | None = None,
        **kwargs,
    ):
        print("action", action)
        print("text", text)
        print("coordinate", coordinate)
        if action in ("mouse_move", "left_click_drag"):
            if coordinate is None:
                raise ToolError(f"coordinate is required for {action}")
            x, y = self.scale_coordinates(
                ScalingSource.API, coordinate[0], coordinate[1]
            )

            if action == "mouse_move":
                smooth_move_to(x, y)
            elif action == "left_click_drag":
                smooth_move_to(x, y)
                pyautogui.dragTo(x, y, button="left")

        elif action in ("key", "type"):
            if text is None:
                raise ToolError(f"text is required for {action}")

            if action == "key":
                if platform.system() == "Darwin":  # Check if we're on macOS
                    text = text.replace("super+", "command+")

                # Normalize key names
                def normalize_key(key):
                    key = key.lower().replace("_", "")
                    key_map = {
                        "pagedown": "pgdn",
                        "pageup": "pgup",
                        "enter": "return",
                        "return": "enter",
                        # Add more mappings as needed
                    }
                    return key_map.get(key, key)

                keys = [normalize_key(k) for k in text.split("+")]

                if len(keys) > 1:
                    if "darwin" in platform.system().lower():
                        # Use AppleScript for hotkey on macOS
                        keystroke, modifier = (keys[-1], "+".join(keys[:-1]))
                        modifier = modifier.lower() + " down"
                        if keystroke.lower() == "space":
                            keystroke = " "
                        elif keystroke.lower() == "enter":
                            keystroke = "\n"
                        script = f"""
                        tell application "System Events"
                            keystroke "{keystroke}" using {modifier}
                        end tell
                        """
                        os.system("osascript -e '{}'".format(script))
                    else:
                        pyautogui.hotkey(*keys)
                else:
                    pyautogui.press(keys[0])
            elif action == "type":
                pyautogui.write(text, interval=TYPING_DELAY_MS / 1000)

        elif action in ("left_click", "right_click", "double_click", "middle_click"):
            time.sleep(0.1)
            button = {
                "left_click": "left",
                "right_click": "right",
                "middle_click": "middle",
            }
            if action == "double_click":
                pyautogui.click()
                time.sleep(0.1)
                pyautogui.click()
            else:
                pyautogui.click(button=button.get(action, "left"))

        elif action == "screenshot":
            return self.screenshot()

        elif action == "cursor_position":
            x, y = pyautogui.position()
            x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
            return ToolResult(output=f"X={x},Y={y}")

        else:
            raise ToolError(f"Invalid action: {action}")

        # Take a screenshot after the action (except for cursor_position)
        if action != "cursor_position":
            return self.screenshot()

    def screenshot(self):
        """Take a screenshot of the current screen and return the base64 encoded image."""
        temp_dir = Path(tempfile.gettempdir())
        path = temp_dir / f"screenshot_{uuid4().hex}.png"

        screenshot = pyautogui.screenshot()
        
        # print current file size before optimization
        screenshot.save(str(path))
        print(f"Original file size: {os.path.getsize(path)} bytes")

        if self._scaling_enabled:
            x, y = self.scale_coordinates(
                ScalingSource.COMPUTER, self.width, self.height
            )
            from PIL import Image

            with Image.open(path) as img:
                
                # Resize with high-quality downsampling
                img = img.resize((x, y), Image.Resampling.LANCZOS)
                
                # Save with optimization and reduced quality
                img.save(path)

        if path.exists():
            print(f"Optimized file size: {os.path.getsize(path)} bytes")
            base64_image = base64.b64encode(path.read_bytes()).decode()
            path.unlink()  # Remove the temporary file

            return {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": base64_image,
                    },
                }
                
        raise ToolError(f"Failed to take screenshot")

    async def shell(self, command: str, take_screenshot=True) -> ToolResult:
        """Run a shell command and return the output, error, and optionally a screenshot."""
        _, stdout, stderr = await run(command)
        base64_image = None

        if take_screenshot:
            # delay to let things settle before taking a screenshot
            await asyncio.sleep(self._screenshot_delay)
            base64_image = (await self.screenshot()).base64_image

        return ToolResult(output=stdout, error=stderr, base64_image=base64_image)

    def scale_coordinates(self, source: ScalingSource, x: int, y: int):
        """Scale coordinates to a target maximum resolution."""
        if not self._scaling_enabled:
            return x, y
        ratio = self.width / self.height
        target_dimension = None
        for dimension in MAX_SCALING_TARGETS.values():
            # allow some error in the aspect ratio - not ratios are exactly 16:9
            if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
                if dimension["width"] < self.width:
                    target_dimension = dimension
                break
        if target_dimension is None:
            return x, y
        # should be less than 1
        x_scaling_factor = target_dimension["width"] / self.width
        y_scaling_factor = target_dimension["height"] / self.height
        if source == ScalingSource.API:
            if x > self.width or y > self.height:
                raise ToolError(f"Coordinates {x}, {y} are out of bounds")
            # scale up
            return round(x / x_scaling_factor), round(y / y_scaling_factor)
        # scale down
        return round(x * x_scaling_factor), round(y * y_scaling_factor)



async def ComputerUse__type(text: str):
    """Execute a typing action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="type", text=text)

async def ComputerUse__key(text: str):
    """Execute a key press action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="key", text=text)

async def ComputerUse__mouse_move(coordinate: tuple[int, int]):
    """Execute a mouse move action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="mouse_move", coordinate=coordinate)

async def ComputerUse__left_click():
    """Execute a left click action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="left_click")

async def ComputerUse__right_click():
    """Execute a right click action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="right_click")

async def ComputerUse__middle_click():
    """Execute a middle click action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="middle_click")

async def ComputerUse__double_click():
    """Execute a double click action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="double_click")

async def ComputerUse__left_click_drag(coordinate: tuple[int, int]):
    """Execute a left click drag action using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="left_click_drag", coordinate=coordinate)

async def ComputerUse__screenshot():
    """Take a screenshot using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="screenshot")


def ComputerUse_screenshot_tool():
    """Take a screenshot using the ComputerTool and return the base64 encoded image."""
    tool = ComputerTool()
    return tool.screenshot()

async def ComputerUse__cursor_position():
    """Get the current cursor position using the ComputerTool."""
    tool = ComputerTool()
    return await tool(action="cursor_position")

# List of all computer use tools
ComputerUse_tools = [
    ComputerUse__type,
    ComputerUse__key,
    ComputerUse__mouse_move,
    ComputerUse__left_click,
    ComputerUse__right_click,
    ComputerUse__middle_click,
    ComputerUse__double_click,
    ComputerUse__left_click_drag,
    ComputerUse__screenshot,
    ComputerUse__cursor_position
]