CUA MCP Server

execute.ts•22.9 KiB

/** * Agent Execution * * Main agent execution module that orchestrates the computer use loop. * Uses the action handler registry for simplified action dispatch. */ import Anthropic from "@anthropic-ai/sdk"; import { put } from "@vercel/blob"; import { CuaComputerClient, CommandResult } from "../cua-client.js"; import type { AgentStep, TaskResult, TaskProgress, ActionInput, } from "./types.js"; import { DEFAULT_DISPLAY_WIDTH, DEFAULT_DISPLAY_HEIGHT, RECOMMENDED_MAX_WIDTH, RECOMMENDED_MAX_HEIGHT, UI_SETTLE_DELAY_MS, HEARTBEAT_INTERVAL_MS, DEFAULT_MAX_STEPS, DEFAULT_TIMEOUT_SECONDS, ANTHROPIC_MAX_RETRIES, MAX_MESSAGE_HISTORY, getModelConfig, } from "./config.js"; import { updateProgress, initializeProgress, finalizeTask, summarizeAction, } from "./progress.js"; import { sleep, generateTaskId, getSandboxHost } from "./utils.js"; import { ACTION_HANDLERS, OBSERVATION_ACTIONS } from "./actions/index.js"; // Actions that should not trigger auto-release of held keys const NO_AUTO_RELEASE_ACTIONS = new Set([ "screenshot", "zoom", "hold_key", "wait", ]); // ============================================ // Message History Management // ============================================ /** * Trim message history to prevent context bloat from accumulated screenshots. * Keeps the initial user message (task) plus the last N exchanges. * Each exchange = assistant response + user tool results. * * @param messages - The messages array to trim (mutated in place) */ function trimMessageHistory( messages: Anthropic.Beta.BetaMessageParam[] ): void { // Each exchange is 2 messages (assistant + user), plus initial user message const maxMessages = MAX_MESSAGE_HISTORY * 2 + 1; while (messages.length > maxMessages) { // Remove the oldest exchange (indices 1 and 2), preserving the initial task message at index 0 messages.splice(1, 2); } } // ============================================ // Screen Dimensions Helper // ============================================ /** * Get actual screen dimensions from the sandbox * Falls back to defaults if detection fails */ async function getScreenDimensions( computer: CuaComputerClient ): Promise<{ width: number; height: number }> { try { const result = (await computer.getScreenSize()) as CommandResult & { size?: { width: number; height: number }; }; console.log("[Agent] get_screen_size result:", JSON.stringify(result)); if (result.success) { // CUA SDK returns { success: true, size: { width, height } } if (result.size?.width && result.size?.height) { const dims = { width: result.size.width, height: result.size.height }; console.log("[Agent] Screen dimensions from size field:", dims); return dims; } // Fallback: parse from content string (legacy/alternate format) if (result.content) { // Try regex for "1920x1080" format const match = result.content.match(/(\d+)\s*[x\u00d7,]\s*(\d+)/i); if (match) { const dims = { width: parseInt(match[1]), height: parseInt(match[2]), }; console.log("[Agent] Parsed screen dimensions from string:", dims); return dims; } // Try parsing content as JSON try { const parsed = JSON.parse(result.content); if (parsed.size?.width && parsed.size?.height) { const dims = { width: parsed.size.width, height: parsed.size.height, }; console.log( "[Agent] Parsed screen dimensions from JSON (size):", dims ); return dims; } if (parsed.width && parsed.height) { const dims = { width: parsed.width, height: parsed.height }; console.log( "[Agent] Parsed screen dimensions from JSON (direct):", dims ); return dims; } } catch { // Ignore JSON parse errors } } } console.log("[Agent] Failed to parse screen size, using defaults"); } catch (err) { console.log("[Agent] Error getting screen size:", err); } return { width: DEFAULT_DISPLAY_WIDTH, height: DEFAULT_DISPLAY_HEIGHT }; } // ============================================ // System Prompt // ============================================ const SYSTEM_PROMPT = `You are a computer use agent. Complete the user's task by interacting with the desktop. IMPORTANT RULES: 1. After each action, take a screenshot to verify the result 2. When you click a button, verify in the next screenshot that the click worked 3. If a dialog disappears after clicking, the action succeeded 4. Be precise with coordinates - click the center of buttons 5. Do NOT scroll unless necessary - most UI elements are already visible ENHANCED ACTIONS AVAILABLE: - triple_click: Triple click to select entire paragraph or line of text - hold_key: Press and hold a modifier key (e.g., shift, ctrl, alt). Keys are automatically released after the next action. - left_mouse_down: Press and hold left mouse button at coordinates - left_mouse_up: Release left mouse button at coordinates - middle_click: Click middle mouse button (opens links in new tabs) When using hold_key for modifier+click combinations: Example sequence: hold_key("shift") -> left_click (shift auto-releases after click) VISUAL VERIFICATION (Critical): After every significant action, take a screenshot and carefully evaluate: - Did the action produce the expected result? - Is the UI in the expected state? - If not, try an alternative approach before giving up. Look for visual confirmation: dialogs appearing/disappearing, text changing, selections highlighting. When the task is complete, you MUST output exactly: TASK_COMPLETE: <brief summary> If you cannot complete the task, output exactly: TASK_FAILED: <reason> Be efficient and direct. Verify your actions worked before moving on.`; // ============================================ // Background Execution Wrapper // ============================================ /** * Execute a task in the background (for non-blocking mode) * Wraps executeTask with error handling and result storage */ export async function executeTaskInBackground( taskId: string, progressUrl: string, sandboxName: string, host: string, cuaApiKey: string, anthropicApiKey: string, task: string, maxSteps: number, timeoutSeconds: number ): Promise<void> { try { const result = await executeTask( sandboxName, host, cuaApiKey, anthropicApiKey, task, maxSteps, timeoutSeconds, taskId, progressUrl ); // Store final result in Blob await put(`tasks/${taskId}.json`, JSON.stringify(result), { access: "public", addRandomSuffix: false, }); } catch (err) { console.error(`[Agent] Background task ${taskId} failed:`, err); // Update progress with error const errorProgress: TaskProgress = { task_id: taskId, sandbox_name: sandboxName, task, status: "failed", current_step: 0, max_steps: maxSteps, started_at: Date.now(), updated_at: Date.now(), elapsed_ms: 0, timeout_seconds: timeoutSeconds, steps_summary: [], final_result: { success: false, summary: `Background execution failed: ${err instanceof Error ? err.message : String(err)}`, total_steps: 0, duration_ms: 0, error: err instanceof Error ? err.message : String(err), }, }; await put(`progress/${taskId}.json`, JSON.stringify(errorProgress), { access: "public", addRandomSuffix: false, }); } } // ============================================ // Main Task Execution // ============================================ /** * Execute a task autonomously using computer use agent loop * * @param sandboxName - Name of the CUA sandbox * @param host - Host URL for the sandbox * @param cuaApiKey - API key for CUA * @param anthropicApiKey - API key for Anthropic * @param task - Task description to complete * @param maxSteps - Maximum meaningful actions (default: 100) * @param timeoutSeconds - Timeout in seconds (default: 280, max: 280) * @param existingTaskId - Pre-generated task ID (for non-blocking mode) * @param existingProgressUrl - Pre-initialized progress URL * @returns Task result with progress URL */ export async function executeTask( sandboxName: string, host: string, cuaApiKey: string, anthropicApiKey: string, task: string, maxSteps: number = DEFAULT_MAX_STEPS, timeoutSeconds: number = DEFAULT_TIMEOUT_SECONDS, existingTaskId?: string, existingProgressUrl?: string ): Promise<TaskResult & { progress_url?: string }> { const taskId = existingTaskId || generateTaskId(); const startTime = Date.now(); const steps: AgentStep[] = []; let progressUrl: string | undefined = existingProgressUrl; let lastReasoning: string | undefined; // Initialize progress tracking (only if not pre-initialized) const progress: TaskProgress = { task_id: taskId, sandbox_name: sandboxName, task, status: "running", current_step: 0, max_steps: maxSteps, started_at: startTime, updated_at: startTime, elapsed_ms: 0, timeout_seconds: timeoutSeconds, steps_summary: [], }; // Store initial progress (only if not pre-initialized) if (!existingProgressUrl) { progressUrl = await updateProgress(taskId, progress); } const anthropic = new Anthropic({ apiKey: anthropicApiKey, maxRetries: ANTHROPIC_MAX_RETRIES, // Default is 2, increase for long-running tasks }); const computer = new CuaComputerClient(sandboxName, host, cuaApiKey); // Get actual screen dimensions from sandbox const screenSize = await getScreenDimensions(computer); const displayWidth = screenSize.width; const displayHeight = screenSize.height; // Warn if resolution exceeds Anthropic recommendations for computer use if ( displayWidth > RECOMMENDED_MAX_WIDTH || displayHeight > RECOMMENDED_MAX_HEIGHT ) { console.warn( `[Agent] Screen resolution ${displayWidth}x${displayHeight} exceeds ` + `Anthropic's recommended maximum of ${RECOMMENDED_MAX_WIDTH}x${RECOMMENDED_MAX_HEIGHT}. ` + `Coordinate accuracy may be reduced for computer use tasks.` ); } // Get model configuration const modelConfig = getModelConfig(); const messages: Anthropic.Beta.BetaMessageParam[] = [ { role: "user", content: `Task: ${task}\n\nPlease complete this task. Start by taking a screenshot to see the current state.`, }, ]; // Track meaningful actions (excludes screenshot/zoom which are just observations) let meaningfulSteps = 0; // Safety limit: total iterations including screenshots (prevents infinite loops) const maxTotalIterations = maxSteps * 3; let totalIterations = 0; // Track held modifier keys for auto-release after actions // This works around Anthropic's computer use tool schema not having release_key const heldKeys = new Set<string>(); while (meaningfulSteps < maxSteps && totalIterations < maxTotalIterations) { totalIterations++; // Check timeout const elapsed = Date.now() - startTime; if (elapsed > timeoutSeconds * 1000) { await finalizeTask(taskId, progress, "timeout", { success: false, summary: "Task timed out", steps: meaningfulSteps, durationMs: elapsed, error: `Timeout after ${timeoutSeconds}s`, }); return { task_id: taskId, success: false, summary: "Task timed out", steps, steps_taken: meaningfulSteps, duration_ms: elapsed, screen_size: { width: displayWidth, height: displayHeight }, error: `Timeout after ${timeoutSeconds}s`, progress_url: progressUrl, }; } try { // Build computer tool based on model config const computerTool = modelConfig.supportsZoom ? { type: modelConfig.toolType, name: "computer" as const, display_width_px: displayWidth, display_height_px: displayHeight, display_number: 1, enable_zoom: true, } : { type: modelConfig.toolType, name: "computer" as const, display_width_px: displayWidth, display_height_px: displayHeight, display_number: 1, }; // Use a heartbeat to update progress while waiting for API response let heartbeatInterval: ReturnType<typeof setInterval> | undefined; const startHeartbeat = () => { heartbeatInterval = setInterval(async () => { // Create snapshot to avoid race condition with main loop const snapshot = { ...progress, updated_at: Date.now(), elapsed_ms: Date.now() - startTime, }; await updateProgress(taskId, snapshot); }, HEARTBEAT_INTERVAL_MS); }; const stopHeartbeat = () => { if (heartbeatInterval) { clearInterval(heartbeatInterval); heartbeatInterval = undefined; } }; startHeartbeat(); let response: Anthropic.Beta.BetaMessage; try { response = await anthropic.beta.messages.create({ model: modelConfig.model, max_tokens: 4096, system: SYSTEM_PROMPT, tools: [computerTool], messages, betas: [modelConfig.betaFlag], }); } finally { stopHeartbeat(); } // Process response const toolResults: Anthropic.Beta.BetaToolResultBlockParam[] = []; for (const block of response.content) { if (block.type === "text") { // Capture reasoning from Claude's text (before any completion markers) if ( !block.text.includes("TASK_COMPLETE:") && !block.text.includes("TASK_FAILED:") ) { lastReasoning = block.text.trim(); } // Check for task completion if (block.text.includes("TASK_COMPLETE:")) { const summary = block.text.split("TASK_COMPLETE:")[1].trim(); const durationMs = Date.now() - startTime; await finalizeTask(taskId, progress, "completed", { success: true, summary, steps: meaningfulSteps, durationMs, }); return { task_id: taskId, success: true, summary, steps, steps_taken: meaningfulSteps, duration_ms: durationMs, screen_size: { width: displayWidth, height: displayHeight }, progress_url: progressUrl, }; } if (block.text.includes("TASK_FAILED:")) { const reason = block.text.split("TASK_FAILED:")[1].trim(); const durationMs = Date.now() - startTime; await finalizeTask(taskId, progress, "failed", { success: false, summary: reason, steps: meaningfulSteps, durationMs, error: "Task failed", }); return { task_id: taskId, success: false, summary: reason, steps, steps_taken: meaningfulSteps, duration_ms: durationMs, error: "Task failed", progress_url: progressUrl, }; } } if (block.type === "tool_use") { const input = block.input as ActionInput; const stepRecord: AgentStep = { step: steps.length + 1, action: input.action, coordinates: input.coordinate, success: true, }; // Look up handler in registry const handler = ACTION_HANDLERS[input.action]; if (!handler) { stepRecord.success = false; stepRecord.error = `Unknown action: ${input.action}`; steps.push(stepRecord); toolResults.push({ type: "tool_result", tool_use_id: block.id, content: `Unknown action: ${input.action}`, is_error: true, }); continue; } try { // Execute the action handler const result = await handler(input, computer, { displayWidth, displayHeight, }); stepRecord.success = result.success; if (result.error) stepRecord.error = result.error; if (result.result) stepRecord.result = result.result; if (typeof result.content === "string") stepRecord.result = result.content; if (lastReasoning) stepRecord.reasoning = lastReasoning; steps.push(stepRecord); // Track held keys for auto-release if (input.action === "hold_key" && result.success) { const keyToHold = input.key || input.text; if (keyToHold) { heldKeys.add(keyToHold.toLowerCase()); console.log(`[Agent] Key held: ${keyToHold} (${heldKeys.size} keys held)`); } } // Auto-release held keys after meaningful actions // This simulates the expected modifier key behavior: hold_key → action → release if (!NO_AUTO_RELEASE_ACTIONS.has(input.action) && heldKeys.size > 0) { console.log(`[Agent] Auto-releasing ${heldKeys.size} held keys after ${input.action}`); for (const key of heldKeys) { try { await computer.keyUp(key); console.log(`[Agent] Auto-released key: ${key}`); } catch (err) { console.warn(`[Agent] Failed to auto-release key ${key}:`, err); } } heldKeys.clear(); } // Update progress for meaningful actions (not screenshots/zoom) if (!OBSERVATION_ACTIONS.has(input.action)) { meaningfulSteps++; const now = Date.now(); progress.current_step = meaningfulSteps; progress.updated_at = now; progress.elapsed_ms = now - startTime; progress.last_action = { action: input.action, reasoning: lastReasoning, result: typeof result.content === "string" ? result.content : undefined, success: stepRecord.success, coordinates: input.coordinate, }; progress.last_reasoning = lastReasoning; // Maintain rolling summary (last 5 actions) const summary = summarizeAction(input.action, input.coordinate); progress.steps_summary.push(summary); if (progress.steps_summary.length > 5) { progress.steps_summary.shift(); } // Update progress (await to ensure it completes before next action) await updateProgress(taskId, progress); } toolResults.push({ type: "tool_result", tool_use_id: block.id, content: result.content, is_error: !result.success, }); // Delay for UI to settle after meaningful actions if (!OBSERVATION_ACTIONS.has(input.action)) { await sleep(UI_SETTLE_DELAY_MS); } } catch (err) { stepRecord.success = false; stepRecord.error = err instanceof Error ? err.message : String(err); steps.push(stepRecord); // Log error for debugging console.error( `[Agent] Action failed (${input.action}):`, stepRecord.error ); toolResults.push({ type: "tool_result", tool_use_id: block.id, content: `Error: ${stepRecord.error}`, is_error: true, }); } } } // Add assistant response and tool results to message history messages.push({ role: "assistant", content: response.content, }); if (toolResults.length > 0) { messages.push({ role: "user", content: toolResults, }); } // Trim message history to prevent context bloat from accumulated screenshots trimMessageHistory(messages); // If the model stopped without tool use and without completion markers if (response.stop_reason === "end_turn" && toolResults.length === 0) { // Check final text for any summary const lastText = response.content.find((b) => b.type === "text"); const summaryText = lastText?.type === "text" ? lastText.text : "Task completed"; const durationMs = Date.now() - startTime; await finalizeTask(taskId, progress, "completed", { success: true, summary: summaryText, steps: meaningfulSteps, durationMs, }); return { task_id: taskId, success: true, summary: summaryText, steps, steps_taken: meaningfulSteps, duration_ms: durationMs, screen_size: { width: displayWidth, height: displayHeight }, progress_url: progressUrl, }; } } catch (err) { const errorMsg = err instanceof Error ? err.message : String(err); const durationMs = Date.now() - startTime; await finalizeTask(taskId, progress, "failed", { success: false, summary: `Agent error: ${errorMsg}`, steps: meaningfulSteps, durationMs, error: errorMsg, }); return { task_id: taskId, success: false, summary: `Agent error: ${errorMsg}`, steps, steps_taken: meaningfulSteps, duration_ms: durationMs, screen_size: { width: displayWidth, height: displayHeight }, error: errorMsg, progress_url: progressUrl, }; } } // Max steps exceeded const durationMs = Date.now() - startTime; const errorMsg = meaningfulSteps >= maxSteps ? `Reached ${maxSteps} action limit (${meaningfulSteps} actions taken)` : `Safety limit reached (${totalIterations} total iterations)`; await finalizeTask(taskId, progress, "failed", { success: false, summary: "Max steps exceeded without completing task", steps: meaningfulSteps, durationMs, error: errorMsg, }); return { task_id: taskId, success: false, summary: "Max steps exceeded without completing task", steps, steps_taken: meaningfulSteps, duration_ms: durationMs, screen_size: { width: displayWidth, height: displayHeight }, error: errorMsg, progress_url: progressUrl, }; } // ============================================ // Re-exports for Convenience // ============================================ export { generateTaskId, getSandboxHost } from "./utils.js"; export { initializeProgress } from "./progress.js";

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/taskcrew/cua-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

execute.ts•22.9 KiB