Skip to main content
Glama
interactionTools.ts27.3 kB
import { z } from "zod"; import { ToolRegistry, ProgressCallback } from "./toolRegistry"; import { TapOnElement } from "../features/action/TapOnElement"; import { InputText } from "../features/action/InputText"; import { ClearText } from "../features/action/ClearText"; import { SelectAllText } from "../features/action/SelectAllText"; import { PressButton } from "../features/action/PressButton"; import { SwipeOnElement } from "../features/action/SwipeOnElement"; import { SwipeOnScreen } from "../features/action/SwipeOnScreen"; import { Shake } from "../features/action/Shake"; import { ImeAction } from "../features/action/ImeAction"; import { RecentApps } from "../features/action/RecentApps"; import { HomeScreen } from "../features/action/HomeScreen"; import { Rotate } from "../features/action/Rotate"; import { ElementUtils } from "../features/utility/ElementUtils"; import { ObserveScreen } from "../features/observe/ObserveScreen"; import { OpenURL } from "../features/action/OpenURL"; import { ActionableError, BootedDevice } from "../models"; import { createJSONToolResponse } from "../utils/toolUtils"; import { logger } from "../utils/logger"; import { Platform } from "../models"; // Type definitions for better TypeScript support export interface ClearTextArgs { platform: Platform; } export interface SelectAllTextArgs { platform: Platform; } export interface PressButtonArgs { button: "home" | "back" | "menu" | "power" | "volume_up" | "volume_down" | "recent"; platform: Platform; } export interface OpenSystemTrayArgs { platform: Platform; } export interface PressKeyArgs { key: "home" | "back" | "menu" | "power" | "volume_up" | "volume_down" | "recent"; platform: Platform; } export interface InputTextArgs { text: string; imeAction?: "done" | "next" | "search" | "send" | "go" | "previous"; platform: Platform; } export interface OpenLinkArgs { url: string; platform: Platform; } export interface TapOnArgs { containerElementId?: string; text?: string; id?: string; action: "tap" | "doubleTap" | "longPress" | "focus"; platform: Platform; } export interface SwipeOnScreenArgs { direction: "up" | "down" | "left" | "right"; includeSystemInsets: boolean; duration: number; platform: Platform; } export interface SwipeOnElementArgs { containerElementId?: string; elementId: string; direction: "up" | "down" | "left" | "right"; duration: number; platform: Platform; } export interface ScrollArgs { containerElementId: string; direction: "up" | "down" | "left" | "right"; lookFor?: ScrollLookForArgs; speed?: "slow" | "normal" | "fast"; platform: Platform; } export interface ScrollLookForArgs { elementId?: string; text?: string; maxTime?: number; } export interface ShakeArgs { duration?: number; intensity?: number; platform: Platform; } export interface ImeActionArgs { action: "done" | "next" | "search" | "send" | "go" | "previous"; platform: Platform; } export interface RecentAppsArgs { platform: Platform; } export interface RotateArgs { orientation: "portrait" | "landscape"; platform: Platform; } // Schema definitions for tool arguments export const shakeSchema = z.object({ duration: z.number().optional().describe("Duration of the shake in milliseconds (default: 1000)"), intensity: z.number().optional().describe("Intensity of the shake acceleration (default: 100)"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const tapOnSchema = z.object({ containerElementId: z.string().optional().describe("Container element ID to restrict the search within"), action: z.enum(["tap", "doubleTap", "longPress", "focus"]).describe("Action to perform on the element"), text: z.string().optional().describe("Text to tap on"), id: z.string().optional().describe("Element ID to tap on"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const swipeOnElementSchema = z.object({ containerElementId: z.string().optional().describe("Container element ID to restrict the search within"), elementId: z.string().describe("ID of the element to swipe on"), direction: z.enum(["up", "down", "left", "right"]).describe("Direction to swipe"), duration: z.number().describe("Duration of the swipe in milliseconds"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const swipeOnScreenSchema = z.object({ direction: z.enum(["up", "down", "left", "right"]).describe("Direction to swipe"), includeSystemInsets: z.boolean().describe("Whether to include system inset areas in the swipe"), duration: z.number().describe("Duration of the swipe in milliseconds"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const dragAndDropSchema = z.object({ from: z.object({ index: z.number().describe("Index of the source element to drag"), text: z.string().optional().describe("Optional text for validation and debugging") }).describe("Source element to drag from"), to: z.object({ index: z.number().describe("Index of the destination element to drop to"), text: z.string().optional().describe("Optional text for validation and debugging") }).describe("Destination element to drop to"), duration: z.number().optional().describe("Duration of the drag in milliseconds (default: 500)"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const scrollSchema = z.object({ containerElementId: z.string().describe("Element ID to scroll until visible"), direction: z.enum(["up", "down", "left", "right"]).describe("Scroll direction"), lookFor: z.object({ elementId: z.string().optional().describe("ID of the element to look for while scrolling"), text: z.string().optional().describe("Optional text to look for while scrolling"), maxTime: z.number().optional().describe("Maximum amount of time to spend scrolling, (default 10 seconds)") }).optional().describe("What we're searching for while scrolling"), speed: z.enum(["slow", "normal", "fast"]).optional().describe("Scroll speed"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const clearTextSchema = z.object({ platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const selectAllTextSchema = z.object({ platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const pressButtonSchema = z.object({ button: z.enum(["home", "back", "menu", "power", "volume_up", "volume_down", "recent"]) .describe("The button to press"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const openSystemTraySchema = z.object({ platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const pressKeySchema = z.object({ key: z.enum(["home", "back", "menu", "power", "volume_up", "volume_down", "recent"]) .describe("The key to press"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const stopAppSchema = z.object({ appId: z.string().describe("App package ID to stop"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const clearStateSchema = z.object({ appId: z.string().describe("App package ID to clear state for"), clearKeychain: z.boolean().optional().describe("Also clear iOS keychain (iOS only)"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const inputTextSchema = z.object({ text: z.string().describe("Text to input to the device"), imeAction: z.enum(["done", "next", "search", "send", "go", "previous"]).optional() .describe("Optional IME action to perform after text input"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const openLinkSchema = z.object({ url: z.string().describe("URL to open in the default browser"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const imeActionSchema = z.object({ action: z.enum(["done", "next", "search", "send", "go", "previous"]).describe("IME action to perform"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const recentAppsSchema = z.object({ platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const homeScreenSchema = z.object({ platform: z.enum(["android", "ios"]).describe("Platform of the device") }); export const rotateSchema = z.object({ orientation: z.enum(["portrait", "landscape"]).describe("The orientation to set"), platform: z.enum(["android", "ios"]).describe("Platform of the device") }); // Register tools export function registerInteractionTools() { const elementUtils = new ElementUtils(); // Tap on handler const tapOnHandler = async (device: BootedDevice, args: TapOnArgs, progress?: ProgressCallback) => { const tapOnTextCommand = new TapOnElement(device); const result = await tapOnTextCommand.execute({ containerElementId: args.containerElementId, text: args.text, elementId: args.id, action: args.action, }, progress); return createJSONToolResponse({ message: `Tapped on element`, observation: result.observation, ...result }); }; // Clear text handler const clearTextHandler = async (device: BootedDevice, args: ClearTextArgs, progress?: ProgressCallback) => { try { const clearText = new ClearText(device); const result = await clearText.execute(progress); return createJSONToolResponse({ message: "Cleared text from input field", observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to clear text: ${error}`); } }; // Select all text handler const selectAllTextHandler = async (device: BootedDevice, args: SelectAllTextArgs, progress?: ProgressCallback) => { try { const selectAllText = new SelectAllText(device); const result = await selectAllText.execute(progress); return createJSONToolResponse({ message: "Selected all text in focused input field", observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to select all text: ${error}`); } }; // Press button handler const pressButtonHandler = async (device: BootedDevice, args: PressButtonArgs, progress?: ProgressCallback) => { try { const pressButton = new PressButton(device); const result = await pressButton.execute(args.button, progress); // observe = true return createJSONToolResponse({ message: `Pressed button ${args.button}`, observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to press button: ${error}`); } }; // Swipe on element handler const swipeOnElementHandler = async (device: BootedDevice, args: SwipeOnElementArgs, progress?: ProgressCallback) => { try { // Validate element ID format const elementId = args.elementId; // Check if the elementId looks like coordinate bounds instead of a resource ID const boundsPattern = /^\[\d+,\d+\]\[\d+,\d+\]$/; if (boundsPattern.test(elementId)) { throw new ActionableError( `Invalid element ID: "${elementId}" appears to be coordinate bounds. ` + `Please provide a proper Android resource ID (e.g., "com.example.app:id/button") ` + `or use swipeOnScreen with coordinates instead.` ); } // Check if elementId is empty or just whitespace if (!elementId || elementId.trim().length === 0) { throw new ActionableError("Element ID cannot be empty. Please provide a valid Android resource ID."); } const observeScreen = new ObserveScreen(device); const swipeOnElement = new SwipeOnElement(device); // First observe to find the element const observeResult = await observeScreen.execute(); if (!observeResult.viewHierarchy) { throw new ActionableError("Could not get view hierarchy to find element for swipe."); } const element = elementUtils.findElementByResourceId( observeResult.viewHierarchy, elementId, args.containerElementId, // Search within the specific container true // partial match ); if (!element) { // Provide helpful suggestions const allResourceIds: string[] = []; const rootNodes = elementUtils.extractRootNodes(observeResult.viewHierarchy); for (const rootNode of rootNodes) { elementUtils.traverseNode(rootNode, (node: any) => { const nodeProperties = elementUtils.extractNodeProperties(node); if (nodeProperties["resource-id"] && nodeProperties["resource-id"].trim()) { allResourceIds.push(nodeProperties["resource-id"]); } }); } const uniqueResourceIds = [...new Set(allResourceIds)].slice(0, 5); // Show first 5 unique IDs const suggestion = uniqueResourceIds.length > 0 ? ` Available resource IDs include: ${uniqueResourceIds.join(", ")}` : " No elements with resource IDs found on current screen."; throw new ActionableError( `Element not found with ID "${elementId}".${suggestion} ` + `Use the 'observe' command to see the current view hierarchy and find valid element IDs.` ); } const result = await swipeOnElement.execute( element, args.direction, { duration: args.duration ?? 100 }, progress ); return createJSONToolResponse({ message: `Swiped ${args.direction} on element with ID "${elementId}"`, observation: result.observation }); } catch (error) { throw new ActionableError(`Failed to swipe on element: ${error}`); } }; // Swipe on screen handler const swipeOnScreenHandler = async (device: BootedDevice, args: SwipeOnScreenArgs, progress?: ProgressCallback) => { try { const swipeOnScreen = new SwipeOnScreen(device); const result = await swipeOnScreen.execute( args.direction, { duration: args.duration ?? 100, includeSystemInsets: args.includeSystemInsets }, progress ); return createJSONToolResponse({ message: `Swiped ${args.direction} on screen${args.includeSystemInsets ? " including navigation areas" : ""}`, observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to swipe on screen: ${error}`); } }; // Open system tray handler const openSystemTrayHandler = async (device: BootedDevice, args: OpenSystemTrayArgs, progress?: ProgressCallback) => { try { const swipeOnScreen = new SwipeOnScreen(device); const result = await swipeOnScreen.execute( "down", { duration: 100, includeSystemInsets: true // to access status bar area }, progress ); return createJSONToolResponse({ message: "Opened system tray by swiping down from the status bar", observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to open system tray: ${error}`); } }; // Scroll handler const scrollHandler = async (device: BootedDevice, args: ScrollArgs, progress?: ProgressCallback) => { // Element-specific scrolling const observeScreen = new ObserveScreen(device); const swipe = new SwipeOnElement(device); const observeResult = await observeScreen.execute(); if (!observeResult.viewHierarchy) { throw new ActionableError("Could not get view hierarchy for element scrolling"); } // Find the element by resource ID const element = elementUtils.findElementByResourceId( observeResult.viewHierarchy, args.containerElementId, args.containerElementId, true // partial match ); if (!element) { throw new ActionableError(`Container element not found with ID: ${args.containerElementId}`); } const containerElement = element; if (!args.lookFor) { const duration = elementUtils.getSwipeDurationFromSpeed(args.speed); const result = await swipe.execute( containerElement, elementUtils.getSwipeDirectionForScroll(args.direction), { duration: duration, easing: "accelerateDecelerate", fingers: 1, randomize: false, lift: true, pressure: 1 }, progress ); return createJSONToolResponse({ message: `Scrolled ${args.direction} within element ${args.containerElementId}`, observation: result.observation }); } else if (!args.lookFor.text && !args.lookFor.elementId) { throw new ActionableError("Either text or element id must be specified to look for something in a scrollable list."); } else { let lastObservation = await observeScreen.execute(); if (!lastObservation.viewHierarchy || !lastObservation.screenSize) { throw new Error("Failed to get initial observation for scrolling until visible."); } const direction = args.direction; const maxTime = 120000; // args.lookFor.maxTime ?? 120000; const startTime = Date.now(); let foundElement = null; while (Date.now() - startTime < maxTime) { // Re-observe the screen to get current state lastObservation = await observeScreen.execute(); if (!lastObservation.viewHierarchy) { throw new Error("Lost observation during scroll until visible."); } // Check if target element is now visible if (args.lookFor.text) { foundElement = elementUtils.findElementByText( lastObservation.viewHierarchy, args.lookFor.text, args.containerElementId, // Search within the specific container true, // fuzzy match false // case-sensitive ); } else if (args.lookFor.elementId) { foundElement = elementUtils.findElementByResourceId( lastObservation.viewHierarchy, args.lookFor.elementId, args.containerElementId, // Search within the specific container true // partial match ); } if (foundElement) { logger.info(`Found element after scrolling for ${Date.now() - startTime}ms.`); break; } // Use the specific container element to swipe, not any scrollable element const result = await swipe.execute( containerElement, elementUtils.getSwipeDirectionForScroll(direction), { duration: 600 }, progress ); // Update observation from swipe result if (result.observation && result.observation.viewHierarchy) { lastObservation = result.observation; } else { throw new Error("Lost observation after swipe during scroll until visible."); } } if (!foundElement) { const target = args.lookFor.text ? `text "${args.lookFor.text}"` : `element with id "${args.lookFor.elementId}"`; throw new ActionableError(`${target} not found after scrolling for ${maxTime}ms.`); } const target = args.lookFor.text ? `text "${args.lookFor.text}"` : `element with id "${args.lookFor.elementId}"`; return createJSONToolResponse({ message: `Scrolled until ${target} became visible`, found: !!foundElement, observation: lastObservation }); } }; // Press key handler const pressKeyHandler = async (device: BootedDevice, args: PressKeyArgs, progress?: ProgressCallback) => { const pressButton = new PressButton(device); const result = await pressButton.execute(args.key, progress); return createJSONToolResponse({ message: `Pressed key ${args.key}`, observation: result.observation, ...result }); }; // Input text handler const inputTextHandler = async (device: BootedDevice, args: InputTextArgs) => { const inputText = new InputText(device); const result = await inputText.execute(args.text, args.imeAction); return createJSONToolResponse({ message: `Input text`, observation: result.observation, ...result }); }; // Open link handler const openLinkHandler = async (device: BootedDevice, args: OpenLinkArgs) => { const openUrl = new OpenURL(device); const result = await openUrl.execute(args.url); return createJSONToolResponse({ message: `Opened link ${args.url}`, observation: result.observation, ...result }); }; // Shake handler const shakeHandler = async (device: BootedDevice, args: ShakeArgs, progress?: ProgressCallback) => { try { const shake = new Shake(device); const result = await shake.execute({ duration: args.duration ?? 1000, intensity: args.intensity ?? 100 }, progress); return createJSONToolResponse({ message: `Shook device for ${args.duration ?? 1000}ms with intensity ${args.intensity ?? 100}`, observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to shake device: ${error}`); } }; // IME action handler const imeActionHandler = async (device: BootedDevice, args: ImeActionArgs, progress?: ProgressCallback) => { try { const imeAction = new ImeAction(device); const result = await imeAction.execute(args.action, progress); return createJSONToolResponse({ message: `Executed IME action "${args.action}"`, observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to execute IME action: ${error}`); } }; // Recent Apps handler const recentAppsHandler = async (device: BootedDevice, args: RecentAppsArgs, progress?: ProgressCallback) => { try { const recentApps = new RecentApps(device); const result = await recentApps.execute(progress); return createJSONToolResponse({ message: "Opened recent apps", observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to open recent apps: ${error}`); } }; // Home Screen handler const homeScreenHandler = async (device: BootedDevice, args: any, progress?: ProgressCallback) => { try { const homeScreen = new HomeScreen(device); const result = await homeScreen.execute(progress); return createJSONToolResponse({ message: "Pressed home button to return to the home screen", observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to go to home screen: ${error}`); } }; // Rotate handler const rotateHandler = async (device: BootedDevice, args: RotateArgs, progress?: ProgressCallback) => { try { const rotate = new Rotate(device); const result = await rotate.execute(args.orientation, progress); return createJSONToolResponse({ message: `Rotated device to ${args.orientation} orientation`, observation: result.observation, ...result }); } catch (error) { throw new ActionableError(`Failed to rotate device: ${error}`); } }; // Register with the tool registry ToolRegistry.registerDeviceAware( "clearText", "Clear text from the currently focused input field", clearTextSchema, clearTextHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "selectAllText", "Select all text in the currently focused input field using long press + tap on 'Select All'", selectAllTextSchema, selectAllTextHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "pressButton", "Press a hardware button on the device", pressButtonSchema, pressButtonHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "swipeOnElement", "Swipe on a specific element", swipeOnElementSchema, swipeOnElementHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "swipeOnScreen", "Swipe on screen in a specific direction", swipeOnScreenSchema, swipeOnScreenHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "openSystemTray", "Open the system notification tray by swiping down from the status bar", openSystemTraySchema, openSystemTrayHandler, true // Supports progress notifications ); // Phase 1: Core Command Renames ToolRegistry.registerDeviceAware( "pressKey", "Press a hardware key on the device (Maestro equivalent of pressButton)", pressKeySchema, pressKeyHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "inputText", "Input text to the device", inputTextSchema, inputTextHandler, false // Does not support progress notifications ); ToolRegistry.registerDeviceAware( "openLink", "Open a URL in the default browser", openLinkSchema, openLinkHandler, false // Does not support progress notifications ); ToolRegistry.registerDeviceAware( "tapOn", "Tap supporting text or resourceId", tapOnSchema, tapOnHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "scroll", "Scroll in a direction on a scrollable container, optionally to find an element (supports text and selectors)", scrollSchema, scrollHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "swipe", "Unified scroll command supporting direction and speed (no index support due to reliability)", scrollSchema, scrollHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "shake", "Shake the device", shakeSchema, shakeHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "imeAction", "Perform an IME action (e.g., done, next, search)", imeActionSchema, imeActionHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "recentApps", "Open the recent apps list", recentAppsSchema, recentAppsHandler, true // Supports progress notifications ); ToolRegistry.registerDeviceAware( "homeScreen", "Return to the home screen by pressing the home button", homeScreenSchema, homeScreenHandler, true // Supports progress notifications ); // Register the new rotate tool ToolRegistry.registerDeviceAware( "rotate", "Rotate the device to a specific orientation", rotateSchema, rotateHandler, true // Supports progress notifications ); }

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/zillow/auto-mobile'

If you have feedback or need assistance with the MCP directory API, please join our Discord server