replicant-mcp

ui.ts•11.5 KiB

import { z } from "zod"; import { ServerContext } from "../server.js"; import { CACHE_TTLS, UiConfig, ReplicantError, ErrorCode } from "../types/index.js"; import { AccessibilityNode, flattenTree } from "../parsers/ui-dump.js"; import { handleFind, getElementCenter } from "./ui-find.js"; export const uiInputSchema = z.object({ operation: z.enum(["dump", "find", "tap", "input", "scroll", "screenshot", "accessibility-check", "visual-snapshot"]), selector: z.object({ resourceId: z.string().optional(), text: z.string().optional(), textContains: z.string().optional(), className: z.string().optional(), nearestTo: z.string().optional(), }).optional(), x: z.number().optional(), y: z.number().optional(), elementIndex: z.number().optional(), text: z.string().optional(), localPath: z.string().optional(), inline: z.boolean().optional(), debug: z.boolean().optional(), gridCell: z.number().min(1).max(24).optional(), gridPosition: z.number().min(1).max(5).optional(), deviceSpace: z.boolean().optional(), maxDimension: z.number().optional(), raw: z.boolean().optional(), compact: z.boolean().optional(), direction: z.enum(["up", "down", "left", "right"]).optional(), amount: z.number().min(0).max(1).optional(), limit: z.number().min(1).max(100).optional(), offset: z.number().min(0).optional(), }); export type UiInput = z.infer<typeof uiInputSchema>; const DEFAULT_CONFIG: UiConfig = { visualModePackages: [], autoFallbackScreenshot: true, includeBase64: false, maxImageDimension: 800, }; type OperationHandler = ( input: UiInput, context: ServerContext, config: UiConfig, deviceId: string, ) => Promise<Record<string, unknown>>; const uiOperations: Record<string, OperationHandler> = { dump: handleDump, find: handleFind, tap: handleTap, input: handleInput, scroll: handleScroll, screenshot: handleScreenshot, "accessibility-check": handleAccessibilityCheck, "visual-snapshot": handleVisualSnapshot, }; export async function handleUiTool( input: UiInput, context: ServerContext, uiConfig?: UiConfig ): Promise<Record<string, unknown>> { const device = await context.deviceState.ensureDevice(context.adb); const config = uiConfig ?? DEFAULT_CONFIG; const handler = uiOperations[input.operation]; if (!handler) { throw new ReplicantError( ErrorCode.INVALID_OPERATION, `Unknown operation: ${input.operation}`, "Valid operations: dump, find, tap, input, scroll, screenshot, accessibility-check, visual-snapshot", ); } return handler(input, context, config, device.id); } async function handleDump( input: UiInput, context: ServerContext, _config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { const tree = await context.ui.dump(deviceId); const dumpId = context.cache.generateId("ui-dump"); context.cache.set(dumpId, { tree, deviceId }, "ui-dump", CACHE_TTLS.UI_TREE); const emptyWarning = tree.length === 0 ? "No accessibility nodes found. Possible causes: (1) UI still loading - wait and retry, (2) App uses custom rendering (Flutter, games, video players) - use screenshot instead, (3) App blocks accessibility services." : undefined; if (input.compact) { return handleCompactDump(tree, input, dumpId, deviceId, emptyWarning); } return handleFullDump(tree, dumpId, deviceId, emptyWarning); } function handleCompactDump( tree: AccessibilityNode[], input: UiInput, dumpId: string, deviceId: string, emptyWarning: string | undefined ): Record<string, unknown> { const flat = flattenTree(tree); const interactive = flat.filter((n) => n.clickable || n.focusable); const limit = input.limit ?? 20; const offset = input.offset ?? 0; const totalCount = interactive.length; const paginated = interactive.slice(offset, offset + limit); const hasMore = offset + limit < totalCount; const elements = paginated.map((n) => ({ text: n.text || n.contentDesc || undefined, type: n.className.split(".").pop(), x: n.centerX, y: n.centerY, resourceId: n.resourceId ? n.resourceId.split("/").pop() : undefined, })); const noInteractiveWarning = tree.length > 0 && totalCount === 0 ? "Accessibility tree exists but no interactive elements found. Try 'ui find' with a text selector, or use screenshot for visual targeting." : undefined; const hint = hasMore ? `${elements.length} of ${totalCount} elements shown. Use 'ui find' for specific elements, or add offset: ${offset + limit} for more.` : undefined; return { dumpId, elements, count: elements.length, totalCount, hasMore, offset, limit, deviceId, hint, warning: emptyWarning || noInteractiveWarning, }; } function handleFullDump( tree: AccessibilityNode[], dumpId: string, deviceId: string, emptyWarning: string | undefined ): Record<string, unknown> { const simplifyNode = (node: AccessibilityNode): Record<string, unknown> => ({ className: node.className.split(".").pop(), text: node.text || undefined, resourceId: node.resourceId ? node.resourceId.split("/").pop() : undefined, bounds: `[${node.bounds.left},${node.bounds.top}][${node.bounds.right},${node.bounds.bottom}]`, clickable: node.clickable || undefined, children: node.children?.map((c) => simplifyNode(c)), }); return { dumpId, tree: tree.map((n) => simplifyNode(n)), deviceId, warning: emptyWarning, }; } async function handleTap( input: UiInput, context: ServerContext, _config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { let x: number, y: number; if (input.elementIndex !== undefined) { if (!context.lastFindResults[input.elementIndex]) { throw new ReplicantError( ErrorCode.ELEMENT_NOT_FOUND, `Element at index ${input.elementIndex} not found. Run 'find' first.`, "Use 'ui find' to locate elements, then reference them by index", ); } const element = context.lastFindResults[input.elementIndex]; const center = getElementCenter(element); x = center.x; y = center.y; } else if (input.x !== undefined && input.y !== undefined) { x = input.x; y = input.y; } else { throw new ReplicantError( ErrorCode.INPUT_VALIDATION_FAILED, "Either x/y coordinates or elementIndex is required for tap", "Provide x and y coordinates, or use elementIndex from a previous 'ui find' result", ); } await context.ui.tap(deviceId, x, y, input.deviceSpace); return { tapped: { x, y, deviceSpace: input.deviceSpace ?? false }, deviceId }; } async function handleInput( input: UiInput, context: ServerContext, _config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { if (!input.text) { throw new ReplicantError( ErrorCode.INPUT_VALIDATION_FAILED, "text is required for input operation", "Provide the text string to input", ); } await context.ui.input(deviceId, input.text); return { input: input.text, deviceId }; } async function handleScroll( input: UiInput, context: ServerContext, _config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { if (!input.direction) { throw new ReplicantError( ErrorCode.INPUT_VALIDATION_FAILED, "direction is required for scroll operation", "Provide a direction: up, down, left, or right", ); } const amount = input.amount ?? 0.5; await context.ui.scroll(deviceId, input.direction, amount); return { scrolled: { direction: input.direction, amount }, deviceId }; } async function handleScreenshot( input: UiInput, context: ServerContext, config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { const result = await context.ui.screenshot(deviceId, { localPath: input.localPath, inline: input.inline ?? true, maxDimension: input.maxDimension ?? config.maxImageDimension, raw: input.raw, }); return { ...result, deviceId }; } async function handleAccessibilityCheck( _input: UiInput, context: ServerContext, _config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { const result = await context.ui.accessibilityCheck(deviceId); return { ...result, deviceId }; } async function handleVisualSnapshot( input: UiInput, context: ServerContext, config: UiConfig, deviceId: string ): Promise<Record<string, unknown>> { const snapshot = await context.ui.visualSnapshot(deviceId, { includeBase64: input.inline ?? config.includeBase64, }); return { ...snapshot, deviceId }; } export const uiToolDefinition = { name: "ui", description: "Interact with app UI via accessibility tree. Auto-selects device if only one connected. Operations: dump, find, tap, input, scroll, screenshot, accessibility-check, visual-snapshot.", inputSchema: { type: "object", properties: { operation: { type: "string", enum: ["dump", "find", "tap", "input", "scroll", "screenshot", "accessibility-check", "visual-snapshot"], }, selector: { type: "object", properties: { resourceId: { type: "string" }, text: { type: "string" }, textContains: { type: "string" }, className: { type: "string" }, nearestTo: { type: "string", description: "Find elements nearest to this text (spatial proximity)" }, }, description: "Element selector (for find)", }, x: { type: "number", description: "X coordinate (for tap)" }, y: { type: "number", description: "Y coordinate (for tap)" }, elementIndex: { type: "number", description: "Element index from last find (for tap)" }, text: { type: "string", description: "Text to input" }, localPath: { type: "string", description: "Local path for screenshot (default: .replicant/screenshots/screenshot-{timestamp}.png)" }, inline: { type: "boolean", description: "Return base64 image data (default: true). Set to false to save to file instead." }, debug: { type: "boolean", description: "Include source (accessibility/ocr) and confidence in response" }, gridCell: { type: "number", minimum: 1, maximum: 24, description: "Grid cell number (1-24) for Tier 5 refinement" }, gridPosition: { type: "number", minimum: 1, maximum: 5, description: "Position within cell (1=TL, 2=TR, 3=Center, 4=BL, 5=BR)" }, deviceSpace: { type: "boolean", description: "For tap: treat x/y as device coordinates (skip image→device scaling). Use when coordinates come from adb shell input tap testing.", }, maxDimension: { type: "number", description: "Max image dimension in pixels (default: 1000). Higher = better quality, more tokens.", }, raw: { type: "boolean", description: "Skip scaling, return full device resolution. Warning: may exceed API limits.", }, compact: { type: "boolean", description: "For dump: return paginated flat list of interactive elements (default: 20, use limit/offset for more).", }, direction: { type: "string", enum: ["up", "down", "left", "right"], description: "Scroll direction (for scroll operation)", }, amount: { type: "number", minimum: 0, maximum: 1, description: "Scroll amount as fraction of screen (0-1, default: 0.5)", }, limit: { type: "number", minimum: 1, maximum: 100, description: "For dump with compact: max elements to return (default: 20).", }, offset: { type: "number", minimum: 0, description: "For dump with compact: skip first N elements for pagination.", }, }, required: ["operation"], }, };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/thecombatwombat/replicant-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ui.ts•11.5 KiB