Skip to main content
Glama
webview-interactions.ts14.8 kB
import { z } from 'zod'; import { writeFile } from 'node:fs/promises'; import { resolve } from 'node:path'; import { executeInWebview, executeInWebviewWithContext, captureScreenshot, getConsoleLogs as getConsoleLogsFromCapture, ScreenshotResult, } from './webview-executor.js'; import { SCRIPTS, buildScript, buildTypeScript, buildKeyEventScript } from './scripts/index.js'; // ============================================================================ // Base Schema for Window Targeting // ============================================================================ /** * Base schema mixin for tools that can target a specific window and app. * All webview tools extend this to support multi-window and multi-app scenarios. */ export const WindowTargetSchema = z.object({ windowId: z.string().optional().describe('Window label to target (defaults to "main")'), appIdentifier: z.union([ z.string(), z.number() ]).optional().describe( 'App port or bundle ID to target. Defaults to the only connected app or the default app if multiple are connected.' ), }); // ============================================================================ // Schemas // ============================================================================ export const InteractSchema = WindowTargetSchema.extend({ action: z.enum([ 'click', 'double-click', 'long-press', 'scroll', 'swipe', 'focus' ]) .describe('Type of interaction to perform'), selector: z.string().optional().describe('CSS selector for the element to interact with'), x: z.number().optional().describe('X coordinate for direct coordinate interaction'), y: z.number().optional().describe('Y coordinate for direct coordinate interaction'), duration: z.number().optional() .describe('Duration in ms for long-press or swipe (default: 500ms for long-press, 300ms for swipe)'), scrollX: z.number().optional().describe('Horizontal scroll amount in pixels (positive = right)'), scrollY: z.number().optional().describe('Vertical scroll amount in pixels (positive = down)'), fromX: z.number().optional().describe('Starting X coordinate for swipe'), fromY: z.number().optional().describe('Starting Y coordinate for swipe'), toX: z.number().optional().describe('Ending X coordinate for swipe'), toY: z.number().optional().describe('Ending Y coordinate for swipe'), }); export const ScreenshotSchema = WindowTargetSchema.extend({ format: z.enum([ 'png', 'jpeg' ]).optional().default('png').describe('Image format'), quality: z.number().min(0).max(100).optional().describe('JPEG quality (0-100, only for jpeg format)'), filePath: z.string().optional().describe('File path to save the screenshot to instead of returning as base64'), }); export const KeyboardSchema = WindowTargetSchema.extend({ action: z.enum([ 'type', 'press', 'down', 'up' ]) .describe('Keyboard action type: "type" for typing text into an element, "press/down/up" for key events'), selector: z.string().optional().describe('CSS selector for element to type into (required for "type" action)'), text: z.string().optional().describe('Text to type (required for "type" action)'), key: z.string().optional().describe('Key to press (required for "press/down/up" actions, e.g., "Enter", "a", "Escape")'), modifiers: z.array(z.enum([ 'Control', 'Alt', 'Shift', 'Meta' ])).optional().describe('Modifier keys to hold'), }); export const WaitForSchema = WindowTargetSchema.extend({ type: z.enum([ 'selector', 'text', 'ipc-event' ]).describe('What to wait for'), value: z.string().describe('Selector, text content, or IPC event name to wait for'), timeout: z.number().optional().default(5000).describe('Timeout in milliseconds (default: 5000ms)'), }); export const GetStylesSchema = WindowTargetSchema.extend({ selector: z.string().describe('CSS selector for element(s) to get styles from'), properties: z.array(z.string()).optional().describe('Specific CSS properties to retrieve. If omitted, returns all computed styles'), multiple: z.boolean().optional().default(false) .describe('Whether to get styles for all matching elements (true) or just the first (false)'), }); export const ExecuteJavaScriptSchema = WindowTargetSchema.extend({ script: z.string().describe( 'JavaScript code to execute in the webview context. ' + 'If returning a value, it must be JSON-serializable. ' + 'For functions that return values, use IIFE syntax: "(() => { return value; })()" not "() => { return value; }"' ), args: z.array(z.unknown()).optional().describe('Arguments to pass to the script'), }); export const FocusElementSchema = WindowTargetSchema.extend({ selector: z.string().describe('CSS selector for element to focus'), }); export const FindElementSchema = WindowTargetSchema.extend({ selector: z.string(), strategy: z.enum([ 'css', 'xpath', 'text' ]).default('css'), }); export const GetConsoleLogsSchema = WindowTargetSchema.extend({ filter: z.string().optional().describe('Regex or keyword to filter logs'), since: z.string().optional().describe('ISO timestamp to filter logs since'), }); // ============================================================================ // Implementation Functions // ============================================================================ export async function interact(options: { action: string; selector?: string; x?: number; y?: number; duration?: number; scrollX?: number; scrollY?: number; fromX?: number; fromY?: number; toX?: number; toY?: number; windowId?: string; appIdentifier?: string | number; }): Promise<string> { const { action, selector, x, y, duration, scrollX, scrollY, fromX, fromY, toX, toY, windowId, appIdentifier } = options; // Handle swipe action separately since it has different logic if (action === 'swipe') { return performSwipe({ fromX, fromY, toX, toY, duration, windowId, appIdentifier }); } // Handle focus action if (action === 'focus') { if (!selector) { throw new Error('Focus action requires a selector'); } return focusElement({ selector, windowId, appIdentifier }); } const script = buildScript(SCRIPTS.interact, { action, selector: selector ?? null, x: x ?? null, y: y ?? null, duration: duration ?? 500, scrollX: scrollX ?? 0, scrollY: scrollY ?? 0, }); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Interaction failed: ${message}`); } } interface SwipeOptions { fromX?: number; fromY?: number; toX?: number; toY?: number; duration?: number; windowId?: string; appIdentifier?: string | number; } async function performSwipe(options: SwipeOptions): Promise<string> { const { fromX, fromY, toX, toY, duration = 300, windowId, appIdentifier } = options; if (fromX === undefined || fromY === undefined || toX === undefined || toY === undefined) { throw new Error('Swipe action requires fromX, fromY, toX, and toY coordinates'); } const script = buildScript(SCRIPTS.swipe, { fromX, fromY, toX, toY, duration }); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Swipe failed: ${message}`); } } export interface ScreenshotOptions { quality?: number; format?: 'png' | 'jpeg'; windowId?: string; filePath?: string; appIdentifier?: string | number; } export interface ScreenshotFileResult { filePath: string; format: 'png' | 'jpeg'; } export async function screenshot(options: ScreenshotOptions = {}): Promise<ScreenshotResult | ScreenshotFileResult> { const { quality, format = 'png', windowId, filePath, appIdentifier } = options; // Use the native screenshot function from webview-executor const result = await captureScreenshot({ format, quality, windowId, appIdentifier }); // If filePath is provided, write to file instead of returning base64 if (filePath) { // Find the image content in the result const imageContent = result.content.find((c) => { return c.type === 'image'; }); if (!imageContent || imageContent.type !== 'image') { throw new Error('Screenshot capture failed: no image data'); } // Decode base64 and write to file const buffer = Buffer.from(imageContent.data, 'base64'); const resolvedPath = resolve(filePath); await writeFile(resolvedPath, buffer); return { filePath: resolvedPath, format }; } return result; } export interface KeyboardOptions { action: string; selectorOrKey?: string; textOrModifiers?: string | string[]; modifiers?: string[]; windowId?: string; appIdentifier?: string | number; } export async function keyboard(options: KeyboardOptions): Promise<string> { const { action, selectorOrKey, textOrModifiers, modifiers, windowId, appIdentifier } = options; // Handle the different parameter combinations based on action if (action === 'type') { const selector = selectorOrKey; const text = textOrModifiers as string; if (!selector || !text) { throw new Error('Type action requires both selector and text parameters'); } const script = buildTypeScript(selector, text); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Type action failed: ${message}`); } } // For press/down/up actions: key is required, modifiers optional const key = selectorOrKey; const mods = Array.isArray(textOrModifiers) ? textOrModifiers : modifiers; if (!key) { throw new Error(`${action} action requires a key parameter`); } const script = buildKeyEventScript(action, key, mods || []); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Keyboard action failed: ${message}`); } } export interface WaitForOptions { type: string; value: string; timeout?: number; windowId?: string; appIdentifier?: string | number; } export async function waitFor(options: WaitForOptions): Promise<string> { const { type, value, timeout = 5000, windowId, appIdentifier } = options; const script = buildScript(SCRIPTS.waitFor, { type, value, timeout }); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Wait failed: ${message}`); } } export interface GetStylesOptions { selector: string; properties?: string[]; multiple?: boolean; windowId?: string; appIdentifier?: string | number; } export async function getStyles(options: GetStylesOptions): Promise<string> { const { selector, properties, multiple = false, windowId, appIdentifier } = options; const script = buildScript(SCRIPTS.getStyles, { selector, properties: properties || [], multiple, }); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Get styles failed: ${message}`); } } export interface ExecuteJavaScriptOptions { script: string; args?: unknown[]; windowId?: string; appIdentifier?: string | number; } export async function executeJavaScript(options: ExecuteJavaScriptOptions): Promise<string> { const { script, args, windowId, appIdentifier } = options; // If args are provided, we need to inject them into the script context const wrappedScript = args && args.length > 0 ? ` (function() { const args = ${JSON.stringify(args)}; return (${script}).apply(null, args); })(); ` : script; try { const { result, windowLabel, warning } = await executeInWebviewWithContext(wrappedScript, windowId, appIdentifier); // Build response with window context let response = result; if (warning) { response = `⚠️ ${warning}\n\n${response}`; } // Add window info footer for clarity response += `\n\n[Executed in window: ${windowLabel}]`; return response; } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`JavaScript execution failed: ${message}`); } } export interface FocusElementOptions { selector: string; windowId?: string; appIdentifier?: string | number; } export async function focusElement(options: FocusElementOptions): Promise<string> { const { selector, windowId, appIdentifier } = options; const script = buildScript(SCRIPTS.focus, { selector }); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Focus failed: ${message}`); } } export interface FindElementOptions { selector: string; strategy: string; windowId?: string; appIdentifier?: string | number; } /** * Find an element using various selector strategies. */ export async function findElement(options: FindElementOptions): Promise<string> { const { selector, strategy, windowId, appIdentifier } = options; const script = buildScript(SCRIPTS.findElement, { selector, strategy }); try { return await executeInWebview(script, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Find element failed: ${message}`); } } export interface GetConsoleLogsOptions { filter?: string; since?: string; windowId?: string; appIdentifier?: string | number; } /** * Get console logs from the webview. */ export async function getConsoleLogs(options: GetConsoleLogsOptions = {}): Promise<string> { const { filter, since, windowId, appIdentifier } = options; try { return await getConsoleLogsFromCapture(filter, since, windowId, appIdentifier); } catch(error: unknown) { const message = error instanceof Error ? error.message : String(error); throw new Error(`Failed to get console logs: ${message}`); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hypothesi/mcp-server-tauri'

If you have feedback or need assistance with the MCP directory API, please join our Discord server