/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import {logger} from '../logger.js';
import type {McpContext, TextSnapshotNode} from '../McpContext.js';
import {zod} from '../third_party/index.js';
import type {ElementHandle} from '../third_party/index.js';
import {parseKey} from '../utils/keyboard.js';
import {ToolCategory} from './categories.js';
import {defineTool} from './ToolDefinition.js';
const dblClickSchema = zod
.boolean()
.optional()
.describe('Set to true for double clicks. Default is false.');
const includeSnapshotSchema = zod
.boolean()
.optional()
.describe('Whether to include a snapshot in the response. Default is false.');
function handleActionError(error: unknown, uid: string) {
logger('failed to act using a locator', error);
throw new Error(
`Failed to interact with the element with uid ${uid}. The element did not become interactive within the configured timeout.`,
{
cause: error,
},
);
}
export const click = defineTool({
name: 'click',
description: `Clicks on the provided element`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
uid: zod
.string()
.describe(
'The uid of an element on the page from the page content snapshot',
),
dblClick: dblClickSchema,
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
const uid = request.params.uid;
const handle = await context.getElementByUid(uid);
try {
await context.waitForEventsAfterAction(async () => {
await handle.asLocator().click({
count: request.params.dblClick ? 2 : 1,
});
});
response.appendResponseLine(
request.params.dblClick
? `Successfully double clicked on the element`
: `Successfully clicked on the element`,
);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
} catch (error) {
handleActionError(error, uid);
} finally {
void handle.dispose();
}
},
});
export const clickAt = defineTool({
name: 'click_at',
description: `Clicks at the provided coordinates`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
conditions: ['computerVision'],
},
schema: {
x: zod.number().describe('The x coordinate'),
y: zod.number().describe('The y coordinate'),
dblClick: dblClickSchema,
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
const page = context.getSelectedPage();
await context.waitForEventsAfterAction(async () => {
await page.mouse.click(request.params.x, request.params.y, {
clickCount: request.params.dblClick ? 2 : 1,
});
});
response.appendResponseLine(
request.params.dblClick
? `Successfully double clicked at the coordinates`
: `Successfully clicked at the coordinates`,
);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
},
});
export const hover = defineTool({
name: 'hover',
description: `Hover over the provided element`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
uid: zod
.string()
.describe(
'The uid of an element on the page from the page content snapshot',
),
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
const uid = request.params.uid;
const handle = await context.getElementByUid(uid);
try {
await context.waitForEventsAfterAction(async () => {
await handle.asLocator().hover();
});
response.appendResponseLine(`Successfully hovered over the element`);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
} catch (error) {
handleActionError(error, uid);
} finally {
void handle.dispose();
}
},
});
// The AXNode for an option doesn't contain its `value`. We set text content of the option as value.
// If the form is a combobox, we need to find the correct option by its text value.
// To do that, loop through the children while checking which child's text matches the requested value (requested value is actually the text content).
// When the correct option is found, use the element handle to get the real value.
async function selectOption(
handle: ElementHandle,
aXNode: TextSnapshotNode,
value: string,
) {
let optionFound = false;
for (const child of aXNode.children) {
if (child.role === 'option' && child.name === value && child.value) {
optionFound = true;
const childHandle = await child.elementHandle();
if (childHandle) {
try {
const childValueHandle = await childHandle.getProperty('value');
try {
const childValue = await childValueHandle.jsonValue();
if (childValue) {
await handle.asLocator().fill(childValue.toString());
}
} finally {
void childValueHandle.dispose();
}
break;
} finally {
void childHandle.dispose();
}
}
}
}
if (!optionFound) {
throw new Error(`Could not find option with text "${value}"`);
}
}
async function fillFormElement(
uid: string,
value: string,
context: McpContext,
) {
const handle = await context.getElementByUid(uid);
try {
const aXNode = context.getAXNodeByUid(uid);
if (aXNode && aXNode.role === 'combobox') {
await selectOption(handle, aXNode, value);
} else {
// Increase timeout for longer input values.
const timeoutPerChar = 10; // ms
const fillTimeout =
context.getSelectedPage().getDefaultTimeout() +
value.length * timeoutPerChar;
await handle.asLocator().setTimeout(fillTimeout).fill(value);
}
} catch (error) {
handleActionError(error, uid);
} finally {
void handle.dispose();
}
}
export const fill = defineTool({
name: 'fill',
description: `Type text into a input, text area or select an option from a <select> element.`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
uid: zod
.string()
.describe(
'The uid of an element on the page from the page content snapshot',
),
value: zod.string().describe('The value to fill in'),
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
await context.waitForEventsAfterAction(async () => {
await context.getSelectedPage().keyboard.type(request.params.value);
await fillFormElement(
request.params.uid,
request.params.value,
context as McpContext,
);
});
response.appendResponseLine(`Successfully filled out the element`);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
},
});
export const drag = defineTool({
name: 'drag',
description: `Drag an element onto another element`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
from_uid: zod.string().describe('The uid of the element to drag'),
to_uid: zod.string().describe('The uid of the element to drop into'),
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
const fromHandle = await context.getElementByUid(request.params.from_uid);
const toHandle = await context.getElementByUid(request.params.to_uid);
try {
await context.waitForEventsAfterAction(async () => {
await fromHandle.drag(toHandle);
await new Promise(resolve => setTimeout(resolve, 50));
await toHandle.drop(fromHandle);
});
response.appendResponseLine(`Successfully dragged an element`);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
} finally {
void fromHandle.dispose();
void toHandle.dispose();
}
},
});
export const fillForm = defineTool({
name: 'fill_form',
description: `Fill out multiple form elements at once`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
elements: zod
.array(
zod.object({
uid: zod.string().describe('The uid of the element to fill out'),
value: zod.string().describe('Value for the element'),
}),
)
.describe('Elements from snapshot to fill out.'),
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
for (const element of request.params.elements) {
await context.waitForEventsAfterAction(async () => {
await fillFormElement(
element.uid,
element.value,
context as McpContext,
);
});
}
response.appendResponseLine(`Successfully filled out the form`);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
},
});
export const uploadFile = defineTool({
name: 'upload_file',
description: 'Upload a file through a provided element.',
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
uid: zod
.string()
.describe(
'The uid of the file input element or an element that will open file chooser on the page from the page content snapshot',
),
filePath: zod.string().describe('The local path of the file to upload'),
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
const {uid, filePath} = request.params;
const handle = (await context.getElementByUid(
uid,
)) as ElementHandle<HTMLInputElement>;
try {
try {
await handle.uploadFile(filePath);
} catch {
// Some sites use a proxy element to trigger file upload instead of
// a type=file element. In this case, we want to default to
// Page.waitForFileChooser() and upload the file this way.
try {
const page = context.getSelectedPage();
const [fileChooser] = await Promise.all([
page.waitForFileChooser({timeout: 3000}),
handle.asLocator().click(),
]);
await fileChooser.accept([filePath]);
} catch {
throw new Error(
`Failed to upload file. The element could not accept the file directly, and clicking it did not trigger a file chooser.`,
);
}
}
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
response.appendResponseLine(`File uploaded from ${filePath}.`);
} finally {
void handle.dispose();
}
},
});
export const pressKey = defineTool({
name: 'press_key',
description: `Press a key or key combination. Use this when other input methods like fill() cannot be used (e.g., keyboard shortcuts, navigation keys, or special key combinations).`,
annotations: {
category: ToolCategory.INPUT,
readOnlyHint: false,
},
schema: {
key: zod
.string()
.describe(
'A key or a combination (e.g., "Enter", "Control+A", "Control++", "Control+Shift+R"). Modifiers: Control, Shift, Alt, Meta',
),
includeSnapshot: includeSnapshotSchema,
},
handler: async (request, response, context) => {
const page = context.getSelectedPage();
const tokens = parseKey(request.params.key);
const [key, ...modifiers] = tokens;
await context.waitForEventsAfterAction(async () => {
for (const modifier of modifiers) {
await page.keyboard.down(modifier);
}
await page.keyboard.press(key);
for (const modifier of modifiers.toReversed()) {
await page.keyboard.up(modifier);
}
});
response.appendResponseLine(
`Successfully pressed key: ${request.params.key}`,
);
if (request.params.includeSnapshot) {
response.includeSnapshot();
}
},
});