desktop-tools.ts•11.1 kB
/**
* ByteBot Desktop API MCP Tools
*
* MCP tool definitions for ByteBot computer control operations
*/
import { DesktopClient } from '../clients/desktop-client.js';
import { formatErrorForMCP } from '../utils/error-handler.js';
/**
* Tool definitions for Desktop API
*/
export function getDesktopTools() {
return [
// Mouse tools
{
name: 'bytebot_move_mouse',
description:
'Move the mouse cursor to specific screen coordinates. Use this to position the cursor before clicking or dragging.',
inputSchema: {
type: 'object' as const,
properties: {
x: {
type: 'number',
description: 'X coordinate (horizontal position in pixels)',
},
y: {
type: 'number',
description: 'Y coordinate (vertical position in pixels)',
},
},
required: ['x', 'y'],
},
},
{
name: 'bytebot_click',
description:
'Click at specific screen coordinates. Supports left, right, and middle mouse buttons, as well as double-clicks.',
inputSchema: {
type: 'object' as const,
properties: {
x: {
type: 'number',
description: 'X coordinate (horizontal position in pixels)',
},
y: {
type: 'number',
description: 'Y coordinate (vertical position in pixels)',
},
button: {
type: 'string',
enum: ['left', 'right', 'middle'],
description: 'Mouse button to click. Default: left',
default: 'left',
},
count: {
type: 'number',
description: 'Number of clicks (1 = single click, 2 = double click). Default: 1',
default: 1,
},
},
required: ['x', 'y'],
},
},
{
name: 'bytebot_drag',
description:
'Drag the mouse from one position to another. Useful for moving windows, selecting text, or drag-and-drop operations.',
inputSchema: {
type: 'object' as const,
properties: {
from_x: {
type: 'number',
description: 'Starting X coordinate',
},
from_y: {
type: 'number',
description: 'Starting Y coordinate',
},
to_x: {
type: 'number',
description: 'Ending X coordinate',
},
to_y: {
type: 'number',
description: 'Ending Y coordinate',
},
},
required: ['from_x', 'from_y', 'to_x', 'to_y'],
},
},
{
name: 'bytebot_scroll',
description:
'Scroll the screen in a specific direction. Use this to navigate through content.',
inputSchema: {
type: 'object' as const,
properties: {
direction: {
type: 'string',
enum: ['up', 'down', 'left', 'right'],
description: 'Direction to scroll',
},
count: {
type: 'number',
description: 'Number of scroll increments. Default: 1',
default: 1,
},
},
required: ['direction'],
},
},
// Keyboard tools
{
name: 'bytebot_type_text',
description:
'Type text string as if typing on a keyboard. Use this for entering text in forms, search boxes, etc.',
inputSchema: {
type: 'object' as const,
properties: {
text: {
type: 'string',
description: 'Text to type',
},
delay: {
type: 'number',
description: 'Optional delay between keystrokes in milliseconds',
},
},
required: ['text'],
},
},
{
name: 'bytebot_paste_text',
description:
'Paste text directly (without typing). Faster than type_text and works with special characters/emojis.',
inputSchema: {
type: 'object' as const,
properties: {
text: {
type: 'string',
description: 'Text to paste',
},
},
required: ['text'],
},
},
{
name: 'bytebot_press_keys',
description:
'Press keyboard keys including modifiers (Ctrl, Shift, Alt, etc.). Use this for keyboard shortcuts like Ctrl+C, Ctrl+V, Alt+Tab.',
inputSchema: {
type: 'object' as const,
properties: {
keys: {
type: 'array',
items: {
type: 'string',
},
description:
'Array of keys to press together. Examples: ["ctrl", "c"] for copy, ["alt", "tab"] for app switching',
},
},
required: ['keys'],
},
},
// Screen tools
{
name: 'bytebot_screenshot',
description:
'Capture a screenshot of the entire screen. Returns base64-encoded PNG image data.',
inputSchema: {
type: 'object' as const,
properties: {},
},
},
{
name: 'bytebot_cursor_position',
description:
'Get the current mouse cursor position. Returns {x, y} coordinates.',
inputSchema: {
type: 'object' as const,
properties: {},
},
},
// File I/O tools
{
name: 'bytebot_read_file',
description:
'Read a file from the filesystem. Returns base64-encoded file content.',
inputSchema: {
type: 'object' as const,
properties: {
path: {
type: 'string',
description: 'Absolute or relative path to the file to read',
},
},
required: ['path'],
},
},
{
name: 'bytebot_write_file',
description:
'Write content to a file on the filesystem. Content must be base64-encoded.',
inputSchema: {
type: 'object' as const,
properties: {
path: {
type: 'string',
description: 'Absolute or relative path to the file to write',
},
content: {
type: 'string',
description: 'Base64-encoded file content',
},
},
required: ['path', 'content'],
},
},
// System tools
{
name: 'bytebot_switch_application',
description:
'Switch to a specific application window. Use this to bring an app to the foreground.',
inputSchema: {
type: 'object' as const,
properties: {
name: {
type: 'string',
description:
'Application name (e.g., "firefox", "terminal", "vscode", "chrome", "safari")',
},
},
required: ['name'],
},
},
{
name: 'bytebot_wait',
description:
'Wait for a specified duration. Use this to add delays between actions or wait for UI updates.',
inputSchema: {
type: 'object' as const,
properties: {
duration: {
type: 'number',
description: 'Duration to wait in milliseconds',
},
},
required: ['duration'],
},
},
];
}
/**
* Tool handlers for Desktop API
*/
export async function handleDesktopTool(
toolName: string,
args: Record<string, unknown>,
desktopClient: DesktopClient
) {
try {
let result;
switch (toolName) {
case 'bytebot_move_mouse':
result = await desktopClient.moveMouse(
args.x as number,
args.y as number
);
break;
case 'bytebot_click':
result = await desktopClient.clickMouse(
args.x as number,
args.y as number,
(args.button as any) || 'left',
(args.count as number) || 1
);
break;
case 'bytebot_drag':
result = await desktopClient.dragMouse(
args.from_x as number,
args.from_y as number,
args.to_x as number,
args.to_y as number
);
break;
case 'bytebot_scroll':
result = await desktopClient.scroll(
args.direction as any,
(args.count as number) || 1
);
break;
case 'bytebot_type_text':
result = await desktopClient.typeText(
args.text as string,
args.delay as number | undefined
);
break;
case 'bytebot_paste_text':
result = await desktopClient.pasteText(args.text as string);
break;
case 'bytebot_press_keys':
result = await desktopClient.pressKeys(args.keys as string[]);
break;
case 'bytebot_screenshot':
result = await desktopClient.screenshot();
// Validate screenshot size
if (result.screenshot) {
desktopClient.validateScreenshotSize(result.screenshot);
}
break;
case 'bytebot_cursor_position':
result = await desktopClient.getCursorPosition();
break;
case 'bytebot_read_file':
result = await desktopClient.readFile(args.path as string);
break;
case 'bytebot_write_file':
result = await desktopClient.writeFile(
args.path as string,
args.content as string
);
break;
case 'bytebot_switch_application':
result = await desktopClient.switchApplication(args.name as string);
break;
case 'bytebot_wait':
result = await desktopClient.wait(args.duration as number);
break;
default:
throw new Error(`Unknown tool: ${toolName}`);
}
// Format response
const responseText = formatDesktopResponse(result, toolName);
return {
content: [
{
type: 'text',
text: responseText,
},
],
};
} catch (error) {
const errorInfo = formatErrorForMCP(error);
return {
content: [
{
type: 'text',
text: `Error: ${errorInfo.error}${errorInfo.details ? '\n\nDetails:\n' + errorInfo.details : ''}`,
},
],
isError: true,
};
}
}
/**
* Format desktop action response for display
*/
function formatDesktopResponse(result: any, toolName: string): string {
const lines: string[] = [];
lines.push(`✓ ${toolName} completed successfully`);
lines.push(`Duration: ${result.duration}ms`);
// Add specific result data based on tool type
if (result.screenshot) {
const sizeKB = ((result.screenshot.length * 3) / 4 / 1024).toFixed(2);
lines.push(`Screenshot captured: ${sizeKB} KB`);
lines.push(`Base64 data: ${result.screenshot.substring(0, 100)}...`);
}
if (result.position) {
lines.push(`Cursor position: (${result.position.x}, ${result.position.y})`);
}
if (result.content) {
const sizeKB = ((result.content.length * 3) / 4 / 1024).toFixed(2);
lines.push(`File content: ${sizeKB} KB`);
lines.push(`Base64 data: ${result.content.substring(0, 100)}...`);
}
if (result.message) {
lines.push(`Message: ${result.message}`);
}
return lines.join('\n');
}