mcp-browser-use

# ruff: noqa: E402 import asyncio import logging import sys from typing import Optional logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) logging.getLogger("browser_use").setLevel(logging.CRITICAL) logging.getLogger("playwright").setLevel(logging.CRITICAL) import json import markdownify from browser_use.agent.message_manager.service import MessageManager from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import BrowserContext from mcp.server.fastmcp import FastMCP from .utils import check_playwright_installation mcp = FastMCP("browser_use") browser: Optional[Browser] = None browser_context: Optional[BrowserContext] = None message_manager: Optional[MessageManager] = None @mcp.tool() async def initialize_browser(headless: bool = False, task: str = "") -> str: """Initialize a new browser instance. Args: headless: Whether to run browser in headless mode task: The task to be performed Returns: Status message """ global browser, browser_context if browser: await close_browser() config = BrowserConfig(headless=headless) browser = Browser(config=config) browser_context = BrowserContext(browser=browser) system_prompt = SystemPrompt( action_description=( "Available actions: initialize_browser, close_browser, search_google, go_to_url, go_back, wait, click_element, input_text, " "switch_tab, open_tab, inspect_page, scroll_down, scroll_up, send_keys, scroll_to_text, " "get_dropdown_options, select_dropdown_option, validate_page, done" ) ).get_system_message() browser_system_prompt = f""" {system_prompt.text()} Your ultimate task is: {task}. If you achieved your ultimate task, stop everything and use the done tool to complete the task. If not, continue as usual. """ return browser_system_prompt @mcp.tool() async def close_browser() -> str: """Close the current browser instance. Returns: Status message """ global browser, browser_context if browser_context: await browser_context.close() browser_context = None if browser: await browser.close() browser = None return "Browser closed successfully" @mcp.tool() async def search_google(query: str) -> str: """ Search the query in Google in the current tab. Args: query (str): The search query to use in Google Returns: str: A message confirming the search was performed """ page = await browser_context.get_current_page() await page.goto(f"https://www.google.com/search?q={query}&udm=14") await page.wait_for_load_state() return f'🔍 Searched for "{query}" in Google' @mcp.tool() async def go_to_url(url: str) -> str: """ Navigate to URL in the current tab. Args: url (str): The URL to navigate to Returns: str: A message confirming navigation """ page = await browser_context.get_current_page() await page.goto(url) await page.wait_for_load_state() return f"🔗 Navigated to {url}" @mcp.tool() async def go_back() -> str: """ Go back to the previous page. Returns: str: A message confirming navigation back """ await browser_context.go_back() return "🔙 Navigated back" @mcp.tool() async def wait(seconds: int = 3) -> str: """ Wait for the specified number of seconds. Args: seconds (int, optional): Number of seconds to wait. Defaults to 3. Returns: str: A message confirming the wait """ await asyncio.sleep(seconds) return f"🕒 Waiting for {seconds} seconds" @mcp.tool() async def click_element(index: int) -> str: """ Click the element with the specified index. Args: index (int): The index of the element to click Returns: str: A message describing the result of the click action """ if index not in await browser_context.get_selector_map(): raise Exception( f"Element with index {index} does not exist - retry or use alternative actions" ) element_node = await browser_context.get_dom_element_by_index(index) session = await browser_context.get_session() initial_pages = len(session.context.pages) # Check if element is a file uploader if await browser_context.is_file_uploader(element_node): return f"Index {index} - has an element which opens file upload dialog. Use a dedicated function for file uploads" try: download_path = await browser_context._click_element_node(element_node) if download_path: msg = f"💾 Downloaded file to {download_path}" else: msg = f"🖱️ Clicked button with index {index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}" # Handle new tab opening if len(session.context.pages) > initial_pages: msg += " - New tab opened - switching to it" await browser_context.switch_to_tab(-1) return msg except Exception as e: if "Element not found" in str(e) or "Failed to click element" in str(e): # Wait a moment and try again await asyncio.sleep(1) try: download_path = await browser_context._click_element_node(element_node) if download_path: msg = f"💾 Downloaded file to {download_path}" else: msg = f"🖱️ Clicked button with index {index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}" # Handle new tab opening if len(session.context.pages) > initial_pages: msg += " - New tab opened - switching to it" await browser_context.switch_to_tab(-1) return msg except Exception: raise Exception( f"Failed to click element with index {index} even after waiting: {str(e)}" ) else: return f"Error clicking element with index {index}: {str(e)}. Call inspect_page() and try finding the element again." @mcp.tool() async def input_text(index: int, text: str, has_sensitive_data: bool = False) -> str: """ Input text into an interactive element at the specified index. Args: index (int): The index of the element to input text into text (str): The text to input has_sensitive_data (bool, optional): Whether the text is sensitive data. Defaults to False. Returns: str: A message confirming the text input """ if index not in await browser_context.get_selector_map(): raise Exception( f"Element index {index} does not exist - retry or use alternative actions" ) element_node = await browser_context.get_dom_element_by_index(index) await browser_context._input_text_element_node(element_node, text) if not has_sensitive_data: return f"⌨️ Input {text} into index {index}" else: return f"⌨️ Input sensitive data into index {index}" @mcp.tool() async def switch_tab(page_id: int) -> str: """ Switch to the tab with the specified page ID. Args: page_id (int): The ID of the page to switch to Returns: str: A message confirming the tab switch """ await browser_context.switch_to_tab(page_id) page = await browser_context.get_current_page() await page.wait_for_load_state() return f"🔄 Switched to tab {page_id}" @mcp.tool() async def open_tab(url: str) -> str: """ Open a URL in a new tab. Args: url (str): The URL to open in the new tab Returns: str: A message confirming the new tab was opened """ await browser_context.create_new_tab(url) return f"🔗 Opened new tab with {url}" @mcp.tool() async def inspect_page() -> str: """ Lists interactive elements and extracts content from the current page. Returns: str: A formatted string that lists all interactive elements (if any) along with the content. """ # Get the current state to inspect interactive elements state = await browser_context.get_state() prompt_message = AgentMessagePrompt( state, include_attributes=["type", "role", "placeholder", "aria-label", "title"], ).get_user_message(use_vision=False) return prompt_message.content @mcp.tool() async def scroll_down(amount: int = None) -> str: """ Scroll down the page by the specified amount. Args: amount (int, optional): Pixels to scroll down. If None, scrolls one page. Returns: str: A message confirming the scroll action """ page = await browser_context.get_current_page() if amount is not None: await page.evaluate(f"window.scrollBy(0, {amount});") else: await page.evaluate("window.scrollBy(0, window.innerHeight);") amount_str = f"{amount} pixels" if amount is not None else "one page" return f"🔍 Scrolled down the page by {amount_str}" @mcp.tool() async def scroll_up(amount: int = None) -> str: """ Scroll up the page by the specified amount. Args: amount (int, optional): Pixels to scroll up. If None, scrolls one page. Returns: str: A message confirming the scroll action """ page = await browser_context.get_current_page() if amount is not None: await page.evaluate(f"window.scrollBy(0, -{amount});") else: await page.evaluate("window.scrollBy(0, -window.innerHeight);") amount_str = f"{amount} pixels" if amount is not None else "one page" return f"🔍 Scrolled up the page by {amount_str}" @mcp.tool() async def send_keys(keys: str) -> str: """ Send keyboard keys or shortcuts to the current page. Args: keys (str): Keys to send, e.g. "Escape", "Enter", "Control+o" Returns: str: A message confirming the keys were sent """ page = await browser_context.get_current_page() try: await page.keyboard.press(keys) except Exception as e: if "Unknown key" in str(e): for key in keys: await page.keyboard.press(key) else: raise e return f"⌨️ Sent keys: {keys}" @mcp.tool() async def scroll_to_text(text: str) -> str: """ Scroll to an element containing the specified text. Args: text (str): The text to find and scroll to. Returns: str: A message confirming the scroll action or indicating failure. """ page = await browser_context.get_current_page() locators = [ page.get_by_text(text, exact=False), page.locator(f"text={text}"), page.locator(f"//*[contains(text(), '{text}')]"), ] for locator in locators: try: if await locator.count() > 0 and await locator.first.is_visible(): await locator.first.scroll_into_view_if_needed() await asyncio.sleep(0.5) return f"🔍 Scrolled to text: {text}" except Exception: continue return f"Text '{text}' not found or not visible on page" @mcp.tool() async def get_dropdown_options(index: int) -> str: """ Get all options from a dropdown element. Args: index (int): The index of the dropdown element. Returns: str: A formatted string listing all dropdown options. """ page = await browser_context.get_current_page() selector_map = await browser_context.get_selector_map() dom_element = selector_map[index] all_options = [] for frame in page.frames: try: options = await frame.evaluate( """ (xpath) => { const select = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; if (!select) return null; return { options: Array.from(select.options).map(opt => ({ text: opt.text, value: opt.value, index: opt.index })), id: select.id, name: select.name }; } """, dom_element.xpath, ) if options: formatted_options = [] for opt in options["options"]: encoded_text = json.dumps(opt["text"]) formatted_options.append(f'{opt["index"]}: text={encoded_text}') all_options.extend(formatted_options) except Exception: pass if all_options: msg = "\n".join(all_options) msg += "\nUse the exact text string in select_dropdown_option" return msg else: return "No options found in any frame for dropdown" @mcp.tool() async def select_dropdown_option(index: int, text: str) -> str: """ Select an option from a dropdown by its text. Args: index (int): The index of the dropdown element. text (str): The exact text of the option to select. Returns: str: A message confirming the option was selected. """ page = await browser_context.get_current_page() selector_map = await browser_context.get_selector_map() dom_element = selector_map[index] if dom_element.tag_name != "select": return f"Cannot select option: Element with index {index} is a {dom_element.tag_name}, not a select" for frame in page.frames: try: find_dropdown_js = """ (xpath) => { try { const select = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; if (!select) return null; if (select.tagName.toLowerCase() !== 'select') { return { error: `Found element but it's a ${select.tagName}, not a SELECT`, found: false }; } return { id: select.id, name: select.name, found: true, tagName: select.tagName, optionCount: select.options.length, currentValue: select.value, availableOptions: Array.from(select.options).map(o => o.text.trim()) }; } catch (e) { return { error: e.toString(), found: false }; } } """ dropdown_info = await frame.evaluate(find_dropdown_js, dom_element.xpath) if dropdown_info and dropdown_info.get("found"): selected_option_values = ( await frame.locator("//" + dom_element.xpath) .nth(0) .select_option(label=text, timeout=1000) ) return f"Selected option {text} with value {selected_option_values}" except Exception: pass return f"Could not select option '{text}' in any frame" @mcp.tool() async def validate_page(expected_text: str = "") -> str: """ Validate the current page state by extracting content and optionally checking for expected text. Args: expected_text (str): Optional text expected to be present on the page. Returns: str: A message indicating whether the expected text was found or showing an extracted snippet. """ page = await browser_context.get_current_page() content = markdownify.markdownify(await page.content()) if expected_text and expected_text.lower() in content.lower(): return ( f"✅ Validation successful: Expected text '{expected_text}' found on page." ) elif expected_text: return f"⚠ Validation warning: Expected text '{expected_text}' not found. Extracted snippet: {content[:200]}..." else: return f"Page content extracted:\n{content[:500]}..." @mcp.tool() async def done(success: bool = True, text: str = "") -> dict: """ Complete the task with a success flag and optional text. Returns: dict: A dictionary indicating completion status. """ return {"is_done": True, "success": success, "extracted_content": text} def main(): """Run the MCP server""" if not check_playwright_installation(): logger.error("Playwright is not properly installed. Exiting.") sys.exit(1) logger.info("Starting MCP server for browser-use") mcp.run(transport="stdio") if __name__ == "__main__": main()