browse_web
Automate web browsing tasks using AI to navigate websites, click buttons, fill forms, and extract information through natural language commands with real-time progress tracking.
Instructions
Browse the web to complete a task using AI-powered browser automation.
The AI agent can navigate websites, click buttons, fill forms, search for information,
and interact with web pages just like a human user. This runs synchronously and returns
when the task is complete.
Args:
task: What you want to accomplish (e.g., "Find the top 3 gaming laptops on Amazon")
url: Starting webpage (defaults to Google)
Returns:
Dictionary containing:
- ok: Boolean indicating success
- data: Task completion message with results
- screenshot_dir: Path to saved screenshots
- session_id: Unique session identifier
- progress: List of actions taken during browsing
- error: Error message (if task failed)
Examples:
- "Search for Python tutorials and summarize the top result"
- "Go to example.com and click the login button"
- "Find product reviews for iPhone 15 Pro"
Note: For long-running tasks, consider using start_web_task instead.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| task | Yes | ||
| url | No | https://www.google.com |
Implementation Reference
- server.py:43-43 (registration)Registers 'browse_web' as an MCP tool using the FastMCP @mcp.tool() decorator.@mcp.tool()
- server.py:44-88 (handler)The main handler function for the 'browse_web' tool. It creates a GeminiBrowserAgent, runs the task synchronously via executor, and returns the result.async def browse_web(task: str, url: str = "https://www.google.com") -> dict[str, Any]: """ Browse the web to complete a task using AI-powered browser automation. The AI agent can navigate websites, click buttons, fill forms, search for information, and interact with web pages just like a human user. This runs synchronously and returns when the task is complete. Args: task: What you want to accomplish (e.g., "Find the top 3 gaming laptops on Amazon") url: Starting webpage (defaults to Google) Returns: Dictionary containing: - ok: Boolean indicating success - data: Task completion message with results - screenshot_dir: Path to saved screenshots - session_id: Unique session identifier - progress: List of actions taken during browsing - error: Error message (if task failed) Examples: - "Search for Python tutorials and summarize the top result" - "Go to example.com and click the login button" - "Find product reviews for iPhone 15 Pro" Note: For long-running tasks, consider using start_web_task instead. """ logger.info(f"Received web browsing request: {task}") # Create agent instance (browser will be cleaned up automatically) agent = GeminiBrowserAgent(logger=logger) try: # Execute task in thread pool to avoid blocking loop = asyncio.get_event_loop() result = await loop.run_in_executor(None, agent.execute_task, task, url) logger.info(f"Task completed with status: {result.get('ok')}") return result finally: # Clean up browser resources agent.cleanup_browser()
- server.py:45-71 (schema)Docstring defining the input parameters (task, url) and output format for the browse_web tool, used by FastMCP for schema inference.""" Browse the web to complete a task using AI-powered browser automation. The AI agent can navigate websites, click buttons, fill forms, search for information, and interact with web pages just like a human user. This runs synchronously and returns when the task is complete. Args: task: What you want to accomplish (e.g., "Find the top 3 gaming laptops on Amazon") url: Starting webpage (defaults to Google) Returns: Dictionary containing: - ok: Boolean indicating success - data: Task completion message with results - screenshot_dir: Path to saved screenshots - session_id: Unique session identifier - progress: List of actions taken during browsing - error: Error message (if task failed) Examples: - "Search for Python tutorials and summarize the top result" - "Go to example.com and click the login button" - "Find product reviews for iPhone 15 Pro" Note: For long-running tasks, consider using start_web_task instead. """
- browser_agent.py:106-157 (helper)GeminiBrowserAgent.execute_task method called by the handler; sets up the browser, navigates to URL, runs the automation loop, handles screenshots and returns results.def execute_task( self, task: str, url: Optional[str] = "https://www.google.com" ) -> Dict[str, Any]: """ Execute a browser automation task. Args: task: Description of the browsing task to perform url: Optional starting URL (defaults to Google) Returns: Dictionary with ok status and either data or error """ try: self.logger.info(f"Task: {task}") self.logger.info(f"Starting URL: {url}") self.logger.info(f"Session ID: {self.session_id}") # Setup browser if not already done if not self.page: self.setup_browser() # Navigate to starting URL if provided if url: self.page.goto(url, wait_until="domcontentloaded", timeout=10000) self.logger.info(f"Navigated to: {url}") else: # Start with a search engine self.page.goto( "https://www.google.com", wait_until="domcontentloaded", timeout=10000 ) self.logger.info("Starting from Google") # Run the browser automation loop result = self._run_browser_automation_loop(task) self.logger.info( f"Task completed! Screenshots saved to: {self.screenshot_dir}" ) return { "ok": True, "data": result, "screenshot_dir": str(self.screenshot_dir), "session_id": self.session_id, "progress": self.progress_updates, } except Exception as exc: self.logger.exception("Browser automation failed") return {"ok": False, "error": str(exc)}
- browser_agent.py:158-278 (helper)Core automation loop in GeminiBrowserAgent; configures Gemini Computer Use tool, runs iterative agent turns with screenshots and function calls until task completion.def _run_browser_automation_loop(self, task: str, max_turns: int = 30) -> str: """ Run the Gemini Computer Use agent loop to complete the task. Args: task: The browsing task to complete max_turns: Maximum number of agent turns Returns: The final result as a string """ # Configure Gemini with Computer Use config = types.GenerateContentConfig( tools=[ types.Tool( computer_use=types.ComputerUse( environment=types.Environment.ENVIRONMENT_BROWSER ) ) ], ) # Initial screenshot - take once and save initial_screenshot = self.page.screenshot(type="png") timestamp = datetime.now().strftime("%H%M%S") screenshot_path = ( self.screenshot_dir / f"step_{self.screenshot_counter:02d}_initial_{timestamp}.png" ) with open(screenshot_path, "wb") as f: f.write(initial_screenshot) self.logger.info(f"Saved initial screenshot: {screenshot_path}") self.screenshot_counter += 1 # Build initial contents contents = [ Content( role="user", parts=[ Part(text=task), Part.from_bytes(data=initial_screenshot, mime_type="image/png"), ], ) ] self.logger.info(f"Starting browser automation loop for task: {task}") self._add_progress("Started browser automation", "info") # Agent loop for turn in range(max_turns): self.logger.info(f"Turn {turn + 1}/{max_turns}") self._add_progress(f"Turn {turn + 1}/{max_turns}", "turn") try: # Get response from Gemini response = self.gemini_client.models.generate_content( model=GEMINI_MODEL, contents=contents, config=config, ) candidate = response.candidates[0] contents.append(candidate.content) # Check if there are function calls has_function_calls = any( part.function_call for part in candidate.content.parts ) if not has_function_calls: # No more actions - extract final text response text_response = " ".join( [part.text for part in candidate.content.parts if part.text] ) self.logger.info(f"Agent finished: {text_response}") # Save final screenshot timestamp = datetime.now().strftime("%H%M%S") screenshot_path = ( self.screenshot_dir / f"step_{self.screenshot_counter:02d}_final_{timestamp}.png" ) self.page.screenshot(path=str(screenshot_path)) self.logger.info(f"Saved final screenshot: {screenshot_path}") self.screenshot_counter += 1 return text_response # Execute function calls self.logger.info("Executing browser actions...") self._add_progress("Executing browser actions", "action") results = self._execute_gemini_function_calls(candidate) # Get function responses with new screenshot function_responses = self._get_gemini_function_responses(results) # Save screenshot after actions timestamp = datetime.now().strftime("%H%M%S") screenshot_path = ( self.screenshot_dir / f"step_{self.screenshot_counter:02d}_{timestamp}.png" ) self.page.screenshot(path=str(screenshot_path)) self.logger.info(f"Saved screenshot: {screenshot_path}") self.screenshot_counter += 1 # Add function responses to contents contents.append( Content( role="user", parts=[Part(function_response=fr) for fr in function_responses], ) ) except Exception as e: self.logger.error(f"Error in browser automation loop: {e}") raise # If we hit max turns, return what we have return f"Task reached maximum turns ({max_turns}). Please check browser state."
- browser_agent.py:279-391 (helper)Executes specific browser actions (navigate, click, type, scroll, etc.) dispatched from Gemini's function calls using Playwright.def _execute_gemini_function_calls(self, candidate) -> list: """Execute Gemini Computer Use function calls using Playwright.""" results = [] function_calls = [ part.function_call for part in candidate.content.parts if part.function_call ] for function_call in function_calls: fname = function_call.name args = function_call.args self.logger.info(f"Executing Gemini action: {fname}") self._add_progress(f"Action: {fname}", "function_call") action_result = {} try: if fname == "open_web_browser": pass # Already open elif fname == "wait_5_seconds": time.sleep(5) elif fname == "go_back": self.page.go_back() elif fname == "go_forward": self.page.go_forward() elif fname == "search": self.page.goto("https://www.google.com") elif fname == "navigate": self.page.goto(args["url"], wait_until="domcontentloaded", timeout=10000) elif fname == "click_at": actual_x = self._denormalize_x(args["x"]) actual_y = self._denormalize_y(args["y"]) self.page.mouse.click(actual_x, actual_y) elif fname == "hover_at": actual_x = self._denormalize_x(args["x"]) actual_y = self._denormalize_y(args["y"]) self.page.mouse.move(actual_x, actual_y) elif fname == "type_text_at": actual_x = self._denormalize_x(args["x"]) actual_y = self._denormalize_y(args["y"]) text = args["text"] press_enter = args.get("press_enter", True) clear_before = args.get("clear_before_typing", True) self.page.mouse.click(actual_x, actual_y) if clear_before: self.page.keyboard.press("Meta+A") self.page.keyboard.press("Backspace") self.page.keyboard.type(text) if press_enter: self.page.keyboard.press("Enter") elif fname == "key_combination": keys = args["keys"] self.page.keyboard.press(keys) elif fname == "scroll_document": direction = args["direction"] if direction == "down": self.page.keyboard.press("PageDown") elif direction == "up": self.page.keyboard.press("PageUp") elif direction == "left": self.page.keyboard.press("ArrowLeft") elif direction == "right": self.page.keyboard.press("ArrowRight") elif fname == "scroll_at": actual_x = self._denormalize_x(args["x"]) actual_y = self._denormalize_y(args["y"]) direction = args["direction"] magnitude = args.get("magnitude", 800) # Scroll by moving to position and using wheel self.page.mouse.move(actual_x, actual_y) scroll_amount = int(magnitude * SCREEN_HEIGHT / 1000) if direction == "down": self.page.mouse.wheel(0, scroll_amount) elif direction == "up": self.page.mouse.wheel(0, -scroll_amount) elif direction == "left": self.page.mouse.wheel(-scroll_amount, 0) elif direction == "right": self.page.mouse.wheel(scroll_amount, 0) elif fname == "drag_and_drop": x = self._denormalize_x(args["x"]) y = self._denormalize_y(args["y"]) dest_x = self._denormalize_x(args["destination_x"]) dest_y = self._denormalize_y(args["destination_y"]) self.page.mouse.move(x, y) self.page.mouse.down() self.page.mouse.move(dest_x, dest_y) self.page.mouse.up() else: self.logger.warning(f"Unimplemented action: {fname}") # Quick stability check - only wait if navigation occurred if fname in ["navigate", "go_back", "go_forward", "search"]: self.page.wait_for_load_state("domcontentloaded", timeout=3000) else: time.sleep(0.3) # Brief pause for UI updates except Exception as e: self.logger.error(f"Error executing {fname}: {e}") action_result = {"error": str(e)} # Get safety decision from the function call if present safety_decision = None if hasattr(function_call, 'safety_decision'): safety_decision = function_call.safety_decision self.logger.info(f"Safety decision present for {fname}: {safety_decision}") results.append((fname, action_result, safety_decision)) return results