fetch
Load and render web pages with JavaScript, bypassing anti-bot measures. Extract clean text, HTML, or markdown automatically.
Instructions
Fetch and render a web page using a real Chrome browser. Handles JavaScript-heavy sites, anti-bot protection, and dynamic content. Auto-detects when content has loaded by monitoring DOM changes and network activity.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | The URL to fetch (must be a valid HTTP/HTTPS URL) | |
| format | No | Output format: 'html' for raw HTML, 'text' for cleaned text content, 'markdown' for structured markdown | text |
| wait_for | No | CSS selector to wait for before extracting content. Usually not needed - the tool auto-detects content stabilization. Use this only when auto-detection fails and you know the specific element to wait for. Examples: '[class*="product"]' for e-commerce, '.job-card' for job boards, '[data-testid="results"]' for search results. | |
| timeout | No | Timeout in milliseconds (default: 60000, max: 120000). Increase to 90000+ for slow-loading e-commerce or search result pages. | |
| human_mode | No | Enable human-mode scrolling and delays for more natural browsing behavior (default: true) |
Implementation Reference
- src/tools/fetch.ts:474-490 (handler)Main export function for the 'fetch' tool - convenience wrapper that calls fetchPage with inline options
export async function fetch( url: string, options: { format?: ContentFormat; wait_for?: string; timeout?: number; human_mode?: boolean; } = {} ): Promise<FetchResponse> { return fetchPage({ url, format: options.format ?? "text", wait_for: options.wait_for, timeout: options.timeout ?? config.timeouts.navigation, human_mode: options.human_mode, }); } - src/tools/fetch.ts:405-468 (handler)Core implementation of the fetch tool - orchestrates rate limiting, Python fetcher calls with retry, and response creation
export async function fetchPage(options: FetchOptions): Promise<FetchResponse> { const startTime = Date.now(); const { url, format, wait_for, timeout, human_mode } = options; const domain = extractDomain(url); logger.info("fetch_start", { url, format, domain }); try { // Step 2: Acquire rate limit token for domain logger.info("step_rate_limit", { domain, elapsed: Date.now() - startTime }); await rateLimiter.acquire(domain); logger.info("step_rate_limit_done", { domain, elapsed: Date.now() - startTime }); // Step 3: Call Python fetcher with retry logic logger.info("step_python_fetch", { elapsed: Date.now() - startTime }); const rawContent = await fetchWithRetry(url, format as ContentFormat, timeout, wait_for, human_mode); logger.info("step_python_fetch_done", { elapsed: Date.now() - startTime }); // Check for errors with no content if (rawContent.error && rawContent.html === "") { logger.error("fetch_failed", { url, event: rawContent.error, status: rawContent.status, }); const errorCode = getErrorCode(rawContent.error); return createErrorResponse( rawContent.url, errorCode, rawContent.error, rawContent.status || undefined ); } // Step 4: Python already handled content extraction, just return the result const duration = Date.now() - startTime; logger.info("fetch_complete", { url, duration_ms: duration, content_length: rawContent.html.length, status: rawContent.status, }); // Step 5: Return FetchResult return createSuccessResponse( rawContent.html, // Python already extracted content in the requested format rawContent.url, rawContent.title, rawContent.status ); } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); logger.error("fetch_error", { url, event: errorMessage, }); const errorCode = getErrorCode(errorMessage); return createErrorResponse(url, errorCode, errorMessage); } } - src/index.ts:92-107 (registration)Creates the tool handlers object that maps 'fetch' tool name to fetchPage implementation, registered in the MCP server
*/ function createToolHandlers(): ToolHandlers { return { fetch: async (options) => { log("debug", "Fetch handler called", { url: options.url }); return fetchPage(options); }, fetchBatch: async (options) => { log("debug", "Fetch batch handler called", { urlCount: options.urls.length }); return fetchBatch(options); }, }; } // ============================================================================= - src/server.ts:32-69 (schema)MCP tool definition (name, description, inputSchema) for the 'fetch' tool used when listing available tools
const FETCH_TOOL: Tool = { name: "fetch", description: "Fetch and render a web page using a real Chrome browser. Handles JavaScript-heavy sites, anti-bot protection, and dynamic content. Auto-detects when content has loaded by monitoring DOM changes and network activity.", inputSchema: { type: "object", properties: { url: { type: "string", description: "The URL to fetch (must be a valid HTTP/HTTPS URL)", }, format: { type: "string", enum: ["html", "text", "markdown"], default: "text", description: "Output format: 'html' for raw HTML, 'text' for cleaned text content, 'markdown' for structured markdown", }, wait_for: { type: "string", description: "CSS selector to wait for before extracting content. Usually not needed - the tool auto-detects content stabilization. Use this only when auto-detection fails and you know the specific element to wait for. Examples: '[class*=\"product\"]' for e-commerce, '.job-card' for job boards, '[data-testid=\"results\"]' for search results.", }, timeout: { type: "number", default: 60000, description: "Timeout in milliseconds (default: 60000, max: 120000). Increase to 90000+ for slow-loading e-commerce or search result pages.", }, human_mode: { type: "boolean", description: "Enable human-mode scrolling and delays for more natural browsing behavior (default: true)", }, }, required: ["url"], }, }; - python/fetcher.py:1067-1565 (handler)Python implementation of the actual page fetching using Nodriver (Chrome automation) - handles browser launch, anti-bot bypass, content extraction
async def fetch_page( url: str, format: str = "text", timeout: int = 30000, wait_for: Optional[str] = None, headless: bool = True, human_mode: bool = True, ) -> Dict[str, Any]: """ Fetch a web page using Nodriver. Args: url: URL to fetch format: Output format - "text", "markdown", or "html" timeout: Timeout in milliseconds wait_for: Optional CSS selector to wait for headless: Run browser in headless mode human_mode: Enable human-like behavior (delays, mouse movements, scrolling) Returns: Dict with success, content, url, title, status """ browser = None chrome_pid = None # For macOS background mode cleanup (actual Chrome PID) debug_port = None # For macOS background mode user_data_dir = None # Temp directory for browser profile (cleaned up in finally) start_time = time.time() try: # Validate URL parsed = urlparse(url) if not parsed.scheme or not parsed.netloc: raise FetchError("INVALID_URL", f"Invalid URL format: {url}") log_info("fetch_start", url=url, format=format, headless=headless, human_mode=human_mode) # Get Chrome path chrome_path = get_chrome_path() if chrome_path: log_info("chrome_found", path=chrome_path) else: log_info("chrome_not_found", message="Using nodriver auto-detection") # Launch browser # sandbox=False required on macOS, otherwise Chrome fails to start # Use unique user_data_dir to avoid conflicts with parallel browser instances # (each Chrome instance needs its own profile directory) import tempfile user_data_dir = tempfile.mkdtemp(prefix='turbowebfetch_') # Cleaned up in finally log_info("browser_user_data_dir", user_data_dir=user_data_dir, headless=headless) # Build browser args browser_args = [] if not headless: browser_args.append('--window-position=-2400,-2400') log_info("headed_offscreen_mode", window_position="-2400,-2400") browser = await asyncio.wait_for( uc.start( headless=headless, browser_executable_path=chrome_path, sandbox=False, browser_args=browser_args, user_data_dir=user_data_dir, ), timeout=NAVIGATE_TIMEOUT ) page = await safe_navigate(browser, url) # Initialize human behavior wrapper (after browser starts, we can get viewport) human: Optional[HumanBehavior] = None if human_mode: try: # Nodriver returns lists from evaluate, so get width/height separately viewport_width = await safe_evaluate(page, "window.innerWidth", timeout=5, default=1920) or 1920 viewport_height = await safe_evaluate(page, "window.innerHeight", timeout=5, default=1080) or 1080 human = HumanBehavior( enabled=True, viewport_width=int(viewport_width), viewport_height=int(viewport_height) ) log_info("human_mode_enabled", viewport_width=viewport_width, viewport_height=viewport_height, modules_available=HUMAN_MODULES_AVAILABLE) except Exception as e: log_info("human_mode_init_failed", error=str(e)) human = HumanBehavior(enabled=False) # Detect and wait for Cloudflare JS challenge to auto-pass is_cloudflare = await detect_cloudflare(page) cf_retry_needed = False if is_cloudflare: log_info("cloudflare_detected", url=url) # Wait for Cloudflare JS challenge to complete (up to 10 seconds) max_cf_wait = 10 cf_check_interval = 2 cf_waited = 0 while cf_waited < max_cf_wait: await asyncio.sleep(cf_check_interval) cf_waited += cf_check_interval # Check if still on Cloudflare challenge still_cloudflare = await detect_cloudflare(page) if not still_cloudflare: log_info("cloudflare_passed", waited_seconds=cf_waited) break log_info("cloudflare_waiting", waited_seconds=cf_waited, max_wait=max_cf_wait) # If still on Cloudflare after waiting, need headed retry with cf_verify still_cf = await detect_cloudflare(page) log_info("cloudflare_check_after_wait", cf_waited=cf_waited, max_cf_wait=max_cf_wait, still_cloudflare=still_cf, headless=headless) if cf_waited >= max_cf_wait and still_cf: if headless: cf_retry_needed = True log_info("cloudflare_retry_needed", reason="JS challenge didn't pass, will retry headed with cf_verify") else: log_info("cloudflare_already_headed", reason="Already in headed mode, cannot retry") # Retry with headed mode + cf_verify() if needed if cf_retry_needed: log_info("cloudflare_headed_retry_start", url=url) # Close headless browser try: browser.stop() except Exception: pass browser = None # Relaunch in headed mode (background on macOS, off-screen on others) browser, page, chrome_pid, debug_port = await start_headed_browser( chrome_path=chrome_path, url=url, ) # Wait for page to load await asyncio.sleep(2) # Check if still Cloudflare (it should be) if await detect_cloudflare(page): log_info("cloudflare_cf_verify_attempt", url=url) try: # Use nodriver's built-in Cloudflare bypass (clicks the checkbox) # Add timeout to prevent hanging on cf_verify await asyncio.wait_for(page.verify_cf(), timeout=30) log_info("cloudflare_cf_verify_success", url=url) # Wait for redirect after verification await asyncio.sleep(3) # Verify we passed if await detect_cloudflare(page): log_error("cloudflare_cf_verify_failed", message="Still on challenge after cf_verify") # Return error - don't continue extracting challenge page content raise FetchError("BLOCKED", "Cloudflare challenge not bypassed after cf_verify") else: log_info("cloudflare_bypassed", url=url) except asyncio.TimeoutError: log_error("cloudflare_cf_verify_timeout", url=url) raise FetchError("TIMEOUT", "Cloudflare verification timed out after 30s") except FetchError: raise # Re-raise our own errors except Exception as cf_err: log_error("cloudflare_cf_verify_error", error=str(cf_err)) raise FetchError("BLOCKED", f"Cloudflare bypass failed: {cf_err}") # Re-initialize human behavior for new browser if human_mode: try: viewport_width = await safe_evaluate(page, "window.innerWidth", timeout=5, default=1920) or 1920 viewport_height = await safe_evaluate(page, "window.innerHeight", timeout=5, default=1080) or 1080 human = HumanBehavior( enabled=True, viewport_width=int(viewport_width), viewport_height=int(viewport_height) ) except Exception: human = HumanBehavior(enabled=False) # Detect DataDome anti-bot challenge (only if not already retried for Cloudflare) # DataDome is used by sites like Indeed and blocks headless browsers if not cf_retry_needed: is_datadome = await detect_datadome(page) datadome_retry_needed = False if is_datadome: log_info("datadome_detected", url=url) if headless: datadome_retry_needed = True log_info("datadome_retry_needed", reason="DataDome blocks headless, will retry in headed mode") else: # Already in headed mode, DataDome might still block but we can't do more log_info("datadome_already_headed", reason="Already in headed mode, cannot retry") # Retry with headed mode for DataDome (no cf_verify needed, just human behavior) if datadome_retry_needed: log_info("datadome_headed_retry_start", url=url) # Close headless browser try: browser.stop() except Exception: pass browser = None # Relaunch in headed mode (background on macOS, off-screen on others) browser, page, chrome_pid, debug_port = await start_headed_browser( chrome_path=chrome_path, url=url, ) # Wait for page to load with human-like delay await asyncio.sleep(3) # Re-initialize human behavior for new browser if human_mode: try: viewport_width = await safe_evaluate(page, "window.innerWidth", timeout=5, default=1920) or 1920 viewport_height = await safe_evaluate(page, "window.innerHeight", timeout=5, default=1080) or 1080 human = HumanBehavior( enabled=True, viewport_width=int(viewport_width), viewport_height=int(viewport_height) ) except Exception: human = HumanBehavior(enabled=False) # Check if DataDome is still blocking still_datadome = await detect_datadome(page) if still_datadome: log_error("datadome_headed_retry_failed", message="Still blocked after headed retry") raise FetchError("BLOCKED", "DataDome challenge not bypassed in headed mode") else: log_info("datadome_bypassed", url=url) # Wait for specific selector if requested, otherwise auto-stabilize if wait_for: try: await page.find(wait_for, timeout=timeout / 1000) log_info("selector_found", selector=wait_for) except Exception as e: log_error("selector_wait_timeout", selector=wait_for, error=str(e)) else: # Fixed wait for JS-heavy pages to load content # Auto-stabilization was unreliable due to intermittent evaluate() failures # A simple fixed wait is more reliable for modern JS frameworks fixed_wait = 5.0 # 5 seconds handles most JS rendering log_info("fixed_wait_start", seconds=fixed_wait) await asyncio.sleep(fixed_wait) # Add reading delay after navigation (human takes time to see page) if human: reading_time = human.get_reading_delay() log_info("reading_delay", seconds=round(reading_time, 2)) await asyncio.sleep(reading_time) # Add thinking delay before taking actions if human: think_time = human.get_thinking_delay(complexity="simple") await asyncio.sleep(think_time) # Dismiss overlays (with human behavior) await dismiss_overlays(page, human=human) # Lazy load content (with human behavior) await lazy_load_content(page, human=human) # Get final URL final_url = page.url # Get page title (with timeout) title = await safe_evaluate(page, "document.title", timeout=5, default="") or "" # Get page HTML (with timeout to prevent hangs on never-ending pages) html = await safe_get_content(page, timeout=CONTENT_TIMEOUT) if not html: log_error("content_extraction_failed", error="get_content returned empty") html = await safe_evaluate(page, "document.documentElement.outerHTML", timeout=10, default="") or "" # Get innerText as fallback for JS-heavy pages where Readability fails inner_text_raw = await safe_evaluate(page, "document.body.innerText", timeout=10, default=None) # Validate innerText is actually a string (nodriver can return error objects) inner_text = inner_text_raw if isinstance(inner_text_raw, str) else None # Log extraction inputs for debugging log_info("content_extraction_inputs", html_len=len(html) if html else 0, title=title[:50] if title else "None", innertext_len=len(inner_text) if inner_text else 0, innertext_type=type(inner_text_raw).__name__) # Extract content based on format if format == "html": content = html elif format == "markdown": content = extract_markdown_content(html, title, inner_text) else: # text content = extract_text_content(html, title, inner_text) duration_ms = int((time.time() - start_time) * 1000) log_info("fetch_success", url=url, final_url=final_url, duration_ms=duration_ms) return { "success": True, "content": content, "url": final_url, "title": title, "status": 200, # Nodriver doesn't expose HTTP status easily } except FetchError as e: duration_ms = int((time.time() - start_time) * 1000) log_error("fetch_failed", url=url, code=e.code, message=e.message, duration_ms=duration_ms) return { "success": False, "error": {"code": e.code, "message": e.message}, "url": url, } except asyncio.TimeoutError: duration_ms = int((time.time() - start_time) * 1000) log_error("fetch_timeout", url=url, duration_ms=duration_ms) return { "success": False, "error": {"code": "TIMEOUT", "message": f"Fetch timeout after {timeout}ms"}, "url": url, } except Exception as e: duration_ms = int((time.time() - start_time) * 1000) log_error("fetch_error", url=url, error=str(e), duration_ms=duration_ms) return { "success": False, "error": {"code": "UNKNOWN_ERROR", "message": str(e)}, "url": url, } finally: # Clean up browser if browser: try: browser.stop() # Note: stop() is not async except Exception as e: log_error("browser_cleanup_failed", error=str(e)) # browser.stop() only kills the main Chrome process, not helper processes # Kill ALL remaining Chrome processes with our user-data-dir (renderer, GPU, etc.) if user_data_dir: try: subprocess.run( ['pkill', '-KILL', '-f', user_data_dir], timeout=3, check=False, # Don't raise if no processes found stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) log_info("chrome_helpers_cleanup", user_data_dir=user_data_dir) except subprocess.TimeoutExpired: log_error("chrome_helpers_cleanup_timeout", user_data_dir=user_data_dir) except Exception as e: log_error("chrome_helpers_cleanup_failed", user_data_dir=user_data_dir, error=str(e)) # Clean up Chrome process if launched in background mode (macOS) # Must kill all Chrome processes with our user-data-dir (main + helpers) if chrome_pid and debug_port: import signal log_info("chrome_cleanup_starting", pid=chrome_pid, port=debug_port) # Use pkill with SIGKILL to immediately kill ALL Chrome processes with our port # This is more reliable than SIGTERM which Chrome may ignore # Note: macOS pkill uses "-KILL" not "-9" try: subprocess.run( ['pkill', '-KILL', '-f', f'remote-debugging-port={debug_port}'], timeout=3 ) log_info("chrome_background_cleanup", pid=chrome_pid, port=debug_port) except subprocess.TimeoutExpired: log_error("chrome_pkill_timeout", port=debug_port) except Exception as e: # Fallback: try direct kill on main process try: os.kill(chrome_pid, signal.SIGKILL) except Exception: pass log_error("chrome_background_cleanup_failed", pid=chrome_pid, error=str(e)) # Clean up temp user data directory if user_data_dir: try: import shutil shutil.rmtree(user_data_dir, ignore_errors=True) log_info("user_data_dir_cleanup", path=user_data_dir) except Exception as e: log_error("user_data_dir_cleanup_failed", path=user_data_dir, error=str(e)) async def main(): """Main entry point.""" parser = argparse.ArgumentParser(description="Fetch web pages using Nodriver") parser.add_argument("--url", required=True, help="URL to fetch") parser.add_argument("--format", choices=["html", "text", "markdown"], default="text", help="Output format") parser.add_argument("--timeout", type=int, default=30000, help="Timeout in milliseconds") parser.add_argument("--wait-for", help="CSS selector to wait for") parser.add_argument("--headless", type=str, default="true", help="Run headless (true/false)") parser.add_argument("--human-mode", type=str, default="true", help="Enable human-like behavior (true/false)") args = parser.parse_args() # Convert string args to bool headless = args.headless.lower() in ("true", "1", "yes") human_mode = args.human_mode.lower() in ("true", "1", "yes") try: # Run fetch with timeout result = await asyncio.wait_for( fetch_page( url=args.url, format=args.format, timeout=args.timeout, wait_for=args.wait_for, headless=headless, human_mode=human_mode, ), timeout=args.timeout / 1000 + 10 # Add 10s buffer for human delays ) output_result(result) except asyncio.TimeoutError: output_result({ "success": False, "error": {"code": "TIMEOUT", "message": f"Overall timeout after {args.timeout}ms"}, "url": args.url, }) except Exception as e: output_result({ "success": False, "error": {"code": "FATAL_ERROR", "message": str(e)}, "url": args.url, }) if __name__ == "__main__": import signal def _sigterm_handler(signum, frame): """Handle SIGTERM by outputting JSON before exiting, so Node.js never sees empty stdout.""" output_result({ "success": False, "error": {"code": "KILLED", "message": "Process terminated by SIGTERM"}, "url": "", }) sys.exit(1) signal.signal(signal.SIGTERM, _sigterm_handler) try: asyncio.run(main()) except KeyboardInterrupt: log_info("interrupted") output_result({ "success": False, "error": {"code": "INTERRUPTED", "message": "Process interrupted"}, "url": "", }) sys.exit(1) except Exception as e: log_error("fatal_error", error=str(e)) output_result({ "success": False, "error": {"code": "FATAL_ERROR", "message": str(e)}, "url": "", }) sys.exit(1)