fetch

Load and render web pages with JavaScript, bypassing anti-bot measures. Extract clean text, HTML, or markdown automatically.

Instructions

Fetch and render a web page using a real Chrome browser. Handles JavaScript-heavy sites, anti-bot protection, and dynamic content. Auto-detects when content has loaded by monitoring DOM changes and network activity.

Input Schema

TableJSON Schema

Name	Required	Description	Default
`url`	Yes	The URL to fetch (must be a valid HTTP/HTTPS URL)
`format`	No	Output format: 'html' for raw HTML, 'text' for cleaned text content, 'markdown' for structured markdown	text
`wait_for`	No	CSS selector to wait for before extracting content. Usually not needed - the tool auto-detects content stabilization. Use this only when auto-detection fails and you know the specific element to wait for. Examples: '[class*="product"]' for e-commerce, '.job-card' for job boards, '[data-testid="results"]' for search results.
`timeout`	No	Timeout in milliseconds (default: 60000, max: 120000). Increase to 90000+ for slow-loading e-commerce or search result pages.
`human_mode`	No	Enable human-mode scrolling and delays for more natural browsing behavior (default: true)

Implementation Reference

src/tools/fetch.ts:474-490 (handler)

Main export function for the 'fetch' tool - convenience wrapper that calls fetchPage with inline options

export async function fetch(
  url: string,
  options: {
    format?: ContentFormat;
    wait_for?: string;
    timeout?: number;
    human_mode?: boolean;
  } = {}
): Promise<FetchResponse> {
  return fetchPage({
    url,
    format: options.format ?? "text",
    wait_for: options.wait_for,
    timeout: options.timeout ?? config.timeouts.navigation,
    human_mode: options.human_mode,
  });
}

src/tools/fetch.ts:405-468 (handler)

Core implementation of the fetch tool - orchestrates rate limiting, Python fetcher calls with retry, and response creation

export async function fetchPage(options: FetchOptions): Promise<FetchResponse> {
  const startTime = Date.now();
  const { url, format, wait_for, timeout, human_mode } = options;
  const domain = extractDomain(url);

  logger.info("fetch_start", { url, format, domain });

  try {
    // Step 2: Acquire rate limit token for domain
    logger.info("step_rate_limit", { domain, elapsed: Date.now() - startTime });
    await rateLimiter.acquire(domain);
    logger.info("step_rate_limit_done", { domain, elapsed: Date.now() - startTime });

    // Step 3: Call Python fetcher with retry logic
    logger.info("step_python_fetch", { elapsed: Date.now() - startTime });
    const rawContent = await fetchWithRetry(url, format as ContentFormat, timeout, wait_for, human_mode);
    logger.info("step_python_fetch_done", { elapsed: Date.now() - startTime });

    // Check for errors with no content
    if (rawContent.error && rawContent.html === "") {
      logger.error("fetch_failed", {
        url,
        event: rawContent.error,
        status: rawContent.status,
      });

      const errorCode = getErrorCode(rawContent.error);

      return createErrorResponse(
        rawContent.url,
        errorCode,
        rawContent.error,
        rawContent.status || undefined
      );
    }

    // Step 4: Python already handled content extraction, just return the result
    const duration = Date.now() - startTime;
    logger.info("fetch_complete", {
      url,
      duration_ms: duration,
      content_length: rawContent.html.length,
      status: rawContent.status,
    });

    // Step 5: Return FetchResult
    return createSuccessResponse(
      rawContent.html,  // Python already extracted content in the requested format
      rawContent.url,
      rawContent.title,
      rawContent.status
    );
  } catch (error) {
    const errorMessage = error instanceof Error ? error.message : String(error);
    logger.error("fetch_error", {
      url,
      event: errorMessage,
    });

    const errorCode = getErrorCode(errorMessage);

    return createErrorResponse(url, errorCode, errorMessage);
  }
}

src/index.ts:92-107 (registration)

Creates the tool handlers object that maps 'fetch' tool name to fetchPage implementation, registered in the MCP server

 */
function createToolHandlers(): ToolHandlers {
  return {
    fetch: async (options) => {
      log("debug", "Fetch handler called", { url: options.url });
      return fetchPage(options);
    },

    fetchBatch: async (options) => {
      log("debug", "Fetch batch handler called", { urlCount: options.urls.length });
      return fetchBatch(options);
    },
  };
}

// =============================================================================

src/server.ts:32-69 (schema)

MCP tool definition (name, description, inputSchema) for the 'fetch' tool used when listing available tools

const FETCH_TOOL: Tool = {
  name: "fetch",
  description:
    "Fetch and render a web page using a real Chrome browser. Handles JavaScript-heavy sites, anti-bot protection, and dynamic content. Auto-detects when content has loaded by monitoring DOM changes and network activity.",
  inputSchema: {
    type: "object",
    properties: {
      url: {
        type: "string",
        description: "The URL to fetch (must be a valid HTTP/HTTPS URL)",
      },
      format: {
        type: "string",
        enum: ["html", "text", "markdown"],
        default: "text",
        description:
          "Output format: 'html' for raw HTML, 'text' for cleaned text content, 'markdown' for structured markdown",
      },
      wait_for: {
        type: "string",
        description:
          "CSS selector to wait for before extracting content. Usually not needed - the tool auto-detects content stabilization. Use this only when auto-detection fails and you know the specific element to wait for. Examples: '[class*=\"product\"]' for e-commerce, '.job-card' for job boards, '[data-testid=\"results\"]' for search results.",
      },
      timeout: {
        type: "number",
        default: 60000,
        description:
          "Timeout in milliseconds (default: 60000, max: 120000). Increase to 90000+ for slow-loading e-commerce or search result pages.",
      },
      human_mode: {
        type: "boolean",
        description:
          "Enable human-mode scrolling and delays for more natural browsing behavior (default: true)",
      },
    },
    required: ["url"],
  },
};

python/fetcher.py:1067-1565 (handler)

Python implementation of the actual page fetching using Nodriver (Chrome automation) - handles browser launch, anti-bot bypass, content extraction

async def fetch_page(
    url: str,
    format: str = "text",
    timeout: int = 30000,
    wait_for: Optional[str] = None,
    headless: bool = True,
    human_mode: bool = True,
) -> Dict[str, Any]:
    """
    Fetch a web page using Nodriver.

    Args:
        url: URL to fetch
        format: Output format - "text", "markdown", or "html"
        timeout: Timeout in milliseconds
        wait_for: Optional CSS selector to wait for
        headless: Run browser in headless mode
        human_mode: Enable human-like behavior (delays, mouse movements, scrolling)

    Returns:
        Dict with success, content, url, title, status
    """
    browser = None
    chrome_pid = None  # For macOS background mode cleanup (actual Chrome PID)
    debug_port = None  # For macOS background mode
    user_data_dir = None  # Temp directory for browser profile (cleaned up in finally)
    start_time = time.time()

    try:
        # Validate URL
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            raise FetchError("INVALID_URL", f"Invalid URL format: {url}")

        log_info("fetch_start", url=url, format=format, headless=headless, human_mode=human_mode)

        # Get Chrome path
        chrome_path = get_chrome_path()
        if chrome_path:
            log_info("chrome_found", path=chrome_path)
        else:
            log_info("chrome_not_found", message="Using nodriver auto-detection")

        # Launch browser
        # sandbox=False required on macOS, otherwise Chrome fails to start
        # Use unique user_data_dir to avoid conflicts with parallel browser instances
        # (each Chrome instance needs its own profile directory)
        import tempfile
        user_data_dir = tempfile.mkdtemp(prefix='turbowebfetch_')  # Cleaned up in finally
        log_info("browser_user_data_dir", user_data_dir=user_data_dir, headless=headless)

        # Build browser args
        browser_args = []
        if not headless:
            browser_args.append('--window-position=-2400,-2400')
            log_info("headed_offscreen_mode", window_position="-2400,-2400")

        browser = await asyncio.wait_for(
            uc.start(
                headless=headless,
                browser_executable_path=chrome_path,
                sandbox=False,
                browser_args=browser_args,
                user_data_dir=user_data_dir,
            ),
            timeout=NAVIGATE_TIMEOUT
        )
        page = await safe_navigate(browser, url)

        # Initialize human behavior wrapper (after browser starts, we can get viewport)
        human: Optional[HumanBehavior] = None
        if human_mode:
            try:
                # Nodriver returns lists from evaluate, so get width/height separately
                viewport_width = await safe_evaluate(page, "window.innerWidth", timeout=5, default=1920) or 1920
                viewport_height = await safe_evaluate(page, "window.innerHeight", timeout=5, default=1080) or 1080
                human = HumanBehavior(
                    enabled=True,
                    viewport_width=int(viewport_width),
                    viewport_height=int(viewport_height)
                )
                log_info("human_mode_enabled", viewport_width=viewport_width, viewport_height=viewport_height, modules_available=HUMAN_MODULES_AVAILABLE)
            except Exception as e:
                log_info("human_mode_init_failed", error=str(e))
                human = HumanBehavior(enabled=False)

        # Detect and wait for Cloudflare JS challenge to auto-pass
        is_cloudflare = await detect_cloudflare(page)
        cf_retry_needed = False

        if is_cloudflare:
            log_info("cloudflare_detected", url=url)

            # Wait for Cloudflare JS challenge to complete (up to 10 seconds)
            max_cf_wait = 10
            cf_check_interval = 2
            cf_waited = 0

            while cf_waited < max_cf_wait:
                await asyncio.sleep(cf_check_interval)
                cf_waited += cf_check_interval

                # Check if still on Cloudflare challenge
                still_cloudflare = await detect_cloudflare(page)
                if not still_cloudflare:
                    log_info("cloudflare_passed", waited_seconds=cf_waited)
                    break

                log_info("cloudflare_waiting", waited_seconds=cf_waited, max_wait=max_cf_wait)

            # If still on Cloudflare after waiting, need headed retry with cf_verify
            still_cf = await detect_cloudflare(page)
            log_info("cloudflare_check_after_wait", cf_waited=cf_waited, max_cf_wait=max_cf_wait, still_cloudflare=still_cf, headless=headless)
            if cf_waited >= max_cf_wait and still_cf:
                if headless:
                    cf_retry_needed = True
                    log_info("cloudflare_retry_needed", reason="JS challenge didn't pass, will retry headed with cf_verify")
                else:
                    log_info("cloudflare_already_headed", reason="Already in headed mode, cannot retry")

        # Retry with headed mode + cf_verify() if needed
        if cf_retry_needed:
            log_info("cloudflare_headed_retry_start", url=url)

            # Close headless browser
            try:
                browser.stop()
            except Exception:
                pass
            browser = None

            # Relaunch in headed mode (background on macOS, off-screen on others)
            browser, page, chrome_pid, debug_port = await start_headed_browser(
                chrome_path=chrome_path,
                url=url,
            )

            # Wait for page to load
            await asyncio.sleep(2)

            # Check if still Cloudflare (it should be)
            if await detect_cloudflare(page):
                log_info("cloudflare_cf_verify_attempt", url=url)
                try:
                    # Use nodriver's built-in Cloudflare bypass (clicks the checkbox)
                    # Add timeout to prevent hanging on cf_verify
                    await asyncio.wait_for(page.verify_cf(), timeout=30)
                    log_info("cloudflare_cf_verify_success", url=url)

                    # Wait for redirect after verification
                    await asyncio.sleep(3)

                    # Verify we passed
                    if await detect_cloudflare(page):
                        log_error("cloudflare_cf_verify_failed", message="Still on challenge after cf_verify")
                        # Return error - don't continue extracting challenge page content
                        raise FetchError("BLOCKED", "Cloudflare challenge not bypassed after cf_verify")
                    else:
                        log_info("cloudflare_bypassed", url=url)
                except asyncio.TimeoutError:
                    log_error("cloudflare_cf_verify_timeout", url=url)
                    raise FetchError("TIMEOUT", "Cloudflare verification timed out after 30s")
                except FetchError:
                    raise  # Re-raise our own errors
                except Exception as cf_err:
                    log_error("cloudflare_cf_verify_error", error=str(cf_err))
                    raise FetchError("BLOCKED", f"Cloudflare bypass failed: {cf_err}")

            # Re-initialize human behavior for new browser
            if human_mode:
                try:
                    viewport_width = await safe_evaluate(page, "window.innerWidth", timeout=5, default=1920) or 1920
                    viewport_height = await safe_evaluate(page, "window.innerHeight", timeout=5, default=1080) or 1080
                    human = HumanBehavior(
                        enabled=True,
                        viewport_width=int(viewport_width),
                        viewport_height=int(viewport_height)
                    )
                except Exception:
                    human = HumanBehavior(enabled=False)

        # Detect DataDome anti-bot challenge (only if not already retried for Cloudflare)
        # DataDome is used by sites like Indeed and blocks headless browsers
        if not cf_retry_needed:
            is_datadome = await detect_datadome(page)
            datadome_retry_needed = False

            if is_datadome:
                log_info("datadome_detected", url=url)

                if headless:
                    datadome_retry_needed = True
                    log_info("datadome_retry_needed", reason="DataDome blocks headless, will retry in headed mode")
                else:
                    # Already in headed mode, DataDome might still block but we can't do more
                    log_info("datadome_already_headed", reason="Already in headed mode, cannot retry")

            # Retry with headed mode for DataDome (no cf_verify needed, just human behavior)
            if datadome_retry_needed:
                log_info("datadome_headed_retry_start", url=url)

                # Close headless browser
                try:
                    browser.stop()
                except Exception:
                    pass
                browser = None

                # Relaunch in headed mode (background on macOS, off-screen on others)
                browser, page, chrome_pid, debug_port = await start_headed_browser(
                    chrome_path=chrome_path,
                    url=url,
                )

                # Wait for page to load with human-like delay
                await asyncio.sleep(3)

                # Re-initialize human behavior for new browser
                if human_mode:
                    try:
                        viewport_width = await safe_evaluate(page, "window.innerWidth", timeout=5, default=1920) or 1920
                        viewport_height = await safe_evaluate(page, "window.innerHeight", timeout=5, default=1080) or 1080
                        human = HumanBehavior(
                            enabled=True,
                            viewport_width=int(viewport_width),
                            viewport_height=int(viewport_height)
                        )
                    except Exception:
                        human = HumanBehavior(enabled=False)

                # Check if DataDome is still blocking
                still_datadome = await detect_datadome(page)
                if still_datadome:
                    log_error("datadome_headed_retry_failed", message="Still blocked after headed retry")
                    raise FetchError("BLOCKED", "DataDome challenge not bypassed in headed mode")
                else:
                    log_info("datadome_bypassed", url=url)

        # Wait for specific selector if requested, otherwise auto-stabilize
        if wait_for:
            try:
                await page.find(wait_for, timeout=timeout / 1000)
                log_info("selector_found", selector=wait_for)
            except Exception as e:
                log_error("selector_wait_timeout", selector=wait_for, error=str(e))
        else:
            # Fixed wait for JS-heavy pages to load content
            # Auto-stabilization was unreliable due to intermittent evaluate() failures
            # A simple fixed wait is more reliable for modern JS frameworks
            fixed_wait = 5.0  # 5 seconds handles most JS rendering
            log_info("fixed_wait_start", seconds=fixed_wait)
            await asyncio.sleep(fixed_wait)

        # Add reading delay after navigation (human takes time to see page)
        if human:
            reading_time = human.get_reading_delay()
            log_info("reading_delay", seconds=round(reading_time, 2))
            await asyncio.sleep(reading_time)

        # Add thinking delay before taking actions
        if human:
            think_time = human.get_thinking_delay(complexity="simple")
            await asyncio.sleep(think_time)

        # Dismiss overlays (with human behavior)
        await dismiss_overlays(page, human=human)

        # Lazy load content (with human behavior)
        await lazy_load_content(page, human=human)

        # Get final URL
        final_url = page.url

        # Get page title (with timeout)
        title = await safe_evaluate(page, "document.title", timeout=5, default="") or ""

        # Get page HTML (with timeout to prevent hangs on never-ending pages)
        html = await safe_get_content(page, timeout=CONTENT_TIMEOUT)
        if not html:
            log_error("content_extraction_failed", error="get_content returned empty")
            html = await safe_evaluate(page, "document.documentElement.outerHTML", timeout=10, default="") or ""

        # Get innerText as fallback for JS-heavy pages where Readability fails
        inner_text_raw = await safe_evaluate(page, "document.body.innerText", timeout=10, default=None)
        # Validate innerText is actually a string (nodriver can return error objects)
        inner_text = inner_text_raw if isinstance(inner_text_raw, str) else None

        # Log extraction inputs for debugging
        log_info("content_extraction_inputs",
                html_len=len(html) if html else 0,
                title=title[:50] if title else "None",
                innertext_len=len(inner_text) if inner_text else 0,
                innertext_type=type(inner_text_raw).__name__)

        # Extract content based on format
        if format == "html":
            content = html
        elif format == "markdown":
            content = extract_markdown_content(html, title, inner_text)
        else:  # text
            content = extract_text_content(html, title, inner_text)

        duration_ms = int((time.time() - start_time) * 1000)
        log_info("fetch_success", url=url, final_url=final_url, duration_ms=duration_ms)

        return {
            "success": True,
            "content": content,
            "url": final_url,
            "title": title,
            "status": 200,  # Nodriver doesn't expose HTTP status easily
        }

    except FetchError as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_error("fetch_failed", url=url, code=e.code, message=e.message, duration_ms=duration_ms)
        return {
            "success": False,
            "error": {"code": e.code, "message": e.message},
            "url": url,
        }

    except asyncio.TimeoutError:
        duration_ms = int((time.time() - start_time) * 1000)
        log_error("fetch_timeout", url=url, duration_ms=duration_ms)
        return {
            "success": False,
            "error": {"code": "TIMEOUT", "message": f"Fetch timeout after {timeout}ms"},
            "url": url,
        }

    except Exception as e:
        duration_ms = int((time.time() - start_time) * 1000)
        log_error("fetch_error", url=url, error=str(e), duration_ms=duration_ms)
        return {
            "success": False,
            "error": {"code": "UNKNOWN_ERROR", "message": str(e)},
            "url": url,
        }

    finally:
        # Clean up browser
        if browser:
            try:
                browser.stop()  # Note: stop() is not async
            except Exception as e:
                log_error("browser_cleanup_failed", error=str(e))

        # browser.stop() only kills the main Chrome process, not helper processes
        # Kill ALL remaining Chrome processes with our user-data-dir (renderer, GPU, etc.)
        if user_data_dir:
            try:
                subprocess.run(
                    ['pkill', '-KILL', '-f', user_data_dir],
                    timeout=3,
                    check=False,  # Don't raise if no processes found
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.DEVNULL,
                )
                log_info("chrome_helpers_cleanup", user_data_dir=user_data_dir)
            except subprocess.TimeoutExpired:
                log_error("chrome_helpers_cleanup_timeout", user_data_dir=user_data_dir)
            except Exception as e:
                log_error("chrome_helpers_cleanup_failed", user_data_dir=user_data_dir, error=str(e))

        # Clean up Chrome process if launched in background mode (macOS)
        # Must kill all Chrome processes with our user-data-dir (main + helpers)
        if chrome_pid and debug_port:
            import signal
            log_info("chrome_cleanup_starting", pid=chrome_pid, port=debug_port)

            # Use pkill with SIGKILL to immediately kill ALL Chrome processes with our port
            # This is more reliable than SIGTERM which Chrome may ignore
            # Note: macOS pkill uses "-KILL" not "-9"
            try:
                subprocess.run(
                    ['pkill', '-KILL', '-f', f'remote-debugging-port={debug_port}'],
                    timeout=3
                )
                log_info("chrome_background_cleanup", pid=chrome_pid, port=debug_port)
            except subprocess.TimeoutExpired:
                log_error("chrome_pkill_timeout", port=debug_port)
            except Exception as e:
                # Fallback: try direct kill on main process
                try:
                    os.kill(chrome_pid, signal.SIGKILL)
                except Exception:
                    pass
                log_error("chrome_background_cleanup_failed", pid=chrome_pid, error=str(e))

        # Clean up temp user data directory
        if user_data_dir:
            try:
                import shutil
                shutil.rmtree(user_data_dir, ignore_errors=True)
                log_info("user_data_dir_cleanup", path=user_data_dir)
            except Exception as e:
                log_error("user_data_dir_cleanup_failed", path=user_data_dir, error=str(e))


async def main():
    """Main entry point."""
    parser = argparse.ArgumentParser(description="Fetch web pages using Nodriver")
    parser.add_argument("--url", required=True, help="URL to fetch")
    parser.add_argument("--format", choices=["html", "text", "markdown"], default="text", help="Output format")
    parser.add_argument("--timeout", type=int, default=30000, help="Timeout in milliseconds")
    parser.add_argument("--wait-for", help="CSS selector to wait for")
    parser.add_argument("--headless", type=str, default="true", help="Run headless (true/false)")
    parser.add_argument("--human-mode", type=str, default="true", help="Enable human-like behavior (true/false)")

    args = parser.parse_args()

    # Convert string args to bool
    headless = args.headless.lower() in ("true", "1", "yes")
    human_mode = args.human_mode.lower() in ("true", "1", "yes")

    try:
        # Run fetch with timeout
        result = await asyncio.wait_for(
            fetch_page(
                url=args.url,
                format=args.format,
                timeout=args.timeout,
                wait_for=args.wait_for,
                headless=headless,
                human_mode=human_mode,
            ),
            timeout=args.timeout / 1000 + 10  # Add 10s buffer for human delays
        )
        output_result(result)
    except asyncio.TimeoutError:
        output_result({
            "success": False,
            "error": {"code": "TIMEOUT", "message": f"Overall timeout after {args.timeout}ms"},
            "url": args.url,
        })
    except Exception as e:
        output_result({
            "success": False,
            "error": {"code": "FATAL_ERROR", "message": str(e)},
            "url": args.url,
        })


if __name__ == "__main__":
    import signal

    def _sigterm_handler(signum, frame):
        """Handle SIGTERM by outputting JSON before exiting, so Node.js never sees empty stdout."""
        output_result({
            "success": False,
            "error": {"code": "KILLED", "message": "Process terminated by SIGTERM"},
            "url": "",
        })
        sys.exit(1)

    signal.signal(signal.SIGTERM, _sigterm_handler)

    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        log_info("interrupted")
        output_result({
            "success": False,
            "error": {"code": "INTERRUPTED", "message": "Process interrupted"},
            "url": "",
        })
        sys.exit(1)
    except Exception as e:
        log_error("fatal_error", error=str(e))
        output_result({
            "success": False,
            "error": {"code": "FATAL_ERROR", "message": str(e)},
            "url": "",
        })
        sys.exit(1)

turbowebfetch

fetch

Instructions

Input Schema

Implementation Reference

Tool Definition Quality

Other Tools

Latest Blog Posts

MCP directory API