Skip to main content
Glama
ThreatFlux
by ThreatFlux

playwright-fetch

Fetch web content using Playwright browser automation to retrieve JavaScript-rendered pages and convert HTML to markdown for processing.

Instructions

Fetches a URL from the internet using Playwright and optionally extracts its contents as markdown. Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYesURL to fetch
max_lengthNoMaximum number of characters to return.
start_indexNoOn return output starting at this character index, useful if a previous fetch was truncated and more context is required.
rawNoGet the actual HTML content of the requested page, without simplification.
wait_for_jsNoWait for JavaScript to execute (client-side rendering).

Implementation Reference

  • Core handler function that launches Playwright browser, navigates to URL, waits for load/JS, extracts main content, converts HTML to markdown, handles errors and proxies.
    async def fetch_url_with_playwright( url: str, user_agent: str, force_raw: bool = False, proxy_url: Optional[str] = None, headless: bool = True, wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] = "networkidle", ) -> Tuple[str, str]: """ Fetch the URL using Playwright and return the content in a form ready for the LLM, as well as a prefix string with status information. """ async with async_playwright() as p: browser_type = p.chromium browser_kwargs: Dict[str, Any] = {"headless": headless} if proxy_url: proxy_settings = ProxySettings(server=proxy_url) browser_kwargs["proxy"] = proxy_settings try: browser = await browser_type.launch(**browser_kwargs) context = await browser.new_context(user_agent=user_agent) page = await context.new_page() try: response = await page.goto(url, wait_until=wait_until, timeout=30000) if not response: raise McpError( ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url} - no response received"), ) if response.status >= 400: raise McpError( ErrorData( code=INTERNAL_ERROR, message=f"Failed to fetch {url} - status code {response.status}", ), ) # Wait for any client-side rendering to complete await page.wait_for_load_state("networkidle", timeout=5000) # Get content content_type = response.headers.get("content-type", "") is_page_html = "text/html" in content_type or not content_type if is_page_html and not force_raw: # Get the rendered HTML content after JavaScript execution html_content = await page.content() # Extract the main content using a common article extraction selector pattern try: # Try to find main content area using common selectors for selector in [ "main", "article", ".main-content", "#main-content", ".content", "#content", ".article", ".post", ".entry-content", ]: try: element = await page.query_selector(selector) if element: main_content = await element.inner_html() break except PlaywrightError: continue else: # If no specific content area found, get the body main_content = html_content # Convert to markdown markdown_content = html_to_markdown(main_content) # Clean up markdown (remove excessive newlines, etc.) markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) return markdown_content, "" except Exception as e: logger.exception(f"Error extracting content: {e}") return html_to_markdown(html_content), "" # For non-HTML or if raw content is requested page_content = await page.content() return ( page_content, f"Content type {content_type} cannot be simplified to markdown, but here is the raw content:\n", ) finally: await context.close() await browser.close() except PlaywrightError as e: raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}")) except asyncio.TimeoutError: raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Timeout when fetching {url}")) except Exception as e: raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Error fetching {url}: {e!s}"))
  • Tool call handler that validates arguments using Fetch schema, checks robots.txt, calls fetch_url_with_playwright, handles pagination with start_index and max_length, adds truncation message if needed.
    async def call_tool(name, arguments: dict) -> list[TextContent]: """Handle tool calls.""" try: args = Fetch(**arguments) except ValueError as e: raise McpError(ErrorData(code=INVALID_PARAMS, message=str(e))) url = str(args.url) if not url: raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required")) if not ignore_robots_txt: await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url) content, prefix = await fetch_url_with_playwright( url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url, headless=headless, wait_until=wait_until if args.wait_for_js else cast(Literal["commit", "domcontentloaded", "load", "networkidle"], "domcontentloaded"), ) original_length = len(content) if args.start_index >= original_length: content = "<e>No more content available.</e>" else: truncated_content = content[args.start_index : args.start_index + args.max_length] if not truncated_content: content = "<e>No more content available.</e>" else: content = truncated_content actual_content_length = len(truncated_content) remaining_content = original_length - (args.start_index + actual_content_length) # Only add the prompt to continue fetching if there is still remaining content if actual_content_length == args.max_length and remaining_content > 0: next_start = args.start_index + actual_content_length content += f"\n\n<e>Content truncated. Call the playwright-fetch tool with a start_index of {next_start} to get more content.</e>" return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")]
  • Pydantic model defining the input schema for the playwright-fetch tool, used for validation and Tool inputSchema.
    class Fetch(BaseModel): """Parameters for fetching a URL.""" url: Annotated[AnyUrl, Field(description="URL to fetch")] max_length: Annotated[ int, Field(default=5000, description="Maximum number of characters to return.", gt=0, lt=1000000), ] start_index: Annotated[ int, Field( default=0, description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.", ge=0, ), ] raw: Annotated[ bool, Field(default=False, description="Get the actual HTML content of the requested page, without simplification."), ] wait_for_js: Annotated[ bool, Field(default=True, description="Wait for JavaScript to execute (client-side rendering)."), ]
  • Registers the 'playwright-fetch' tool by returning the Tool object with name, description, and inputSchema from Fetch.model_json_schema().
    async def list_tools() -> list[Tool]: """List available tools for the server.""" return [ Tool( name="playwright-fetch", description="""Fetches a URL from the internet using Playwright and optionally extracts its contents as markdown. Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.""", inputSchema=Fetch.model_json_schema(), ), ]
  • Main serve function that configures global vars, creates MCP Server named 'mcp-playwright-fetch', and registers the list_tools, list_prompts, call_tool, get_prompt handlers.
    async def serve( custom_user_agent: Optional[str] = None, ignore_robots_txt: bool = False, proxy_url: Optional[str] = None, headless: bool = True, wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] = "networkidle", ) -> None: """Run the Playwright fetch MCP server. Args: custom_user_agent: Optional custom User-Agent string to use for requests ignore_robots_txt: Whether to ignore robots.txt restrictions proxy_url: Optional proxy URL to use for requests headless: Whether to run the browser in headless mode wait_until: When to consider navigation succeeded """ # Import required modules import sys # Import the handlers from .handlers import ( call_tool, get_prompt, list_prompts, list_tools, ) handlers_module = sys.modules["mcp_server_fetch.handlers"] # Set configuration variables in the handlers module # Use type: ignore to handle dynamic attribute assignment through sys.modules handlers_module.user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS # type: ignore handlers_module.user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL # type: ignore handlers_module.ignore_robots_txt = ignore_robots_txt # type: ignore handlers_module.proxy_url = proxy_url # type: ignore handlers_module.headless = headless # type: ignore handlers_module.wait_until = wait_until # type: ignore # Create server instance server: Server = Server("mcp-playwright-fetch") # Register the handlers server.list_tools()(list_tools) server.list_prompts()(list_prompts) server.call_tool()(call_tool) server.get_prompt()(get_prompt) # Start the server options = server.create_initialization_options() async with stdio_server() as (read_stream, write_stream): await server.run(read_stream, write_stream, options, raise_exceptions=True)

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreatFlux/playwright-fetch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server