playwright-fetch
Fetch and process web content with Playwright, enabling access to JavaScript-rendered pages and extraction of HTML content as markdown for simplified analysis and integration.
Instructions
Fetches a URL from the internet using Playwright and optionally extracts its contents as markdown. Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| max_length | No | Maximum number of characters to return. | |
| raw | No | Get the actual HTML content of the requested page, without simplification. | |
| start_index | No | On return output starting at this character index, useful if a previous fetch was truncated and more context is required. | |
| url | Yes | URL to fetch | |
| wait_for_js | No | Wait for JavaScript to execute (client-side rendering). |
Implementation Reference
- src/mcp_server_fetch/handlers.py:59-101 (handler)The main handler function for the 'playwright-fetch' tool. It validates input using the Fetch schema, checks robots.txt if applicable, fetches the content using Playwright via fetch_url_with_playwright, handles pagination with start_index and max_length, and returns the content as TextContent.async def call_tool(name, arguments: dict) -> list[TextContent]: """Handle tool calls.""" try: args = Fetch(**arguments) except ValueError as e: raise McpError(ErrorData(code=INVALID_PARAMS, message=str(e))) url = str(args.url) if not url: raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required")) if not ignore_robots_txt: await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url) content, prefix = await fetch_url_with_playwright( url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url, headless=headless, wait_until=wait_until if args.wait_for_js else cast(Literal["commit", "domcontentloaded", "load", "networkidle"], "domcontentloaded"), ) original_length = len(content) if args.start_index >= original_length: content = "<e>No more content available.</e>" else: truncated_content = content[args.start_index : args.start_index + args.max_length] if not truncated_content: content = "<e>No more content available.</e>" else: content = truncated_content actual_content_length = len(truncated_content) remaining_content = original_length - (args.start_index + actual_content_length) # Only add the prompt to continue fetching if there is still remaining content if actual_content_length == args.max_length and remaining_content > 0: next_start = args.start_index + actual_content_length content += f"\n\n<e>Content truncated. Call the playwright-fetch tool with a start_index of {next_start} to get more content.</e>" return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")]
- Core helper function that launches Playwright browser, navigates to the URL, waits for load, extracts main content if HTML, converts to markdown, handles raw content, and returns processed content with prefix.async def fetch_url_with_playwright( url: str, user_agent: str, force_raw: bool = False, proxy_url: Optional[str] = None, headless: bool = True, wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] = "networkidle", ) -> Tuple[str, str]: """ Fetch the URL using Playwright and return the content in a form ready for the LLM, as well as a prefix string with status information. """ async with async_playwright() as p: browser_type = p.chromium browser_kwargs: Dict[str, Any] = {"headless": headless} if proxy_url: proxy_settings = ProxySettings(server=proxy_url) browser_kwargs["proxy"] = proxy_settings try: browser = await browser_type.launch(**browser_kwargs) context = await browser.new_context(user_agent=user_agent) page = await context.new_page() try: response = await page.goto(url, wait_until=wait_until, timeout=30000) if not response: raise McpError( ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url} - no response received"), ) if response.status >= 400: raise McpError( ErrorData( code=INTERNAL_ERROR, message=f"Failed to fetch {url} - status code {response.status}", ), ) # Wait for any client-side rendering to complete await page.wait_for_load_state("networkidle", timeout=5000) # Get content content_type = response.headers.get("content-type", "") is_page_html = "text/html" in content_type or not content_type if is_page_html and not force_raw: # Get the rendered HTML content after JavaScript execution html_content = await page.content() # Extract the main content using a common article extraction selector pattern try: # Try to find main content area using common selectors for selector in [ "main", "article", ".main-content", "#main-content", ".content", "#content", ".article", ".post", ".entry-content", ]: try: element = await page.query_selector(selector) if element: main_content = await element.inner_html() break except PlaywrightError: continue else: # If no specific content area found, get the body main_content = html_content # Convert to markdown markdown_content = html_to_markdown(main_content) # Clean up markdown (remove excessive newlines, etc.) markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) return markdown_content, "" except Exception as e: logger.exception(f"Error extracting content: {e}") return html_to_markdown(html_content), "" # For non-HTML or if raw content is requested page_content = await page.content() return ( page_content, f"Content type {content_type} cannot be simplified to markdown, but here is the raw content:\n", ) finally: await context.close() await browser.close() except PlaywrightError as e: raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}")) except asyncio.TimeoutError: raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Timeout when fetching {url}")) except Exception as e: raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Error fetching {url}: {e!s}"))
- Pydantic model defining the input schema for the 'playwright-fetch' tool, including URL, pagination options, raw flag, and JS wait option. Referenced in list_tools().class Fetch(BaseModel): """Parameters for fetching a URL.""" url: Annotated[AnyUrl, Field(description="URL to fetch")] max_length: Annotated[ int, Field(default=5000, description="Maximum number of characters to return.", gt=0, lt=1000000), ] start_index: Annotated[ int, Field( default=0, description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.", ge=0, ), ] raw: Annotated[ bool, Field(default=False, description="Get the actual HTML content of the requested page, without simplification."), ] wait_for_js: Annotated[ bool, Field(default=True, description="Wait for JavaScript to execute (client-side rendering)."), ]
- src/mcp_server_fetch/handlers.py:36-46 (registration)Tool registration via list_tools(): defines the 'playwright-fetch' tool name, description, and references the Fetch inputSchema.async def list_tools() -> list[Tool]: """List available tools for the server.""" return [ Tool( name="playwright-fetch", description="""Fetches a URL from the internet using Playwright and optionally extracts its contents as markdown. Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.""", inputSchema=Fetch.model_json_schema(), ), ]
- src/mcp_server_fetch/server.py:264-294 (registration)Server setup in serve() function: imports handlers, configures globals, creates Server('mcp-playwright-fetch'), and registers list_tools, list_prompts, call_tool, get_prompt which implement the 'playwright-fetch' tool.# Import the handlers from .handlers import ( call_tool, get_prompt, list_prompts, list_tools, ) handlers_module = sys.modules["mcp_server_fetch.handlers"] # Set configuration variables in the handlers module # Use type: ignore to handle dynamic attribute assignment through sys.modules handlers_module.user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS # type: ignore handlers_module.user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL # type: ignore handlers_module.ignore_robots_txt = ignore_robots_txt # type: ignore handlers_module.proxy_url = proxy_url # type: ignore handlers_module.headless = headless # type: ignore handlers_module.wait_until = wait_until # type: ignore # Create server instance server: Server = Server("mcp-playwright-fetch") # Register the handlers server.list_tools()(list_tools) server.list_prompts()(list_prompts) server.call_tool()(call_tool) server.get_prompt()(get_prompt) # Start the server options = server.create_initialization_options() async with stdio_server() as (read_stream, write_stream): await server.run(read_stream, write_stream, options, raise_exceptions=True)