universal_scraper
Fetch any webpage's content with options for browser rendering, geo-targeting, and multiple output formats like HTML, markdown, or links.
Instructions
Get a content of any webpage.
Supports browser rendering, parsing of certain webpages and different output formats.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | Website url to scrape. | |
| render | No | Whether a headless browser should be used to render the page. For example: - 'html' when browser is required to render the page. | |
| user_agent_type | No | Device type and browser that will be used to determine User-Agent header value. | |
| geo_location | No | The geographical location that the result should be adapted for. Use ISO-3166 country codes. Examples: - 'California, United States' - 'Mexico' - 'US' for United States - 'DE' for Germany - 'FR' for France | |
| output_format | No | The format of the output. Works only when parse parameter is false. - links - Most efficient when the goal is navigation or finding specific URLs. Use this first when you need to locate a specific page within a website. - md - Best for extracting and reading visible content once you've found the right page. Use this to get structured content that's easy to read and process. - html - Should be used sparingly only when you need the raw HTML structure, JavaScript code, or styling information. |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
| result | Yes |
Implementation Reference
- src/oxylabs_mcp/tools/scraper.py:24-52 (handler)The main handler function for the 'universal_scraper' tool. It's decorated with @mcp.tool, accepts URL, render, user_agent_type, geo_location, and output_format parameters. It builds a payload, calls the Oxylabs client to scrape the URL, and returns content in the requested format (markdown, HTML, or links).
@mcp.tool(annotations=ToolAnnotations(readOnlyHint=True)) async def universal_scraper( url: url_params.URL_PARAM, render: url_params.RENDER_PARAM = None, user_agent_type: url_params.USER_AGENT_TYPE_PARAM = None, geo_location: url_params.GEO_LOCATION_PARAM = None, output_format: url_params.OUTPUT_FORMAT_PARAM = None, ) -> str: """Get a content of any webpage. Supports browser rendering, parsing of certain webpages and different output formats. """ try: async with oxylabs_client() as client: payload: dict[str, Any] = {"url": url} if render: payload["render"] = render if user_agent_type: payload["user_agent_type"] = user_agent_type if geo_location: payload["geo_location"] = geo_location response_json = await client.scrape(payload) return get_content(response_json, output_format=output_format) except MCPServerError as e: return await e.process() - src/oxylabs_mcp/tools/scraper.py:14-21 (registration)SCRAPER_TOOLS list includes 'universal_scraper' as one of the available scraper tools. The tool is registered via the @mcp.tool decorator on the FastMCP instance named 'scraper', which is mounted in __init__.py (line 36).
SCRAPER_TOOLS = [ "universal_scraper", "google_search_scraper", "amazon_search_scraper", "amazon_product_scraper", ] mcp = FastMCP("scraper") - src/oxylabs_mcp/url_params.py:6-60 (schema)Input schema definitions used by the universal_scraper handler. Defines URL_PARAM (str with Field description), RENDER_PARAM (Literal['html']), USER_AGENT_TYPE_PARAM (desktop/mobile/tablet variants), GEO_LOCATION_PARAM (ISO-3166 country codes), and OUTPUT_FORMAT_PARAM (links, md, html).
URL_PARAM = Annotated[str, Field(description="Website url to scrape.")] PARSE_PARAM = Annotated[ bool, Field( description="Should result be parsed. If the result is not parsed, the output_format parameter is applied.", ), ] RENDER_PARAM = Annotated[ Literal["html"] | None, Field( description=""" Whether a headless browser should be used to render the page. For example: - 'html' when browser is required to render the page. """, examples=["html"], ), ] OUTPUT_FORMAT_PARAM = Annotated[ Literal[ "links", "md", "html", ] | None, Field( description=""" The format of the output. Works only when parse parameter is false. - links - Most efficient when the goal is navigation or finding specific URLs. Use this first when you need to locate a specific page within a website. - md - Best for extracting and reading visible content once you've found the right page. Use this to get structured content that's easy to read and process. - html - Should be used sparingly only when you need the raw HTML structure, JavaScript code, or styling information. """ ), ] GOOGLE_QUERY_PARAM = Annotated[str, Field(description="URL-encoded keyword to search for.")] AMAZON_SEARCH_QUERY_PARAM = Annotated[str, Field(description="Keyword to search for.")] USER_AGENT_TYPE_PARAM = Annotated[ Literal[ "desktop", "desktop_chrome", "desktop_firefox", "desktop_safari", "desktop_edge", "desktop_opera", "mobile", "mobile_ios", "mobile_android", "tablet", ] | None, Field( description="Device type and browser that will be used to " "determine User-Agent header value." ), ] - src/oxylabs_mcp/utils.py:288-305 (helper)get_content() helper function called by universal_scraper to process the API response. Depending on output_format, it returns raw HTML, extracted links, or markdown-converted content. Also uses clean_html() and markdownify.
def get_content( response_json: dict[str, typing.Any], *, output_format: str | None, parse: bool = False, ) -> str: """Extract content from response and convert to a proper format.""" content = response_json["results"][0]["content"] if parse and isinstance(content, dict): return json.dumps(content) if output_format == "html": return str(content) if output_format == "links": links = extract_links_with_text(str(content)) return "\n".join(links) stripped_html = clean_html(str(content)) return markdownify(stripped_html) - src/oxylabs_mcp/utils.py:144-228 (helper)_OxylabsClientWrapper class (specifically the scrape method) and oxylabs_client async context manager. The scrape method posts the payload to the Oxylabs API endpoint and returns the JSON response, which is the core network call used by universal_scraper.
class _OxylabsClientWrapper: def __init__( self, client: AsyncClient, ) -> None: self._client = client self._ctx = get_context() async def scrape(self, payload: dict[str, typing.Any]) -> dict[str, typing.Any]: await self._ctx.info(f"Create job with params: {json.dumps(payload)}") response = await self._client.post(settings.OXYLABS_SCRAPER_URL, json=payload) response_json: dict[str, typing.Any] = response.json() if response.status_code == status.HTTP_201_CREATED: await self._ctx.info( f"Job info: " f"job_id={response_json['job']['id']} " f"job_status={response_json['job']['status']}" ) response.raise_for_status() return response_json def get_oxylabs_auth() -> tuple[str | None, str | None]: """Extract the Oxylabs credentials.""" if settings.MCP_TRANSPORT == "streamable-http": request_headers = dict(get_context().request_context.request.headers) # type: ignore[union-attr] username = request_headers.get(USERNAME_HEADER.lower()) password = request_headers.get(PASSWORD_HEADER.lower()) if not username or not password: query_params = get_context().request_context.request.query_params # type: ignore[union-attr] username = query_params.get(USERNAME_QUERY_PARAM) password = query_params.get(PASSWORD_QUERY_PARAM) else: username = os.environ.get(USERNAME_ENV) password = os.environ.get(PASSWORD_ENV) return username, password def get_oxylabs_ai_studio_api_key() -> str | None: """Extract the Oxylabs AI Studio API key.""" if settings.MCP_TRANSPORT == "streamable-http": request_headers = dict(get_context().request_context.request.headers) # type: ignore[union-attr] ai_studio_api_key = request_headers.get(AI_STUDIO_API_KEY_HEADER.lower()) if not ai_studio_api_key: query_params = get_context().request_context.request.query_params # type: ignore[union-attr] ai_studio_api_key = query_params.get(AI_STUDIO_API_KEY_QUERY_PARAM) else: ai_studio_api_key = os.getenv(AI_STUDIO_API_KEY_ENV) return ai_studio_api_key @asynccontextmanager async def oxylabs_client() -> AsyncIterator[_OxylabsClientWrapper]: """Async context manager for Oxylabs client that is used in MCP tools.""" headers = _get_default_headers() username, password = get_oxylabs_auth() if not username or not password: raise ValueError("Oxylabs username and password must be set.") auth = BasicAuth(username=username, password=password) async with AsyncClient( timeout=Timeout(settings.OXYLABS_REQUEST_TIMEOUT_S), verify=True, headers=headers, auth=auth, ) as client: try: yield _OxylabsClientWrapper(client) except HTTPStatusError as e: raise MCPServerError( f"HTTP error during POST request: {e.response.status_code} - {e.response.text}" ) from None except RequestError as e: raise MCPServerError(f"Request error during POST request: {e}") from None except Exception as e: raise MCPServerError(f"Error: {str(e) or repr(e)}") from None