scrape
Extract web content and images from URLs, handling dynamic pages, geo-restrictions, and bot detection with configurable output formats.
Instructions
Execute a web scrape using ScraperAPI with the specified parameters.
Supports both text and image URLs. When the target URL points to an image,
the image content is returned directly.
Parameters:
params: Scrape model containing:
url: Target URL to scrape (required)
render: Enable JavaScript rendering only when needed for dynamic content (default: False)
Set to True ONLY if the content you need is missing from the initial HTML response and is loaded dynamically by JavaScript.
For most websites, including many modern ones, the main content is available without JavaScript rendering.
country_code: Two-letter country code for geo-specific scraping
premium: Use premium residential/mobile proxies for higher success rate (costs more, incompatible with ultra_premium)
ultra_premium: Activate advanced bypass mechanisms (costs more, incompatible with premium)
device_type: 'mobile' or 'desktop' for device-specific user agents
output_format: 'text', 'markdown', 'csv' or 'json' for the output format (default: 'markdown')
autoparse: boolean to enable automatic parsing of the content for select websites (default: False).
Set to true if the output_format is 'csv' or 'json'. Only available for certain websites.
Returns:
Scraped content as text, csv or jsonInput Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| params | Yes |
Implementation Reference
- The MCP tool handler function for "scrape". It validates the API key, checks rate limits, and invokes basic_scrape.
async def scrape(params: Scrape) -> str: """ Execute a web scrape using ScraperAPI with the specified parameters. Supports both text and image URLs. When the target URL points to an image, the image content is returned directly. Parameters: params: Scrape model containing: url: Target URL to scrape (required) render: Enable JavaScript rendering only when needed for dynamic content (default: False) Set to True ONLY if the content you need is missing from the initial HTML response and is loaded dynamically by JavaScript. For most websites, including many modern ones, the main content is available without JavaScript rendering. country_code: Two-letter country code for geo-specific scraping premium: Use premium residential/mobile proxies for higher success rate (costs more, incompatible with ultra_premium) ultra_premium: Activate advanced bypass mechanisms (costs more, incompatible with premium) device_type: 'mobile' or 'desktop' for device-specific user agents output_format: 'text', 'markdown', 'csv' or 'json' for the output format (default: 'markdown') autoparse: boolean to enable automatic parsing of the content for select websites (default: False). Set to true if the output_format is 'csv' or 'json'. Only available for certain websites. Returns: Scraped content as text, csv or json """ logging.info(f"Invoking scrape tool with params: {params}") try: settings.validate_api_key() except ApiKeyEnvVarNotSetError as e: raise ToolError(str(e)) from e try: _rate_limiter.acquire() except RateLimitExceededError as e: raise ToolError(str(e)) from e try: result = await basic_scrape( url=str(params.url), render=params.render, country_code=params.country_code, premium=params.premium, ultra_premium=params.ultra_premium, device_type=params.device_type, output_format=params.output_format, autoparse=params.autoparse, ) logging.info(f"Scrape tool completed for URL: {params.url}") if result.is_image: logging.info( f"Returning image content ({result.mime_type}) for URL: {params.url}" ) # Image() expects short format name (e.g. "jpeg"), not full MIME type image_format = result.mime_type.removeprefix("image/") return Image(data=result.image_data, format=image_format) return result.text except ScrapeError as e: raise ToolError(str(e)) from e - The core business logic that calls the ScraperAPI service.
async def basic_scrape( url: str, render: bool = None, country_code: str = None, premium: bool = None, ultra_premium: bool = None, device_type: str = None, output_format: str = "markdown", autoparse: bool = False, ) -> ScrapeResult: logging.info(f"Starting scrape for URL: {url}") payload = { "api_key": settings.API_KEY, "url": url, "scraper_sdk": "mcp-server", } optional_params = { "render": (render, lambda v: str(v).lower()), "country_code": (country_code, str), "premium": (premium, lambda v: str(v).lower()), "ultra_premium": (ultra_premium, lambda v: str(v).lower()), "device_type": ( device_type, lambda v: v.value if hasattr(v, "value") else str(v), ), "output_format": ( output_format, lambda v: v.value if hasattr(v, "value") else str(v), ), "autoparse": (autoparse, lambda v: str(v).lower()), } for key, (value, formatter) in optional_params.items(): if value is not None: payload[key] = formatter(value) logging.debug(f"Added optional param: {key}={payload[key]}") try: logging.info(f"Sending request to {settings.API_URL}") async with httpx.AsyncClient(follow_redirects=True) as client: response = await client.get( settings.API_URL, params=payload, timeout=settings.API_TIMEOUT_SECONDS, ) response.raise_for_status() logging.info(f"Scrape successful for URL: {url}") content_type = _get_content_type(response) content_size = len(response.content) size_limit = settings.IMAGE_SIZE_LIMIT_BYTES image_mime = detect_image_mime(content_type, response.content) if image_mime: content_type = image_mime logging.info( f"Image response detected: {content_type}, " f"size: {_format_file_size(content_size)}" ) if content_size > size_limit: logging.warning( f"Image too large ({_format_file_size(content_size)}), " f"limit is {_format_file_size(size_limit)}" ) return ScrapeResult( text=( f"Image found at {url}\n" f"Type: {content_type}\n" f"Size: {_format_file_size(content_size)}\n\n" f"The image exceeds the {_format_file_size(size_limit)} " f"size limit for inline content and cannot be returned directly." ) ) return ScrapeResult(image_data=response.content, mime_type=content_type) return ScrapeResult(text=response.text) except httpx.HTTPStatusError as e: status_code = e.response.status_code param_summary = " ".join( f"{k}={v}" for k, v in payload.items() if k != "api_key" ) error_message = f"HTTP error {status_code} when scraping '{url}'. Parameters used: {param_summary}" logging.error(f"basic_scrape: {error_message}", exc_info=True) raise ScrapeError(error_message) from e except httpx.RequestError as e: error_message = f"Connection error when scraping '{url}': {e}" logging.error(f"basic_scrape: {error_message}", exc_info=True) raise ScrapeError(error_message) from e except Exception as e: error_message = f"Unexpected error when scraping '{url}': {e}" logging.error(f"basic_scrape: {error_message}", exc_info=True) raise ScrapeError(error_message) from e - src/scraperapi_mcp_server/server.py:19-27 (registration)Tool registration for the "scrape" tool using the FastMCP decorator.
@mcp.tool( name="scrape", annotations=ToolAnnotations( readOnlyHint=True, destructiveHint=False, idempotentHint=True, openWorldHint=True, ), )