scrape

Extract web content and images from URLs, handling dynamic pages, geo-restrictions, and bot detection with configurable output formats.

Instructions

Execute a web scrape using ScraperAPI with the specified parameters.
Supports both text and image URLs. When the target URL points to an image,
the image content is returned directly.

Parameters:
    params: Scrape model containing:
        url: Target URL to scrape (required)
        render: Enable JavaScript rendering only when needed for dynamic content (default: False)
                Set to True ONLY if the content you need is missing from the initial HTML response and is loaded dynamically by JavaScript.
                For most websites, including many modern ones, the main content is available without JavaScript rendering.
        country_code: Two-letter country code for geo-specific scraping
        premium: Use premium residential/mobile proxies for higher success rate (costs more, incompatible with ultra_premium)
        ultra_premium: Activate advanced bypass mechanisms (costs more, incompatible with premium)
        device_type: 'mobile' or 'desktop' for device-specific user agents
        output_format: 'text', 'markdown', 'csv' or 'json' for the output format (default: 'markdown')
        autoparse: boolean to enable automatic parsing of the content for select websites (default: False).
                Set to true if the output_format is 'csv' or 'json'. Only available for certain websites.

Returns:
    Scraped content as text, csv or json

Input Schema

TableJSON Schema

Name	Required	Description	Default
`params`	Yes

Implementation Reference

src/scraperapi_mcp_server/server.py:28-84 (handler)

The MCP tool handler function for "scrape". It validates the API key, checks rate limits, and invokes basic_scrape.

async def scrape(params: Scrape) -> str:
    """
    Execute a web scrape using ScraperAPI with the specified parameters.
    Supports both text and image URLs. When the target URL points to an image,
    the image content is returned directly.

    Parameters:
        params: Scrape model containing:
            url: Target URL to scrape (required)
            render: Enable JavaScript rendering only when needed for dynamic content (default: False)
                    Set to True ONLY if the content you need is missing from the initial HTML response and is loaded dynamically by JavaScript.
                    For most websites, including many modern ones, the main content is available without JavaScript rendering.
            country_code: Two-letter country code for geo-specific scraping
            premium: Use premium residential/mobile proxies for higher success rate (costs more, incompatible with ultra_premium)
            ultra_premium: Activate advanced bypass mechanisms (costs more, incompatible with premium)
            device_type: 'mobile' or 'desktop' for device-specific user agents
            output_format: 'text', 'markdown', 'csv' or 'json' for the output format (default: 'markdown')
            autoparse: boolean to enable automatic parsing of the content for select websites (default: False).
                    Set to true if the output_format is 'csv' or 'json'. Only available for certain websites.

    Returns:
        Scraped content as text, csv or json
    """

    logging.info(f"Invoking scrape tool with params: {params}")
    try:
        settings.validate_api_key()
    except ApiKeyEnvVarNotSetError as e:
        raise ToolError(str(e)) from e
    try:
        _rate_limiter.acquire()
    except RateLimitExceededError as e:
        raise ToolError(str(e)) from e
    try:
        result = await basic_scrape(
            url=str(params.url),
            render=params.render,
            country_code=params.country_code,
            premium=params.premium,
            ultra_premium=params.ultra_premium,
            device_type=params.device_type,
            output_format=params.output_format,
            autoparse=params.autoparse,
        )
        logging.info(f"Scrape tool completed for URL: {params.url}")

        if result.is_image:
            logging.info(
                f"Returning image content ({result.mime_type}) for URL: {params.url}"
            )
            # Image() expects short format name (e.g. "jpeg"), not full MIME type
            image_format = result.mime_type.removeprefix("image/")
            return Image(data=result.image_data, format=image_format)

        return result.text
    except ScrapeError as e:
        raise ToolError(str(e)) from e

src/scraperapi_mcp_server/scrape/scrape.py:24-114 (handler)

The core business logic that calls the ScraperAPI service.

async def basic_scrape(
    url: str,
    render: bool = None,
    country_code: str = None,
    premium: bool = None,
    ultra_premium: bool = None,
    device_type: str = None,
    output_format: str = "markdown",
    autoparse: bool = False,
) -> ScrapeResult:
    logging.info(f"Starting scrape for URL: {url}")
    payload = {
        "api_key": settings.API_KEY,
        "url": url,
        "scraper_sdk": "mcp-server",
    }
    optional_params = {
        "render": (render, lambda v: str(v).lower()),
        "country_code": (country_code, str),
        "premium": (premium, lambda v: str(v).lower()),
        "ultra_premium": (ultra_premium, lambda v: str(v).lower()),
        "device_type": (
            device_type,
            lambda v: v.value if hasattr(v, "value") else str(v),
        ),
        "output_format": (
            output_format,
            lambda v: v.value if hasattr(v, "value") else str(v),
        ),
        "autoparse": (autoparse, lambda v: str(v).lower()),
    }
    for key, (value, formatter) in optional_params.items():
        if value is not None:
            payload[key] = formatter(value)
            logging.debug(f"Added optional param: {key}={payload[key]}")
    try:
        logging.info(f"Sending request to {settings.API_URL}")
        async with httpx.AsyncClient(follow_redirects=True) as client:
            response = await client.get(
                settings.API_URL,
                params=payload,
                timeout=settings.API_TIMEOUT_SECONDS,
            )
            response.raise_for_status()
        logging.info(f"Scrape successful for URL: {url}")

        content_type = _get_content_type(response)
        content_size = len(response.content)
        size_limit = settings.IMAGE_SIZE_LIMIT_BYTES
        image_mime = detect_image_mime(content_type, response.content)

        if image_mime:
            content_type = image_mime
            logging.info(
                f"Image response detected: {content_type}, "
                f"size: {_format_file_size(content_size)}"
            )
            if content_size > size_limit:
                logging.warning(
                    f"Image too large ({_format_file_size(content_size)}), "
                    f"limit is {_format_file_size(size_limit)}"
                )
                return ScrapeResult(
                    text=(
                        f"Image found at {url}\n"
                        f"Type: {content_type}\n"
                        f"Size: {_format_file_size(content_size)}\n\n"
                        f"The image exceeds the {_format_file_size(size_limit)} "
                        f"size limit for inline content and cannot be returned directly."
                    )
                )
            return ScrapeResult(image_data=response.content, mime_type=content_type)

        return ScrapeResult(text=response.text)
    except httpx.HTTPStatusError as e:
        status_code = e.response.status_code
        param_summary = " ".join(
            f"{k}={v}" for k, v in payload.items() if k != "api_key"
        )
        error_message = f"HTTP error {status_code} when scraping '{url}'. Parameters used: {param_summary}"
        logging.error(f"basic_scrape: {error_message}", exc_info=True)
        raise ScrapeError(error_message) from e
    except httpx.RequestError as e:
        error_message = f"Connection error when scraping '{url}': {e}"
        logging.error(f"basic_scrape: {error_message}", exc_info=True)
        raise ScrapeError(error_message) from e
    except Exception as e:
        error_message = f"Unexpected error when scraping '{url}': {e}"
        logging.error(f"basic_scrape: {error_message}", exc_info=True)
        raise ScrapeError(error_message) from e

src/scraperapi_mcp_server/server.py:19-27 (registration)

Tool registration for the "scrape" tool using the FastMCP decorator.

@mcp.tool(
    name="scrape",
    annotations=ToolAnnotations(
        readOnlyHint=True,
        destructiveHint=False,
        idempotentHint=True,
        openWorldHint=True,
    ),
)

scraperapi-mcp-server

scrape

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API