Crawl4AI MCP Wrapper

crawl4ai_mcp.py•13.9 KiB

#!/usr/bin/env python3
"""
Custom Crawl4AI MCP Server
Wraps the Crawl4AI Docker API endpoints with FastMCP for reliable stdio transport.
"""
import os
import httpx
from fastmcp import FastMCP
from typing import Optional, List, Dict, Any

# Initialize MCP server
mcp = FastMCP("Crawl4AI")

# Get base URL from environment or use default (Pi instance)
CRAWL4AI_BASE_URL = os.getenv("CRAWL4AI_BASE_URL", "http://192.168.0.87:11235")
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN", "test_api_code")


@mcp.tool()
async def scrape_markdown(
    url: str,
    filter_type: str = "fit",
    query: Optional[str] = None,
    cache_bust: str = "0",
    stealth: bool = True  # Default to stealth mode for anti-bot detection
) -> dict:
    """Extract clean markdown content from a webpage.

    Args:
        url: The URL to scrape (must be http/https)
        filter_type: Content filter strategy - 'fit' (default), 'raw', 'bm25', or 'llm'
        query: Optional query string for BM25/LLM filters
        cache_bust: Cache-bust revision counter (default: '0')
        stealth: Enable stealth mode to bypass bot detection (default: True)

    Returns:
        Dictionary with 'markdown' content and 'success' status
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        try:
            # Use /crawl endpoint with optional stealth
            payload = {"urls": [url]}
            if stealth:
                payload["browser_config"] = {
                    "headless": True,
                    "stealth": True
                }

            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json=payload
            )
            response.raise_for_status()
            data = response.json()

            # Extract markdown from crawl result
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                # markdown can be dict or string
                md = result.get("markdown", "")
                if isinstance(md, dict):
                    return {
                        "success": result.get("success", True),
                        "url": url,
                        "markdown": md.get("raw_markdown", ""),
                        "raw_markdown": md.get("raw_markdown", ""),
                        "markdown_with_citations": md.get("markdown_with_citations", ""),
                        "references_markdown": md.get("references_markdown", ""),
                        "fit_markdown": md.get("fit_markdown", ""),
                        "stealth_mode": stealth
                    }
                return {
                    "success": result.get("success", True),
                    "url": url,
                    "markdown": md,
                    "raw_markdown": md,
                    "stealth_mode": stealth
                }
            return {
                "success": False,
                "error": "No results returned",
                "url": url
            }
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def extract_html(url: str) -> dict:
    """Extract preprocessed HTML from a webpage.

    Crawls the URL and returns sanitized HTML structures useful for
    schema extraction or further processing.

    Args:
        url: The URL to extract HTML from

    Returns:
        Dictionary with HTML content and processing status
    """
    async with httpx.AsyncClient(timeout=60.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={"urls": [url], "browser_config": {"headless": True, "stealth": True}}
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                return {
                    "success": result.get("success", True),
                    "url": url,
                    "html": result.get("html", ""),
                    "cleaned_html": result.get("cleaned_html", "")
                }
            return {"success": False, "error": "No results", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def capture_screenshot(
    url: str,
    screenshot_wait_for: float = 2.0,
    output_path: Optional[str] = None
) -> dict:
    """Capture a full-page PNG screenshot of a webpage.

    Args:
        url: The URL to screenshot
        screenshot_wait_for: Seconds to wait before capture (default: 2.0)
        output_path: Optional path to save the screenshot file

    Returns:
        Dictionary with screenshot data (base64 if no output_path, or file path)
    """
    import base64
    async with httpx.AsyncClient(timeout=90.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={
                    "urls": [url],
                    "crawler_config": {
                        "screenshot": True,
                        "screenshot_wait_for": screenshot_wait_for
                    },
                    "browser_config": {"headless": True, "stealth": True}
                }
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                screenshot_b64 = result.get("screenshot")
                if screenshot_b64:
                    if output_path:
                        screenshot_bytes = base64.b64decode(screenshot_b64)
                        with open(output_path, 'wb') as f:
                            f.write(screenshot_bytes)
                        return {
                            "success": True,
                            "url": url,
                            "path": output_path,
                            "size": len(screenshot_bytes)
                        }
                    return {
                        "success": True,
                        "url": url,
                        "screenshot": screenshot_b64
                    }
            return {"success": False, "error": "No screenshot data", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def generate_pdf(
    url: str,
    output_path: Optional[str] = None
) -> dict:
    """Generate a PDF document from a webpage.

    Args:
        url: The URL to convert to PDF
        output_path: Optional path to save the PDF file

    Returns:
        Dictionary with PDF data (base64 if no output_path, or file path)
    """
    async with httpx.AsyncClient(timeout=90.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={
                    "urls": [url],
                    "crawler_config": {"pdf": True},
                    "browser_config": {"headless": True, "stealth": True}
                }
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                return {
                    "success": result.get("success", True),
                    "url": url,
                    "pdf": result.get("pdf"),
                    "html": result.get("html", "")[:500]
                }
            return {"success": False, "error": "No results", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def execute_javascript(
    url: str,
    scripts: List[str]
) -> dict:
    """Execute JavaScript snippets on a webpage.

    Runs JavaScript in the browser context and returns results.
    Each script should be an expression that returns a value (IIFE or async function).

    Args:
        url: The URL to execute scripts on
        scripts: List of JavaScript code snippets to execute in order

    Returns:
        Full CrawlResult with execution results, markdown, links, etc.
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={
                    "urls": [url],
                    "js_code": scripts,
                    "browser_config": {"headless": True, "stealth": True}
                }
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                return {
                    "success": True,
                    "url": url,
                    "result": results[0]
                }
            return {"success": False, "error": "No results", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def crawl_urls(
    urls: List[str],
    browser_config: Optional[Dict[str, Any]] = None,
    crawler_config: Optional[Dict[str, Any]] = None,
    hooks: Optional[Dict[str, Any]] = None
) -> dict:
    """Crawl multiple URLs and return results.

    Args:
        urls: List of URLs to crawl (1-100 URLs)
        browser_config: Optional browser configuration overrides (default: stealth mode)
        crawler_config: Optional crawler configuration overrides
        hooks: Optional user-provided hook functions for customization

    Returns:
        Dictionary with crawl results for all URLs
    """
    if not urls or len(urls) == 0:
        return {
            "success": False,
            "error": "No URLs provided"
        }

    if len(urls) > 100:
        return {
            "success": False,
            "error": f"Too many URLs ({len(urls)}). Maximum is 100."
        }

    async with httpx.AsyncClient(timeout=300.0) as client:
        try:
            payload = {"urls": urls}
            # Default to stealth mode if no browser_config provided
            if browser_config:
                payload["browser_config"] = browser_config
            else:
                payload["browser_config"] = {"headless": True, "stealth": True}
            if crawler_config:
                payload["crawler_config"] = crawler_config
            if hooks:
                payload["hooks"] = hooks

            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json=payload
            )
            response.raise_for_status()
            return response.json()
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "urls": urls
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "urls": urls
            }


@mcp.tool()
async def ask_crawl4ai(
    query: str,
    context_type: str = "all",
    score_ratio: float = 0.5,
    max_results: int = 20
) -> dict:
    """Query the Crawl4AI library documentation and context.

    Note: This endpoint may not be available on all crawl4ai instances.

    Args:
        query: Search query to filter relevant content (RECOMMENDED)
        context_type: Type of context - 'code', 'doc', or 'all' (default)
        score_ratio: Minimum score as fraction of max score (default: 0.5)
        max_results: Maximum results to return (default: 20)

    Returns:
        Dictionary with filtered documentation/code context
    """
    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/ask",
                json={
                    "query": query,
                    "context_type": context_type,
                    "score_ratio": score_ratio,
                    "max_results": max_results
                }
            )
            response.raise_for_status()
            return response.json()
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)} - /ask endpoint may not be available",
                "query": query
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "query": query
            }


if __name__ == "__main__":
    # Run the MCP server with stdio transport
    mcp.run(transport="stdio")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/joedank/mcpcrawl4ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

crawl4ai_mcp.py•13.9 KiB

#!/usr/bin/env python3
"""
Custom Crawl4AI MCP Server
Wraps the Crawl4AI Docker API endpoints with FastMCP for reliable stdio transport.
"""
import os
import httpx
from fastmcp import FastMCP
from typing import Optional, List, Dict, Any

# Initialize MCP server
mcp = FastMCP("Crawl4AI")

# Get base URL from environment or use default (Pi instance)
CRAWL4AI_BASE_URL = os.getenv("CRAWL4AI_BASE_URL", "http://192.168.0.87:11235")
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN", "test_api_code")


@mcp.tool()
async def scrape_markdown(
    url: str,
    filter_type: str = "fit",
    query: Optional[str] = None,
    cache_bust: str = "0",
    stealth: bool = True  # Default to stealth mode for anti-bot detection
) -> dict:
    """Extract clean markdown content from a webpage.

    Args:
        url: The URL to scrape (must be http/https)
        filter_type: Content filter strategy - 'fit' (default), 'raw', 'bm25', or 'llm'
        query: Optional query string for BM25/LLM filters
        cache_bust: Cache-bust revision counter (default: '0')
        stealth: Enable stealth mode to bypass bot detection (default: True)

    Returns:
        Dictionary with 'markdown' content and 'success' status
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        try:
            # Use /crawl endpoint with optional stealth
            payload = {"urls": [url]}
            if stealth:
                payload["browser_config"] = {
                    "headless": True,
                    "stealth": True
                }

            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json=payload
            )
            response.raise_for_status()
            data = response.json()

            # Extract markdown from crawl result
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                # markdown can be dict or string
                md = result.get("markdown", "")
                if isinstance(md, dict):
                    return {
                        "success": result.get("success", True),
                        "url": url,
                        "markdown": md.get("raw_markdown", ""),
                        "raw_markdown": md.get("raw_markdown", ""),
                        "markdown_with_citations": md.get("markdown_with_citations", ""),
                        "references_markdown": md.get("references_markdown", ""),
                        "fit_markdown": md.get("fit_markdown", ""),
                        "stealth_mode": stealth
                    }
                return {
                    "success": result.get("success", True),
                    "url": url,
                    "markdown": md,
                    "raw_markdown": md,
                    "stealth_mode": stealth
                }
            return {
                "success": False,
                "error": "No results returned",
                "url": url
            }
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def extract_html(url: str) -> dict:
    """Extract preprocessed HTML from a webpage.

    Crawls the URL and returns sanitized HTML structures useful for
    schema extraction or further processing.

    Args:
        url: The URL to extract HTML from

    Returns:
        Dictionary with HTML content and processing status
    """
    async with httpx.AsyncClient(timeout=60.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={"urls": [url], "browser_config": {"headless": True, "stealth": True}}
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                return {
                    "success": result.get("success", True),
                    "url": url,
                    "html": result.get("html", ""),
                    "cleaned_html": result.get("cleaned_html", "")
                }
            return {"success": False, "error": "No results", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def capture_screenshot(
    url: str,
    screenshot_wait_for: float = 2.0,
    output_path: Optional[str] = None
) -> dict:
    """Capture a full-page PNG screenshot of a webpage.

    Args:
        url: The URL to screenshot
        screenshot_wait_for: Seconds to wait before capture (default: 2.0)
        output_path: Optional path to save the screenshot file

    Returns:
        Dictionary with screenshot data (base64 if no output_path, or file path)
    """
    import base64
    async with httpx.AsyncClient(timeout=90.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={
                    "urls": [url],
                    "crawler_config": {
                        "screenshot": True,
                        "screenshot_wait_for": screenshot_wait_for
                    },
                    "browser_config": {"headless": True, "stealth": True}
                }
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                screenshot_b64 = result.get("screenshot")
                if screenshot_b64:
                    if output_path:
                        screenshot_bytes = base64.b64decode(screenshot_b64)
                        with open(output_path, 'wb') as f:
                            f.write(screenshot_bytes)
                        return {
                            "success": True,
                            "url": url,
                            "path": output_path,
                            "size": len(screenshot_bytes)
                        }
                    return {
                        "success": True,
                        "url": url,
                        "screenshot": screenshot_b64
                    }
            return {"success": False, "error": "No screenshot data", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def generate_pdf(
    url: str,
    output_path: Optional[str] = None
) -> dict:
    """Generate a PDF document from a webpage.

    Args:
        url: The URL to convert to PDF
        output_path: Optional path to save the PDF file

    Returns:
        Dictionary with PDF data (base64 if no output_path, or file path)
    """
    async with httpx.AsyncClient(timeout=90.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={
                    "urls": [url],
                    "crawler_config": {"pdf": True},
                    "browser_config": {"headless": True, "stealth": True}
                }
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                result = results[0]
                return {
                    "success": result.get("success", True),
                    "url": url,
                    "pdf": result.get("pdf"),
                    "html": result.get("html", "")[:500]
                }
            return {"success": False, "error": "No results", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def execute_javascript(
    url: str,
    scripts: List[str]
) -> dict:
    """Execute JavaScript snippets on a webpage.

    Runs JavaScript in the browser context and returns results.
    Each script should be an expression that returns a value (IIFE or async function).

    Args:
        url: The URL to execute scripts on
        scripts: List of JavaScript code snippets to execute in order

    Returns:
        Full CrawlResult with execution results, markdown, links, etc.
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json={
                    "urls": [url],
                    "js_code": scripts,
                    "browser_config": {"headless": True, "stealth": True}
                }
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if results and len(results) > 0:
                return {
                    "success": True,
                    "url": url,
                    "result": results[0]
                }
            return {"success": False, "error": "No results", "url": url}
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "url": url
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "url": url
            }


@mcp.tool()
async def crawl_urls(
    urls: List[str],
    browser_config: Optional[Dict[str, Any]] = None,
    crawler_config: Optional[Dict[str, Any]] = None,
    hooks: Optional[Dict[str, Any]] = None
) -> dict:
    """Crawl multiple URLs and return results.

    Args:
        urls: List of URLs to crawl (1-100 URLs)
        browser_config: Optional browser configuration overrides (default: stealth mode)
        crawler_config: Optional crawler configuration overrides
        hooks: Optional user-provided hook functions for customization

    Returns:
        Dictionary with crawl results for all URLs
    """
    if not urls or len(urls) == 0:
        return {
            "success": False,
            "error": "No URLs provided"
        }

    if len(urls) > 100:
        return {
            "success": False,
            "error": f"Too many URLs ({len(urls)}). Maximum is 100."
        }

    async with httpx.AsyncClient(timeout=300.0) as client:
        try:
            payload = {"urls": urls}
            # Default to stealth mode if no browser_config provided
            if browser_config:
                payload["browser_config"] = browser_config
            else:
                payload["browser_config"] = {"headless": True, "stealth": True}
            if crawler_config:
                payload["crawler_config"] = crawler_config
            if hooks:
                payload["hooks"] = hooks

            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/crawl",
                json=payload
            )
            response.raise_for_status()
            return response.json()
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)}",
                "urls": urls
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "urls": urls
            }


@mcp.tool()
async def ask_crawl4ai(
    query: str,
    context_type: str = "all",
    score_ratio: float = 0.5,
    max_results: int = 20
) -> dict:
    """Query the Crawl4AI library documentation and context.

    Note: This endpoint may not be available on all crawl4ai instances.

    Args:
        query: Search query to filter relevant content (RECOMMENDED)
        context_type: Type of context - 'code', 'doc', or 'all' (default)
        score_ratio: Minimum score as fraction of max score (default: 0.5)
        max_results: Maximum results to return (default: 20)

    Returns:
        Dictionary with filtered documentation/code context
    """
    async with httpx.AsyncClient(timeout=30.0) as client:
        try:
            response = await client.post(
                f"{CRAWL4AI_BASE_URL}/ask",
                json={
                    "query": query,
                    "context_type": context_type,
                    "score_ratio": score_ratio,
                    "max_results": max_results
                }
            )
            response.raise_for_status()
            return response.json()
        except httpx.HTTPError as e:
            return {
                "success": False,
                "error": f"HTTP error: {str(e)} - /ask endpoint may not be available",
                "query": query
            }
        except Exception as e:
            return {
                "success": False,
                "error": f"Request failed: {str(e)}",
                "query": query
            }


if __name__ == "__main__":
    # Run the MCP server with stdio transport
    mcp.run(transport="stdio")