Crawl4AI MCP Wrapper

crawl4ai_mcp.py•9.85 kB

#!/usr/bin/env python3 """ Custom Crawl4AI MCP Server Wraps the Crawl4AI Docker API endpoints with FastMCP for reliable stdio transport. """ import os import httpx from fastmcp import FastMCP from typing import Optional, List, Dict, Any # Initialize MCP server mcp = FastMCP("Crawl4AI") # Get base URL from environment or use default CRAWL4AI_BASE_URL = os.getenv("CRAWL4AI_BASE_URL", "http://localhost:11235") @mcp.tool() async def scrape_markdown( url: str, filter_type: str = "fit", query: Optional[str] = None, cache_bust: str = "0" ) -> dict: """Extract clean markdown content from a webpage. Args: url: The URL to scrape (must be http/https) filter_type: Content filter strategy - 'fit' (default), 'raw', 'bm25', or 'llm' query: Optional query string for BM25/LLM filters cache_bust: Cache-bust revision counter (default: '0') Returns: Dictionary with 'markdown' content and 'success' status """ async with httpx.AsyncClient(timeout=60.0) as client: try: response = await client.post( f"{CRAWL4AI_BASE_URL}/md", json={ "url": url, "f": filter_type, "q": query, "c": cache_bust } ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "url": url } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "url": url } @mcp.tool() async def extract_html(url: str) -> dict: """Extract preprocessed HTML from a webpage. Crawls the URL and returns sanitized HTML structures useful for schema extraction or further processing. Args: url: The URL to extract HTML from Returns: Dictionary with HTML content and processing status """ async with httpx.AsyncClient(timeout=60.0) as client: try: response = await client.post( f"{CRAWL4AI_BASE_URL}/html", json={"url": url} ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "url": url } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "url": url } @mcp.tool() async def capture_screenshot( url: str, screenshot_wait_for: float = 2.0, output_path: Optional[str] = None ) -> dict: """Capture a full-page PNG screenshot of a webpage. Args: url: The URL to screenshot screenshot_wait_for: Seconds to wait before capture (default: 2.0) output_path: Optional path to save the screenshot file Returns: Dictionary with screenshot data (base64 if no output_path, or file path) """ async with httpx.AsyncClient(timeout=90.0) as client: try: response = await client.post( f"{CRAWL4AI_BASE_URL}/screenshot", json={ "url": url, "screenshot_wait_for": screenshot_wait_for, "output_path": output_path } ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "url": url } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "url": url } @mcp.tool() async def generate_pdf( url: str, output_path: Optional[str] = None ) -> dict: """Generate a PDF document from a webpage. Args: url: The URL to convert to PDF output_path: Optional path to save the PDF file Returns: Dictionary with PDF data (base64 if no output_path, or file path) """ async with httpx.AsyncClient(timeout=90.0) as client: try: response = await client.post( f"{CRAWL4AI_BASE_URL}/pdf", json={ "url": url, "output_path": output_path } ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "url": url } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "url": url } @mcp.tool() async def execute_javascript( url: str, scripts: List[str] ) -> dict: """Execute JavaScript snippets on a webpage. Runs JavaScript in the browser context and returns results. Each script should be an expression that returns a value (IIFE or async function). Args: url: The URL to execute scripts on scripts: List of JavaScript code snippets to execute in order Returns: Full CrawlResult with execution results, markdown, links, etc. """ async with httpx.AsyncClient(timeout=120.0) as client: try: response = await client.post( f"{CRAWL4AI_BASE_URL}/execute_js", json={ "url": url, "scripts": scripts } ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "url": url } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "url": url } @mcp.tool() async def crawl_urls( urls: List[str], browser_config: Optional[Dict[str, Any]] = None, crawler_config: Optional[Dict[str, Any]] = None, hooks: Optional[Dict[str, Any]] = None ) -> dict: """Crawl multiple URLs and return results. Args: urls: List of URLs to crawl (1-100 URLs) browser_config: Optional browser configuration overrides crawler_config: Optional crawler configuration overrides hooks: Optional user-provided hook functions for customization Returns: Dictionary with crawl results for all URLs """ if not urls or len(urls) == 0: return { "success": False, "error": "No URLs provided" } if len(urls) > 100: return { "success": False, "error": f"Too many URLs ({len(urls)}). Maximum is 100." } async with httpx.AsyncClient(timeout=300.0) as client: try: payload = {"urls": urls} if browser_config: payload["browser_config"] = browser_config if crawler_config: payload["crawler_config"] = crawler_config if hooks: payload["hooks"] = hooks response = await client.post( f"{CRAWL4AI_BASE_URL}/crawl", json=payload ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "urls": urls } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "urls": urls } @mcp.tool() async def ask_crawl4ai( query: str, context_type: str = "all", score_ratio: float = 0.5, max_results: int = 20 ) -> dict: """Query the Crawl4AI library documentation and context. Use this to get information about Crawl4AI features, code examples, and best practices directly from the library documentation. Args: query: Search query to filter relevant content (RECOMMENDED) context_type: Type of context - 'code', 'doc', or 'all' (default) score_ratio: Minimum score as fraction of max score (default: 0.5) max_results: Maximum results to return (default: 20) Returns: Dictionary with filtered documentation/code context """ async with httpx.AsyncClient(timeout=30.0) as client: try: response = await client.post( f"{CRAWL4AI_BASE_URL}/ask", json={ "query": query, "context_type": context_type, "score_ratio": score_ratio, "max_results": max_results } ) response.raise_for_status() return response.json() except httpx.HTTPError as e: return { "success": False, "error": f"HTTP error: {str(e)}", "query": query } except Exception as e: return { "success": False, "error": f"Request failed: {str(e)}", "query": query } if __name__ == "__main__": # Run the MCP server with stdio transport mcp.run(transport="stdio")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/joedank/mcpcrawl4ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server