crawl4ai_mcp.py•9.85 kB
#!/usr/bin/env python3
"""
Custom Crawl4AI MCP Server
Wraps the Crawl4AI Docker API endpoints with FastMCP for reliable stdio transport.
"""
import os
import httpx
from fastmcp import FastMCP
from typing import Optional, List, Dict, Any
# Initialize MCP server
mcp = FastMCP("Crawl4AI")
# Get base URL from environment or use default
CRAWL4AI_BASE_URL = os.getenv("CRAWL4AI_BASE_URL", "http://localhost:11235")
@mcp.tool()
async def scrape_markdown(
url: str,
filter_type: str = "fit",
query: Optional[str] = None,
cache_bust: str = "0"
) -> dict:
"""Extract clean markdown content from a webpage.
Args:
url: The URL to scrape (must be http/https)
filter_type: Content filter strategy - 'fit' (default), 'raw', 'bm25', or 'llm'
query: Optional query string for BM25/LLM filters
cache_bust: Cache-bust revision counter (default: '0')
Returns:
Dictionary with 'markdown' content and 'success' status
"""
async with httpx.AsyncClient(timeout=60.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/md",
json={
"url": url,
"f": filter_type,
"q": query,
"c": cache_bust
}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def extract_html(url: str) -> dict:
"""Extract preprocessed HTML from a webpage.
Crawls the URL and returns sanitized HTML structures useful for
schema extraction or further processing.
Args:
url: The URL to extract HTML from
Returns:
Dictionary with HTML content and processing status
"""
async with httpx.AsyncClient(timeout=60.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/html",
json={"url": url}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def capture_screenshot(
url: str,
screenshot_wait_for: float = 2.0,
output_path: Optional[str] = None
) -> dict:
"""Capture a full-page PNG screenshot of a webpage.
Args:
url: The URL to screenshot
screenshot_wait_for: Seconds to wait before capture (default: 2.0)
output_path: Optional path to save the screenshot file
Returns:
Dictionary with screenshot data (base64 if no output_path, or file path)
"""
async with httpx.AsyncClient(timeout=90.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/screenshot",
json={
"url": url,
"screenshot_wait_for": screenshot_wait_for,
"output_path": output_path
}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def generate_pdf(
url: str,
output_path: Optional[str] = None
) -> dict:
"""Generate a PDF document from a webpage.
Args:
url: The URL to convert to PDF
output_path: Optional path to save the PDF file
Returns:
Dictionary with PDF data (base64 if no output_path, or file path)
"""
async with httpx.AsyncClient(timeout=90.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/pdf",
json={
"url": url,
"output_path": output_path
}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def execute_javascript(
url: str,
scripts: List[str]
) -> dict:
"""Execute JavaScript snippets on a webpage.
Runs JavaScript in the browser context and returns results.
Each script should be an expression that returns a value (IIFE or async function).
Args:
url: The URL to execute scripts on
scripts: List of JavaScript code snippets to execute in order
Returns:
Full CrawlResult with execution results, markdown, links, etc.
"""
async with httpx.AsyncClient(timeout=120.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/execute_js",
json={
"url": url,
"scripts": scripts
}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def crawl_urls(
urls: List[str],
browser_config: Optional[Dict[str, Any]] = None,
crawler_config: Optional[Dict[str, Any]] = None,
hooks: Optional[Dict[str, Any]] = None
) -> dict:
"""Crawl multiple URLs and return results.
Args:
urls: List of URLs to crawl (1-100 URLs)
browser_config: Optional browser configuration overrides
crawler_config: Optional crawler configuration overrides
hooks: Optional user-provided hook functions for customization
Returns:
Dictionary with crawl results for all URLs
"""
if not urls or len(urls) == 0:
return {
"success": False,
"error": "No URLs provided"
}
if len(urls) > 100:
return {
"success": False,
"error": f"Too many URLs ({len(urls)}). Maximum is 100."
}
async with httpx.AsyncClient(timeout=300.0) as client:
try:
payload = {"urls": urls}
if browser_config:
payload["browser_config"] = browser_config
if crawler_config:
payload["crawler_config"] = crawler_config
if hooks:
payload["hooks"] = hooks
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json=payload
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"urls": urls
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"urls": urls
}
@mcp.tool()
async def ask_crawl4ai(
query: str,
context_type: str = "all",
score_ratio: float = 0.5,
max_results: int = 20
) -> dict:
"""Query the Crawl4AI library documentation and context.
Use this to get information about Crawl4AI features, code examples,
and best practices directly from the library documentation.
Args:
query: Search query to filter relevant content (RECOMMENDED)
context_type: Type of context - 'code', 'doc', or 'all' (default)
score_ratio: Minimum score as fraction of max score (default: 0.5)
max_results: Maximum results to return (default: 20)
Returns:
Dictionary with filtered documentation/code context
"""
async with httpx.AsyncClient(timeout=30.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/ask",
json={
"query": query,
"context_type": context_type,
"score_ratio": score_ratio,
"max_results": max_results
}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"query": query
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"query": query
}
if __name__ == "__main__":
# Run the MCP server with stdio transport
mcp.run(transport="stdio")