We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/joedank/mcpcrawl4ai'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
#!/usr/bin/env python3
"""
Custom Crawl4AI MCP Server
Wraps the Crawl4AI Docker API endpoints with FastMCP for reliable stdio transport.
"""
import os
import httpx
from fastmcp import FastMCP
from typing import Optional, List, Dict, Any
# Initialize MCP server
mcp = FastMCP("Crawl4AI")
# Get base URL from environment or use default (Pi instance)
CRAWL4AI_BASE_URL = os.getenv("CRAWL4AI_BASE_URL", "http://192.168.0.87:11235")
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN", "test_api_code")
@mcp.tool()
async def scrape_markdown(
url: str,
filter_type: str = "fit",
query: Optional[str] = None,
cache_bust: str = "0",
stealth: bool = True # Default to stealth mode for anti-bot detection
) -> dict:
"""Extract clean markdown content from a webpage.
Args:
url: The URL to scrape (must be http/https)
filter_type: Content filter strategy - 'fit' (default), 'raw', 'bm25', or 'llm'
query: Optional query string for BM25/LLM filters
cache_bust: Cache-bust revision counter (default: '0')
stealth: Enable stealth mode to bypass bot detection (default: True)
Returns:
Dictionary with 'markdown' content and 'success' status
"""
async with httpx.AsyncClient(timeout=120.0) as client:
try:
# Use /crawl endpoint with optional stealth
payload = {"urls": [url]}
if stealth:
payload["browser_config"] = {
"headless": True,
"stealth": True
}
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json=payload
)
response.raise_for_status()
data = response.json()
# Extract markdown from crawl result
results = data.get("results", [])
if results and len(results) > 0:
result = results[0]
# markdown can be dict or string
md = result.get("markdown", "")
if isinstance(md, dict):
return {
"success": result.get("success", True),
"url": url,
"markdown": md.get("raw_markdown", ""),
"raw_markdown": md.get("raw_markdown", ""),
"markdown_with_citations": md.get("markdown_with_citations", ""),
"references_markdown": md.get("references_markdown", ""),
"fit_markdown": md.get("fit_markdown", ""),
"stealth_mode": stealth
}
return {
"success": result.get("success", True),
"url": url,
"markdown": md,
"raw_markdown": md,
"stealth_mode": stealth
}
return {
"success": False,
"error": "No results returned",
"url": url
}
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def extract_html(url: str) -> dict:
"""Extract preprocessed HTML from a webpage.
Crawls the URL and returns sanitized HTML structures useful for
schema extraction or further processing.
Args:
url: The URL to extract HTML from
Returns:
Dictionary with HTML content and processing status
"""
async with httpx.AsyncClient(timeout=60.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={"urls": [url], "browser_config": {"headless": True, "stealth": True}}
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
if results and len(results) > 0:
result = results[0]
return {
"success": result.get("success", True),
"url": url,
"html": result.get("html", ""),
"cleaned_html": result.get("cleaned_html", "")
}
return {"success": False, "error": "No results", "url": url}
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def capture_screenshot(
url: str,
screenshot_wait_for: float = 2.0,
output_path: Optional[str] = None
) -> dict:
"""Capture a full-page PNG screenshot of a webpage.
Args:
url: The URL to screenshot
screenshot_wait_for: Seconds to wait before capture (default: 2.0)
output_path: Optional path to save the screenshot file
Returns:
Dictionary with screenshot data (base64 if no output_path, or file path)
"""
import base64
async with httpx.AsyncClient(timeout=90.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={
"urls": [url],
"crawler_config": {
"screenshot": True,
"screenshot_wait_for": screenshot_wait_for
},
"browser_config": {"headless": True, "stealth": True}
}
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
if results and len(results) > 0:
result = results[0]
screenshot_b64 = result.get("screenshot")
if screenshot_b64:
if output_path:
screenshot_bytes = base64.b64decode(screenshot_b64)
with open(output_path, 'wb') as f:
f.write(screenshot_bytes)
return {
"success": True,
"url": url,
"path": output_path,
"size": len(screenshot_bytes)
}
return {
"success": True,
"url": url,
"screenshot": screenshot_b64
}
return {"success": False, "error": "No screenshot data", "url": url}
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def generate_pdf(
url: str,
output_path: Optional[str] = None
) -> dict:
"""Generate a PDF document from a webpage.
Args:
url: The URL to convert to PDF
output_path: Optional path to save the PDF file
Returns:
Dictionary with PDF data (base64 if no output_path, or file path)
"""
async with httpx.AsyncClient(timeout=90.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={
"urls": [url],
"crawler_config": {"pdf": True},
"browser_config": {"headless": True, "stealth": True}
}
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
if results and len(results) > 0:
result = results[0]
return {
"success": result.get("success", True),
"url": url,
"pdf": result.get("pdf"),
"html": result.get("html", "")[:500]
}
return {"success": False, "error": "No results", "url": url}
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def execute_javascript(
url: str,
scripts: List[str]
) -> dict:
"""Execute JavaScript snippets on a webpage.
Runs JavaScript in the browser context and returns results.
Each script should be an expression that returns a value (IIFE or async function).
Args:
url: The URL to execute scripts on
scripts: List of JavaScript code snippets to execute in order
Returns:
Full CrawlResult with execution results, markdown, links, etc.
"""
async with httpx.AsyncClient(timeout=120.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={
"urls": [url],
"js_code": scripts,
"browser_config": {"headless": True, "stealth": True}
}
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
if results and len(results) > 0:
return {
"success": True,
"url": url,
"result": results[0]
}
return {"success": False, "error": "No results", "url": url}
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"url": url
}
@mcp.tool()
async def crawl_urls(
urls: List[str],
browser_config: Optional[Dict[str, Any]] = None,
crawler_config: Optional[Dict[str, Any]] = None,
hooks: Optional[Dict[str, Any]] = None
) -> dict:
"""Crawl multiple URLs and return results.
Args:
urls: List of URLs to crawl (1-100 URLs)
browser_config: Optional browser configuration overrides (default: stealth mode)
crawler_config: Optional crawler configuration overrides
hooks: Optional user-provided hook functions for customization
Returns:
Dictionary with crawl results for all URLs
"""
if not urls or len(urls) == 0:
return {
"success": False,
"error": "No URLs provided"
}
if len(urls) > 100:
return {
"success": False,
"error": f"Too many URLs ({len(urls)}). Maximum is 100."
}
async with httpx.AsyncClient(timeout=300.0) as client:
try:
payload = {"urls": urls}
# Default to stealth mode if no browser_config provided
if browser_config:
payload["browser_config"] = browser_config
else:
payload["browser_config"] = {"headless": True, "stealth": True}
if crawler_config:
payload["crawler_config"] = crawler_config
if hooks:
payload["hooks"] = hooks
response = await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json=payload
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)}",
"urls": urls
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"urls": urls
}
@mcp.tool()
async def ask_crawl4ai(
query: str,
context_type: str = "all",
score_ratio: float = 0.5,
max_results: int = 20
) -> dict:
"""Query the Crawl4AI library documentation and context.
Note: This endpoint may not be available on all crawl4ai instances.
Args:
query: Search query to filter relevant content (RECOMMENDED)
context_type: Type of context - 'code', 'doc', or 'all' (default)
score_ratio: Minimum score as fraction of max score (default: 0.5)
max_results: Maximum results to return (default: 20)
Returns:
Dictionary with filtered documentation/code context
"""
async with httpx.AsyncClient(timeout=30.0) as client:
try:
response = await client.post(
f"{CRAWL4AI_BASE_URL}/ask",
json={
"query": query,
"context_type": context_type,
"score_ratio": score_ratio,
"max_results": max_results
}
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
return {
"success": False,
"error": f"HTTP error: {str(e)} - /ask endpoint may not be available",
"query": query
}
except Exception as e:
return {
"success": False,
"error": f"Request failed: {str(e)}",
"query": query
}
if __name__ == "__main__":
# Run the MCP server with stdio transport
mcp.run(transport="stdio")