We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/Marcussy34/localMCP-crawl4ai-RAG'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
#!/usr/bin/env python3
"""
Crawl4AI Backend Script
Handles web crawling requests from the Next.js frontend
"""
import asyncio
import json
import sys
import os
import logging
from io import StringIO
from contextlib import redirect_stdout, redirect_stderr
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy
from urllib.parse import urljoin, urlparse
from collections import deque
# Suppress Crawl4AI logging output to keep JSON clean
os.environ['CRAWL4AI_QUIET'] = '1'
# Configure logging to go to stderr
logging.basicConfig(
level=logging.ERROR,
stream=sys.stderr,
format='%(levelname)s: %(message)s'
)
async def crawl(url, extraction_type="markdown", js_code=None, css_selector=None, llm_prompt=None, headless=True,
deep_crawl=False, crawl_strategy="bfs", max_pages=10, stream_progress=False, session_id=None):
"""
Main crawl function that processes different extraction strategies
Args:
url: Target URL to crawl
extraction_type: Type of extraction (markdown, css, llm)
js_code: Optional JavaScript code to execute
css_selector: CSS selector for CSS extraction
llm_prompt: Prompt for LLM extraction
headless: Run browser in headless mode
deep_crawl: Whether to crawl multiple pages (entire site)
crawl_strategy: Strategy for deep crawling (bfs or dfs)
max_pages: Maximum number of pages to crawl (limit for safety)
stream_progress: Whether to stream progress updates to stderr
session_id: Session ID for progress tracking via file
"""
def write_progress(crawled, total, current_url="", status="crawling"):
"""Write progress to file for polling"""
if session_id:
progress_file = f"/tmp/crawl_progress_{session_id}.json"
progress_data = {
"crawled": crawled,
"total": total if total != float('inf') else "unlimited",
"currentUrl": current_url,
"status": status
}
try:
with open(progress_file, 'w') as f:
json.dump(progress_data, f)
except Exception as e:
# Don't fail crawl if progress writing fails
pass
def emit_progress(crawled, total, current_url=""):
"""Emit progress to stderr for streaming API"""
if stream_progress:
progress_data = json.dumps({
"crawled": crawled,
"total": total if total != float('inf') else "unlimited",
"currentUrl": current_url,
"status": "crawling"
})
print(f"PROGRESS:{progress_data}", file=sys.stderr, flush=True)
# Always write to file if session_id exists
write_progress(crawled, total, current_url)
try:
# Configure browser - disable verbose logging
browser_config = BrowserConfig(
headless=headless,
verbose=False
)
# Configure crawler run
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=1
)
# Add JavaScript code if provided
if js_code:
run_config.js_code = [js_code]
# Configure deep crawling if enabled
crawl_strategy_obj = None
if deep_crawl:
# Use infinity for unlimited crawling (max_pages = 0 means unlimited)
actual_max_pages = float('inf') if max_pages == 0 else max_pages
# Choose strategy based on user preference
if crawl_strategy == "dfs":
crawl_strategy_obj = DFSDeepCrawlStrategy(
max_depth=10,
max_pages=actual_max_pages
)
else: # bfs (default)
crawl_strategy_obj = BFSDeepCrawlStrategy(
max_depth=10,
max_pages=actual_max_pages
)
# Redirect stdout/stderr during crawling to suppress Crawl4AI output
stdout_buffer = StringIO()
stderr_buffer = StringIO()
# Create crawler and run
if stream_progress:
# Don't redirect stderr when streaming progress
with redirect_stdout(stdout_buffer):
async with AsyncWebCrawler(config=browser_config) as crawler:
if deep_crawl and crawl_strategy_obj:
write_progress(0, actual_max_pages, url, "starting")
# Custom deep crawl with progress tracking
visited = set()
to_visit = deque([url])
crawled_results = []
base_domain = urlparse(url).netloc
while to_visit and len(crawled_results) < actual_max_pages:
current_url = to_visit.popleft()
# Skip if already visited
if current_url in visited:
continue
visited.add(current_url)
try:
# Crawl the current page
write_progress(len(crawled_results), actual_max_pages, current_url, "crawling")
result = await crawler.arun(url=current_url, config=run_config)
crawled_results.append(result)
# Extract links from the page (only same domain)
if hasattr(result, 'links') and result.links:
for link_data in result.links.get('internal', []):
link_url = link_data.get('href', '')
if link_url:
# Make absolute URL
absolute_url = urljoin(current_url, link_url)
parsed = urlparse(absolute_url)
# Only add if same domain and not visited
if parsed.netloc == base_domain and absolute_url not in visited:
if crawl_strategy == "bfs":
to_visit.append(absolute_url) # BFS: add to end
else:
to_visit.appendleft(absolute_url) # DFS: add to front
except Exception as e:
# Skip failed pages
print(f"Failed to crawl {current_url}: {str(e)}", file=sys.stderr)
continue
write_progress(len(crawled_results), actual_max_pages, "Processing results...", "processing")
else:
result = await crawler.arun(
url=url,
config=run_config
)
crawled_results = None
else:
# Redirect both stdout and stderr
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
async with AsyncWebCrawler(config=browser_config) as crawler:
if deep_crawl and crawl_strategy_obj:
# Custom deep crawl with progress tracking
visited = set()
to_visit = deque([url])
crawled_results = []
base_domain = urlparse(url).netloc
while to_visit and len(crawled_results) < actual_max_pages:
current_url = to_visit.popleft()
# Skip if already visited
if current_url in visited:
continue
visited.add(current_url)
try:
# Crawl the current page
write_progress(len(crawled_results), actual_max_pages, current_url, "crawling")
result = await crawler.arun(url=current_url, config=run_config)
crawled_results.append(result)
# Extract links from the page (only same domain)
if hasattr(result, 'links') and result.links:
for link_data in result.links.get('internal', []):
link_url = link_data.get('href', '')
if link_url:
# Make absolute URL
absolute_url = urljoin(current_url, link_url)
parsed = urlparse(absolute_url)
# Only add if same domain and not visited
if parsed.netloc == base_domain and absolute_url not in visited:
if crawl_strategy == "bfs":
to_visit.append(absolute_url) # BFS: add to end
else:
to_visit.appendleft(absolute_url) # DFS: add to front
except Exception as e:
# Skip failed pages
continue
write_progress(len(crawled_results), actual_max_pages, "Processing results...", "processing")
else:
result = await crawler.arun(
url=url,
config=run_config
)
crawled_results = None
# Process result based on extraction type (outside redirect blocks)
if extraction_type == "markdown":
# Handle deep crawl results
if deep_crawl:
if crawled_results and len(crawled_results) > 0:
pages = []
total_words = 0
# Process each crawled page
for page_result in crawled_results:
content = page_result.markdown if hasattr(page_result, 'markdown') else ""
word_count = len(content.split()) if content else 0
total_words += word_count
# Extract title from metadata or URL
title = ""
if hasattr(page_result, 'metadata') and page_result.metadata:
title = page_result.metadata.get('title', '')
if not title and hasattr(page_result, 'url'):
# Use last part of URL as title
title = page_result.url.split('/')[-1] or page_result.url
pages.append({
"url": page_result.url if hasattr(page_result, 'url') else "Unknown",
"content": content,
"wordCount": word_count,
"title": title or f"Page {len(pages) + 1}"
})
# Write final progress before returning
write_progress(len(pages), actual_max_pages, "Completed", "completed")
return {
"success": True,
"deepCrawl": True,
"pages": pages,
"totalPages": len(pages),
"totalWords": total_words,
"status": "completed"
}
else:
return {
"success": False,
"error": "Deep crawl found no pages. Try adjusting max_pages or check URL."
}
else:
# Single page result
content = result.markdown
word_count = len(content.split()) if content else 0
return {
"success": True,
"content": content,
"wordCount": word_count,
"status": "completed"
}
elif extraction_type == "css":
# CSS extraction only works with single page for now
if deep_crawl:
return {
"success": False,
"error": "CSS extraction not yet supported with deep crawl"
}
content = result.markdown
return {
"success": True,
"content": content,
"status": "completed",
"note": "CSS extraction - showing markdown. Implement CSS selector logic as needed."
}
elif extraction_type == "llm":
# LLM extraction only works with single page for now
if deep_crawl:
return {
"success": False,
"error": "LLM extraction not yet supported with deep crawl"
}
content = result.markdown
return {
"success": True,
"content": content,
"status": "completed",
"note": "LLM extraction requires API keys. Configure LLMExtractionStrategy."
}
else:
return {
"success": False,
"error": f"Unknown extraction type: {extraction_type}"
}
except Exception as e:
return {
"success": False,
"error": str(e)
}
def main():
"""
Main entry point - reads JSON from stdin and returns JSON to stdout
"""
try:
# Read input from stdin
input_data = sys.stdin.read()
params = json.loads(input_data)
# Extract parameters
url = params.get("url")
extraction_type = params.get("extractionType", "markdown")
js_code = params.get("jsCode")
css_selector = params.get("cssSelector")
llm_prompt = params.get("llmPrompt")
headless = params.get("headless", True)
deep_crawl = params.get("deepCrawl", False)
crawl_strategy = params.get("crawlStrategy", "bfs")
max_pages = params.get("maxPages", 10)
stream_progress = params.get("streamProgress", False)
session_id = params.get("sessionId")
# Validate URL
if not url:
print(json.dumps({
"success": False,
"error": "URL is required"
}))
sys.exit(1)
# Write initial progress if session_id exists
if session_id and deep_crawl:
progress_file = f"/tmp/crawl_progress_{session_id}.json"
actual_max = float('inf') if max_pages == 0 else max_pages
initial_progress = {
"crawled": 0,
"total": "unlimited" if max_pages == 0 else max_pages,
"currentUrl": url,
"status": "initializing"
}
try:
with open(progress_file, 'w') as f:
json.dump(initial_progress, f)
except:
pass
# Run the crawl
result = asyncio.run(crawl(
url=url,
extraction_type=extraction_type,
js_code=js_code,
css_selector=css_selector,
llm_prompt=llm_prompt,
headless=headless,
deep_crawl=deep_crawl,
crawl_strategy=crawl_strategy,
max_pages=max_pages,
stream_progress=stream_progress,
session_id=session_id
))
# Clean up progress file
if session_id:
progress_file = f"/tmp/crawl_progress_{session_id}.json"
try:
if os.path.exists(progress_file):
os.remove(progress_file)
except:
pass
# Output result as JSON
print(json.dumps(result))
sys.exit(0)
except Exception as e:
print(json.dumps({
"success": False,
"error": f"Backend error: {str(e)}"
}))
sys.exit(1)
if __name__ == "__main__":
main()