Crawl4AI MCP Server
by weidwonder
#!/usr/bin/env python
import asyncio
import json
from typing import List, Dict
from mcp.server.fastmcp import FastMCP, Context
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai import CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from search import SearchManager
mcp = FastMCP("Crawl4AI")
crawler = None
search_manager = None
async def initialize_search_manager():
global search_manager
if search_manager is None:
search_manager = SearchManager()
print("Search manager initialized with engines:", [type(e).__name__ for e in search_manager.engines])
async def initialize_crawler():
global crawler
browser_config = BrowserConfig(headless=True)
md_generator = DefaultMarkdownGenerator(
options={"citations": True}
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=10,
excluded_tags=["nav", "footer", "header"],
markdown_generator=md_generator
)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.__aenter__()
async def close_crawler():
global crawler
if crawler:
await crawler.__aexit__(None, None, None)
@mcp.tool()
async def read_url(url: str, format: str = "markdown_with_citations") -> str:
"""Crawl a webpage and return its content in a specified format.
Args:
url: The URL to crawl
format: The format of the content to return. Options:
- raw_markdown: The basic HTML→Markdown conversion
- markdown_with_citations: Markdown including inline citations that reference links at the end
- references_markdown: The references/citations themselves (if citations=True)
- fit_markdown: The filtered/"fit" markdown if a content filter was used
- fit_html: The filtered HTML that generated fit_markdown
- markdown: The default markdown format
"""
global crawler
if not crawler:
await initialize_crawler()
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=10,
excluded_tags=["nav", "footer", "header"],
markdown_generator=DefaultMarkdownGenerator(
options={"citations": True}
)
)
try:
result = await crawler.arun(url=url, config=run_config)
content = None
if format == "raw_markdown":
content = result.markdown_v2.raw_markdown
elif format == "markdown_with_citations":
content = result.markdown_v2.markdown_with_citations
elif format == "references_markdown":
content = result.markdown_v2.references_markdown
elif format == "fit_markdown":
content = result.markdown_v2.fit_markdown
elif format == "fit_html":
content = result.markdown_v2.fit_html
else:
content = result.markdown_v2.markdown_with_citations
# 确保内容是UTF-8编码的字符串
if content is not None:
# 如果内容已经是字符串,确保它是UTF-8编码的
if isinstance(content, str):
# 在Windows系统上,处理可能的编码问题
try:
# 尝试将内容编码为UTF-8,然后解码回字符串
# 这样可以确保内容是有效的UTF-8字符串
content = content.encode('utf-8', errors='replace').decode('utf-8')
except Exception as e:
print(f"Warning: Error handling content encoding: {str(e)}")
else:
# 如果内容不是字符串,尝试将其转换为字符串
try:
content = str(content)
content = content.encode('utf-8', errors='replace').decode('utf-8')
except Exception as e:
print(f"Warning: Error converting content to string: {str(e)}")
content = f"Error: Could not convert content to string: {str(e)}"
return content
except Exception as e:
error_msg = f"Error crawling URL: {str(e)}"
print(error_msg)
return json.dumps({"error": error_msg}, ensure_ascii=False)
@mcp.tool()
async def search(query: str, num_results: int = 10, engine: str = "duckduckgo") -> str:
"""执行网络搜索并返回结果。
Args:
query: 搜索查询字符串
num_results: 返回结果的数量,默认为10
engine: 使用的搜索引擎,可选值:
- "duckduckgo": 使用DuckDuckGo搜索(默认)
- "google": 使用Google搜索(需要配置API密钥)
"""
global search_manager
try:
await initialize_search_manager()
if not search_manager or not search_manager.engines:
return json.dumps({"error": "No search engines available"}, ensure_ascii=False)
results = await search_manager.search(query, num_results, engine)
print(f"Search results: {results}") # 添加调试日志
# 确保JSON字符串是UTF-8编码的
try:
json_str = json.dumps(results, ensure_ascii=False, indent=2)
# 在Windows系统上,处理可能的编码问题
json_str = json_str.encode('utf-8', errors='replace').decode('utf-8')
return json_str
except Exception as e:
error_msg = f"Error encoding search results: {str(e)}"
print(error_msg)
return json.dumps({"error": error_msg}, ensure_ascii=False)
except Exception as e:
error_msg = f"Search error: {str(e)}"
print(error_msg) # 添加错误日志
try:
return json.dumps({"error": error_msg}, ensure_ascii=False)
except Exception as json_e:
# 如果JSON序列化失败,返回简单的错误消息
return f"Error: {error_msg}. JSON encoding failed: {str(json_e)}"
async def cleanup():
await close_crawler()
if __name__ == "__main__":
try:
mcp.run()
finally:
asyncio.run(cleanup())