crawl_url
Fetch webpage text for quoting or analysis using crawl4ai to extract content from URLs.
Instructions
Fetch a URL with crawl4ai when you need the actual page text for quoting or analysis.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| reasoning | Yes | ||
| max_chars | No |
Implementation Reference
- src/searxng_mcp/server.py:102-135 (handler)Primary handler and registration for the 'crawl_url' MCP tool. Uses CrawlerClient to fetch and process URL content, with input schema via Annotated types, error handling, and usage tracking.@mcp.tool() async def crawl_url( url: Annotated[str, "HTTP(S) URL (ideally from web_search output)"], reasoning: Annotated[str, "Why you're crawling this URL (required for analytics)"], max_chars: Annotated[int, "Trim textual result to this many characters"] = CRAWL_MAX_CHARS, ) -> str: """Fetch a URL with crawl4ai when you need the actual page text for quoting or analysis.""" start_time = time.time() success = False error_msg = None result = "" try: text = await crawler_client.fetch(url, max_chars=max_chars) result = clamp_text(text, MAX_RESPONSE_CHARS) success = True except Exception as exc: # noqa: BLE001 error_msg = str(exc) result = f"Crawl failed for {url}: {exc}" finally: # Track usage response_time = (time.time() - start_time) * 1000 tracker.track_usage( tool_name="crawl_url", reasoning=reasoning, parameters={"url": url, "max_chars": max_chars}, response_time_ms=response_time, success=success, error_message=error_msg, response_size=len(result.encode("utf-8")), ) return result
- src/searxng_mcp/crawler.py:14-37 (helper)Core crawling logic in CrawlerClient.fetch(), called by the tool handler. Uses crawl4ai AsyncWebCrawler to fetch, extract markdown/content, clean, and trim text.async def fetch(self, url: str, *, max_chars: int | None = None) -> str: """Fetch *url* and return cleaned markdown, trimmed to *max_chars*.""" run_config = CrawlerRunConfig(cache_mode=self.cache_mode) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=url, config=run_config) if getattr(result, "error", None): raise RuntimeError(str(result.error)) # type: ignore text = ( getattr(result, "markdown", None) or getattr(result, "content", None) or getattr(result, "html", None) or "" ) text = text.strip() if not text: raise RuntimeError("Crawl completed but returned no readable content.") limit = max_chars or CRAWL_MAX_CHARS return clamp_text(text, limit)
- src/searxng_mcp/crawler.py:8-11 (helper)CrawlerClient class initialization and configuration, instantiated globally as crawler_client in server.py.class CrawlerClient: """Lightweight wrapper around crawl4ai's async crawler.""" def __init__(self, *, cache_mode: CacheMode = CacheMode.BYPASS) -> None:
- src/searxng_mcp/server.py:21-102 (registration)Import and global instantiation of CrawlerClient used by the crawl_url tool.from .crawler import CrawlerClient from .errors import ErrorParser from .extractor import DataExtractor from .github import GitHubClient, RepoInfo from .images import PixabayClient from .registry import PackageInfo, PackageRegistryClient from .search import SearxSearcher from .service_health import ServiceHealthChecker from .tracking import get_tracker mcp = FastMCP("web-research-assistant") searcher = SearxSearcher() crawler_client = CrawlerClient() registry_client = PackageRegistryClient() github_client = GitHubClient() pixabay_client = PixabayClient() error_parser = ErrorParser() api_docs_detector = APIDocsDetector() api_docs_extractor = APIDocsExtractor() data_extractor = DataExtractor() tech_comparator = TechComparator(searcher, github_client, registry_client) changelog_fetcher = ChangelogFetcher(github_client, registry_client) service_health_checker = ServiceHealthChecker(crawler_client) tracker = get_tracker() def _format_search_hits(hits): lines = [] for idx, hit in enumerate(hits, 1): snippet = f"\n{hit.snippet}" if hit.snippet else "" lines.append(f"{idx}. {hit.title} — {hit.url}{snippet}") body = "\n\n".join(lines) return clamp_text(body, MAX_RESPONSE_CHARS) @mcp.tool() async def web_search( query: Annotated[str, "Natural-language web query"], reasoning: Annotated[str, "Why you're using this tool (required for analytics)"], category: Annotated[ str, "Optional SearXNG category (general, images, news, it, science, etc.)" ] = DEFAULT_CATEGORY, max_results: Annotated[int, "How many ranked hits to return (1-10)"] = DEFAULT_MAX_RESULTS, ) -> str: """Use this first to gather fresh web search results via the local SearXNG instance.""" start_time = time.time() success = False error_msg = None result = "" try: hits = await searcher.search(query, category=category, max_results=max_results) if not hits: result = f"No results for '{query}' in category '{category}'." else: result = _format_search_hits(hits) success = True except Exception as exc: # noqa: BLE001 error_msg = str(exc) result = f"Search failed: {exc}" finally: # Track usage response_time = (time.time() - start_time) * 1000 # Convert to ms tracker.track_usage( tool_name="web_search", reasoning=reasoning, parameters={ "query": query, "category": category, "max_results": max_results, }, response_time_ms=response_time, success=success, error_message=error_msg, response_size=len(result.encode("utf-8")), ) return result @mcp.tool()