crawl_url

Fetch webpage text for quoting or analysis using crawl4ai to extract content from URLs.

Instructions

Fetch a URL with crawl4ai when you need the actual page text for quoting or analysis.

Input Schema

TableJSON Schema

Name	Required	Description	Default
`url`	Yes
`reasoning`	Yes
`max_chars`	No

Implementation Reference

src/searxng_mcp/server.py:102-135 (handler)

Primary handler and registration for the 'crawl_url' MCP tool. Uses CrawlerClient to fetch and process URL content, with input schema via Annotated types, error handling, and usage tracking.

@mcp.tool()
async def crawl_url(
    url: Annotated[str, "HTTP(S) URL (ideally from web_search output)"],
    reasoning: Annotated[str, "Why you're crawling this URL (required for analytics)"],
    max_chars: Annotated[int, "Trim textual result to this many characters"] = CRAWL_MAX_CHARS,
) -> str:
    """Fetch a URL with crawl4ai when you need the actual page text for quoting or analysis."""

    start_time = time.time()
    success = False
    error_msg = None
    result = ""

    try:
        text = await crawler_client.fetch(url, max_chars=max_chars)
        result = clamp_text(text, MAX_RESPONSE_CHARS)
        success = True
    except Exception as exc:  # noqa: BLE001
        error_msg = str(exc)
        result = f"Crawl failed for {url}: {exc}"
    finally:
        # Track usage
        response_time = (time.time() - start_time) * 1000
        tracker.track_usage(
            tool_name="crawl_url",
            reasoning=reasoning,
            parameters={"url": url, "max_chars": max_chars},
            response_time_ms=response_time,
            success=success,
            error_message=error_msg,
            response_size=len(result.encode("utf-8")),
        )

    return result

src/searxng_mcp/crawler.py:14-37 (helper)

Core crawling logic in CrawlerClient.fetch(), called by the tool handler. Uses crawl4ai AsyncWebCrawler to fetch, extract markdown/content, clean, and trim text.

async def fetch(self, url: str, *, max_chars: int | None = None) -> str:
    """Fetch *url* and return cleaned markdown, trimmed to *max_chars*."""

    run_config = CrawlerRunConfig(cache_mode=self.cache_mode)

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=url, config=run_config)

    if getattr(result, "error", None):
        raise RuntimeError(str(result.error))  # type: ignore

    text = (
        getattr(result, "markdown", None)
        or getattr(result, "content", None)
        or getattr(result, "html", None)
        or ""
    )

    text = text.strip()
    if not text:
        raise RuntimeError("Crawl completed but returned no readable content.")

    limit = max_chars or CRAWL_MAX_CHARS
    return clamp_text(text, limit)

src/searxng_mcp/crawler.py:8-11 (helper)

CrawlerClient class initialization and configuration, instantiated globally as crawler_client in server.py.

class CrawlerClient:
    """Lightweight wrapper around crawl4ai's async crawler."""

    def __init__(self, *, cache_mode: CacheMode = CacheMode.BYPASS) -> None:

src/searxng_mcp/server.py:21-102 (registration)

Import and global instantiation of CrawlerClient used by the crawl_url tool.

from .crawler import CrawlerClient
from .errors import ErrorParser
from .extractor import DataExtractor
from .github import GitHubClient, RepoInfo
from .images import PixabayClient
from .registry import PackageInfo, PackageRegistryClient
from .search import SearxSearcher
from .service_health import ServiceHealthChecker
from .tracking import get_tracker

mcp = FastMCP("web-research-assistant")
searcher = SearxSearcher()
crawler_client = CrawlerClient()
registry_client = PackageRegistryClient()
github_client = GitHubClient()
pixabay_client = PixabayClient()
error_parser = ErrorParser()
api_docs_detector = APIDocsDetector()
api_docs_extractor = APIDocsExtractor()
data_extractor = DataExtractor()
tech_comparator = TechComparator(searcher, github_client, registry_client)
changelog_fetcher = ChangelogFetcher(github_client, registry_client)
service_health_checker = ServiceHealthChecker(crawler_client)
tracker = get_tracker()


def _format_search_hits(hits):
    lines = []
    for idx, hit in enumerate(hits, 1):
        snippet = f"\n{hit.snippet}" if hit.snippet else ""
        lines.append(f"{idx}. {hit.title} — {hit.url}{snippet}")
    body = "\n\n".join(lines)
    return clamp_text(body, MAX_RESPONSE_CHARS)


@mcp.tool()
async def web_search(
    query: Annotated[str, "Natural-language web query"],
    reasoning: Annotated[str, "Why you're using this tool (required for analytics)"],
    category: Annotated[
        str, "Optional SearXNG category (general, images, news, it, science, etc.)"
    ] = DEFAULT_CATEGORY,
    max_results: Annotated[int, "How many ranked hits to return (1-10)"] = DEFAULT_MAX_RESULTS,
) -> str:
    """Use this first to gather fresh web search results via the local SearXNG instance."""

    start_time = time.time()
    success = False
    error_msg = None
    result = ""

    try:
        hits = await searcher.search(query, category=category, max_results=max_results)
        if not hits:
            result = f"No results for '{query}' in category '{category}'."
        else:
            result = _format_search_hits(hits)
        success = True
    except Exception as exc:  # noqa: BLE001
        error_msg = str(exc)
        result = f"Search failed: {exc}"
    finally:
        # Track usage
        response_time = (time.time() - start_time) * 1000  # Convert to ms
        tracker.track_usage(
            tool_name="web_search",
            reasoning=reasoning,
            parameters={
                "query": query,
                "category": category,
                "max_results": max_results,
            },
            response_time_ms=response_time,
            success=success,
            error_message=error_msg,
            response_size=len(result.encode("utf-8")),
        )

    return result


@mcp.tool()

Web Research Assistant

crawl_url

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API