Skip to main content
Glama
elad12390

Web Research Assistant

by elad12390

crawl_url

Fetch webpage text for quoting or analysis using crawl4ai to extract content from URLs.

Instructions

Fetch a URL with crawl4ai when you need the actual page text for quoting or analysis.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYes
reasoningYes
max_charsNo

Implementation Reference

  • Primary handler and registration for the 'crawl_url' MCP tool. Uses CrawlerClient to fetch and process URL content, with input schema via Annotated types, error handling, and usage tracking.
    @mcp.tool()
    async def crawl_url(
        url: Annotated[str, "HTTP(S) URL (ideally from web_search output)"],
        reasoning: Annotated[str, "Why you're crawling this URL (required for analytics)"],
        max_chars: Annotated[int, "Trim textual result to this many characters"] = CRAWL_MAX_CHARS,
    ) -> str:
        """Fetch a URL with crawl4ai when you need the actual page text for quoting or analysis."""
    
        start_time = time.time()
        success = False
        error_msg = None
        result = ""
    
        try:
            text = await crawler_client.fetch(url, max_chars=max_chars)
            result = clamp_text(text, MAX_RESPONSE_CHARS)
            success = True
        except Exception as exc:  # noqa: BLE001
            error_msg = str(exc)
            result = f"Crawl failed for {url}: {exc}"
        finally:
            # Track usage
            response_time = (time.time() - start_time) * 1000
            tracker.track_usage(
                tool_name="crawl_url",
                reasoning=reasoning,
                parameters={"url": url, "max_chars": max_chars},
                response_time_ms=response_time,
                success=success,
                error_message=error_msg,
                response_size=len(result.encode("utf-8")),
            )
    
        return result
  • Core crawling logic in CrawlerClient.fetch(), called by the tool handler. Uses crawl4ai AsyncWebCrawler to fetch, extract markdown/content, clean, and trim text.
    async def fetch(self, url: str, *, max_chars: int | None = None) -> str:
        """Fetch *url* and return cleaned markdown, trimmed to *max_chars*."""
    
        run_config = CrawlerRunConfig(cache_mode=self.cache_mode)
    
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun(url=url, config=run_config)
    
        if getattr(result, "error", None):
            raise RuntimeError(str(result.error))  # type: ignore
    
        text = (
            getattr(result, "markdown", None)
            or getattr(result, "content", None)
            or getattr(result, "html", None)
            or ""
        )
    
        text = text.strip()
        if not text:
            raise RuntimeError("Crawl completed but returned no readable content.")
    
        limit = max_chars or CRAWL_MAX_CHARS
        return clamp_text(text, limit)
  • CrawlerClient class initialization and configuration, instantiated globally as crawler_client in server.py.
    class CrawlerClient:
        """Lightweight wrapper around crawl4ai's async crawler."""
    
        def __init__(self, *, cache_mode: CacheMode = CacheMode.BYPASS) -> None:
  • Import and global instantiation of CrawlerClient used by the crawl_url tool.
    from .crawler import CrawlerClient
    from .errors import ErrorParser
    from .extractor import DataExtractor
    from .github import GitHubClient, RepoInfo
    from .images import PixabayClient
    from .registry import PackageInfo, PackageRegistryClient
    from .search import SearxSearcher
    from .service_health import ServiceHealthChecker
    from .tracking import get_tracker
    
    mcp = FastMCP("web-research-assistant")
    searcher = SearxSearcher()
    crawler_client = CrawlerClient()
    registry_client = PackageRegistryClient()
    github_client = GitHubClient()
    pixabay_client = PixabayClient()
    error_parser = ErrorParser()
    api_docs_detector = APIDocsDetector()
    api_docs_extractor = APIDocsExtractor()
    data_extractor = DataExtractor()
    tech_comparator = TechComparator(searcher, github_client, registry_client)
    changelog_fetcher = ChangelogFetcher(github_client, registry_client)
    service_health_checker = ServiceHealthChecker(crawler_client)
    tracker = get_tracker()
    
    
    def _format_search_hits(hits):
        lines = []
        for idx, hit in enumerate(hits, 1):
            snippet = f"\n{hit.snippet}" if hit.snippet else ""
            lines.append(f"{idx}. {hit.title} — {hit.url}{snippet}")
        body = "\n\n".join(lines)
        return clamp_text(body, MAX_RESPONSE_CHARS)
    
    
    @mcp.tool()
    async def web_search(
        query: Annotated[str, "Natural-language web query"],
        reasoning: Annotated[str, "Why you're using this tool (required for analytics)"],
        category: Annotated[
            str, "Optional SearXNG category (general, images, news, it, science, etc.)"
        ] = DEFAULT_CATEGORY,
        max_results: Annotated[int, "How many ranked hits to return (1-10)"] = DEFAULT_MAX_RESULTS,
    ) -> str:
        """Use this first to gather fresh web search results via the local SearXNG instance."""
    
        start_time = time.time()
        success = False
        error_msg = None
        result = ""
    
        try:
            hits = await searcher.search(query, category=category, max_results=max_results)
            if not hits:
                result = f"No results for '{query}' in category '{category}'."
            else:
                result = _format_search_hits(hits)
            success = True
        except Exception as exc:  # noqa: BLE001
            error_msg = str(exc)
            result = f"Search failed: {exc}"
        finally:
            # Track usage
            response_time = (time.time() - start_time) * 1000  # Convert to ms
            tracker.track_usage(
                tool_name="web_search",
                reasoning=reasoning,
                parameters={
                    "query": query,
                    "category": category,
                    "max_results": max_results,
                },
                response_time_ms=response_time,
                success=success,
                error_message=error_msg,
                response_size=len(result.encode("utf-8")),
            )
    
        return result
    
    
    @mcp.tool()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/elad12390/web-research-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server