MCP WebScout

fetch.py•10.3 KiB

"""Fetch tool - uses Crawl4AI for advanced web scraping with LLM extraction support.""" import asyncio import json import logging import os import sys from typing import Any, Dict, Literal, Optional from ..utils.pdf_parsing import is_pdf_url # Fix Windows console encoding for Crawl4AI/rich - MUST be before any imports if sys.platform == "win32": # Force UTF-8 mode for Windows console os.environ["PYTHONUTF8"] = "1" os.environ["PYTHONIOENCODING"] = "utf-8:replace" # Reconfigure stdout/stderr to use UTF-8 if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') # Disable rich console colors and emoji os.environ["NO_COLOR"] = "1" os.environ["TERM"] = "dumb" os.environ["RICH_EMOJI"] = "0" # Additional environment variables for rich os.environ["COLUMNS"] = "120" # Set console width os.environ["LINES"] = "40" # Set console height from ..constants import ( DEEPSEEK_API_BASE, DEEPSEEK_DEFAULT_MODEL, DEEPSEEK_DEFAULT_PROMPT, PROXY_URL, ) # Import Crawl4AI try: from crawl4ai import AsyncWebCrawler, LLMConfig from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai import LLMExtractionStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy CRAWL4AI_AVAILABLE = True except ImportError: CRAWL4AI_AVAILABLE = False AsyncWebCrawler = None # type: ignore BrowserConfig = None # type: ignore CrawlerRunConfig = None # type: ignore LLMConfig = None # type: ignore LLMExtractionStrategy = None # type: ignore async def fetch_with_crawl4ai( url: str, js_render: bool = True, use_proxy: bool = True, timeout: int = 30, extraction_strategy: Optional[Any] = None, ) -> Dict[str, Any]: """Fetch webpage using Crawl4AI. Args: url: URL to fetch js_render: Whether to enable JavaScript rendering use_proxy: Whether to use proxy timeout: Request timeout in seconds extraction_strategy: Optional extraction strategy (e.g., LLMExtractionStrategy) Returns: Dictionary with fetch results """ if not CRAWL4AI_AVAILABLE: raise ImportError( "crawl4ai package is required for fetch. " "Install with: pip install crawl4ai" ) logging.info(f"Fetching URL with Crawl4AI: {url}, js_render={js_render}") if is_pdf_url(url=url): # Initialize the PDF crawler strategy crawler_strategy = PDFCrawlerStrategy() # PDFCrawlerStrategy is typically used in conjunction with PDFContentScrapingStrategy # The scraping strategy handles the actual PDF content extraction scraping_strategy = PDFContentScrapingStrategy() else: scraping_strategy = None crawler_strategy = None # Configure browser browser_config = BrowserConfig( headless=True, verbose=False, # Disable verbose logging to reduce console output ) # Configure run parameters run_config = CrawlerRunConfig( word_count_threshold=10, exclude_external_links=False, remove_overlay_elements=True, scraping_strategy=scraping_strategy, ) # Add extraction strategy if provided if extraction_strategy: run_config.extraction_strategy = extraction_strategy # Suppress Crawl4AI logging to prevent encoding errors old_logger_level = logging.getLogger("crawl4ai").level logging.getLogger("crawl4ai").setLevel(logging.CRITICAL) try: # Execute fetch async with AsyncWebCrawler(config=browser_config, crawler_strategy=crawler_strategy) as crawler: result = await crawler.arun(url=url, config=run_config) response = { "success": result.success, "url": result.url, "html": result.html if result.html else None, "markdown": result.markdown.raw_markdown if result.markdown else None, "fit_markdown": result.markdown.fit_markdown if result.markdown else None, "title": result.metadata.get("title") if result.metadata else None, "status_code": result.status_code, "error": result.error_message if not result.success else None, "extracted_content": result.extracted_content if extraction_strategy else None, } logging.info( f"Fetch completed: success={result.success}, " f"status={result.status_code}, " f"title={response.get('title')}" ) return response except Exception as e: logging.exception("Crawl4AI fetch failed") return { "success": False, "url": url, "html": None, "markdown": None, "fit_markdown": None, "title": None, "status_code": None, "error": f"Crawl4AI fetch failed: {str(e)}", "extracted_content": None, } finally: # Restore logger level logging.getLogger("crawl4ai").setLevel(old_logger_level) async def fetch( url: str, mode: Literal["simple", "llm"] = "simple", prompt: Optional[str] = None, model: str = DEEPSEEK_DEFAULT_MODEL, api_key: Optional[str] = None, max_length: int = 10000, use_proxy: bool = True, js_render: bool = True, timeout: int = 30, ) -> Dict[str, Any]: """Fetch webpage content using Crawl4AI with optional LLM extraction. This is the v2 fetch tool that uses Crawl4AI for advanced web scraping: 1. Uses real browser for JavaScript rendering 2. Generates clean Markdown content 3. Supports LLM-based structured extraction (optional) Args: url: The URL to fetch content from mode: Fetch mode - "simple" for basic fetch, "llm" for LLM extraction prompt: Custom extraction prompt (only used when mode="llm") model: Model name (supports OpenAI-compatible APIs via LiteLLM) api_key: API key for the LLM provider (required for mode="llm") max_length: Maximum content length to return (affects chunking) use_proxy: Whether to use proxy (default: True) js_render: Whether to enable JavaScript rendering (default: True) timeout: Request timeout in seconds (default: 30) Returns: Dictionary containing: - success: Whether the fetch was successful - url: The fetched URL - markdown: The markdown content (if successful) - fit_markdown: The most relevant content in markdown (if available) - title: Page title (if available) - extracted: Structured extraction result (only when mode="llm") - model: LLM model used (only when mode="llm") - error: Error message (if failed) """ logging.info( f"fetch called - URL: {url}, mode={mode}, js_render={js_render}" ) # Validate mode if mode not in ("simple", "llm"): raise ValueError(f"Invalid mode: {mode}. Must be 'simple' or 'llm'") # Validate LLM mode requirements if mode == "llm" and not api_key: raise ValueError("api_key is required when mode='llm'") # Prepare extraction strategy for LLM mode extraction_strategy = None if mode == "llm" and api_key: if not CRAWL4AI_AVAILABLE: raise ImportError("crawl4ai package is required for LLM extraction") # Set default prompt if not provided extraction_prompt = prompt or DEEPSEEK_DEFAULT_PROMPT # Configure LLM for Crawl4AI # DeepSeek uses OpenAI-compatible API, so we use "openai/" prefix # and pass the base_url through extra_args llm_config = LLMConfig( provider="openai/deepseek-chat", # Use OpenAI-compatible format api_token=api_key, ) # Create LLM extraction strategy extraction_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction=extraction_prompt, extraction_type="block", # Use "block" for freeform extraction chunk_token_threshold=max_length, # Control chunk size overlap_rate=0.1, # 10% overlap between chunks apply_chunking=True, # Enable chunking for long content input_format="fit_markdown", # Use fit_markdown for most relevant content extra_args={ "base_url": DEEPSEEK_API_BASE, # Set DeepSeek base URL "temperature": 0.1, "max_tokens": 2000, }, verbose=False, # Disable verbose to avoid encoding issues ) # Step 1: Fetch content using Crawl4AI fetch_result = await fetch_with_crawl4ai( url=url, js_render=js_render, use_proxy=use_proxy, timeout=timeout, extraction_strategy=extraction_strategy, ) if not fetch_result["success"]: return { "success": False, "url": url, "markdown": None, "fit_markdown": None, "title": None, "extracted": None, "model": None, "error": fetch_result.get("error", "Unknown fetch error"), } # Prepare base result result: Dict[str, Any] = { "success": True, "url": fetch_result["url"], "markdown": fetch_result["markdown"], "fit_markdown": fetch_result.get("fit_markdown"), "title": fetch_result.get("title"), } # Step 2: Process LLM extraction result (if mode is "llm") if mode == "llm" and fetch_result.get("extracted_content"): try: # Parse extracted content as JSON if possible extracted_data = json.loads(fetch_result["extracted_content"]) result["extracted"] = extracted_data result["model"] = model except json.JSONDecodeError: # If not valid JSON, return as-is result["extracted"] = {"result": fetch_result["extracted_content"]} result["model"] = model elif mode == "llm": # LLM mode but no extraction result result["extracted"] = {"error": "No content extracted by LLM"} result["model"] = model return result

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rainyuniverse/mcp-webscout'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

fetch.py•10.3 KiB