"""Fetch tool - uses Crawl4AI for advanced web scraping with LLM extraction support."""
import asyncio
import json
import logging
import os
import sys
from typing import Any, Dict, Literal, Optional
from ..utils.pdf_parsing import is_pdf_url
# Fix Windows console encoding for Crawl4AI/rich - MUST be before any imports
if sys.platform == "win32":
# Force UTF-8 mode for Windows console
os.environ["PYTHONUTF8"] = "1"
os.environ["PYTHONIOENCODING"] = "utf-8:replace"
# Reconfigure stdout/stderr to use UTF-8
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8', errors='replace')
# Disable rich console colors and emoji
os.environ["NO_COLOR"] = "1"
os.environ["TERM"] = "dumb"
os.environ["RICH_EMOJI"] = "0"
# Additional environment variables for rich
os.environ["COLUMNS"] = "120" # Set console width
os.environ["LINES"] = "40" # Set console height
from ..constants import (
DEEPSEEK_API_BASE,
DEEPSEEK_DEFAULT_MODEL,
DEEPSEEK_DEFAULT_PROMPT,
PROXY_URL,
)
# Import Crawl4AI
try:
from crawl4ai import AsyncWebCrawler, LLMConfig
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai import LLMExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
CRAWL4AI_AVAILABLE = True
except ImportError:
CRAWL4AI_AVAILABLE = False
AsyncWebCrawler = None # type: ignore
BrowserConfig = None # type: ignore
CrawlerRunConfig = None # type: ignore
LLMConfig = None # type: ignore
LLMExtractionStrategy = None # type: ignore
async def fetch_with_crawl4ai(
url: str,
js_render: bool = True,
use_proxy: bool = True,
timeout: int = 30,
extraction_strategy: Optional[Any] = None,
) -> Dict[str, Any]:
"""Fetch webpage using Crawl4AI.
Args:
url: URL to fetch
js_render: Whether to enable JavaScript rendering
use_proxy: Whether to use proxy
timeout: Request timeout in seconds
extraction_strategy: Optional extraction strategy (e.g., LLMExtractionStrategy)
Returns:
Dictionary with fetch results
"""
if not CRAWL4AI_AVAILABLE:
raise ImportError(
"crawl4ai package is required for fetch. "
"Install with: pip install crawl4ai"
)
logging.info(f"Fetching URL with Crawl4AI: {url}, js_render={js_render}")
if is_pdf_url(url=url):
# Initialize the PDF crawler strategy
crawler_strategy = PDFCrawlerStrategy()
# PDFCrawlerStrategy is typically used in conjunction with PDFContentScrapingStrategy
# The scraping strategy handles the actual PDF content extraction
scraping_strategy = PDFContentScrapingStrategy()
else:
scraping_strategy = None
crawler_strategy = None
# Configure browser
browser_config = BrowserConfig(
headless=True,
verbose=False, # Disable verbose logging to reduce console output
)
# Configure run parameters
run_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=False,
remove_overlay_elements=True,
scraping_strategy=scraping_strategy,
)
# Add extraction strategy if provided
if extraction_strategy:
run_config.extraction_strategy = extraction_strategy
# Suppress Crawl4AI logging to prevent encoding errors
old_logger_level = logging.getLogger("crawl4ai").level
logging.getLogger("crawl4ai").setLevel(logging.CRITICAL)
try:
# Execute fetch
async with AsyncWebCrawler(config=browser_config, crawler_strategy=crawler_strategy) as crawler:
result = await crawler.arun(url=url, config=run_config)
response = {
"success": result.success,
"url": result.url,
"html": result.html if result.html else None,
"markdown": result.markdown.raw_markdown if result.markdown else None,
"fit_markdown": result.markdown.fit_markdown if result.markdown else None,
"title": result.metadata.get("title") if result.metadata else None,
"status_code": result.status_code,
"error": result.error_message if not result.success else None,
"extracted_content": result.extracted_content if extraction_strategy else None,
}
logging.info(
f"Fetch completed: success={result.success}, "
f"status={result.status_code}, "
f"title={response.get('title')}"
)
return response
except Exception as e:
logging.exception("Crawl4AI fetch failed")
return {
"success": False,
"url": url,
"html": None,
"markdown": None,
"fit_markdown": None,
"title": None,
"status_code": None,
"error": f"Crawl4AI fetch failed: {str(e)}",
"extracted_content": None,
}
finally:
# Restore logger level
logging.getLogger("crawl4ai").setLevel(old_logger_level)
async def fetch(
url: str,
mode: Literal["simple", "llm"] = "simple",
prompt: Optional[str] = None,
model: str = DEEPSEEK_DEFAULT_MODEL,
api_key: Optional[str] = None,
max_length: int = 10000,
use_proxy: bool = True,
js_render: bool = True,
timeout: int = 30,
) -> Dict[str, Any]:
"""Fetch webpage content using Crawl4AI with optional LLM extraction.
This is the v2 fetch tool that uses Crawl4AI for advanced web scraping:
1. Uses real browser for JavaScript rendering
2. Generates clean Markdown content
3. Supports LLM-based structured extraction (optional)
Args:
url: The URL to fetch content from
mode: Fetch mode - "simple" for basic fetch, "llm" for LLM extraction
prompt: Custom extraction prompt (only used when mode="llm")
model: Model name (supports OpenAI-compatible APIs via LiteLLM)
api_key: API key for the LLM provider (required for mode="llm")
max_length: Maximum content length to return (affects chunking)
use_proxy: Whether to use proxy (default: True)
js_render: Whether to enable JavaScript rendering (default: True)
timeout: Request timeout in seconds (default: 30)
Returns:
Dictionary containing:
- success: Whether the fetch was successful
- url: The fetched URL
- markdown: The markdown content (if successful)
- fit_markdown: The most relevant content in markdown (if available)
- title: Page title (if available)
- extracted: Structured extraction result (only when mode="llm")
- model: LLM model used (only when mode="llm")
- error: Error message (if failed)
"""
logging.info(
f"fetch called - URL: {url}, mode={mode}, js_render={js_render}"
)
# Validate mode
if mode not in ("simple", "llm"):
raise ValueError(f"Invalid mode: {mode}. Must be 'simple' or 'llm'")
# Validate LLM mode requirements
if mode == "llm" and not api_key:
raise ValueError("api_key is required when mode='llm'")
# Prepare extraction strategy for LLM mode
extraction_strategy = None
if mode == "llm" and api_key:
if not CRAWL4AI_AVAILABLE:
raise ImportError("crawl4ai package is required for LLM extraction")
# Set default prompt if not provided
extraction_prompt = prompt or DEEPSEEK_DEFAULT_PROMPT
# Configure LLM for Crawl4AI
# DeepSeek uses OpenAI-compatible API, so we use "openai/" prefix
# and pass the base_url through extra_args
llm_config = LLMConfig(
provider="openai/deepseek-chat", # Use OpenAI-compatible format
api_token=api_key,
)
# Create LLM extraction strategy
extraction_strategy = LLMExtractionStrategy(
llm_config=llm_config,
instruction=extraction_prompt,
extraction_type="block", # Use "block" for freeform extraction
chunk_token_threshold=max_length, # Control chunk size
overlap_rate=0.1, # 10% overlap between chunks
apply_chunking=True, # Enable chunking for long content
input_format="fit_markdown", # Use fit_markdown for most relevant content
extra_args={
"base_url": DEEPSEEK_API_BASE, # Set DeepSeek base URL
"temperature": 0.1,
"max_tokens": 2000,
},
verbose=False, # Disable verbose to avoid encoding issues
)
# Step 1: Fetch content using Crawl4AI
fetch_result = await fetch_with_crawl4ai(
url=url,
js_render=js_render,
use_proxy=use_proxy,
timeout=timeout,
extraction_strategy=extraction_strategy,
)
if not fetch_result["success"]:
return {
"success": False,
"url": url,
"markdown": None,
"fit_markdown": None,
"title": None,
"extracted": None,
"model": None,
"error": fetch_result.get("error", "Unknown fetch error"),
}
# Prepare base result
result: Dict[str, Any] = {
"success": True,
"url": fetch_result["url"],
"markdown": fetch_result["markdown"],
"fit_markdown": fetch_result.get("fit_markdown"),
"title": fetch_result.get("title"),
}
# Step 2: Process LLM extraction result (if mode is "llm")
if mode == "llm" and fetch_result.get("extracted_content"):
try:
# Parse extracted content as JSON if possible
extracted_data = json.loads(fetch_result["extracted_content"])
result["extracted"] = extracted_data
result["model"] = model
except json.JSONDecodeError:
# If not valid JSON, return as-is
result["extracted"] = {"result": fetch_result["extracted_content"]}
result["model"] = model
elif mode == "llm":
# LLM mode but no extraction result
result["extracted"] = {"error": "No content extracted by LLM"}
result["model"] = model
return result