extract
Extract clean content from web pages by providing URLs, enabling users to gather text and data from websites for analysis or processing.
Instructions
Extract content from web pages, crawl sites, or map site structure.
extract: Get clean content from URLs (requires urls)
crawl: Deep crawl from root URLs (requires urls)
map: Discover site structure without content (requires urls) Use
helptool for full documentation.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| action | Yes | ||
| urls | No | ||
| depth | No | ||
| max_pages | No | ||
| format | No | markdown | |
| stealth | No |
Implementation Reference
- src/wet_mcp/server.py:611-709 (registration)The 'extract' tool is registered with @mcp.tool() decorator and defines the handler function that dispatches to three actions: 'extract', 'crawl', and 'map'. The handler validates inputs, checks cache, and calls the underlying implementation functions (_extract, _crawl, _sitemap).
@mcp.tool( annotations=ToolAnnotations( readOnlyHint=True, openWorldHint=True, ), ) @_wrap_tool("extract") async def extract( action: str, urls: list[str] | None = None, depth: int = 2, max_pages: int = 20, format: str = "markdown", stealth: bool = False, ) -> str: """Extract content from web pages, crawl sites, or map site structure. - extract: Get clean content from URLs (requires urls) - crawl: Deep crawl from root URLs (requires urls) - map: Discover site structure without content (requires urls) Use `help` tool for full documentation. """ # Security: enforce hard limits to prevent resource exhaustion _MAX_EXTRACT_URLS = 20 _MAX_CRAWL_PAGES = 100 _MAX_DEPTH = 5 max_pages = min(max_pages, _MAX_CRAWL_PAGES) depth = min(depth, _MAX_DEPTH) match action: case "extract": if not urls: return "Error: urls is required for extract action" urls = urls[:_MAX_EXTRACT_URLS] cache_params = {"urls": sorted(urls), "format": format, "stealth": stealth} if _web_cache: cached = _web_cache.get("extract", cache_params) if cached: return cached result = await _with_timeout( _extract(urls=urls, format=format, stealth=stealth), "extract", ) if _web_cache and not result.startswith("Error"): _web_cache.set("extract", cache_params, result) return result case "crawl": if not urls: return "Error: urls is required for crawl action" urls = urls[:_MAX_EXTRACT_URLS] cache_params = { "urls": sorted(urls), "depth": depth, "max_pages": max_pages, } if _web_cache: cached = _web_cache.get("crawl", cache_params) if cached: return cached result = await _with_timeout( _crawl( urls=urls, depth=depth, max_pages=max_pages, format=format, stealth=stealth, ), "crawl", ) if _web_cache and not result.startswith("Error"): _web_cache.set("crawl", cache_params, result) return result case "map": if not urls: return "Error: urls is required for map action" urls = urls[:_MAX_EXTRACT_URLS] cache_params = { "urls": sorted(urls), "depth": depth, "max_pages": max_pages, } if _web_cache: cached = _web_cache.get("map", cache_params) if cached: return cached result = await _with_timeout( _sitemap(urls=urls, depth=depth, max_pages=max_pages), "map", ) if _web_cache and not result.startswith("Error"): _web_cache.set("map", cache_params, result) return result case _: return ( f"Error: Unknown action '{action}'. Valid actions: extract, crawl, map" ) - src/wet_mcp/sources/crawler.py:226-310 (handler)Core implementation of the 'extract' action. This async function uses Crawl4AI's AsyncWebCrawler to fetch and extract content from URLs. It handles document URLs (PDF, DOCX, etc.) via markitdown, applies security checks via is_safe_url(), and returns JSON results with title, content, and links.
async def extract( urls: list[str], format: str = "markdown", stealth: bool = True, scan_full_page: bool = False, delay_before_return_html: float = 0.0, page_timeout: int = 60000, ) -> str: """Extract content from URLs. Args: urls: List of URLs to extract format: Output format (markdown, text, html) stealth: Enable stealth mode scan_full_page: Auto-scroll to trigger lazy-loaded content delay_before_return_html: Seconds to wait after page load before capture page_timeout: Page loading timeout in milliseconds Returns: JSON string with extracted content """ logger.info(f"Extracting content from {len(urls)} URLs") crawler = await _get_crawler(stealth) sem = _get_semaphore() # Build CrawlerRunConfig with optional SPA-friendly settings run_config_kwargs: dict = {"verbose": False} if scan_full_page: run_config_kwargs["scan_full_page"] = True run_config_kwargs["scroll_delay"] = 0.3 if delay_before_return_html > 0: run_config_kwargs["delay_before_return_html"] = delay_before_return_html if page_timeout != 60000: run_config_kwargs["page_timeout"] = page_timeout run_config = CrawlerRunConfig(**run_config_kwargs) async def process_url(url: str): async with sem: if not is_safe_url(url): logger.warning(f"Skipping unsafe URL: {url}") return {"url": url, "error": "Security Alert: Unsafe URL blocked"} # Route document URLs (PDF, DOCX, etc.) through markitdown if _is_document_url(url): logger.info(f"Document URL detected, using markitdown: {url}") return await _extract_with_markitdown(url) try: result = await crawler.arun( url, # type: ignore[invalid-argument-type] config=run_config, ) # type: ignore[missing-argument] if result.success: content = ( result.markdown if format == "markdown" else result.cleaned_html ) return { "url": url, "title": result.metadata.get("title", ""), "content": content, "links": { "internal": result.links.get("internal", [])[:20], "external": result.links.get("external", [])[:20], }, } else: return { "url": url, "error": result.error_message or "Failed to extract", } except Exception as e: logger.error(f"Error extracting {url}: {e}") return { "url": url, "error": str(e), } tasks = [process_url(url) for url in urls] results = await asyncio.gather(*tasks) logger.info(f"Extracted {len(results)} pages") return json.dumps(results, ensure_ascii=False, indent=2) - The _get_crawler() function manages a singleton browser pool using AsyncWebCrawler. It handles browser instance creation, recycling (when stealth mode changes), and retry logic with fresh browser data directory on failure.
async def _get_crawler(stealth: bool = False) -> AsyncWebCrawler: """Return a shared AsyncWebCrawler, creating one if necessary. If the requested *stealth* mode differs from the current instance the old browser is shut down and a new one is started. This should rarely happen in practice since most calls use the same stealth setting. On failure (e.g. Playwright connection corrupted after browser recycle), retries once with a fresh browser data directory. """ global _crawler_instance, _crawler_stealth async with _pool_lock: # Reuse existing instance if stealth matches if _crawler_instance is not None and _crawler_stealth == stealth: return _crawler_instance # Tear down existing instance with different stealth mode if _crawler_instance is not None: logger.debug(f"Recycling browser (stealth {_crawler_stealth} -> {stealth})") try: await _crawler_instance.__aexit__(None, None, None) except Exception as exc: logger.debug(f"Error closing old crawler: {exc}") _crawler_instance = None # Start a fresh browser (retry once on failure) for attempt in range(2): logger.info(f"Starting shared browser (stealth={stealth})...") crawler = AsyncWebCrawler( verbose=False, config=_browser_config(stealth), ) try: await crawler.__aenter__() _crawler_instance = crawler _crawler_stealth = stealth logger.info("Shared browser started") return _crawler_instance except Exception: if attempt == 0: logger.warning( "Browser start failed, retrying with fresh data dir..." ) _cleanup_browser_data_dir() else: logger.error("Failed to start shared browser after retry") raise raise RuntimeError("Failed to start shared browser") - The _extract_with_markitdown() function handles document conversion for PDF, DOCX, PPTX, and other document formats. Downloads the document via httpx and converts to Markdown using the markitdown library.
async def _extract_with_markitdown(url: str) -> dict: """Download document and convert to Markdown via markitdown.""" try: from markitdown import MarkItDown except ImportError: return { "url": url, "error": "markitdown not installed. Install with: pip install 'markitdown[pdf,docx,pptx]'", } try: async with _safe_httpx_client(timeout=60, follow_redirects=True) as client: resp = await client.get(url) resp.raise_for_status() # Write to temp file (markitdown needs file path with extension) import io ext = Path(urlparse(url).path).suffix.lower() or ".pdf" md = MarkItDown() result = md.convert_stream(io.BytesIO(resp.content), file_extension=ext) return { "url": url, "title": Path(urlparse(url).path).stem, "content": result.text_content, "converter": "markitdown", } except Exception as e: logger.error(f"markitdown failed for {url}: {e}") return {"url": url, "error": f"Document conversion failed: {e}"} - src/wet_mcp/sources/crawler.py:52-67 (helper)The _browser_config() function creates BrowserConfig for Crawl4AI with per-process isolated data directory and optional stealth mode, handling Docker/CI environments with --no-sandbox flag.
def _browser_config(stealth: bool = False) -> BrowserConfig: """Create BrowserConfig with per-process isolated data directory.""" extra_args: list[str] = [] # Docker/CI environments need --no-sandbox (Chromium cannot use # the SUID sandbox inside unprivileged containers). if os.path.exists("/.dockerenv") or os.environ.get("container"): extra_args += ["--no-sandbox", "--disable-dev-shm-usage"] return BrowserConfig( headless=True, enable_stealth=stealth, verbose=False, user_data_dir=_BROWSER_DATA_DIR, extra_args=extra_args, )