Skip to main content
Glama
n24q02m

WET - Web Extended Toolkit

extract

Extract clean content from web pages by providing URLs, enabling users to gather text and data from websites for analysis or processing.

Instructions

Extract content from web pages, crawl sites, or map site structure.

  • extract: Get clean content from URLs (requires urls)

  • crawl: Deep crawl from root URLs (requires urls)

  • map: Discover site structure without content (requires urls) Use help tool for full documentation.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
actionYes
urlsNo
depthNo
max_pagesNo
formatNomarkdown
stealthNo

Implementation Reference

  • The 'extract' tool is registered with @mcp.tool() decorator and defines the handler function that dispatches to three actions: 'extract', 'crawl', and 'map'. The handler validates inputs, checks cache, and calls the underlying implementation functions (_extract, _crawl, _sitemap).
    @mcp.tool(
        annotations=ToolAnnotations(
            readOnlyHint=True,
            openWorldHint=True,
        ),
    )
    @_wrap_tool("extract")
    async def extract(
        action: str,
        urls: list[str] | None = None,
        depth: int = 2,
        max_pages: int = 20,
        format: str = "markdown",
        stealth: bool = False,
    ) -> str:
        """Extract content from web pages, crawl sites, or map site structure.
        - extract: Get clean content from URLs (requires urls)
        - crawl: Deep crawl from root URLs (requires urls)
        - map: Discover site structure without content (requires urls)
        Use `help` tool for full documentation.
        """
        # Security: enforce hard limits to prevent resource exhaustion
        _MAX_EXTRACT_URLS = 20
        _MAX_CRAWL_PAGES = 100
        _MAX_DEPTH = 5
    
        max_pages = min(max_pages, _MAX_CRAWL_PAGES)
        depth = min(depth, _MAX_DEPTH)
    
        match action:
            case "extract":
                if not urls:
                    return "Error: urls is required for extract action"
                urls = urls[:_MAX_EXTRACT_URLS]
                cache_params = {"urls": sorted(urls), "format": format, "stealth": stealth}
                if _web_cache:
                    cached = _web_cache.get("extract", cache_params)
                    if cached:
                        return cached
                result = await _with_timeout(
                    _extract(urls=urls, format=format, stealth=stealth),
                    "extract",
                )
                if _web_cache and not result.startswith("Error"):
                    _web_cache.set("extract", cache_params, result)
                return result
    
            case "crawl":
                if not urls:
                    return "Error: urls is required for crawl action"
                urls = urls[:_MAX_EXTRACT_URLS]
                cache_params = {
                    "urls": sorted(urls),
                    "depth": depth,
                    "max_pages": max_pages,
                }
                if _web_cache:
                    cached = _web_cache.get("crawl", cache_params)
                    if cached:
                        return cached
                result = await _with_timeout(
                    _crawl(
                        urls=urls,
                        depth=depth,
                        max_pages=max_pages,
                        format=format,
                        stealth=stealth,
                    ),
                    "crawl",
                )
                if _web_cache and not result.startswith("Error"):
                    _web_cache.set("crawl", cache_params, result)
                return result
    
            case "map":
                if not urls:
                    return "Error: urls is required for map action"
                urls = urls[:_MAX_EXTRACT_URLS]
                cache_params = {
                    "urls": sorted(urls),
                    "depth": depth,
                    "max_pages": max_pages,
                }
                if _web_cache:
                    cached = _web_cache.get("map", cache_params)
                    if cached:
                        return cached
                result = await _with_timeout(
                    _sitemap(urls=urls, depth=depth, max_pages=max_pages),
                    "map",
                )
                if _web_cache and not result.startswith("Error"):
                    _web_cache.set("map", cache_params, result)
                return result
    
            case _:
                return (
                    f"Error: Unknown action '{action}'. Valid actions: extract, crawl, map"
                )
  • Core implementation of the 'extract' action. This async function uses Crawl4AI's AsyncWebCrawler to fetch and extract content from URLs. It handles document URLs (PDF, DOCX, etc.) via markitdown, applies security checks via is_safe_url(), and returns JSON results with title, content, and links.
    async def extract(
        urls: list[str],
        format: str = "markdown",
        stealth: bool = True,
        scan_full_page: bool = False,
        delay_before_return_html: float = 0.0,
        page_timeout: int = 60000,
    ) -> str:
        """Extract content from URLs.
    
        Args:
            urls: List of URLs to extract
            format: Output format (markdown, text, html)
            stealth: Enable stealth mode
            scan_full_page: Auto-scroll to trigger lazy-loaded content
            delay_before_return_html: Seconds to wait after page load before capture
            page_timeout: Page loading timeout in milliseconds
    
        Returns:
            JSON string with extracted content
        """
        logger.info(f"Extracting content from {len(urls)} URLs")
    
        crawler = await _get_crawler(stealth)
        sem = _get_semaphore()
    
        # Build CrawlerRunConfig with optional SPA-friendly settings
        run_config_kwargs: dict = {"verbose": False}
        if scan_full_page:
            run_config_kwargs["scan_full_page"] = True
            run_config_kwargs["scroll_delay"] = 0.3
        if delay_before_return_html > 0:
            run_config_kwargs["delay_before_return_html"] = delay_before_return_html
        if page_timeout != 60000:
            run_config_kwargs["page_timeout"] = page_timeout
        run_config = CrawlerRunConfig(**run_config_kwargs)
    
        async def process_url(url: str):
            async with sem:
                if not is_safe_url(url):
                    logger.warning(f"Skipping unsafe URL: {url}")
                    return {"url": url, "error": "Security Alert: Unsafe URL blocked"}
    
                # Route document URLs (PDF, DOCX, etc.) through markitdown
                if _is_document_url(url):
                    logger.info(f"Document URL detected, using markitdown: {url}")
                    return await _extract_with_markitdown(url)
    
                try:
                    result = await crawler.arun(
                        url,  # type: ignore[invalid-argument-type]
                        config=run_config,
                    )  # type: ignore[missing-argument]
    
                    if result.success:
                        content = (
                            result.markdown if format == "markdown" else result.cleaned_html
                        )
                        return {
                            "url": url,
                            "title": result.metadata.get("title", ""),
                            "content": content,
                            "links": {
                                "internal": result.links.get("internal", [])[:20],
                                "external": result.links.get("external", [])[:20],
                            },
                        }
                    else:
                        return {
                            "url": url,
                            "error": result.error_message or "Failed to extract",
                        }
    
                except Exception as e:
                    logger.error(f"Error extracting {url}: {e}")
                    return {
                        "url": url,
                        "error": str(e),
                    }
    
        tasks = [process_url(url) for url in urls]
        results = await asyncio.gather(*tasks)
    
        logger.info(f"Extracted {len(results)} pages")
        return json.dumps(results, ensure_ascii=False, indent=2)
  • The _get_crawler() function manages a singleton browser pool using AsyncWebCrawler. It handles browser instance creation, recycling (when stealth mode changes), and retry logic with fresh browser data directory on failure.
    async def _get_crawler(stealth: bool = False) -> AsyncWebCrawler:
        """Return a shared AsyncWebCrawler, creating one if necessary.
    
        If the requested *stealth* mode differs from the current instance the
        old browser is shut down and a new one is started.  This should rarely
        happen in practice since most calls use the same stealth setting.
    
        On failure (e.g. Playwright connection corrupted after browser recycle),
        retries once with a fresh browser data directory.
        """
        global _crawler_instance, _crawler_stealth
    
        async with _pool_lock:
            # Reuse existing instance if stealth matches
            if _crawler_instance is not None and _crawler_stealth == stealth:
                return _crawler_instance
    
            # Tear down existing instance with different stealth mode
            if _crawler_instance is not None:
                logger.debug(f"Recycling browser (stealth {_crawler_stealth} -> {stealth})")
                try:
                    await _crawler_instance.__aexit__(None, None, None)
                except Exception as exc:
                    logger.debug(f"Error closing old crawler: {exc}")
                _crawler_instance = None
    
            # Start a fresh browser (retry once on failure)
            for attempt in range(2):
                logger.info(f"Starting shared browser (stealth={stealth})...")
                crawler = AsyncWebCrawler(
                    verbose=False,
                    config=_browser_config(stealth),
                )
                try:
                    await crawler.__aenter__()
                    _crawler_instance = crawler
                    _crawler_stealth = stealth
                    logger.info("Shared browser started")
                    return _crawler_instance
                except Exception:
                    if attempt == 0:
                        logger.warning(
                            "Browser start failed, retrying with fresh data dir..."
                        )
                        _cleanup_browser_data_dir()
                    else:
                        logger.error("Failed to start shared browser after retry")
                        raise
    
            raise RuntimeError("Failed to start shared browser")
  • The _extract_with_markitdown() function handles document conversion for PDF, DOCX, PPTX, and other document formats. Downloads the document via httpx and converts to Markdown using the markitdown library.
    async def _extract_with_markitdown(url: str) -> dict:
        """Download document and convert to Markdown via markitdown."""
        try:
            from markitdown import MarkItDown
        except ImportError:
            return {
                "url": url,
                "error": "markitdown not installed. Install with: pip install 'markitdown[pdf,docx,pptx]'",
            }
    
        try:
            async with _safe_httpx_client(timeout=60, follow_redirects=True) as client:
                resp = await client.get(url)
                resp.raise_for_status()
    
            # Write to temp file (markitdown needs file path with extension)
            import io
    
            ext = Path(urlparse(url).path).suffix.lower() or ".pdf"
            md = MarkItDown()
            result = md.convert_stream(io.BytesIO(resp.content), file_extension=ext)
    
            return {
                "url": url,
                "title": Path(urlparse(url).path).stem,
                "content": result.text_content,
                "converter": "markitdown",
            }
        except Exception as e:
            logger.error(f"markitdown failed for {url}: {e}")
            return {"url": url, "error": f"Document conversion failed: {e}"}
  • The _browser_config() function creates BrowserConfig for Crawl4AI with per-process isolated data directory and optional stealth mode, handling Docker/CI environments with --no-sandbox flag.
    def _browser_config(stealth: bool = False) -> BrowserConfig:
        """Create BrowserConfig with per-process isolated data directory."""
        extra_args: list[str] = []
    
        # Docker/CI environments need --no-sandbox (Chromium cannot use
        # the SUID sandbox inside unprivileged containers).
        if os.path.exists("/.dockerenv") or os.environ.get("container"):
            extra_args += ["--no-sandbox", "--disable-dev-shm-usage"]
    
        return BrowserConfig(
            headless=True,
            enable_stealth=stealth,
            verbose=False,
            user_data_dir=_BROWSER_DATA_DIR,
            extra_args=extra_args,
        )
Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/n24q02m/wet-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server