Skip to main content
Glama

scrape_with_stealth

Scrape websites with strong anti-bot protection using advanced stealth techniques like undetected browser automation, randomized behavior patterns, and human-like interactions to avoid detection.

Instructions

Scrape a webpage using advanced stealth techniques to avoid detection.

This tool uses sophisticated anti-detection methods including:

  • Undetected browser automation

  • Randomized behavior patterns

  • Human-like interactions

  • Advanced evasion techniques

Use this for websites with strong anti-bot protection.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
requestYes

Implementation Reference

  • Primary MCP tool handler and registration for 'scrape_with_stealth'. Includes input schema via Annotated Fields, output ScrapeResponse, caching, retry, and delegation to core scraper instance.
    @app.tool()
    @timing_decorator
    async def scrape_with_stealth(
        url: Annotated[
            str,
            Field(
                ...,
                description="目标网页 URL,必须包含协议前缀(http:// 或 https://),使用反检测技术抓取复杂的反爬虫网站",
            ),
        ],
        method: Annotated[
            str,
            Field(
                default="selenium",
                description="""隐身方法选择,可选值:
                    "selenium"(使用 undetected-chromedriver 反检测技术)、
                    "playwright"(使用 Playwright 隐身模式)""",
            ),
        ],
        extract_config: Annotated[
            Optional[Dict[str, Any]],
            Field(
                default=None,
                description="""数据提取配置字典,支持 CSS 选择器和属性提取。
                    示例:{"title": "h1", "content": ".article-body", "links": {"selector": "a", "attr": "href", "multiple": True}}""",
            ),
        ],
        wait_for_element: Annotated[
            Optional[str],
            Field(
                default=None,
                description="""等待加载的元素 CSS 选择器,确保动态内容完全加载。
                    示例:".content"、"#main-article\"""",
            ),
        ],
        scroll_page: Annotated[
            bool,
            Field(
                default=False,
                description="是否滚动页面以加载动态内容,适用于无限滚动或懒加载的页面",
            ),
        ],
    ) -> ScrapeResponse:
        """
        Scrape a webpage using advanced stealth techniques to avoid detection.
    
        This tool uses sophisticated anti-detection methods including:
        - Undetected browser automation
        - Randomized behavior patterns
        - Human-like interactions
        - Advanced evasion techniques
    
        Use this for websites with strong anti-bot protection.
    
        Returns:
            ScrapeResponse object containing success status, scraped data, stealth method used, and performance metrics.
            Designed for bypassing sophisticated bot detection systems.
        """
        try:
            from .utils import URLValidator
    
            # Validate inputs
            if not URLValidator.is_valid_url(url):
                return ScrapeResponse(
                    success=False,
                    url=url,
                    method=method,
                    error="Invalid URL format",
                )
    
            if method not in ["selenium", "playwright"]:
                return ScrapeResponse(
                    success=False,
                    url=url,
                    method=method,
                    error="Method must be one of: selenium, playwright",
                )
    
            start_time = time.time()
            logger.info(f"Stealth scraping: {url} with method: {method}")
    
            # Apply rate limiting
            await rate_limiter.wait()
    
            # Normalize URL
            normalized_url = URLValidator.normalize_url(url)
    
            # Check cache first
            cache_key_data = {
                "extract_config": extract_config,
                "wait_for_element": wait_for_element,
                "scroll_page": scroll_page,
            }
            cached_result = cache_manager.get(
                normalized_url, f"stealth_{method}", cache_key_data
            )
            if cached_result:
                logger.info(f"Returning cached result for {normalized_url}")
                cached_result["from_cache"] = True
                return cached_result
    
            # Validate and normalize extract config
            if extract_config:
                extract_config = ConfigValidator.validate_extract_config(extract_config)
    
            # Perform stealth scraping with retry
            result = await retry_manager.retry_async(
                anti_detection_scraper.scrape_with_stealth,
                url=normalized_url,
                method=method,
                extract_config=extract_config,
                wait_for_element=wait_for_element,
                scroll_page=scroll_page,
            )
    
            duration_ms = int((time.time() - start_time) * 1000)
            success = "error" not in result
    
            if success:
                # Clean text content if present
                if "content" in result and "text" in result["content"]:
                    result["content"]["text"] = TextCleaner.clean_text(
                        result["content"]["text"]
                    )
    
                # Cache successful result
                cache_manager.set(
                    normalized_url, f"stealth_{method}", result, cache_key_data
                )
    
                metrics_collector.record_request(
                    normalized_url, True, duration_ms, f"stealth_{method}"
                )
    
                return ScrapeResponse(
                    success=True,
                    url=url,
                    method=f"stealth_{method}",
                    data=result,
                    duration_ms=duration_ms,
                    from_cache=False,
                )
            else:
                error_response = ErrorHandler.handle_scraping_error(
                    Exception(result.get("error", "Unknown error")),
                    normalized_url,
                    f"stealth_{method}",
                )
                metrics_collector.record_request(
                    normalized_url,
                    False,
                    duration_ms,
                    f"stealth_{method}",
                    error_response["error"]["category"],
                )
                return ScrapeResponse(
                    success=False,
                    url=url,
                    method=f"stealth_{method}",
                    error=error_response["error"]["message"],
                )
    
        except Exception as e:
            duration_ms = (
                int((time.time() - start_time) * 1000) if "start_time" in locals() else 0
            )
            error_response = ErrorHandler.handle_scraping_error(e, url, f"stealth_{method}")
            metrics_collector.record_request(
                url,
                False,
                duration_ms,
                f"stealth_{method}",
                error_response["error"]["category"],
            )
            return ScrapeResponse(
                success=False,
                url=url,
                method=f"stealth_{method}",
                error=error_response["error"]["message"],
            )
  • Core implementation of stealth scraping logic in AntiDetectionScraper class. Dispatches to Selenium or Playwright based stealth methods with human-like behaviors.
    async def scrape_with_stealth(
        self,
        url: str,
        method: str = "selenium",
        extract_config: Optional[Dict[str, Any]] = None,
        wait_for_element: Optional[str] = None,
        scroll_page: bool = False,
    ) -> Dict[str, Any]:
        """
        Scrape using stealth techniques to avoid detection.
    
        Args:
            url: URL to scrape
            method: "selenium" or "playwright"
            extract_config: Data extraction configuration
            wait_for_element: Element to wait for
            scroll_page: Whether to scroll the page to load dynamic content
        """
        try:
            if method == "selenium":
                return await self._scrape_with_selenium_stealth(
                    url, extract_config, wait_for_element, scroll_page
                )
            elif method == "playwright":
                return await self._scrape_with_playwright_stealth(
                    url, extract_config, wait_for_element, scroll_page
                )
            else:
                raise ValueError(f"Unknown stealth method: {method}")
    
        except Exception as e:
            logger.error(f"Stealth scraping failed for {url}: {str(e)}")
            return {"error": str(e), "url": url}
    
        finally:
            await self.cleanup()
  • Pydantic output schema model used by the scrape_with_stealth tool.
    class ScrapeResponse(BaseModel):
        """Response model for scraping operations."""
    
        success: bool = Field(..., description="操作是否成功")
        url: str = Field(..., description="被抓取的URL")
        method: str = Field(..., description="使用的抓取方法")
        data: Optional[Dict[str, Any]] = Field(default=None, description="抓取到的数据")
        metadata: Optional[Dict[str, Any]] = Field(default=None, description="页面元数据")
        error: Optional[str] = Field(default=None, description="错误信息(如果有)")
        timestamp: datetime = Field(default_factory=datetime.now, description="抓取时间戳")
  • Global instance of AntiDetectionScraper created for use by the scrape_with_stealth tool.
    anti_detection_scraper = AntiDetectionScraper()
  • Helper method for Selenium-based stealth scraping, including undetected driver setup and human-like behavior simulation.
    async def _scrape_with_selenium_stealth(
        self,
        url: str,
        extract_config: Optional[Dict[str, Any]],
        wait_for_element: Optional[str],
        scroll_page: bool,
    ) -> Dict[str, Any]:
        """Scrape using Selenium with stealth techniques."""
        self.driver = await self._get_undetected_chrome_driver()
    
        # Random delay before navigation
        await asyncio.sleep(random.uniform(1, 3))  # nosec B311
    
        self.driver.get(url)
    
        # Wait for page load
        await asyncio.sleep(random.uniform(2, 4))  # nosec B311
    
        # Wait for specific element if specified
        if wait_for_element:
            try:
                WebDriverWait(self.driver, settings.browser_timeout).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element))
                )
            except TimeoutException:
                logger.warning(f"Timeout waiting for element: {wait_for_element}")
    
        # Scroll page to load dynamic content
        if scroll_page:
            await self._scroll_page_selenium()
    
        # Human-like mouse movements
        await self._simulate_human_behavior_selenium()
    
        # Extract data
        result = await self._extract_data_selenium(extract_config)
        result["url"] = self.driver.current_url
    
        return result
Install Server

Other Tools

Related Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server