scrape_with_stealth
Scrape websites with strong anti-bot protection using advanced stealth techniques like undetected browser automation, randomized behavior patterns, and human-like interactions to avoid detection.
Instructions
Scrape a webpage using advanced stealth techniques to avoid detection.
This tool uses sophisticated anti-detection methods including:
Undetected browser automation
Randomized behavior patterns
Human-like interactions
Advanced evasion techniques
Use this for websites with strong anti-bot protection.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| request | Yes |
Input Schema (JSON Schema)
{
"$defs": {
"StealthScrapeRequest": {
"description": "Request model for stealth scraping operations.",
"properties": {
"extract_config": {
"anyOf": [
{
"additionalProperties": true,
"type": "object"
},
{
"type": "null"
}
],
"default": null,
"description": "Configuration for data extraction",
"title": "Extract Config"
},
"method": {
"default": "selenium",
"description": "Stealth method: selenium or playwright",
"title": "Method",
"type": "string"
},
"scroll_page": {
"default": false,
"description": "Whether to scroll page to load dynamic content",
"title": "Scroll Page",
"type": "boolean"
},
"url": {
"description": "URL to scrape using stealth techniques",
"title": "Url",
"type": "string"
},
"wait_for_element": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "CSS selector to wait for",
"title": "Wait For Element"
}
},
"required": [
"url"
],
"title": "StealthScrapeRequest",
"type": "object"
}
},
"properties": {
"request": {
"$ref": "#/$defs/StealthScrapeRequest",
"title": "Request"
}
},
"required": [
"request"
],
"type": "object"
}
Implementation Reference
- extractor/server.py:688-868 (handler)Primary MCP tool handler and registration for 'scrape_with_stealth'. Includes input schema via Annotated Fields, output ScrapeResponse, caching, retry, and delegation to core scraper instance.@app.tool() @timing_decorator async def scrape_with_stealth( url: Annotated[ str, Field( ..., description="目标网页 URL,必须包含协议前缀(http:// 或 https://),使用反检测技术抓取复杂的反爬虫网站", ), ], method: Annotated[ str, Field( default="selenium", description="""隐身方法选择,可选值: "selenium"(使用 undetected-chromedriver 反检测技术)、 "playwright"(使用 Playwright 隐身模式)""", ), ], extract_config: Annotated[ Optional[Dict[str, Any]], Field( default=None, description="""数据提取配置字典,支持 CSS 选择器和属性提取。 示例:{"title": "h1", "content": ".article-body", "links": {"selector": "a", "attr": "href", "multiple": True}}""", ), ], wait_for_element: Annotated[ Optional[str], Field( default=None, description="""等待加载的元素 CSS 选择器,确保动态内容完全加载。 示例:".content"、"#main-article\"""", ), ], scroll_page: Annotated[ bool, Field( default=False, description="是否滚动页面以加载动态内容,适用于无限滚动或懒加载的页面", ), ], ) -> ScrapeResponse: """ Scrape a webpage using advanced stealth techniques to avoid detection. This tool uses sophisticated anti-detection methods including: - Undetected browser automation - Randomized behavior patterns - Human-like interactions - Advanced evasion techniques Use this for websites with strong anti-bot protection. Returns: ScrapeResponse object containing success status, scraped data, stealth method used, and performance metrics. Designed for bypassing sophisticated bot detection systems. """ try: from .utils import URLValidator # Validate inputs if not URLValidator.is_valid_url(url): return ScrapeResponse( success=False, url=url, method=method, error="Invalid URL format", ) if method not in ["selenium", "playwright"]: return ScrapeResponse( success=False, url=url, method=method, error="Method must be one of: selenium, playwright", ) start_time = time.time() logger.info(f"Stealth scraping: {url} with method: {method}") # Apply rate limiting await rate_limiter.wait() # Normalize URL normalized_url = URLValidator.normalize_url(url) # Check cache first cache_key_data = { "extract_config": extract_config, "wait_for_element": wait_for_element, "scroll_page": scroll_page, } cached_result = cache_manager.get( normalized_url, f"stealth_{method}", cache_key_data ) if cached_result: logger.info(f"Returning cached result for {normalized_url}") cached_result["from_cache"] = True return cached_result # Validate and normalize extract config if extract_config: extract_config = ConfigValidator.validate_extract_config(extract_config) # Perform stealth scraping with retry result = await retry_manager.retry_async( anti_detection_scraper.scrape_with_stealth, url=normalized_url, method=method, extract_config=extract_config, wait_for_element=wait_for_element, scroll_page=scroll_page, ) duration_ms = int((time.time() - start_time) * 1000) success = "error" not in result if success: # Clean text content if present if "content" in result and "text" in result["content"]: result["content"]["text"] = TextCleaner.clean_text( result["content"]["text"] ) # Cache successful result cache_manager.set( normalized_url, f"stealth_{method}", result, cache_key_data ) metrics_collector.record_request( normalized_url, True, duration_ms, f"stealth_{method}" ) return ScrapeResponse( success=True, url=url, method=f"stealth_{method}", data=result, duration_ms=duration_ms, from_cache=False, ) else: error_response = ErrorHandler.handle_scraping_error( Exception(result.get("error", "Unknown error")), normalized_url, f"stealth_{method}", ) metrics_collector.record_request( normalized_url, False, duration_ms, f"stealth_{method}", error_response["error"]["category"], ) return ScrapeResponse( success=False, url=url, method=f"stealth_{method}", error=error_response["error"]["message"], ) except Exception as e: duration_ms = ( int((time.time() - start_time) * 1000) if "start_time" in locals() else 0 ) error_response = ErrorHandler.handle_scraping_error(e, url, f"stealth_{method}") metrics_collector.record_request( url, False, duration_ms, f"stealth_{method}", error_response["error"]["category"], ) return ScrapeResponse( success=False, url=url, method=f"stealth_{method}", error=error_response["error"]["message"], )
- extractor/advanced_features.py:130-166 (handler)Core implementation of stealth scraping logic in AntiDetectionScraper class. Dispatches to Selenium or Playwright based stealth methods with human-like behaviors.async def scrape_with_stealth( self, url: str, method: str = "selenium", extract_config: Optional[Dict[str, Any]] = None, wait_for_element: Optional[str] = None, scroll_page: bool = False, ) -> Dict[str, Any]: """ Scrape using stealth techniques to avoid detection. Args: url: URL to scrape method: "selenium" or "playwright" extract_config: Data extraction configuration wait_for_element: Element to wait for scroll_page: Whether to scroll the page to load dynamic content """ try: if method == "selenium": return await self._scrape_with_selenium_stealth( url, extract_config, wait_for_element, scroll_page ) elif method == "playwright": return await self._scrape_with_playwright_stealth( url, extract_config, wait_for_element, scroll_page ) else: raise ValueError(f"Unknown stealth method: {method}") except Exception as e: logger.error(f"Stealth scraping failed for {url}: {str(e)}") return {"error": str(e), "url": url} finally: await self.cleanup()
- extractor/server.py:62-72 (schema)Pydantic output schema model used by the scrape_with_stealth tool.class ScrapeResponse(BaseModel): """Response model for scraping operations.""" success: bool = Field(..., description="操作是否成功") url: str = Field(..., description="被抓取的URL") method: str = Field(..., description="使用的抓取方法") data: Optional[Dict[str, Any]] = Field(default=None, description="抓取到的数据") metadata: Optional[Dict[str, Any]] = Field(default=None, description="页面元数据") error: Optional[str] = Field(default=None, description="错误信息(如果有)") timestamp: datetime = Field(default_factory=datetime.now, description="抓取时间戳")
- extractor/server.py:37-37 (registration)Global instance of AntiDetectionScraper created for use by the scrape_with_stealth tool.anti_detection_scraper = AntiDetectionScraper()
- Helper method for Selenium-based stealth scraping, including undetected driver setup and human-like behavior simulation.async def _scrape_with_selenium_stealth( self, url: str, extract_config: Optional[Dict[str, Any]], wait_for_element: Optional[str], scroll_page: bool, ) -> Dict[str, Any]: """Scrape using Selenium with stealth techniques.""" self.driver = await self._get_undetected_chrome_driver() # Random delay before navigation await asyncio.sleep(random.uniform(1, 3)) # nosec B311 self.driver.get(url) # Wait for page load await asyncio.sleep(random.uniform(2, 4)) # nosec B311 # Wait for specific element if specified if wait_for_element: try: WebDriverWait(self.driver, settings.browser_timeout).until( EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element)) ) except TimeoutException: logger.warning(f"Timeout waiting for element: {wait_for_element}") # Scroll page to load dynamic content if scroll_page: await self._scroll_page_selenium() # Human-like mouse movements await self._simulate_human_behavior_selenium() # Extract data result = await self._extract_data_selenium(extract_config) result["url"] = self.driver.current_url return result