scrape_with_stealth

Scrape websites with strong anti-bot protection using advanced stealth techniques like undetected browser automation, randomized behavior patterns, and human-like interactions to avoid detection.

Instructions

Scrape a webpage using advanced stealth techniques to avoid detection.

This tool uses sophisticated anti-detection methods including:

Undetected browser automation
Randomized behavior patterns
Human-like interactions
Advanced evasion techniques

Use this for websites with strong anti-bot protection.

Input Schema

TableJSON Schema

Name	Required	Description	Default
`request`	Yes

Implementation Reference

extractor/server.py:688-868 (handler)
Primary MCP tool handler and registration for 'scrape_with_stealth'. Includes input schema via Annotated Fields, output ScrapeResponse, caching, retry, and delegation to core scraper instance.
@app.tool() @timing_decorator async def scrape_with_stealth( url: Annotated[ str, Field( ..., description="目标网页 URL，必须包含协议前缀（http:// 或 https://），使用反检测技术抓取复杂的反爬虫网站", ), ], method: Annotated[ str, Field( default="selenium", description="""隐身方法选择，可选值： "selenium"（使用 undetected-chromedriver 反检测技术）、 "playwright"（使用 Playwright 隐身模式）""", ), ], extract_config: Annotated[ Optional[Dict[str, Any]], Field( default=None, description="""数据提取配置字典，支持 CSS 选择器和属性提取。示例：{"title": "h1", "content": ".article-body", "links": {"selector": "a", "attr": "href", "multiple": True}}""", ), ], wait_for_element: Annotated[ Optional[str], Field( default=None, description="""等待加载的元素 CSS 选择器，确保动态内容完全加载。示例：".content"、"#main-article\"""", ), ], scroll_page: Annotated[ bool, Field( default=False, description="是否滚动页面以加载动态内容，适用于无限滚动或懒加载的页面", ), ], ) -> ScrapeResponse: """ Scrape a webpage using advanced stealth techniques to avoid detection. This tool uses sophisticated anti-detection methods including: - Undetected browser automation - Randomized behavior patterns - Human-like interactions - Advanced evasion techniques Use this for websites with strong anti-bot protection. Returns: ScrapeResponse object containing success status, scraped data, stealth method used, and performance metrics. Designed for bypassing sophisticated bot detection systems. """ try: from .utils import URLValidator # Validate inputs if not URLValidator.is_valid_url(url): return ScrapeResponse( success=False, url=url, method=method, error="Invalid URL format", ) if method not in ["selenium", "playwright"]: return ScrapeResponse( success=False, url=url, method=method, error="Method must be one of: selenium, playwright", ) start_time = time.time() logger.info(f"Stealth scraping: {url} with method: {method}") # Apply rate limiting await rate_limiter.wait() # Normalize URL normalized_url = URLValidator.normalize_url(url) # Check cache first cache_key_data = { "extract_config": extract_config, "wait_for_element": wait_for_element, "scroll_page": scroll_page, } cached_result = cache_manager.get( normalized_url, f"stealth_{method}", cache_key_data ) if cached_result: logger.info(f"Returning cached result for {normalized_url}") cached_result["from_cache"] = True return cached_result # Validate and normalize extract config if extract_config: extract_config = ConfigValidator.validate_extract_config(extract_config) # Perform stealth scraping with retry result = await retry_manager.retry_async( anti_detection_scraper.scrape_with_stealth, url=normalized_url, method=method, extract_config=extract_config, wait_for_element=wait_for_element, scroll_page=scroll_page, ) duration_ms = int((time.time() - start_time) * 1000) success = "error" not in result if success: # Clean text content if present if "content" in result and "text" in result["content"]: result["content"]["text"] = TextCleaner.clean_text( result["content"]["text"] ) # Cache successful result cache_manager.set( normalized_url, f"stealth_{method}", result, cache_key_data ) metrics_collector.record_request( normalized_url, True, duration_ms, f"stealth_{method}" ) return ScrapeResponse( success=True, url=url, method=f"stealth_{method}", data=result, duration_ms=duration_ms, from_cache=False, ) else: error_response = ErrorHandler.handle_scraping_error( Exception(result.get("error", "Unknown error")), normalized_url, f"stealth_{method}", ) metrics_collector.record_request( normalized_url, False, duration_ms, f"stealth_{method}", error_response["error"]["category"], ) return ScrapeResponse( success=False, url=url, method=f"stealth_{method}", error=error_response["error"]["message"], ) except Exception as e: duration_ms = ( int((time.time() - start_time) * 1000) if "start_time" in locals() else 0 ) error_response = ErrorHandler.handle_scraping_error(e, url, f"stealth_{method}") metrics_collector.record_request( url, False, duration_ms, f"stealth_{method}", error_response["error"]["category"], ) return ScrapeResponse( success=False, url=url, method=f"stealth_{method}", error=error_response["error"]["message"], )
extractor/advanced_features.py:130-166 (handler)
Core implementation of stealth scraping logic in AntiDetectionScraper class. Dispatches to Selenium or Playwright based stealth methods with human-like behaviors.
async def scrape_with_stealth( self, url: str, method: str = "selenium", extract_config: Optional[Dict[str, Any]] = None, wait_for_element: Optional[str] = None, scroll_page: bool = False, ) -> Dict[str, Any]: """ Scrape using stealth techniques to avoid detection. Args: url: URL to scrape method: "selenium" or "playwright" extract_config: Data extraction configuration wait_for_element: Element to wait for scroll_page: Whether to scroll the page to load dynamic content """ try: if method == "selenium": return await self._scrape_with_selenium_stealth( url, extract_config, wait_for_element, scroll_page ) elif method == "playwright": return await self._scrape_with_playwright_stealth( url, extract_config, wait_for_element, scroll_page ) else: raise ValueError(f"Unknown stealth method: {method}") except Exception as e: logger.error(f"Stealth scraping failed for {url}: {str(e)}") return {"error": str(e), "url": url} finally: await self.cleanup()
extractor/server.py:62-72 (schema)
Pydantic output schema model used by the scrape_with_stealth tool.
class ScrapeResponse(BaseModel): """Response model for scraping operations.""" success: bool = Field(..., description="操作是否成功") url: str = Field(..., description="被抓取的URL") method: str = Field(..., description="使用的抓取方法") data: Optional[Dict[str, Any]] = Field(default=None, description="抓取到的数据") metadata: Optional[Dict[str, Any]] = Field(default=None, description="页面元数据") error: Optional[str] = Field(default=None, description="错误信息（如果有）") timestamp: datetime = Field(default_factory=datetime.now, description="抓取时间戳")
extractor/server.py:37-37 (registration)
Global instance of AntiDetectionScraper created for use by the scrape_with_stealth tool.
anti_detection_scraper = AntiDetectionScraper()
extractor/advanced_features.py:167-205 (helper)
Helper method for Selenium-based stealth scraping, including undetected driver setup and human-like behavior simulation.
async def _scrape_with_selenium_stealth( self, url: str, extract_config: Optional[Dict[str, Any]], wait_for_element: Optional[str], scroll_page: bool, ) -> Dict[str, Any]: """Scrape using Selenium with stealth techniques.""" self.driver = await self._get_undetected_chrome_driver() # Random delay before navigation await asyncio.sleep(random.uniform(1, 3)) # nosec B311 self.driver.get(url) # Wait for page load await asyncio.sleep(random.uniform(2, 4)) # nosec B311 # Wait for specific element if specified if wait_for_element: try: WebDriverWait(self.driver, settings.browser_timeout).until( EC.presence_of_element_located((By.CSS_SELECTOR, wait_for_element)) ) except TimeoutException: logger.warning(f"Timeout waiting for element: {wait_for_element}") # Scroll page to load dynamic content if scroll_page: await self._scroll_page_selenium() # Human-like mouse movements await self._simulate_human_behavior_selenium() # Extract data result = await self._extract_data_selenium(extract_config) result["url"] = self.driver.current_url return result

Scrapy MCP Server

scrape_with_stealth

Instructions

Input Schema

Implementation Reference

Other Tools

Related Tools

Latest Blog Posts

MCP directory API