"""
This type stub file was generated by pyright.
"""
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Set
from ..types import AsyncWebCrawler, CrawlResult, CrawlerRunConfig, RunManyReturn
class DeepCrawlDecorator:
"""Decorator that adds deep crawling capability to arun method."""
deep_crawl_active = ...
def __init__(self, crawler: AsyncWebCrawler) -> None:
...
def __call__(self, original_arun): # -> _Wrapped[..., Any, ..., CoroutineType[Any, Any, AsyncGenerator[RunManyReturn, Any] | RunManyReturn]]:
...
class DeepCrawlStrategy(ABC):
"""
Abstract base class for deep crawling strategies.
Core functions:
- arun: Main entry point that returns an async generator of CrawlResults.
- shutdown: Clean up resources.
- can_process_url: Validate a URL and decide whether to process it.
- _process_links: Extract and process links from a CrawlResult.
"""
async def arun(self, start_url: str, crawler: AsyncWebCrawler, config: Optional[CrawlerRunConfig] = ...) -> RunManyReturn:
"""
Traverse the given URL using the specified crawler.
Args:
start_url (str): The URL from which to start crawling.
crawler (AsyncWebCrawler): The crawler instance to use.
crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
Returns:
Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
"""
...
def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig): # -> CoroutineType[Any, Any, RunManyReturn]:
...
@abstractmethod
async def shutdown(self) -> None:
"""
Clean up resources used by the deep crawl strategy.
"""
...
@abstractmethod
async def can_process_url(self, url: str, depth: int) -> bool:
"""
Validate the URL format and apply custom filtering logic.
Args:
url (str): The URL to validate.
depth (int): The current depth in the crawl.
Returns:
bool: True if the URL should be processed, False otherwise.
"""
...
@abstractmethod
async def link_discovery(self, result: CrawlResult, source_url: str, current_depth: int, visited: Set[str], next_level: List[tuple], depths: Dict[str, int]) -> None:
"""
Extract and process links from the given crawl result.
This method should:
- Validate each extracted URL using can_process_url.
- Optionally score URLs.
- Append valid URLs (and their parent references) to the next_level list.
- Update the depths dictionary with the new depth for each URL.
Args:
result (CrawlResult): The result from a crawl operation.
source_url (str): The URL from which this result was obtained.
current_depth (int): The depth at which the source URL was processed.
visited (Set[str]): Set of already visited URLs.
next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
depths (Dict[str, int]): Mapping of URLs to their current depth.
"""
...