"""
This type stub file was generated by pyright.
"""
import logging
from typing import Dict, List, Optional, Set, Tuple
from .filters import FilterChain
from .scorers import URLScorer
from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlResult, CrawlerRunConfig, RunManyReturn
BATCH_SIZE = ...
class BestFirstCrawlingStrategy(DeepCrawlStrategy):
"""
Best-First Crawling Strategy using a priority queue.
This strategy prioritizes URLs based on their score, ensuring that higher-value
pages are crawled first. It reimplements the core traversal loop to use a priority
queue while keeping URL validation and link discovery consistent with our design.
Core methods:
- arun: Returns either a list (batch mode) or an async generator (stream mode).
- _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
- can_process_url: Validates URLs and applies filtering (inherited behavior).
- link_discovery: Extracts and validates links from a CrawlResult.
"""
def __init__(self, max_depth: int, filter_chain: FilterChain = ..., url_scorer: Optional[URLScorer] = ..., include_external: bool = ..., max_pages: int = ..., logger: Optional[logging.Logger] = ...) -> None:
...
async def can_process_url(self, url: str, depth: int) -> bool:
"""
Validate the URL format and apply filtering.
For the starting URL (depth 0), filtering is bypassed.
"""
...
async def link_discovery(self, result: CrawlResult, source_url: str, current_depth: int, visited: Set[str], next_links: List[Tuple[str, Optional[str]]], depths: Dict[str, int]) -> None:
"""
Extract links from the crawl result, validate them, and append new URLs
(with their parent references) to next_links.
Also updates the depths dictionary.
"""
...
async def arun(self, start_url: str, crawler: AsyncWebCrawler, config: Optional[CrawlerRunConfig] = ...) -> RunManyReturn:
"""
Main entry point for best-first crawling.
Returns either a list (batch mode) or an async generator (stream mode)
of CrawlResults.
"""
...
async def shutdown(self) -> None:
"""
Signal cancellation and clean up resources.
"""
...