"""
This type stub file was generated by pyright.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from dataclasses import dataclass
from pathlib import Path
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import LLMConfig
from crawl4ai.models import CrawlResult, Link
"""
Adaptive Web Crawler for Crawl4AI
This module implements adaptive information foraging for efficient web crawling.
It determines when sufficient information has been gathered to answer a query,
avoiding unnecessary crawls while ensuring comprehensive coverage.
"""
@dataclass
class CrawlState:
"""Tracks the current state of adaptive crawling"""
crawled_urls: Set[str] = ...
knowledge_base: List[CrawlResult] = ...
pending_links: List[Link] = ...
query: str = ...
metrics: Dict[str, float] = ...
term_frequencies: Dict[str, int] = ...
document_frequencies: Dict[str, int] = ...
documents_with_terms: Dict[str, Set[int]] = ...
total_documents: int = ...
new_terms_history: List[int] = ...
crawl_order: List[str] = ...
kb_embeddings: Optional[Any] = ...
query_embeddings: Optional[Any] = ...
expanded_queries: List[str] = ...
coverage_shape: Optional[Any] = ...
semantic_gaps: List[Tuple[List[float], float]] = ...
embedding_model: str = ...
def save(self, path: Union[str, Path]): # -> None:
"""Save state to disk for persistence"""
...
@classmethod
def load(cls, path: Union[str, Path]) -> CrawlState:
"""Load state from disk"""
...
@dataclass
class AdaptiveConfig:
"""Configuration for adaptive crawling"""
confidence_threshold: float = ...
max_depth: int = ...
max_pages: int = ...
top_k_links: int = ...
min_gain_threshold: float = ...
strategy: str = ...
saturation_threshold: float = ...
consistency_threshold: float = ...
coverage_weight: float = ...
consistency_weight: float = ...
saturation_weight: float = ...
relevance_weight: float = ...
novelty_weight: float = ...
authority_weight: float = ...
save_state: bool = ...
state_path: Optional[str] = ...
embedding_model: str = ...
embedding_llm_config: Optional[Union[LLMConfig, Dict]] = ...
n_query_variations: int = ...
coverage_threshold: float = ...
alpha_shape_alpha: float = ...
embedding_min_confidence_threshold: float = ...
embedding_coverage_radius: float = ...
embedding_k_exp: float = ...
embedding_nearest_weight: float = ...
embedding_top_k_weight: float = ...
embedding_overlap_threshold: float = ...
embedding_min_relative_improvement: float = ...
embedding_validation_min_score: float = ...
embedding_quality_min_confidence: float = ...
embedding_quality_max_confidence: float = ...
embedding_quality_scale_factor: float = ...
def validate(self): # -> None:
"""Validate configuration parameters"""
...
class CrawlStrategy(ABC):
"""Abstract base class for crawling strategies"""
@abstractmethod
async def calculate_confidence(self, state: CrawlState) -> float:
"""Calculate overall confidence that we have sufficient information"""
...
@abstractmethod
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
"""Rank pending links by expected information gain"""
...
@abstractmethod
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
"""Determine if crawling should stop"""
...
@abstractmethod
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
"""Update state with new crawl results"""
...
class StatisticalStrategy(CrawlStrategy):
"""Pure statistical approach - no LLM, no embeddings"""
def __init__(self) -> None:
...
async def calculate_confidence(self, state: CrawlState) -> float:
"""Calculate confidence using coverage, consistency, and saturation"""
...
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
"""Rank links by expected information gain"""
...
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
"""Determine if crawling should stop"""
...
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
"""Update state with new crawl results"""
...
class EmbeddingStrategy(CrawlStrategy):
"""Embedding-based adaptive crawling using semantic space coverage"""
def __init__(self, embedding_model: str = ..., llm_config: Union[LLMConfig, Dict] = ...) -> None:
...
async def map_query_semantic_space(self, query: str, n_synthetic: int = ...) -> Any:
"""Generate a point cloud representing the semantic neighborhood of the query"""
...
def compute_coverage_shape(self, query_points: Any, alpha: float = ...): # -> dict[str, Incomplete] | None:
"""Find the minimal shape that covers all query points using alpha shape"""
...
def find_coverage_gaps(self, kb_embeddings: Any, query_embeddings: Any) -> List[Tuple[Any, float]]:
"""Calculate gap distances for all query variations using vectorized operations"""
...
async def select_links_for_expansion(self, candidate_links: List[Link], gaps: List[Tuple[Any, float]], kb_embeddings: Any) -> List[Tuple[Link, float]]:
"""Select links that most efficiently fill the gaps"""
...
async def calculate_confidence(self, state: CrawlState) -> float:
"""Coverage-based learning score (0–1)."""
...
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
"""Main entry point for link ranking"""
...
async def validate_coverage(self, state: CrawlState) -> float:
"""Validate coverage using held-out queries with caching"""
...
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
"""Stop based on learning curve convergence"""
...
def get_quality_confidence(self, state: CrawlState) -> float:
"""Calculate quality-based confidence score for display"""
...
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
"""Update embeddings and coverage metrics with deduplication"""
...
class AdaptiveCrawler:
"""Main adaptive crawler that orchestrates the crawling process"""
def __init__(self, crawler: Optional[AsyncWebCrawler] = ..., config: Optional[AdaptiveConfig] = ..., strategy: Optional[CrawlStrategy] = ...) -> None:
...
async def digest(self, start_url: str, query: str, resume_from: Optional[str] = ...) -> CrawlState:
"""Main entry point for adaptive crawling"""
...
@property
def confidence(self) -> float:
"""Current confidence level"""
...
@property
def coverage_stats(self) -> Dict[str, Any]:
"""Detailed coverage statistics"""
...
@property
def is_sufficient(self) -> bool:
"""Check if current knowledge is sufficient"""
...
def print_stats(self, detailed: bool = ...) -> None:
"""Print comprehensive statistics about the knowledge base
Args:
detailed: If True, show detailed statistics including top terms
"""
...
def export_knowledge_base(self, filepath: Union[str, Path], format: str = ...) -> None:
"""Export the knowledge base to a file
Args:
filepath: Path to save the file
format: Export format - currently supports 'jsonl'
"""
...
def import_knowledge_base(self, filepath: Union[str, Path], format: str = ...) -> None:
"""Import a knowledge base from a file
Args:
filepath: Path to the file to import
format: Import format - currently supports 'jsonl'
"""
...
def get_relevant_content(self, top_k: int = ...) -> List[Dict[str, Any]]:
"""Get most relevant content for the query"""
...