Dev Tool MCP

Overview Schema Related Servers Score Discussions

adaptive_crawler.pyi•8.29 KiB

""" This type stub file was generated by pyright. """ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Set, Tuple, Union from dataclasses import dataclass from pathlib import Path from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_configs import LLMConfig from crawl4ai.models import CrawlResult, Link """ Adaptive Web Crawler for Crawl4AI This module implements adaptive information foraging for efficient web crawling. It determines when sufficient information has been gathered to answer a query, avoiding unnecessary crawls while ensuring comprehensive coverage. """ @dataclass class CrawlState: """Tracks the current state of adaptive crawling""" crawled_urls: Set[str] = ... knowledge_base: List[CrawlResult] = ... pending_links: List[Link] = ... query: str = ... metrics: Dict[str, float] = ... term_frequencies: Dict[str, int] = ... document_frequencies: Dict[str, int] = ... documents_with_terms: Dict[str, Set[int]] = ... total_documents: int = ... new_terms_history: List[int] = ... crawl_order: List[str] = ... kb_embeddings: Optional[Any] = ... query_embeddings: Optional[Any] = ... expanded_queries: List[str] = ... coverage_shape: Optional[Any] = ... semantic_gaps: List[Tuple[List[float], float]] = ... embedding_model: str = ... def save(self, path: Union[str, Path]): # -> None: """Save state to disk for persistence""" ... @classmethod def load(cls, path: Union[str, Path]) -> CrawlState: """Load state from disk""" ... @dataclass class AdaptiveConfig: """Configuration for adaptive crawling""" confidence_threshold: float = ... max_depth: int = ... max_pages: int = ... top_k_links: int = ... min_gain_threshold: float = ... strategy: str = ... saturation_threshold: float = ... consistency_threshold: float = ... coverage_weight: float = ... consistency_weight: float = ... saturation_weight: float = ... relevance_weight: float = ... novelty_weight: float = ... authority_weight: float = ... save_state: bool = ... state_path: Optional[str] = ... embedding_model: str = ... embedding_llm_config: Optional[Union[LLMConfig, Dict]] = ... n_query_variations: int = ... coverage_threshold: float = ... alpha_shape_alpha: float = ... embedding_min_confidence_threshold: float = ... embedding_coverage_radius: float = ... embedding_k_exp: float = ... embedding_nearest_weight: float = ... embedding_top_k_weight: float = ... embedding_overlap_threshold: float = ... embedding_min_relative_improvement: float = ... embedding_validation_min_score: float = ... embedding_quality_min_confidence: float = ... embedding_quality_max_confidence: float = ... embedding_quality_scale_factor: float = ... def validate(self): # -> None: """Validate configuration parameters""" ... class CrawlStrategy(ABC): """Abstract base class for crawling strategies""" @abstractmethod async def calculate_confidence(self, state: CrawlState) -> float: """Calculate overall confidence that we have sufficient information""" ... @abstractmethod async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: """Rank pending links by expected information gain""" ... @abstractmethod async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: """Determine if crawling should stop""" ... @abstractmethod async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: """Update state with new crawl results""" ... class StatisticalStrategy(CrawlStrategy): """Pure statistical approach - no LLM, no embeddings""" def __init__(self) -> None: ... async def calculate_confidence(self, state: CrawlState) -> float: """Calculate confidence using coverage, consistency, and saturation""" ... async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: """Rank links by expected information gain""" ... async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: """Determine if crawling should stop""" ... async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: """Update state with new crawl results""" ... class EmbeddingStrategy(CrawlStrategy): """Embedding-based adaptive crawling using semantic space coverage""" def __init__(self, embedding_model: str = ..., llm_config: Union[LLMConfig, Dict] = ...) -> None: ... async def map_query_semantic_space(self, query: str, n_synthetic: int = ...) -> Any: """Generate a point cloud representing the semantic neighborhood of the query""" ... def compute_coverage_shape(self, query_points: Any, alpha: float = ...): # -> dict[str, Incomplete] | None: """Find the minimal shape that covers all query points using alpha shape""" ... def find_coverage_gaps(self, kb_embeddings: Any, query_embeddings: Any) -> List[Tuple[Any, float]]: """Calculate gap distances for all query variations using vectorized operations""" ... async def select_links_for_expansion(self, candidate_links: List[Link], gaps: List[Tuple[Any, float]], kb_embeddings: Any) -> List[Tuple[Link, float]]: """Select links that most efficiently fill the gaps""" ... async def calculate_confidence(self, state: CrawlState) -> float: """Coverage-based learning score (0–1).""" ... async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]: """Main entry point for link ranking""" ... async def validate_coverage(self, state: CrawlState) -> float: """Validate coverage using held-out queries with caching""" ... async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool: """Stop based on learning curve convergence""" ... def get_quality_confidence(self, state: CrawlState) -> float: """Calculate quality-based confidence score for display""" ... async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None: """Update embeddings and coverage metrics with deduplication""" ... class AdaptiveCrawler: """Main adaptive crawler that orchestrates the crawling process""" def __init__(self, crawler: Optional[AsyncWebCrawler] = ..., config: Optional[AdaptiveConfig] = ..., strategy: Optional[CrawlStrategy] = ...) -> None: ... async def digest(self, start_url: str, query: str, resume_from: Optional[str] = ...) -> CrawlState: """Main entry point for adaptive crawling""" ... @property def confidence(self) -> float: """Current confidence level""" ... @property def coverage_stats(self) -> Dict[str, Any]: """Detailed coverage statistics""" ... @property def is_sufficient(self) -> bool: """Check if current knowledge is sufficient""" ... def print_stats(self, detailed: bool = ...) -> None: """Print comprehensive statistics about the knowledge base Args: detailed: If True, show detailed statistics including top terms """ ... def export_knowledge_base(self, filepath: Union[str, Path], format: str = ...) -> None: """Export the knowledge base to a file Args: filepath: Path to save the file format: Export format - currently supports 'jsonl' """ ... def import_knowledge_base(self, filepath: Union[str, Path], format: str = ...) -> None: """Import a knowledge base from a file Args: filepath: Path to the file to import format: Import format - currently supports 'jsonl' """ ... def get_relevant_content(self, top_k: int = ...) -> List[Dict[str, Any]]: """Get most relevant content for the query""" ...

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/osins/dev-tool-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

adaptive_crawler.pyi•8.29 KiB