Dev Tool MCP

Overview Schema Related Servers Score Discussions

async_webcrawler.pyi•9.83 KiB

""" This type stub file was generated by pyright. """ from typing import List, Optional from contextlib import asynccontextmanager from .models import CrawlResult, RunManyReturn from .chunking_strategy import * from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy from .async_logger import AsyncLoggerBase from .async_configs import BrowserConfig, CrawlerRunConfig, SeedingConfig from .async_dispatcher import * from .async_dispatcher import BaseDispatcher class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. There are two ways to use the crawler: 1. Using context manager (recommended for simple cases): ```python async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com") ``` 2. Using explicit lifecycle management (recommended for long-running applications): ```python crawler = AsyncWebCrawler() await crawler.start() # Use the crawler multiple times result1 = await crawler.arun(url="https://example.com") result2 = await crawler.arun(url="https://another.com") await crawler.close() ``` Attributes: browser_config (BrowserConfig): Configuration object for browser settings. crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. logger (AsyncLogger): Logger instance for recording events and errors. crawl4ai_folder (str): Directory for storing cache. base_directory (str): Base directory for storing cache. ready (bool): Whether the crawler is ready for use. Methods: start(): Start the crawler explicitly without using context manager. close(): Close the crawler explicitly without using context manager. arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). awarmup(): Perform warmup sequence. arun_many(): Run the crawler for multiple sources. aprocess_html(): Process HTML content. Typical Usage: async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com") print(result.markdown) Using configuration: browser_config = BrowserConfig(browser_type="chromium", headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS ) result = await crawler.arun(url="https://example.com", config=crawler_config) print(result.markdown) """ _domain_last_hit = ... def __init__(self, crawler_strategy: AsyncCrawlerStrategy = ..., config: BrowserConfig = ..., base_directory: str = ..., thread_safe: bool = ..., logger: AsyncLoggerBase = ..., **kwargs) -> None: """ Initialize the AsyncWebCrawler. Args: crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy config: Configuration object for browser settings. Default BrowserConfig() base_directory: Base directory for storing cache thread_safe: Whether to use thread-safe operations **kwargs: Additional arguments for backwards compatibility """ ... async def start(self): # -> Self: """ Start the crawler explicitly without using context manager. This is equivalent to using 'async with' but gives more control over the lifecycle. Returns: AsyncWebCrawler: The initialized crawler instance """ ... async def close(self): # -> None: """ Close the crawler explicitly without using context manager. This should be called when you're done with the crawler if you used start(). This method will: 1. Clean up browser resources 2. Close any open pages and contexts """ ... async def __aenter__(self): # -> Self: ... async def __aexit__(self, exc_type, exc_val, exc_tb): # -> None: ... @asynccontextmanager async def nullcontext(self): # -> Generator[None, Any, None]: """异步空上下文管理器""" ... async def arun(self, url: str, config: CrawlerRunConfig = ..., **kwargs) -> RunManyReturn: """ Runs the crawler for a single source: URL (web, local file, or raw HTML). Migration Guide: Old way (deprecated): result = await crawler.arun( url="https://example.com", word_count_threshold=200, screenshot=True, ... ) New way (recommended): config = CrawlerRunConfig( word_count_threshold=200, screenshot=True, ... ) result = await crawler.arun(url="https://example.com", crawler_config=config) Args: url: The URL to crawl (http://, https://, file://, or raw:) crawler_config: Configuration object controlling crawl behavior [other parameters maintained for backwards compatibility] Returns: CrawlResult: The result of crawling and processing """ ... async def aprocess_html(self, url: str, html: str, extracted_content: str, config: CrawlerRunConfig, screenshot_data: str, pdf_data: str, verbose: bool, **kwargs) -> CrawlResult: """ Process HTML content using the provided configuration. Args: url: The URL being processed html: Raw HTML content extracted_content: Previously extracted content (if any) config: Configuration object controlling processing behavior screenshot_data: Screenshot data (if any) pdf_data: PDF data (if any) verbose: Whether to enable verbose logging **kwargs: Additional parameters for backwards compatibility Returns: CrawlResult: Processed result containing extracted and formatted content """ ... async def arun_many(self, urls: List[str], config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = ..., dispatcher: Optional[BaseDispatcher] = ..., **kwargs) -> RunManyReturn: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. Args: urls: List of URLs to crawl config: Configuration object(s) controlling crawl behavior. Can be: - Single CrawlerRunConfig: Used for all URLs - List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher [other parameters maintained for backwards compatibility] Returns: Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]: Either a list of all results or an async generator yielding results Examples: # Batch processing (default) results = await crawler.arun_many( urls=["https://example1.com", "https://example2.com"], config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) ) for result in results: print(f"Processed {result.url}: {len(result.markdown)} chars") # Streaming results async for result in await crawler.arun_many( urls=["https://example1.com", "https://example2.com"], config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True), ): print(f"Processed {result.url}: {len(result.markdown)} chars") """ ... async def aseed_urls(self, domain_or_domains: Union[str, List[str]], config: Optional[SeedingConfig] = ..., **kwargs) -> Union[List[str], Dict[str, List[Union[str, Dict[str, Any]]]]]: """ Discovers, filters, and optionally validates URLs for a given domain(s) using sitemaps and Common Crawl archives. Args: domain_or_domains: A single domain string (e.g., "iana.org") or a list of domains. config: A SeedingConfig object to control the seeding process. Parameters passed directly via kwargs will override those in 'config'. **kwargs: Additional parameters (e.g., `source`, `live_check`, `extract_head`, `pattern`, `concurrency`, `hits_per_sec`, `force_refresh`, `verbose`) that will be used to construct or update the SeedingConfig. Returns: If `extract_head` is False: - For a single domain: `List[str]` of discovered URLs. - For multiple domains: `Dict[str, List[str]]` mapping each domain to its URLs. If `extract_head` is True: - For a single domain: `List[Dict[str, Any]]` where each dict contains 'url' and 'head_data' (parsed <head> metadata). - For multiple domains: `Dict[str, List[Dict[str, Any]]]` mapping each domain to a list of URL data dictionaries. Raises: ValueError: If `domain_or_domains` is not a string or a list of strings. Exception: Any underlying exceptions from AsyncUrlSeeder or network operations. Example: >>> # Discover URLs from sitemap with live check for 'example.com' >>> result = await crawler.aseed_urls("example.com", source="sitemap", live_check=True, hits_per_sec=10) >>> # Discover URLs from Common Crawl, extract head data for 'example.com' and 'python.org' >>> multi_domain_result = await crawler.aseed_urls( >>> ["example.com", "python.org"], >>> source="cc", extract_head=True, concurrency=200, hits_per_sec=50 >>> ) """ ...

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/osins/dev-tool-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

async_webcrawler.pyi•9.83 KiB