"""
This type stub file was generated by pyright.
"""
import pathlib
import httpx
from datetime import timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, TYPE_CHECKING, Union
from .async_logger import AsyncLoggerBase
from .async_configs import SeedingConfig
"""
async_url_seeder.py
Fast async URL discovery for Crawl4AI
Features
--------
* Common-Crawl streaming via httpx.AsyncClient (HTTP/2, keep-alive)
* robots.txt → sitemap chain (.gz + nested indexes) via async httpx
* Per-domain CDX result cache on disk (~/.crawl4ai/<index>_<domain>_<hash>.jsonl)
* Optional HEAD-only liveness check
* Optional partial <head> download + meta parsing
* Global hits-per-second rate-limit via asyncio.Semaphore
* Concurrency in the thousands — fine on a single event-loop
"""
LXML = ...
HAS_BROTLI = ...
HAS_BM25 = ...
if TYPE_CHECKING:
...
COLLINFO_URL = ...
TTL = ...
_meta_rx = ...
_charset_rx = ...
_title_rx = ...
_link_rx = ...
class AsyncUrlSeeder:
"""
Async version of UrlSeeder.
Call pattern is await/async for / async with.
Public coroutines
-----------------
await seed.urls(...)
returns List[Dict[str,Any]] (url, status, head_data)
await seed.many_urls(...)
returns Dict[str, List[Dict[str,Any]]]
await seed.close()
closes the HTTP client if owned by seeder
Usage examples
--------------
# Manual cleanup:
seeder = AsyncUrlSeeder()
try:
urls = await seeder.urls("example.com", config)
finally:
await seeder.close()
# Using async context manager (recommended):
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# Reusing existing client:
client = httpx.AsyncClient()
seeder = AsyncUrlSeeder(client=client)
urls = await seeder.urls("example.com", config)
# No need to close seeder, as it doesn't own the client
"""
def __init__(self, ttl: timedelta = ..., client: Optional[httpx.AsyncClient] = ..., logger: Optional[AsyncLoggerBase] = ..., base_directory: Optional[Union[str, pathlib.Path]] = ..., cache_root: Optional[Union[str, Path]] = ...) -> None:
...
async def urls(self, domain: str, config: SeedingConfig) -> List[Dict[str, Any]]:
"""
Fetch URLs for a domain using configuration from SeedingConfig.
Parameters
----------
domain : str
The domain to fetch URLs for (e.g., "example.com")
config : SeedingConfig
Configuration object containing all seeding parameters
"""
...
async def many_urls(self, domains: Sequence[str], config: SeedingConfig) -> Dict[str, List[Dict[str, Any]]]:
"""
Fetch URLs for many domains in parallel.
Parameters
----------
domains : Sequence[str]
List of domains to fetch URLs for
config : SeedingConfig
Configuration object containing all seeding parameters
Returns a {domain: urls-list} dict.
"""
...
async def extract_head_for_urls(self, urls: List[str], config: Optional[SeedingConfig] = ..., concurrency: int = ..., timeout: int = ...) -> List[Dict[str, Any]]:
"""
Extract head content for a custom list of URLs using URLSeeder's parallel processing.
This method reuses URLSeeder's efficient parallel processing, caching, and head extraction
logic to process a custom list of URLs rather than discovering URLs from sources.
Parameters
----------
urls : List[str]
List of URLs to extract head content from
config : SeedingConfig, optional
Configuration object. If None, uses default settings for head extraction
concurrency : int, default=10
Number of concurrent requests
timeout : int, default=5
Timeout for each request in seconds
Returns
-------
List[Dict[str, Any]]
List of dictionaries containing url, status, head_data, and optional relevance_score
"""
...
async def close(self): # -> None:
"""Close the HTTP client if we own it."""
...
async def __aenter__(self): # -> Self:
"""Async context manager entry."""
...
async def __aexit__(self, exc_type, exc_val, exc_tb): # -> Literal[False]:
"""Async context manager exit."""
...