"""
This type stub file was generated by pyright.
"""
from typing import Any, Callable, Dict, List, Optional, Union
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy
from .deep_crawling import DeepCrawlStrategy
from .table_extraction import TableExtractionStrategy
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
from enum import Enum
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
class MatchMode(Enum):
OR = ...
AND = ...
def to_serializable_dict(obj: Any, ignore_default_value: bool = ...) -> Dict:
"""
Recursively convert an object to a serializable dictionary using {type, params} structure
for complex objects.
"""
...
def from_serializable_dict(data: Any) -> Any:
"""
Recursively convert a serializable dictionary back to an object instance.
"""
...
def is_empty_value(value: Any) -> bool:
"""Check if a value is effectively empty/null."""
...
class GeolocationConfig:
def __init__(self, latitude: float, longitude: float, accuracy: Optional[float] = ...) -> None:
"""Configuration class for geolocation settings.
Args:
latitude: Latitude coordinate (e.g., 37.7749)
longitude: Longitude coordinate (e.g., -122.4194)
accuracy: Accuracy in meters. Default: 0.0
"""
...
@staticmethod
def from_dict(geo_dict: Dict) -> GeolocationConfig:
"""Create a GeolocationConfig from a dictionary."""
...
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
...
def clone(self, **kwargs) -> GeolocationConfig:
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
GeolocationConfig: A new instance with the specified updates
"""
...
class ProxyConfig:
def __init__(self, server: str, username: Optional[str] = ..., password: Optional[str] = ..., ip: Optional[str] = ...) -> None:
"""Configuration class for a single proxy.
Args:
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
username: Optional username for proxy authentication
password: Optional password for proxy authentication
ip: Optional IP address for verification purposes
"""
...
@staticmethod
def from_string(proxy_str: str) -> ProxyConfig:
"""Create a ProxyConfig from a string.
Supported formats:
- 'http://username:password@ip:port'
- 'http://ip:port'
- 'socks5://ip:port'
- 'ip:port:username:password'
- 'ip:port'
"""
...
@staticmethod
def from_dict(proxy_dict: Dict) -> ProxyConfig:
"""Create a ProxyConfig from a dictionary."""
...
@staticmethod
def from_env(env_var: str = ...) -> List[ProxyConfig]:
"""Load proxies from environment variable.
Args:
env_var: Name of environment variable containing comma-separated proxy strings
Returns:
List of ProxyConfig objects
"""
...
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
...
def clone(self, **kwargs) -> ProxyConfig:
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
ProxyConfig: A new instance with the specified updates
"""
...
class BrowserConfig:
"""
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
This class centralizes all parameters that affect browser and context creation. Instead of passing
scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
code will then reference these settings to initialize the browser in a consistent, documented manner.
Attributes:
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
Default: "chromium".
headless (bool): Whether to run the browser in headless mode (no visible GUI).
Default: True.
browser_mode (str): Determines how the browser should be initialized:
"builtin" - use the builtin CDP browser running in background
"dedicated" - create a new dedicated browser instance each time
"cdp" - use explicit CDP settings provided in cdp_url
"docker" - run browser in Docker container with isolation
Default: "dedicated"
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
advanced manipulation. Default: False.
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
Automatically sets use_managed_browser=True. Default: False.
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
temporary directory may be used. Default: None.
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
is "chromium". Default: "chromium".
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
is "chromium". Default: "chromium".
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
Default: None.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
viewport_width (int): Default viewport width for pages. Default: 1080.
viewport_height (int): Default viewport height for pages. Default: 600.
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
Default: None.
verbose (bool): Enable verbose logging.
Default: True.
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
Default: False.
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
a default path will be created. Default: None.
storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
Default: None.
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
{"name": "...", "value": "...", "url": "..."}.
Default: [].
headers (dict): Extra HTTP headers to apply to all requests in this context.
Default: {}.
user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
user_agent as-is. Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
text_mode (bool): If True, disables images and other rich content for potentially faster load times.
Default: False.
light_mode (bool): Disables certain background features for performance gains. Default: False.
extra_args (list): Additional command-line arguments passed to the browser.
Default: [].
enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection.
Cannot be used with use_undetected browser mode. Default: False.
"""
def __init__(self, browser_type: str = ..., headless: bool = ..., browser_mode: str = ..., use_managed_browser: bool = ..., cdp_url: str = ..., use_persistent_context: bool = ..., user_data_dir: str = ..., chrome_channel: str = ..., channel: str = ..., proxy: str = ..., proxy_config: Union[ProxyConfig, dict, None] = ..., viewport_width: int = ..., viewport_height: int = ..., viewport: dict = ..., accept_downloads: bool = ..., downloads_path: str = ..., storage_state: Union[str, dict, None] = ..., ignore_https_errors: bool = ..., java_script_enabled: bool = ..., sleep_on_close: bool = ..., verbose: bool = ..., cookies: list = ..., headers: dict = ..., user_agent: str = ..., user_agent_mode: str = ..., user_agent_generator_config: dict = ..., text_mode: bool = ..., light_mode: bool = ..., extra_args: list = ..., debugging_port: int = ..., host: str = ..., enable_stealth: bool = ...) -> None:
...
@staticmethod
def from_kwargs(kwargs: dict) -> BrowserConfig:
...
def to_dict(self): # -> dict[str, str | bool | ProxyConfig | dict[Any, Any] | int | Any | list[Any] | None]:
...
def clone(self, **kwargs): # -> BrowserConfig:
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
BrowserConfig: A new instance with the specified updates
"""
...
def dump(self) -> dict:
...
@staticmethod
def load(data: dict) -> BrowserConfig:
...
def set_nstproxy(self, token: str, channel_id: str, country: str = ..., state: str = ..., city: str = ..., protocol: str = ..., session_duration: int = ...): # -> None:
"""
Fetch a proxy from NSTProxy API and automatically assign it to proxy_config.
Get your NSTProxy token from: https://app.nstproxy.com/profile
Args:
token (str): NSTProxy API token.
channel_id (str): NSTProxy channel ID.
country (str, optional): Country code (default: "ANY").
state (str, optional): State code (default: "").
city (str, optional): City name (default: "").
protocol (str, optional): Proxy protocol ("http" or "socks5"). Defaults to "http".
session_duration (int, optional): Session duration in minutes (0 = rotate each request). Defaults to 10.
Raises:
ValueError: If the API response format is invalid.
PermissionError: If the API returns an error message.
"""
...
class VirtualScrollConfig:
"""Configuration for virtual scroll handling.
This config enables capturing content from pages with virtualized scrolling
(like Twitter, Instagram feeds) where DOM elements are recycled as user scrolls.
"""
def __init__(self, container_selector: str, scroll_count: int = ..., scroll_by: Union[str, int] = ..., wait_after_scroll: float = ...) -> None:
"""
Initialize virtual scroll configuration.
Args:
container_selector: CSS selector for the scrollable container
scroll_count: Maximum number of scrolls to perform
scroll_by: Amount to scroll - can be:
- "container_height": scroll by container's height
- "page_height": scroll by viewport height
- int: fixed pixel amount
wait_after_scroll: Seconds to wait after each scroll for content to load
"""
...
def to_dict(self) -> dict:
"""Convert to dictionary for serialization."""
...
@classmethod
def from_dict(cls, data: dict) -> VirtualScrollConfig:
"""Create instance from dictionary."""
...
class LinkPreviewConfig:
"""Configuration for link head extraction and scoring."""
def __init__(self, include_internal: bool = ..., include_external: bool = ..., include_patterns: Optional[List[str]] = ..., exclude_patterns: Optional[List[str]] = ..., concurrency: int = ..., timeout: int = ..., max_links: int = ..., query: Optional[str] = ..., score_threshold: Optional[float] = ..., verbose: bool = ...) -> None:
"""
Initialize link extraction configuration.
Args:
include_internal: Whether to include same-domain links
include_external: Whether to include different-domain links
include_patterns: List of glob patterns to include (e.g., ["*/docs/*", "*/api/*"])
exclude_patterns: List of glob patterns to exclude (e.g., ["*/login*", "*/admin*"])
concurrency: Number of links to process simultaneously
timeout: Timeout in seconds for each link's head extraction
max_links: Maximum number of links to process (prevents overload)
query: Query string for BM25 contextual scoring (optional)
score_threshold: Minimum relevance score to include links (0.0-1.0, optional)
verbose: Show detailed progress during extraction
"""
...
@staticmethod
def from_dict(config_dict: Dict[str, Any]) -> LinkPreviewConfig:
"""Create LinkPreviewConfig from dictionary (for backward compatibility)."""
...
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary format."""
...
def clone(self, **kwargs) -> LinkPreviewConfig:
"""Create a copy with updated values."""
...
class HTTPCrawlerConfig:
"""HTTP-specific crawler configuration"""
method: str = ...
headers: Optional[Dict[str, str]] = ...
data: Optional[Dict[str, Any]] = ...
json: Optional[Dict[str, Any]] = ...
follow_redirects: bool = ...
verify_ssl: bool = ...
def __init__(self, method: str = ..., headers: Optional[Dict[str, str]] = ..., data: Optional[Dict[str, Any]] = ..., json: Optional[Dict[str, Any]] = ..., follow_redirects: bool = ..., verify_ssl: bool = ...) -> None:
...
@staticmethod
def from_kwargs(kwargs: dict) -> HTTPCrawlerConfig:
...
def to_dict(self): # -> dict[str, str | Dict[str, str] | Dict[str, Any] | bool | None]:
...
def clone(self, **kwargs): # -> HTTPCrawlerConfig:
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
HTTPCrawlerConfig: A new instance with the specified updates
"""
...
def dump(self) -> dict:
...
@staticmethod
def load(data: dict) -> HTTPCrawlerConfig:
...
class CrawlerRunConfig:
"""
Configuration class for controlling how the crawler runs each crawl operation.
This includes parameters for content extraction, page manipulation, waiting conditions,
caching, and other runtime behaviors.
This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
By using this class, you have a single place to understand and adjust the crawling options.
Attributes:
# Deep Crawl Parameters
deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
# Content Processing Parameters
word_count_threshold (int): Minimum word count threshold before processing content.
Default: MIN_WORD_THRESHOLD (typically 200).
extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
Default: None (NoExtractionStrategy is used if None).
chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
Default: RegexChunking().
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
Default: None.
only_text (bool): If True, attempt to extract text-only content where applicable.
Default: False.
css_selector (str or None): CSS selector to extract a specific portion of the page.
Default: None.
target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation
and structured data extraction. When you set this, only the contents
of these elements are processed for extraction and Markdown generation.
If you do not set any value, the entire page is processed.
The difference between this and css_selector is that this will shrink
the initial raw HTML to the selected element, while this will only affect
the extraction and Markdown generation.
Default: None
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
Default: None.
excluded_selector (str or None): CSS selector to exclude from processing.
Default: None.
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
Default: False.
keep_attrs (list of str): List of HTML attributes to keep during processing.
Default: [].
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
Default: False.
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
Default: False.
parser_type (str): Type of parser to use for HTML parsing.
Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: LXMLWebScrapingStrategy.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# Browser Location and Identity Parameters
locale (str or None): Locale to use for the browser context (e.g., "en-US").
Default: None.
timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York").
Default: None.
geolocation (GeolocationConfig or None): Geolocation configuration for the browser.
Default: None.
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled.
If None, defaults to CacheMode.ENABLED internally.
Default: CacheMode.BYPASS.
session_id (str or None): Optional session ID to persist the browser context and the created
page instance. If the ID already exists, the crawler does not
create a new page and uses the current page to preserve the state.
bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
Default: False.
disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
Default: False.
no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
Default: False.
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
Default: False.
shared_data (dict or None): Shared data to be passed between hooks.
Default: None.
# Page Navigation and Timing Parameters
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
Default: "domcontentloaded".
page_timeout (int): Timeout in ms for page operations like navigation.
Default: 60000 (60 seconds).
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
Default: None.
wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
If None, uses page_timeout instead.
Default: None.
wait_for_images (bool): If True, wait for images to load before extracting content.
Default: False.
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
Default: 0.1.
mean_delay (float): Mean base delay between requests when calling arun_many.
Default: 0.1.
max_range (float): Max random additional delay range for requests in arun_many.
Default: 0.3.
semaphore_count (int): Number of concurrent operations allowed.
Default: 5.
# Page Interaction Parameters
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
Default: None.
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
Default: False.
ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
Default: True.
scan_full_page (bool): If True, scroll through the entire page to load all content.
Default: False.
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
Default: 0.2.
max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan.
If None, scrolls until the entire page is loaded. Default: None.
process_iframes (bool): If True, attempts to process and inline iframe content.
Default: False.
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
Default: False.
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
Default: False.
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
Default: False.
magic (bool): If True, attempts automatic handling of overlays/popups.
Default: False.
adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
Default: False.
# Media Handling Parameters
screenshot (bool): Whether to take a screenshot after crawling.
Default: False.
screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
Default: None.
screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
pdf (bool): Whether to generate a PDF of the page.
Default: False.
image_description_min_word_threshold (int): Minimum words for image description extraction.
Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
image_score_threshold (int): Minimum score threshold for processing an image.
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
exclude_external_images (bool): If True, exclude all external images from processing.
Default: False.
table_score_threshold (int): Minimum score threshold for processing a table.
Default: 7.
table_extraction (TableExtractionStrategy): Strategy to use for table extraction.
Default: DefaultTableExtraction with table_score_threshold.
# Virtual Scroll Parameters
virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
Used for capturing content from pages with virtualized
scrolling (e.g., Twitter, Instagram feeds).
Default: None.
# Link and Domain Handling Parameters
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
Default: SOCIAL_MEDIA_DOMAINS (from config).
exclude_external_links (bool): If True, exclude all external links from the results.
Default: False.
exclude_internal_links (bool): If True, exclude internal links from the results.
Default: False.
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
Default: False.
exclude_domains (list of str): List of specific domains to exclude from results.
Default: [].
exclude_internal_links (bool): If True, exclude internal links from the results.
Default: False.
score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure,
text quality, and contextual relevance metrics. Separate from link_preview_config.
Default: False.
# Debugging and Logging Parameters
verbose (bool): Enable verbose logging.
Default: True.
log_console (bool): If True, log console messages from the page.
Default: False.
# HTTP Crwler Strategy Parameters
method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
Default: "GET".
data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
Default: None.
json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
# Connection Parameters
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
Default: False.
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
Default: False.
user_agent (str): Custom User-Agent string to use.
Default: None.
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
# Experimental Parameters
experimental (dict): Dictionary containing experimental parameters that are in beta phase.
This allows passing temporary features that are not yet fully integrated
into the main parameter set.
Default: None.
url: str = None # This is not a compulsory parameter
"""
_UNWANTED_PROPS = ...
def __init__(self, word_count_threshold: int = ..., extraction_strategy: ExtractionStrategy = ..., chunking_strategy: ChunkingStrategy = ..., markdown_generator: MarkdownGenerationStrategy = ..., only_text: bool = ..., css_selector: str = ..., target_elements: List[str] = ..., excluded_tags: list = ..., excluded_selector: str = ..., keep_data_attributes: bool = ..., keep_attrs: list = ..., remove_forms: bool = ..., prettiify: bool = ..., parser_type: str = ..., scraping_strategy: ContentScrapingStrategy = ..., proxy_config: Union[ProxyConfig, dict, None] = ..., proxy_rotation_strategy: Optional[ProxyRotationStrategy] = ..., locale: Optional[str] = ..., timezone_id: Optional[str] = ..., geolocation: Optional[GeolocationConfig] = ..., fetch_ssl_certificate: bool = ..., cache_mode: CacheMode = ..., session_id: str = ..., bypass_cache: bool = ..., disable_cache: bool = ..., no_cache_read: bool = ..., no_cache_write: bool = ..., shared_data: dict = ..., wait_until: str = ..., page_timeout: int = ..., wait_for: str = ..., wait_for_timeout: int = ..., wait_for_images: bool = ..., delay_before_return_html: float = ..., mean_delay: float = ..., max_range: float = ..., semaphore_count: int = ..., js_code: Union[str, List[str]] = ..., c4a_script: Union[str, List[str]] = ..., js_only: bool = ..., ignore_body_visibility: bool = ..., scan_full_page: bool = ..., scroll_delay: float = ..., max_scroll_steps: Optional[int] = ..., process_iframes: bool = ..., remove_overlay_elements: bool = ..., simulate_user: bool = ..., override_navigator: bool = ..., magic: bool = ..., adjust_viewport_to_content: bool = ..., screenshot: bool = ..., screenshot_wait_for: float = ..., screenshot_height_threshold: int = ..., pdf: bool = ..., capture_mhtml: bool = ..., image_description_min_word_threshold: int = ..., image_score_threshold: int = ..., table_score_threshold: int = ..., table_extraction: TableExtractionStrategy = ..., exclude_external_images: bool = ..., exclude_all_images: bool = ..., exclude_social_media_domains: list = ..., exclude_external_links: bool = ..., exclude_social_media_links: bool = ..., exclude_domains: list = ..., exclude_internal_links: bool = ..., score_links: bool = ..., preserve_https_for_internal_links: bool = ..., verbose: bool = ..., log_console: bool = ..., capture_network_requests: bool = ..., capture_console_messages: bool = ..., method: str = ..., stream: bool = ..., url: str = ..., check_robots_txt: bool = ..., user_agent: str = ..., user_agent_mode: str = ..., user_agent_generator_config: dict = ..., deep_crawl_strategy: Optional[DeepCrawlStrategy] = ..., link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = ..., virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = ..., url_matcher: Optional[UrlMatcher] = ..., match_mode: MatchMode = ..., experimental: Dict[str, Any] = ...) -> None:
...
def is_match(self, url: str) -> bool:
"""Check if this config matches the given URL.
Args:
url: The URL to check against this config's matcher
Returns:
bool: True if this config should be used for the URL or if no matcher is set.
"""
...
def __getattr__(self, name):
"""Handle attribute access."""
...
def __setattr__(self, name, value): # -> None:
"""Handle attribute setting."""
...
@staticmethod
def from_kwargs(kwargs: dict) -> CrawlerRunConfig:
...
def dump(self) -> dict:
...
@staticmethod
def load(data: dict) -> CrawlerRunConfig:
...
def to_dict(self): # -> dict[str, int | ExtractionStrategy | ChunkingStrategy | RegexChunking | MarkdownGenerationStrategy | bool | str | List[str] | list[Any] | ContentScrapingStrategy | ProxyConfig | dict[Any, Any] | ProxyRotationStrategy | GeolocationConfig | CacheMode | float | DefaultTableExtraction | TableExtractionStrategy | None]:
...
def clone(self, **kwargs): # -> CrawlerRunConfig:
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
CrawlerRunConfig: A new instance with the specified updates
Example:
```python
# Create a new config with streaming enabled
stream_config = config.clone(stream=True)
# Create a new config with multiple updates
new_config = config.clone(
stream=True,
cache_mode=CacheMode.BYPASS,
verbose=True
)
```
"""
...
class LLMConfig:
def __init__(self, provider: str = ..., api_token: Optional[str] = ..., base_url: Optional[str] = ..., temperature: Optional[float] = ..., max_tokens: Optional[int] = ..., top_p: Optional[float] = ..., frequency_penalty: Optional[float] = ..., presence_penalty: Optional[float] = ..., stop: Optional[List[str]] = ..., n: Optional[int] = ...) -> None:
"""Configuaration class for LLM provider and API token."""
...
@staticmethod
def from_kwargs(kwargs: dict) -> LLMConfig:
...
def to_dict(self): # -> dict[str, str | float | int | List[str] | None]:
...
def clone(self, **kwargs): # -> LLMConfig:
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
llm_config: A new instance with the specified updates
"""
...
class SeedingConfig:
"""
Configuration class for URL discovery and pre-validation via AsyncUrlSeeder.
"""
def __init__(self, source: str = ..., pattern: Optional[str] = ..., live_check: bool = ..., extract_head: bool = ..., max_urls: int = ..., concurrency: int = ..., hits_per_sec: int = ..., force: bool = ..., base_directory: Optional[str] = ..., llm_config: Optional[LLMConfig] = ..., verbose: Optional[bool] = ..., query: Optional[str] = ..., score_threshold: Optional[float] = ..., scoring_method: str = ..., filter_nonsense_urls: bool = ...) -> None:
"""
Initialize URL seeding configuration.
Args:
source: Discovery source(s) to use. Options: "sitemap", "cc" (Common Crawl),
or "sitemap+cc" (both). Default: "sitemap+cc"
pattern: URL pattern to filter discovered URLs (e.g., "*example.com/blog/*").
Supports glob-style wildcards. Default: "*" (all URLs)
live_check: Whether to perform HEAD requests to verify URL liveness.
Default: False
extract_head: Whether to fetch and parse <head> section for metadata extraction.
Required for BM25 relevance scoring. Default: False
max_urls: Maximum number of URLs to discover. Use -1 for no limit.
Default: -1
concurrency: Maximum concurrent requests for live checks/head extraction.
Default: 1000
hits_per_sec: Rate limit in requests per second to avoid overwhelming servers.
Default: 5
force: If True, bypasses the AsyncUrlSeeder's internal .jsonl cache and
re-fetches URLs. Default: False
base_directory: Base directory for UrlSeeder's cache files (.jsonl).
If None, uses default ~/.crawl4ai/. Default: None
llm_config: LLM configuration for future features (e.g., semantic scoring).
Currently unused. Default: None
verbose: Override crawler's general verbose setting for seeding operations.
Default: None (inherits from crawler)
query: Search query for BM25 relevance scoring (e.g., "python tutorials").
Requires extract_head=True. Default: None
score_threshold: Minimum relevance score (0.0-1.0) to include URL.
Only applies when query is provided. Default: None
scoring_method: Scoring algorithm to use. Currently only "bm25" is supported.
Future: "semantic". Default: "bm25"
filter_nonsense_urls: Filter out utility URLs like robots.txt, sitemap.xml,
ads.txt, favicon.ico, etc. Default: True
"""
...
def to_dict(self) -> Dict[str, Any]:
...
@staticmethod
def from_kwargs(kwargs: Dict[str, Any]) -> SeedingConfig:
...
def clone(self, **kwargs: Any) -> SeedingConfig:
...