"""
This type stub file was generated by pyright.
"""
import numpy as np
from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Sequence, Tuple
from functools import lru_cache
original_applies_to = ...
def patched_applies_to(self, filename): # -> bool:
...
def chunk_documents(documents: Iterable[str], chunk_token_threshold: int, overlap: int, word_token_rate: float = ..., tokenizer: Optional[Callable[[str], List[str]]] = ...) -> Generator[str, None, None]:
"""
Efficiently chunks documents into token-limited sections with overlap between chunks.
Args:
documents: Iterable of document strings
chunk_token_threshold: Maximum tokens per chunk
overlap: Number of tokens to overlap between chunks
word_token_rate: Token estimate per word when not using a tokenizer
tokenizer: Function that splits text into tokens (if available)
Yields:
Text chunks as strings
"""
...
def merge_chunks(docs: Sequence[str], target_size: int, overlap: int = ..., word_token_ratio: float = ..., splitter: Callable = ...) -> List[str]:
"""
Merges a sequence of documents into chunks based on a target token count, with optional overlap.
Each document is split into tokens using the provided splitter function (defaults to str.split). Tokens are distributed into chunks aiming for the specified target size, with optional overlapping tokens between consecutive chunks. Returns a list of non-empty merged chunks as strings.
Args:
docs: Sequence of input document strings to be merged.
target_size: Target number of tokens per chunk.
overlap: Number of tokens to overlap between consecutive chunks.
word_token_ratio: Multiplier to estimate token count from word count.
splitter: Callable used to split each document into tokens.
Returns:
List of merged document chunks as strings, each not exceeding the target token size.
"""
...
class VersionManager:
def __init__(self) -> None:
...
def get_installed_version(self): # -> Version | None:
"""Get the version recorded in home directory"""
...
def update_version(self): # -> None:
"""Update the version file to current library version"""
...
def needs_update(self): # -> bool:
"""Check if database needs update based on version"""
...
class RobotsParser:
CACHE_TTL = ...
def __init__(self, cache_dir=..., cache_ttl=...) -> None:
...
async def can_fetch(self, url: str, user_agent: str = ...) -> bool:
"""
Check if URL can be fetched according to robots.txt rules.
Args:
url: The URL to check
user_agent: User agent string to check against (default: "*")
Returns:
bool: True if allowed, False if disallowed by robots.txt
"""
...
def clear_cache(self): # -> None:
"""Clear all cached robots.txt entries"""
...
def clear_expired(self): # -> None:
"""Remove only expired entries from cache"""
...
class InvalidCSSSelectorError(Exception):
...
SPLITS = ...
HTML_CODE_CHARS = ...
def advanced_split(text: str) -> list[str]:
...
def create_box_message(message: str, type: str = ..., width: int = ..., add_newlines: bool = ..., double_line: bool = ...) -> str:
"""
Create a styled message box with colored borders and formatted text.
How it works:
1. Determines box style and colors based on the message type (e.g., info, warning).
2. Wraps text to fit within the specified width.
3. Constructs a box using characters (single or double lines) with appropriate formatting.
4. Adds optional newlines before and after the box.
Args:
message (str): The message to display inside the box.
type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
width (int): Width of the box. Defaults to 120.
add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
double_line (bool): Whether to use double lines for the box border. Defaults to False.
Returns:
str: A formatted string containing the styled message box.
"""
...
def calculate_semaphore_count(): # -> int:
"""
Calculate the optimal semaphore count based on system resources.
How it works:
1. Determines the number of CPU cores and total system memory.
2. Sets a base count as half of the available CPU cores.
3. Limits the count based on memory, assuming 2GB per semaphore instance.
4. Returns the minimum value between CPU and memory-based limits.
Returns:
int: The calculated semaphore count.
"""
...
def get_system_memory(): # -> int | Any | None:
"""
Get the total system memory in bytes.
How it works:
1. Detects the operating system.
2. Reads memory information from system-specific commands or files.
3. Converts the memory to bytes for uniformity.
Returns:
int: The total system memory in bytes.
Raises:
OSError: If the operating system is unsupported.
"""
...
def get_home_folder(): # -> str:
"""
Get or create the home folder for Crawl4AI configuration and cache.
How it works:
1. Uses environment variables or defaults to the user's home directory.
2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
3. Returns the path to the home folder.
Returns:
str: The path to the Crawl4AI home folder.
"""
...
async def get_chromium_path(browser_type) -> str:
"""Returns the browser executable path using playwright's browser management.
Uses playwright's built-in browser management to get the correct browser executable
path regardless of platform. This ensures we're using the same browser version
that playwright is tested with.
Returns:
str: Path to browser executable
Raises:
RuntimeError: If browser executable cannot be found
"""
...
def beautify_html(escaped_html): # -> str:
"""
Beautifies an escaped HTML string.
Parameters:
escaped_html (str): A string containing escaped HTML.
Returns:
str: A beautifully formatted HTML string.
"""
...
def split_and_parse_json_objects(json_string): # -> tuple[list[Any], list[Any]]:
"""
Splits a JSON string which is a list of objects and tries to parse each object.
Parameters:
json_string (str): A string representation of a list of JSON objects, e.g., '[{...}, {...}, ...]'.
Returns:
tuple: A tuple containing two lists:
- First list contains all successfully parsed JSON objects.
- Second list contains the string representations of all segments that couldn't be parsed.
"""
...
def sanitize_html(html):
"""
Sanitize an HTML string by escaping quotes.
How it works:
1. Replaces all unwanted and special characters with an empty string.
2. Escapes double and single quotes for safe usage.
Args:
html (str): The HTML string to sanitize.
Returns:
str: The sanitized HTML string.
"""
...
def sanitize_input_encode(text: str) -> str:
"""Sanitize input to handle potential encoding issues."""
...
def escape_json_string(s): # -> str:
"""
Escapes characters in a string to be JSON safe.
Parameters:
s (str): The input string to be escaped.
Returns:
str: The escaped string, safe for JSON encoding.
"""
...
def replace_inline_tags(soup, tags, only_text=...):
"""
Replace inline HTML tags with Markdown-style equivalents.
How it works:
1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
3. Optionally replaces tags with their text content only.
Args:
soup (BeautifulSoup): Parsed HTML content.
tags (List[str]): List of tags to replace.
only_text (bool): Whether to replace tags with plain text. Defaults to False.
Returns:
BeautifulSoup: Updated BeautifulSoup object with replaced tags.
"""
...
def get_content_of_website(url, html, word_count_threshold=..., css_selector=..., **kwargs):
"""
Extract structured content, media, and links from website HTML.
How it works:
1. Parses the HTML content using BeautifulSoup.
2. Extracts internal/external links and media (images, videos, audios).
3. Cleans the content by removing unwanted tags and attributes.
4. Converts cleaned HTML to Markdown.
5. Collects metadata and returns the extracted information.
Args:
url (str): The website URL.
html (str): The HTML content of the website.
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
Returns:
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
"""
...
def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = ..., css_selector: str = ..., **kwargs) -> Dict[str, Any]:
"""
Extracts and cleans content from website HTML, optimizing for useful media and contextual information.
Parses the provided HTML to extract internal and external links, filters and scores images for usefulness, gathers contextual descriptions for media, removes unwanted or low-value elements, and converts the cleaned HTML to Markdown. Also extracts metadata and returns all structured content in a dictionary.
Args:
url: The URL of the website being processed.
html: The raw HTML content to extract from.
word_count_threshold: Minimum word count for elements to be retained.
css_selector: Optional CSS selector to restrict extraction to specific elements.
Returns:
A dictionary containing Markdown content, cleaned HTML, extraction success status, media and link lists, and metadata.
Raises:
InvalidCSSSelectorError: If a provided CSS selector does not match any elements.
"""
...
def extract_metadata_using_lxml(html, doc=...): # -> dict[Any, Any]:
"""
Extract metadata from HTML using lxml for better performance.
"""
...
def extract_metadata(html, soup=...): # -> dict[Any, Any]:
"""
Extract optimized content, media, and links from website HTML.
How it works:
1. Similar to `get_content_of_website`, but optimized for performance.
2. Filters and scores images for usefulness.
3. Extracts contextual descriptions for media files.
4. Handles excluded tags and CSS selectors.
5. Cleans HTML and converts it to Markdown.
Args:
url (str): The website URL.
html (str): The HTML content of the website.
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
**kwargs: Additional options for customization.
Returns:
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
"""
...
def extract_xml_tags(string): # -> list[Any]:
"""
Extracts XML tags from a string.
Args:
string (str): The input string containing XML tags.
Returns:
List[str]: A list of XML tags extracted from the input string.
"""
...
def extract_xml_data_legacy(tags, string): # -> dict[Any, Any]:
"""
Extract data for specified XML tags from a string.
How it works:
1. Searches the string for each tag using regex.
2. Extracts the content within the tags.
3. Returns a dictionary of tag-content pairs.
Args:
tags (List[str]): The list of XML tags to extract.
string (str): The input string containing XML data.
Returns:
Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
"""
...
def extract_xml_data(tags, string): # -> dict[Any, Any]:
"""
Extract data for specified XML tags from a string, returning the longest content for each tag.
How it works:
1. Finds all occurrences of each tag in the string using regex.
2. For each tag, selects the occurrence with the longest content.
3. Returns a dictionary of tag-content pairs.
Args:
tags (List[str]): The list of XML tags to extract.
string (str): The input string containing XML data.
Returns:
Dict[str, str]: A dictionary with tag names as keys and longest extracted content as values.
"""
...
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response=..., base_url=..., **kwargs): # -> ModelResponse | CustomStreamWrapper | list[dict[str, int | list[str]]] | None:
"""
Perform an API completion request with exponential backoff.
How it works:
1. Sends a completion request to the API.
2. Retries on rate-limit errors with exponential delays.
3. Returns the API response or an error after all retries.
Args:
provider (str): The name of the API provider.
prompt_with_variables (str): The input prompt for the completion request.
api_token (str): The API token for authentication.
json_response (bool): Whether to request a JSON response. Defaults to False.
base_url (Optional[str]): The base URL for the API. Defaults to None.
**kwargs: Additional arguments for the API request.
Returns:
dict: The API response or an error message after all retries.
"""
...
async def aperform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response=..., base_url=..., **kwargs): # -> ModelResponse | CustomStreamWrapper | list[dict[str, int | list[str]]] | None:
"""
Async version: Perform an API completion request with exponential backoff.
How it works:
1. Sends an async completion request to the API.
2. Retries on rate-limit errors with exponential delays (async).
3. Returns the API response or an error after all retries.
Args:
provider (str): The name of the API provider.
prompt_with_variables (str): The input prompt for the completion request.
api_token (str): The API token for authentication.
json_response (bool): Whether to request a JSON response. Defaults to False.
base_url (Optional[str]): The base URL for the API. Defaults to None.
**kwargs: Additional arguments for the API request.
Returns:
dict: The API response or an error message after all retries.
"""
...
def extract_blocks(url, html, provider=..., api_token=..., base_url=...): # -> Any | list[Any]:
"""
Extract content blocks from website HTML using an AI provider.
How it works:
1. Prepares a prompt by sanitizing and escaping HTML.
2. Sends the prompt to an AI provider with optional retries.
3. Parses the response to extract structured blocks or errors.
Args:
url (str): The website URL.
html (str): The HTML content of the website.
provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
api_token (Optional[str]): The API token for authentication. Defaults to None.
base_url (Optional[str]): The base URL for the API. Defaults to None.
Returns:
List[dict]: A list of extracted content blocks.
"""
...
def extract_blocks_batch(batch_data, provider=..., api_token=...): # -> list[Any]:
"""
Extract content blocks from a batch of website HTMLs.
How it works:
1. Prepares prompts for each URL and HTML pair.
2. Sends the prompts to the AI provider in a batch request.
3. Parses the responses to extract structured blocks or errors.
Args:
batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
api_token (Optional[str]): The API token for authentication. Defaults to None.
Returns:
List[dict]: A list of extracted content blocks from all batch items.
"""
...
def merge_chunks_based_on_token_threshold(chunks, token_threshold): # -> list[Any]:
"""
Merges small chunks into larger ones based on the total token threshold.
:param chunks: List of text chunks to be merged based on token count.
:param token_threshold: Max number of tokens for each merged chunk.
:return: List of merged text chunks.
"""
...
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=...) -> list:
"""
Process sections of HTML content sequentially or in parallel.
How it works:
1. Sequentially processes sections with delays for "groq/" providers.
2. Uses ThreadPoolExecutor for parallel processing with other providers.
3. Extracts content blocks for each section.
Args:
url (str): The website URL.
sections (List[str]): The list of HTML sections to process.
provider (str): The AI provider for content extraction.
api_token (str): The API token for authentication.
base_url (Optional[str]): The base URL for the API. Defaults to None.
Returns:
List[dict]: The list of extracted content blocks from all sections.
"""
...
def wrap_text(draw, text, font, max_width): # -> LiteralString:
"""
Wrap text to fit within a specified width for rendering.
How it works:
1. Splits the text into words.
2. Constructs lines that fit within the maximum width using the provided font.
3. Returns the wrapped text as a single string.
Args:
draw (ImageDraw.Draw): The drawing context for measuring text size.
text (str): The text to wrap.
font (ImageFont.FreeTypeFont): The font to use for measuring text size.
max_width (int): The maximum width for each line.
Returns:
str: The wrapped text.
"""
...
def format_html(html_string): # -> str:
"""
Prettify an HTML string using BeautifulSoup.
How it works:
1. Parses the HTML string with BeautifulSoup.
2. Formats the HTML with proper indentation.
3. Returns the prettified HTML string.
Args:
html_string (str): The HTML string to format.
Returns:
str: The prettified HTML string.
"""
...
def fast_format_html(html_string): # -> LiteralString:
"""
A fast HTML formatter that uses string operations instead of parsing.
Args:
html_string (str): The HTML string to format
Returns:
str: The formatted HTML string
"""
...
def normalize_url(href, base_url):
"""Normalize URLs to ensure consistent format"""
...
def normalize_url(href: str, base_url: str, *, drop_query_tracking=..., sort_query=..., keep_fragment=..., extra_drop_params=..., preserve_https=..., original_scheme=...): # -> str | None:
"""
Extended URL normalizer
Parameters
----------
href : str
The raw link extracted from a page.
base_url : str
The page’s canonical URL (used to resolve relative links).
drop_query_tracking : bool (default True)
Remove common tracking query parameters.
sort_query : bool (default True)
Alphabetically sort query keys for deterministic output.
keep_fragment : bool (default False)
Preserve the hash fragment (#section) if you need in-page links.
extra_drop_params : Iterable[str] | None
Additional query keys to strip (case-insensitive).
Returns
-------
str | None
A clean, canonical URL or None if href is empty/None.
"""
...
def normalize_url_for_deep_crawl(href, base_url, preserve_https=..., original_scheme=...): # -> str | None:
"""Normalize URLs to ensure consistent format"""
...
@lru_cache(maxsize=10000)
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=..., original_scheme=...): # -> str | None:
"""Efficient URL normalization with proper parsing"""
...
def normalize_url_tmp(href, base_url): # -> str:
"""Normalize URLs to ensure consistent format"""
...
def get_base_domain(url: str) -> str:
"""
Extract the base domain from a given URL, handling common edge cases.
How it works:
1. Parses the URL to extract the domain.
2. Removes the port number and 'www' prefix.
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
Args:
url (str): The URL to extract the base domain from.
Returns:
str: The extracted base domain or an empty string if parsing fails.
"""
...
def is_external_url(url: str, base_domain: str) -> bool:
"""
Extract the base domain from a given URL, handling common edge cases.
How it works:
1. Parses the URL to extract the domain.
2. Removes the port number and 'www' prefix.
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
Args:
url (str): The URL to extract the base domain from.
Returns:
str: The extracted base domain or an empty string if parsing fails.
"""
...
def clean_tokens(tokens: list[str]) -> list[str]:
"""
Clean a list of tokens by removing noise, stop words, and short tokens.
How it works:
1. Defines a set of noise words and stop words.
2. Filters tokens based on length and exclusion criteria.
3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").
Args:
tokens (list[str]): The list of tokens to clean.
Returns:
list[str]: The cleaned list of tokens.
"""
...
def profile_and_time(func): # -> _Wrapped[..., Any, ..., Any]:
"""
Decorator to profile a function's execution time and performance.
How it works:
1. Records the start time before executing the function.
2. Profiles the function's execution using `cProfile`.
3. Prints the elapsed time and profiling statistics.
Args:
func (Callable): The function to decorate.
Returns:
Callable: The decorated function with profiling and timing enabled.
"""
...
def generate_content_hash(content: str) -> str:
"""Generate a unique hash for content"""
...
def ensure_content_dirs(base_path: str) -> Dict[str, str]:
"""Create content directories if they don't exist"""
...
def configure_windows_event_loop(): # -> None:
"""
Configure the Windows event loop to use ProactorEventLoop.
This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
This function should only be called on Windows systems and before any async operations.
On non-Windows systems, this function does nothing.
Example:
```python
from crawl4ai.async_configs import configure_windows_event_loop
# Call this before any async operations if you're on Windows
configure_windows_event_loop()
```
"""
...
def get_error_context(exc_info, context_lines: int = ...): # -> dict[str, str | int | Any | None]:
"""
Extract error context with more reliable line number tracking.
Args:
exc_info: The exception info from sys.exc_info()
context_lines: Number of lines to show before and after the error
Returns:
dict: Error context information
"""
...
def truncate(value, threshold):
...
def optimize_html(html_str, threshold=...):
...
class HeadPeekr:
@staticmethod
async def fetch_head_section(url, timeout=...): # -> bytes | None:
...
@staticmethod
async def peek_html(url, timeout=...): # -> str | None:
...
@staticmethod
def extract_meta_tags(head_content: str): # -> dict[Any, Any]:
...
def get_title(head_content: str): # -> str | Any | None:
...
def preprocess_html_for_schema(html_content, text_threshold=..., attr_value_threshold=..., max_size=...):
"""
Preprocess HTML to reduce size while preserving structure for schema generation.
Args:
html_content (str): Raw HTML content
text_threshold (int): Maximum length for text nodes before truncation
attr_value_threshold (int): Maximum length for attribute values before truncation
max_size (int): Target maximum size for output HTML
Returns:
str: Preprocessed HTML content
"""
...
def start_colab_display_server(): # -> None:
"""
Start virtual display server in Google Colab.
Raises error if not running in Colab environment.
"""
...
def setup_colab_environment(): # -> None:
"""
Alternative setup using IPython magic commands
"""
...
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
"""
Extract page context for link scoring - called ONCE per page for performance.
Parser-agnostic function that takes pre-extracted data.
Args:
page_title: Title of the page
headlines_text: Combined text from h1, h2, h3 elements
meta_description: Meta description content
base_url: Base URL of the page
Returns:
Dictionary containing page context data for fast link scoring
"""
...
def calculate_link_intrinsic_score(link_text: str, url: str, title_attr: str, class_attr: str, rel_attr: str, page_context: dict) -> float:
"""
Ultra-fast link quality scoring using only provided data (no DOM access needed).
Parser-agnostic function.
Args:
link_text: Text content of the link
url: Link URL
title_attr: Title attribute of the link
class_attr: Class attribute of the link
rel_attr: Rel attribute of the link
page_context: Pre-computed page context from extract_page_context()
Returns:
Quality score (0.0 - 10.0), higher is better
"""
...
def calculate_total_score(intrinsic_score: Optional[float] = ..., contextual_score: Optional[float] = ..., score_links_enabled: bool = ..., query_provided: bool = ...) -> float:
"""
Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
Args:
intrinsic_score: Quality score based on URL structure, text, and context (0-10)
contextual_score: BM25 relevance score based on query and head content (0-1 typically)
score_links_enabled: Whether link scoring is enabled
query_provided: Whether a query was provided for contextual scoring
Returns:
Combined total score (0-10 scale)
Scoring Logic:
- No scoring: return 5.0 (neutral score)
- Only intrinsic: return normalized intrinsic score
- Only contextual: return contextual score scaled to 10
- Both: weighted combination (70% intrinsic, 30% contextual scaled)
"""
...
async def get_text_embeddings(texts: List[str], llm_config: Optional[Dict] = ..., model_name: str = ..., batch_size: int = ...) -> np.ndarray:
"""
Compute embeddings for a list of texts using specified model.
Args:
texts: List of texts to embed
llm_config: Optional LLM configuration for API-based embeddings
model_name: Model name (used when llm_config is None)
batch_size: Batch size for processing
Returns:
numpy array of embeddings
"""
...
def get_text_embeddings_sync(texts: List[str], llm_config: Optional[Dict] = ..., model_name: str = ..., batch_size: int = ...) -> np.ndarray:
"""Synchronous wrapper for get_text_embeddings"""
...
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
"""Calculate cosine similarity between two vectors"""
...
def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
"""Calculate cosine distance (1 - similarity) between two vectors"""
...
def get_true_available_memory_gb() -> float:
"""Get truly available memory including inactive pages (cross-platform)"""
...
def get_true_memory_usage_percent() -> float:
"""
Get memory usage percentage that accounts for platform differences.
Returns:
float: Memory usage percentage (0-100)
"""
...
def get_memory_stats() -> Tuple[float, float, float]:
"""
Get comprehensive memory statistics.
Returns:
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
"""
...
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
"""
Convert hook function objects to string representations for Docker API.
This utility simplifies the process of using hooks with the Docker API by converting
Python function objects into the string format required by the API.
Args:
hooks: Dictionary mapping hook point names to Python function objects.
Functions should be async and follow hook signature requirements.
Returns:
Dictionary mapping hook point names to string representations of the functions.
Example:
>>> async def my_hook(page, context, **kwargs):
... await page.set_viewport_size({"width": 1920, "height": 1080})
... return page
>>>
>>> hooks_dict = {"on_page_context_created": my_hook}
>>> api_hooks = hooks_to_string(hooks_dict)
>>> # api_hooks is now ready to use with Docker API
Raises:
ValueError: If a hook is not callable or source cannot be extracted
"""
...