"""
This type stub file was generated by pyright.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Tuple, Union
from enum import IntFlag
from .utils import *
from .models import *
from .model_loader import *
from .types import LLMConfig
class ExtractionStrategy(ABC):
"""
Abstract base class for all extraction strategies.
"""
def __init__(self, input_format: str = ..., **kwargs) -> None:
"""
Initialize the extraction strategy.
Args:
input_format: Content format to use for extraction.
Options: "markdown" (default), "html", "fit_markdown"
**kwargs: Additional keyword arguments
"""
...
@abstractmethod
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
"""
Extract meaningful blocks or chunks from the given HTML.
:param url: The URL of the webpage.
:param html: The HTML content of the webpage.
:return: A list of extracted blocks or chunks.
"""
...
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
"""
Process sections of text in parallel by default.
:param url: The URL of the webpage.
:param sections: List of sections (strings) to process.
:return: A list of processed JSON blocks.
"""
...
async def arun(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
"""
Async version: Process sections of text in parallel using asyncio.
Default implementation runs the sync version in a thread pool.
Subclasses can override this for true async processing.
:param url: The URL of the webpage.
:param sections: List of sections (strings) to process.
:return: A list of processed JSON blocks.
"""
...
class NoExtractionStrategy(ExtractionStrategy):
"""
A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
"""
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
"""
Extract meaningful blocks or chunks from the given HTML.
"""
...
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
...
class CosineStrategy(ExtractionStrategy):
"""
Extract meaningful blocks or chunks from the given HTML using cosine similarity.
How it works:
1. Pre-filter documents using embeddings and semantic_filter.
2. Perform clustering using cosine similarity.
3. Organize texts by their cluster labels, retaining order.
4. Filter clusters by word count.
5. Extract meaningful blocks or chunks from the filtered clusters.
Attributes:
semantic_filter (str): A keyword filter for document filtering.
word_count_threshold (int): Minimum number of words per cluster.
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
linkage_method (str): The linkage method for hierarchical clustering.
top_k (int): Number of top categories to extract.
model_name (str): The name of the sentence-transformers model.
sim_threshold (float): The similarity threshold for clustering.
"""
def __init__(self, semantic_filter=..., word_count_threshold=..., max_dist=..., linkage_method=..., top_k=..., model_name=..., sim_threshold=..., **kwargs) -> None:
"""
Initialize the strategy with clustering parameters.
Args:
semantic_filter (str): A keyword filter for document filtering.
word_count_threshold (int): Minimum number of words per cluster.
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
linkage_method (str): The linkage method for hierarchical clustering.
top_k (int): Number of top categories to extract.
"""
...
def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = ...) -> List[str]:
"""
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
Args:
documents (List[str]): A list of document texts.
semantic_filter (str): A keyword filter for document filtering.
at_least_k (int): The minimum number of documents to return.
Returns:
List[str]: A list of filtered and sorted document texts.
"""
...
def get_embeddings(self, sentences: List[str], batch_size=..., bypass_buffer=...): # -> NDArray[Any]:
"""
Get BERT embeddings for a list of sentences.
Args:
sentences (List[str]): A list of text chunks (sentences).
Returns:
NumPy array of embeddings.
"""
...
def hierarchical_clustering(self, sentences: List[str], embeddings=...): # -> Any:
"""
Perform hierarchical clustering on sentences and return cluster labels.
Args:
sentences (List[str]): A list of text chunks (sentences).
Returns:
NumPy array of cluster labels.
"""
...
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
"""
Filter clusters to remove those with a word count below the threshold.
Args:
clusters (Dict[int, List[str]]): Dictionary of clusters.
Returns:
Dict[int, List[str]]: Filtered dictionary of clusters.
"""
...
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
"""
Extract clusters from HTML content using hierarchical clustering.
Args:
url (str): The URL of the webpage.
html (str): The HTML content of the webpage.
Returns:
List[Dict[str, Any]]: A list of processed JSON blocks.
"""
...
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
"""
Process sections using hierarchical clustering.
Args:
url (str): The URL of the webpage.
sections (List[str]): List of sections (strings) to process.
Returns:
"""
...
class LLMExtractionStrategy(ExtractionStrategy):
"""
A strategy that uses an LLM to extract meaningful content from the HTML.
Attributes:
llm_config: The LLM configuration object.
instruction: The instruction to use for the LLM model.
schema: Pydantic model schema for structured data.
extraction_type: "block" or "schema".
chunk_token_threshold: Maximum tokens per chunk.
overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking.
verbose: Whether to print verbose output.
usages: List of individual token usages.
total_usage: Accumulated token usage.
"""
_UNWANTED_PROPS = ...
def __init__(self, llm_config: LLMConfig = ..., instruction: str = ..., schema: Dict = ..., extraction_type=..., chunk_token_threshold=..., overlap_rate=..., word_token_rate=..., apply_chunking=..., input_format: str = ..., force_json_response=..., verbose=..., provider: str = ..., api_token: Optional[str] = ..., base_url: str = ..., api_base: str = ..., **kwargs) -> None:
"""
Initialize the strategy with clustering parameters.
Args:
llm_config: The LLM configuration object.
instruction: The instruction to use for the LLM model.
schema: Pydantic model schema for structured data.
extraction_type: "block" or "schema".
chunk_token_threshold: Maximum tokens per chunk.
overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking.
input_format: Content format to use for extraction.
Options: "markdown" (default), "html", "fit_markdown"
force_json_response: Whether to force a JSON response from the LLM.
verbose: Whether to print verbose output.
# Deprecated arguments, will be removed very soon
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
api_token: The API token for the provider.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
"""
...
def __setattr__(self, name, value): # -> None:
"""Handle attribute setting."""
...
def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
"""
Extract meaningful blocks or chunks from the given HTML using an LLM.
How it works:
1. Construct a prompt with variables.
2. Make a request to the LLM using the prompt.
3. Parse the response and extract blocks or chunks.
Args:
url: The URL of the webpage.
ix: Index of the block.
html: The HTML content of the webpage.
Returns:
A list of extracted blocks or chunks.
"""
...
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
"""
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
Args:
url: The URL of the webpage.
sections: List of sections (strings) to process.
Returns:
A list of extracted blocks or chunks.
"""
...
async def aextract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
"""
Async version: Extract meaningful blocks or chunks from the given HTML using an LLM.
How it works:
1. Construct a prompt with variables.
2. Make an async request to the LLM using the prompt.
3. Parse the response and extract blocks or chunks.
Args:
url: The URL of the webpage.
ix: Index of the block.
html: The HTML content of the webpage.
Returns:
A list of extracted blocks or chunks.
"""
...
async def arun(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
"""
Async version: Process sections with true parallelism using asyncio.gather.
Args:
url: The URL of the webpage.
sections: List of sections (strings) to process.
Returns:
A list of extracted blocks or chunks.
"""
...
def show_usage(self) -> None:
"""Print a detailed token usage report showing total and per-request usage."""
...
class JsonElementExtractionStrategy(ExtractionStrategy):
"""
Abstract base class for extracting structured JSON from HTML content.
How it works:
1. Parses HTML content using the `_parse_html` method.
2. Uses a schema to define base selectors, fields, and transformations.
3. Extracts data hierarchically, supporting nested fields and lists.
4. Handles computed fields with expressions or functions.
Attributes:
DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
schema (Dict[str, Any]): The schema defining the extraction rules.
verbose (bool): Enables verbose logging for debugging purposes.
Methods:
extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
_extract_item(element, fields): Extracts fields from a single element.
_extract_single_field(element, field): Extracts a single field based on its type.
_apply_transform(value, transform): Applies a transformation to a value.
_compute_field(item, field): Computes a field value using an expression or function.
run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
Abstract Methods:
_parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
_get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
_get_elements(element, selector): Retrieves child elements using a selector.
_get_element_text(element): Extracts text content from an element.
_get_element_html(element): Extracts raw HTML from an element.
_get_element_attribute(element, attribute): Extracts an attribute's value from an element.
"""
DEL = ...
def __init__(self, schema: Dict[str, Any], **kwargs) -> None:
"""
Initialize the JSON element extraction strategy with a schema.
Args:
schema (Dict[str, Any]): The schema defining the extraction rules.
"""
...
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
"""
Extract structured data from HTML content.
How it works:
1. Parses the HTML content using the `_parse_html` method.
2. Identifies base elements using the schema's base selector.
3. Extracts fields from each base element using `_extract_item`.
Args:
url (str): The URL of the page being processed.
html_content (str): The raw HTML content to parse and extract.
*q: Additional positional arguments.
**kwargs: Additional keyword arguments for custom extraction.
Returns:
List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
"""
...
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
"""
Run the extraction strategy on a combined HTML content.
How it works:
1. Combines multiple HTML sections using the `DEL` delimiter.
2. Calls the `extract` method with the combined HTML.
Args:
url (str): The URL of the page being processed.
sections (List[str]): A list of HTML sections.
*q: Additional positional arguments.
**kwargs: Additional keyword arguments for custom extraction.
Returns:
List[Dict[str, Any]]: A list of extracted items.
"""
...
_GENERATE_SCHEMA_UNWANTED_PROPS = ...
@staticmethod
def generate_schema(html: str, schema_type: str = ..., query: str = ..., target_json_example: str = ..., llm_config: LLMConfig = ..., provider: str = ..., api_token: str = ..., **kwargs) -> dict:
"""
Generate extraction schema from HTML content and optional query.
Args:
html (str): The HTML content to analyze
query (str, optional): Natural language description of what data to extract
provider (str): Legacy Parameter. LLM provider to use
api_token (str): Legacy Parameter. API token for LLM provider
llm_config (LLMConfig): LLM configuration object
prompt (str, optional): Custom prompt template to use
**kwargs: Additional args passed to LLM processor
Returns:
dict: Generated schema following the JsonElementExtractionStrategy format
"""
...
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
"""
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
How it works:
1. Parses HTML content with BeautifulSoup.
2. Selects elements using CSS selectors defined in the schema.
3. Extracts field data and applies transformations as defined.
Attributes:
schema (Dict[str, Any]): The schema defining the extraction rules.
verbose (bool): Enables verbose logging for debugging purposes.
Methods:
_parse_html(html_content): Parses HTML content into a BeautifulSoup object.
_get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
_get_elements(element, selector): Selects child elements using a CSS selector.
_get_element_text(element): Extracts text content from a BeautifulSoup element.
_get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
_get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
"""
def __init__(self, schema: Dict[str, Any], **kwargs) -> None:
...
class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs) -> None:
...
class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs) -> None:
...
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
"""
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
How it works:
1. Parses HTML content into an lxml tree.
2. Selects elements using XPath expressions.
3. Converts CSS selectors to XPath when needed.
Attributes:
schema (Dict[str, Any]): The schema defining the extraction rules.
verbose (bool): Enables verbose logging for debugging purposes.
Methods:
_parse_html(html_content): Parses HTML content into an lxml tree.
_get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
_css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
_get_elements(element, selector): Selects child elements using an XPath selector.
_get_element_text(element): Extracts text content from an lxml element.
_get_element_html(element): Extracts the raw HTML content of an lxml element.
_get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
"""
def __init__(self, schema: Dict[str, Any], **kwargs) -> None:
...
_CTRL = ...
_WB_FIX = ...
_NEEDS_ESCAPE = ...
class RegexExtractionStrategy(ExtractionStrategy):
"""
A lean strategy that finds e-mails, phones, URLs, dates, money, etc.,
using nothing but pre-compiled regular expressions.
Extraction returns::
{
"url": "<page-url>",
"label": "<pattern-label>",
"value": "<matched-string>",
"span": [start, end]
}
Only `generate_schema()` touches an LLM, extraction itself is pure Python.
"""
class _B(IntFlag):
EMAIL = ...
PHONE_INTL = ...
PHONE_US = ...
URL = ...
IPV4 = ...
IPV6 = ...
UUID = ...
CURRENCY = ...
PERCENTAGE = ...
NUMBER = ...
DATE_ISO = ...
DATE_US = ...
TIME_24H = ...
POSTAL_US = ...
POSTAL_UK = ...
HTML_COLOR_HEX = ...
TWITTER_HANDLE = ...
HASHTAG = ...
MAC_ADDR = ...
IBAN = ...
CREDIT_CARD = ...
NOTHING = ...
ALL = ...
Email = ...
PhoneIntl = ...
PhoneUS = ...
Url = ...
IPv4 = ...
IPv6 = ...
Uuid = ...
Currency = ...
Percentage = ...
Number = ...
DateIso = ...
DateUS = ...
Time24h = ...
PostalUS = ...
PostalUK = ...
HexColor = ...
TwitterHandle = ...
Hashtag = ...
MacAddr = ...
Iban = ...
CreditCard = ...
All = ...
Nothing = ...
DEFAULT_PATTERNS: Dict[str, str] = ...
_FLAGS = ...
_UNWANTED_PROPS = ...
def __init__(self, pattern: _B = ..., *, custom: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = ..., input_format: str = ..., **kwargs) -> None:
"""
Args:
patterns: Custom patterns overriding or extending defaults.
Dict[label, regex] or list[tuple(label, regex)].
input_format: "html", "markdown" or "text".
**kwargs: Forwarded to ExtractionStrategy.
"""
...
def extract(self, url: str, content: str, *q, **kw) -> List[Dict[str, Any]]:
...
@staticmethod
def generate_pattern(label: str, html: str, *, query: Optional[str] = ..., examples: Optional[List[str]] = ..., llm_config: Optional[LLMConfig] = ..., **kwargs) -> Dict[str, str]:
"""
Ask an LLM for a single page-specific regex and return
{label: pattern} ── ready for RegexExtractionStrategy(custom=…)
"""
...