from .utils import *
from .models import *
from .model_loader import *
import abc
from .types import LLMConfig
from _typeshed import Incomplete
from abc import ABC, abstractmethod
from enum import IntFlag
from typing import Any
class ExtractionStrategy(ABC, metaclass=abc.ABCMeta):
input_format: Incomplete
DEL: str
name: Incomplete
verbose: Incomplete
def __init__(self, input_format: str = 'markdown', **kwargs) -> None: ...
@abstractmethod
def extract(self, url: str, html: str, *q, **kwargs) -> list[dict[str, Any]]: ...
def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: ...
class NoExtractionStrategy(ExtractionStrategy):
def extract(self, url: str, html: str, *q, **kwargs) -> list[dict[str, Any]]: ...
def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: ...
class CosineStrategy(ExtractionStrategy):
semantic_filter: Incomplete
word_count_threshold: Incomplete
max_dist: Incomplete
linkage_method: Incomplete
top_k: Incomplete
sim_threshold: Incomplete
timer: Incomplete
verbose: Incomplete
buffer_embeddings: Incomplete
get_embedding_method: str
device: Incomplete
default_batch_size: Incomplete
def __init__(self, semantic_filter=None, word_count_threshold: int = 10, max_dist: float = 0.2, linkage_method: str = 'ward', top_k: int = 3, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold: float = 0.3, **kwargs) -> None: ...
def filter_documents_embeddings(self, documents: list[str], semantic_filter: str, at_least_k: int = 20) -> list[str]: ...
def get_embeddings(self, sentences: list[str], batch_size=None, bypass_buffer: bool = False): ...
def hierarchical_clustering(self, sentences: list[str], embeddings=None): ...
def filter_clusters_by_word_count(self, clusters: dict[int, list[str]]) -> dict[int, list[str]]: ...
def extract(self, url: str, html: str, *q, **kwargs) -> list[dict[str, Any]]: ...
def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: ...
class LLMExtractionStrategy(ExtractionStrategy):
llm_config: Incomplete
instruction: Incomplete
extract_type: Incomplete
schema: Incomplete
force_json_response: Incomplete
chunk_token_threshold: Incomplete
overlap_rate: Incomplete
word_token_rate: Incomplete
apply_chunking: Incomplete
extra_args: Incomplete
verbose: Incomplete
usages: Incomplete
total_usage: Incomplete
provider: Incomplete
api_token: Incomplete
base_url: Incomplete
api_base: Incomplete
def __init__(self, llm_config: LLMConfig = None, instruction: str = None, schema: dict = None, extraction_type: str = 'block', chunk_token_threshold=..., overlap_rate=..., word_token_rate=..., apply_chunking: bool = True, input_format: str = 'markdown', force_json_response: bool = False, verbose: bool = False, provider: str = ..., api_token: str | None = None, base_url: str = None, api_base: str = None, **kwargs) -> None: ...
def __setattr__(self, name, value) -> None: ...
def extract(self, url: str, ix: int, html: str) -> list[dict[str, Any]]: ...
def run(self, url: str, sections: list[str]) -> list[dict[str, Any]]: ...
def show_usage(self) -> None: ...
class JsonElementExtractionStrategy(ExtractionStrategy, metaclass=abc.ABCMeta):
DEL: str
schema: Incomplete
verbose: Incomplete
def __init__(self, schema: dict[str, Any], **kwargs) -> None: ...
def extract(self, url: str, html_content: str, *q, **kwargs) -> list[dict[str, Any]]: ...
def run(self, url: str, sections: list[str], *q, **kwargs) -> list[dict[str, Any]]: ...
@staticmethod
def generate_schema(html: str, schema_type: str = 'CSS', query: str = None, target_json_example: str = None, llm_config: LLMConfig = ..., provider: str = None, api_token: str = None, **kwargs) -> dict: ...
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
def __init__(self, schema: dict[str, Any], **kwargs) -> None: ...
class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
use_caching: Incomplete
optimize_common_patterns: Incomplete
etree: Incomplete
html_parser: Incomplete
CSSSelector: Incomplete
def __init__(self, schema: dict[str, Any], **kwargs) -> None: ...
class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
def __init__(self, schema: dict[str, Any], **kwargs) -> None: ...
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
def __init__(self, schema: dict[str, Any], **kwargs) -> None: ...
class RegexExtractionStrategy(ExtractionStrategy):
class _B(IntFlag):
EMAIL = ...
PHONE_INTL = ...
PHONE_US = ...
URL = ...
IPV4 = ...
IPV6 = ...
UUID = ...
CURRENCY = ...
PERCENTAGE = ...
NUMBER = ...
DATE_ISO = ...
DATE_US = ...
TIME_24H = ...
POSTAL_US = ...
POSTAL_UK = ...
HTML_COLOR_HEX = ...
TWITTER_HANDLE = ...
HASHTAG = ...
MAC_ADDR = ...
IBAN = ...
CREDIT_CARD = ...
NOTHING = ...
ALL = EMAIL | PHONE_INTL | PHONE_US | URL | IPV4 | IPV6 | UUID | CURRENCY | PERCENTAGE | NUMBER | DATE_ISO | DATE_US | TIME_24H | POSTAL_US | POSTAL_UK | HTML_COLOR_HEX | TWITTER_HANDLE | HASHTAG | MAC_ADDR | IBAN | CREDIT_CARD
Email: Incomplete
PhoneIntl: Incomplete
PhoneUS: Incomplete
Url: Incomplete
IPv4: Incomplete
IPv6: Incomplete
Uuid: Incomplete
Currency: Incomplete
Percentage: Incomplete
Number: Incomplete
DateIso: Incomplete
DateUS: Incomplete
Time24h: Incomplete
PostalUS: Incomplete
PostalUK: Incomplete
HexColor: Incomplete
TwitterHandle: Incomplete
Hashtag: Incomplete
MacAddr: Incomplete
Iban: Incomplete
CreditCard: Incomplete
All: Incomplete
Nothing: Incomplete
DEFAULT_PATTERNS: dict[str, str]
def __init__(self, pattern: _B = ..., *, custom: dict[str, str] | list[tuple[str, str]] | None = None, input_format: str = 'fit_html', **kwargs) -> None: ...
def extract(self, url: str, content: str, *q, **kw) -> list[dict[str, Any]]: ...
@staticmethod
def generate_pattern(label: str, html: str, *, query: str | None = None, examples: list[str] | None = None, llm_config: LLMConfig | None = None, **kwargs) -> dict[str, str]: ...