import numpy as np
from _typeshed import Incomplete
from bs4 import Tag as Tag
from typing import Any, Callable, Generator, Iterable, Sequence
from urllib.parse import quote as quote, unquote as unquote
original_applies_to: Incomplete
def patched_applies_to(self, filename): ...
def chunk_documents(documents: Iterable[str], chunk_token_threshold: int, overlap: int, word_token_rate: float = 0.75, tokenizer: Callable[[str], list[str]] | None = None) -> Generator[str, None, None]: ...
def merge_chunks(docs: Sequence[str], target_size: int, overlap: int = 0, word_token_ratio: float = 1.0, splitter: Callable = None) -> list[str]: ...
class VersionManager:
home_dir: Incomplete
version_file: Incomplete
def __init__(self) -> None: ...
def get_installed_version(self): ...
def update_version(self) -> None: ...
def needs_update(self): ...
class RobotsParser:
CACHE_TTL: Incomplete
cache_dir: Incomplete
cache_ttl: Incomplete
db_path: Incomplete
def __init__(self, cache_dir=None, cache_ttl=None) -> None: ...
async def can_fetch(self, url: str, user_agent: str = '*') -> bool: ...
def clear_cache(self) -> None: ...
def clear_expired(self) -> None: ...
class InvalidCSSSelectorError(Exception): ...
SPLITS: Incomplete
HTML_CODE_CHARS: Incomplete
def advanced_split(text: str) -> list[str]: ...
def create_box_message(message: str, type: str = 'info', width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str: ...
def calculate_semaphore_count(): ...
def get_system_memory(): ...
def get_home_folder(): ...
async def get_chromium_path(browser_type) -> str: ...
def beautify_html(escaped_html): ...
def split_and_parse_json_objects(json_string): ...
def sanitize_html(html): ...
def sanitize_input_encode(text: str) -> str: ...
def escape_json_string(s): ...
def replace_inline_tags(soup, tags, only_text: bool = False): ...
def get_content_of_website(url, html, word_count_threshold=..., css_selector=None, **kwargs): ...
def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = ..., css_selector: str = None, **kwargs) -> dict[str, Any]: ...
def extract_metadata_using_lxml(html, doc=None): ...
def extract_metadata(html, soup=None): ...
def extract_xml_tags(string): ...
def extract_xml_data_legacy(tags, string): ...
def extract_xml_data(tags, string): ...
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response: bool = False, base_url=None, **kwargs): ...
def extract_blocks(url, html, provider=..., api_token=None, base_url=None): ...
def extract_blocks_batch(batch_data, provider: str = 'groq/llama3-70b-8192', api_token=None): ...
def merge_chunks_based_on_token_threshold(chunks, token_threshold): ...
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: ...
def wrap_text(draw, text, font, max_width): ...
def format_html(html_string): ...
def fast_format_html(html_string): ...
def normalize_url(href, base_url): ...
def normalize_url_for_deep_crawl(href, base_url, preserve_https: bool = False, original_scheme=None): ...
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https: bool = False, original_scheme=None): ...
def normalize_url_tmp(href, base_url): ...
def get_base_domain(url: str) -> str: ...
def is_external_url(url: str, base_domain: str) -> bool: ...
def clean_tokens(tokens: list[str]) -> list[str]: ...
def profile_and_time(func): ...
def generate_content_hash(content: str) -> str: ...
def ensure_content_dirs(base_path: str) -> dict[str, str]: ...
def configure_windows_event_loop() -> None: ...
def get_error_context(exc_info, context_lines: int = 5): ...
def truncate(value, threshold): ...
def optimize_html(html_str, threshold: int = 200): ...
class HeadPeekr:
@staticmethod
async def fetch_head_section(url, timeout: float = 0.3): ...
@staticmethod
async def peek_html(url, timeout: float = 0.3): ...
@staticmethod
def extract_meta_tags(head_content: str): ...
def get_title(head_content: str): ...
def preprocess_html_for_schema(html_content, text_threshold: int = 100, attr_value_threshold: int = 200, max_size: int = 100000): ...
def start_colab_display_server() -> None: ...
def setup_colab_environment() -> None: ...
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict: ...
def calculate_link_intrinsic_score(link_text: str, url: str, title_attr: str, class_attr: str, rel_attr: str, page_context: dict) -> float: ...
def calculate_total_score(intrinsic_score: float | None = None, contextual_score: float | None = None, score_links_enabled: bool = False, query_provided: bool = False) -> float: ...
async def get_text_embeddings(texts: list[str], llm_config: dict | None = None, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', batch_size: int = 32) -> np.ndarray: ...
def get_text_embeddings_sync(texts: list[str], llm_config: dict | None = None, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', batch_size: int = 32) -> np.ndarray: ...
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float: ...
def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float: ...
def get_true_available_memory_gb() -> float: ...
def get_true_memory_usage_percent() -> float: ...
def get_memory_stats() -> tuple[float, float, float]: ...
def hooks_to_string(hooks: dict[str, Callable]) -> dict[str, str]: ...