Crawl4AI+SearXNG MCP Server

utils.pyi•5.35 KiB

import numpy as np
from _typeshed import Incomplete
from bs4 import Tag as Tag
from typing import Any, Callable, Generator, Iterable, Sequence
from urllib.parse import quote as quote, unquote as unquote

original_applies_to: Incomplete

def patched_applies_to(self, filename): ...
def chunk_documents(documents: Iterable[str], chunk_token_threshold: int, overlap: int, word_token_rate: float = 0.75, tokenizer: Callable[[str], list[str]] | None = None) -> Generator[str, None, None]: ...
def merge_chunks(docs: Sequence[str], target_size: int, overlap: int = 0, word_token_ratio: float = 1.0, splitter: Callable = None) -> list[str]: ...

class VersionManager:
    home_dir: Incomplete
    version_file: Incomplete
    def __init__(self) -> None: ...
    def get_installed_version(self): ...
    def update_version(self) -> None: ...
    def needs_update(self): ...

class RobotsParser:
    CACHE_TTL: Incomplete
    cache_dir: Incomplete
    cache_ttl: Incomplete
    db_path: Incomplete
    def __init__(self, cache_dir=None, cache_ttl=None) -> None: ...
    async def can_fetch(self, url: str, user_agent: str = '*') -> bool: ...
    def clear_cache(self) -> None: ...
    def clear_expired(self) -> None: ...

class InvalidCSSSelectorError(Exception): ...

SPLITS: Incomplete
HTML_CODE_CHARS: Incomplete

def advanced_split(text: str) -> list[str]: ...
def create_box_message(message: str, type: str = 'info', width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str: ...
def calculate_semaphore_count(): ...
def get_system_memory(): ...
def get_home_folder(): ...
async def get_chromium_path(browser_type) -> str: ...
def beautify_html(escaped_html): ...
def split_and_parse_json_objects(json_string): ...
def sanitize_html(html): ...
def sanitize_input_encode(text: str) -> str: ...
def escape_json_string(s): ...
def replace_inline_tags(soup, tags, only_text: bool = False): ...
def get_content_of_website(url, html, word_count_threshold=..., css_selector=None, **kwargs): ...
def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = ..., css_selector: str = None, **kwargs) -> dict[str, Any]: ...
def extract_metadata_using_lxml(html, doc=None): ...
def extract_metadata(html, soup=None): ...
def extract_xml_tags(string): ...
def extract_xml_data_legacy(tags, string): ...
def extract_xml_data(tags, string): ...
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response: bool = False, base_url=None, **kwargs): ...
def extract_blocks(url, html, provider=..., api_token=None, base_url=None): ...
def extract_blocks_batch(batch_data, provider: str = 'groq/llama3-70b-8192', api_token=None): ...
def merge_chunks_based_on_token_threshold(chunks, token_threshold): ...
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: ...
def wrap_text(draw, text, font, max_width): ...
def format_html(html_string): ...
def fast_format_html(html_string): ...
def normalize_url(href, base_url): ...
def normalize_url_for_deep_crawl(href, base_url, preserve_https: bool = False, original_scheme=None): ...
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https: bool = False, original_scheme=None): ...
def normalize_url_tmp(href, base_url): ...
def get_base_domain(url: str) -> str: ...
def is_external_url(url: str, base_domain: str) -> bool: ...
def clean_tokens(tokens: list[str]) -> list[str]: ...
def profile_and_time(func): ...
def generate_content_hash(content: str) -> str: ...
def ensure_content_dirs(base_path: str) -> dict[str, str]: ...
def configure_windows_event_loop() -> None: ...
def get_error_context(exc_info, context_lines: int = 5): ...
def truncate(value, threshold): ...
def optimize_html(html_str, threshold: int = 200): ...

class HeadPeekr:
    @staticmethod
    async def fetch_head_section(url, timeout: float = 0.3): ...
    @staticmethod
    async def peek_html(url, timeout: float = 0.3): ...
    @staticmethod
    def extract_meta_tags(head_content: str): ...
    def get_title(head_content: str): ...

def preprocess_html_for_schema(html_content, text_threshold: int = 100, attr_value_threshold: int = 200, max_size: int = 100000): ...
def start_colab_display_server() -> None: ...
def setup_colab_environment() -> None: ...
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict: ...
def calculate_link_intrinsic_score(link_text: str, url: str, title_attr: str, class_attr: str, rel_attr: str, page_context: dict) -> float: ...
def calculate_total_score(intrinsic_score: float | None = None, contextual_score: float | None = None, score_links_enabled: bool = False, query_provided: bool = False) -> float: ...
async def get_text_embeddings(texts: list[str], llm_config: dict | None = None, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', batch_size: int = 32) -> np.ndarray: ...
def get_text_embeddings_sync(texts: list[str], llm_config: dict | None = None, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', batch_size: int = 32) -> np.ndarray: ...
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float: ...
def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float: ...
def get_true_available_memory_gb() -> float: ...
def get_true_memory_usage_percent() -> float: ...
def get_memory_stats() -> tuple[float, float, float]: ...
def hooks_to_string(hooks: dict[str, Callable]) -> dict[str, str]: ...

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-enthusiasts/crawl4ai-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

utils.pyi•5.35 KiB

import numpy as np
from _typeshed import Incomplete
from bs4 import Tag as Tag
from typing import Any, Callable, Generator, Iterable, Sequence
from urllib.parse import quote as quote, unquote as unquote

original_applies_to: Incomplete

def patched_applies_to(self, filename): ...
def chunk_documents(documents: Iterable[str], chunk_token_threshold: int, overlap: int, word_token_rate: float = 0.75, tokenizer: Callable[[str], list[str]] | None = None) -> Generator[str, None, None]: ...
def merge_chunks(docs: Sequence[str], target_size: int, overlap: int = 0, word_token_ratio: float = 1.0, splitter: Callable = None) -> list[str]: ...

class VersionManager:
    home_dir: Incomplete
    version_file: Incomplete
    def __init__(self) -> None: ...
    def get_installed_version(self): ...
    def update_version(self) -> None: ...
    def needs_update(self): ...

class RobotsParser:
    CACHE_TTL: Incomplete
    cache_dir: Incomplete
    cache_ttl: Incomplete
    db_path: Incomplete
    def __init__(self, cache_dir=None, cache_ttl=None) -> None: ...
    async def can_fetch(self, url: str, user_agent: str = '*') -> bool: ...
    def clear_cache(self) -> None: ...
    def clear_expired(self) -> None: ...

class InvalidCSSSelectorError(Exception): ...

SPLITS: Incomplete
HTML_CODE_CHARS: Incomplete

def advanced_split(text: str) -> list[str]: ...
def create_box_message(message: str, type: str = 'info', width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str: ...
def calculate_semaphore_count(): ...
def get_system_memory(): ...
def get_home_folder(): ...
async def get_chromium_path(browser_type) -> str: ...
def beautify_html(escaped_html): ...
def split_and_parse_json_objects(json_string): ...
def sanitize_html(html): ...
def sanitize_input_encode(text: str) -> str: ...
def escape_json_string(s): ...
def replace_inline_tags(soup, tags, only_text: bool = False): ...
def get_content_of_website(url, html, word_count_threshold=..., css_selector=None, **kwargs): ...
def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = ..., css_selector: str = None, **kwargs) -> dict[str, Any]: ...
def extract_metadata_using_lxml(html, doc=None): ...
def extract_metadata(html, soup=None): ...
def extract_xml_tags(string): ...
def extract_xml_data_legacy(tags, string): ...
def extract_xml_data(tags, string): ...
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response: bool = False, base_url=None, **kwargs): ...
def extract_blocks(url, html, provider=..., api_token=None, base_url=None): ...
def extract_blocks_batch(batch_data, provider: str = 'groq/llama3-70b-8192', api_token=None): ...
def merge_chunks_based_on_token_threshold(chunks, token_threshold): ...
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: ...
def wrap_text(draw, text, font, max_width): ...
def format_html(html_string): ...
def fast_format_html(html_string): ...
def normalize_url(href, base_url): ...
def normalize_url_for_deep_crawl(href, base_url, preserve_https: bool = False, original_scheme=None): ...
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https: bool = False, original_scheme=None): ...
def normalize_url_tmp(href, base_url): ...
def get_base_domain(url: str) -> str: ...
def is_external_url(url: str, base_domain: str) -> bool: ...
def clean_tokens(tokens: list[str]) -> list[str]: ...
def profile_and_time(func): ...
def generate_content_hash(content: str) -> str: ...
def ensure_content_dirs(base_path: str) -> dict[str, str]: ...
def configure_windows_event_loop() -> None: ...
def get_error_context(exc_info, context_lines: int = 5): ...
def truncate(value, threshold): ...
def optimize_html(html_str, threshold: int = 200): ...

class HeadPeekr:
    @staticmethod
    async def fetch_head_section(url, timeout: float = 0.3): ...
    @staticmethod
    async def peek_html(url, timeout: float = 0.3): ...
    @staticmethod
    def extract_meta_tags(head_content: str): ...
    def get_title(head_content: str): ...

def preprocess_html_for_schema(html_content, text_threshold: int = 100, attr_value_threshold: int = 200, max_size: int = 100000): ...
def start_colab_display_server() -> None: ...
def setup_colab_environment() -> None: ...
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict: ...
def calculate_link_intrinsic_score(link_text: str, url: str, title_attr: str, class_attr: str, rel_attr: str, page_context: dict) -> float: ...
def calculate_total_score(intrinsic_score: float | None = None, contextual_score: float | None = None, score_links_enabled: bool = False, query_provided: bool = False) -> float: ...
async def get_text_embeddings(texts: list[str], llm_config: dict | None = None, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', batch_size: int = 32) -> np.ndarray: ...
def get_text_embeddings_sync(texts: list[str], llm_config: dict | None = None, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', batch_size: int = 32) -> np.ndarray: ...
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float: ...
def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float: ...
def get_true_available_memory_gb() -> float: ...
def get_true_memory_usage_percent() -> float: ...
def get_memory_stats() -> tuple[float, float, float]: ...
def hooks_to_string(hooks: dict[str, Callable]) -> dict[str, str]: ...