"""
This type stub file was generated by pyright.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional
from lxml import html as lhtml
from .models import ScrapingResult
OG_REGEX = ...
TWITTER_REGEX = ...
DIMENSION_REGEX = ...
def parse_srcset(s: str) -> List[Dict]:
...
def parse_dimension(dimension): # -> tuple[int, str | Any] | tuple[None, None]:
...
def fetch_image_file_size(img, base_url): # -> str | None:
...
class ContentScrapingStrategy(ABC):
@abstractmethod
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
...
@abstractmethod
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
...
class LXMLWebScrapingStrategy(ContentScrapingStrategy):
"""
LXML-based implementation for fast web content scraping.
This is the primary scraping strategy in Crawl4AI, providing high-performance
HTML parsing and content extraction using the lxml library.
Note: WebScrapingStrategy is now an alias for this class to maintain
backward compatibility.
"""
def __init__(self, logger=...) -> None:
...
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
"""
Main entry point for content scraping.
Args:
url (str): The URL of the page to scrape.
html (str): The HTML content of the page.
**kwargs: Additional keyword arguments.
Returns:
ScrapingResult: A structured result containing the scraped content.
"""
...
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
"""
Main entry point for asynchronous content scraping.
Args:
url (str): The URL of the page to scrape.
html (str): The HTML content of the page.
**kwargs: Additional keyword arguments.
Returns:
ScrapingResult: A structured result containing the scraped content.
"""
...
def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
"""
Process an HTML element.
How it works:
1. Check if the element is an image, video, or audio.
2. Extract the element's attributes and content.
3. Process the element based on its type.
4. Return the processed element information.
Args:
url (str): The URL of the page containing the element.
element (lhtml.HtmlElement): The HTML element to process.
**kwargs: Additional keyword arguments.
Returns:
dict: A dictionary containing the processed element information.
"""
...
def find_closest_parent_with_useful_text(self, element: lhtml.HtmlElement, **kwargs) -> Optional[str]:
...
def flatten_nested_elements(self, element: lhtml.HtmlElement) -> lhtml.HtmlElement:
"""Flatten nested elements of the same type in LXML tree"""
...
def process_image(self, img: lhtml.HtmlElement, url: str, index: int, total_images: int, **kwargs) -> Optional[List[Dict]]:
...
def remove_empty_elements_fast(self, root, word_count_threshold=...):
"""
Remove elements that fall below the desired word threshold in a single pass from the bottom up.
Skips non-element nodes like HtmlComment and bypasses certain tags that are allowed to have no content.
"""
...
def remove_unwanted_attributes_fast(self, root: lhtml.HtmlElement, important_attrs=..., keep_data_attributes=...) -> lhtml.HtmlElement:
"""
Removes all attributes from each element (including root) except those in `important_attrs`.
If `keep_data_attributes=True`, also retain any attribute starting with 'data-'.
Returns the same root element, mutated in-place, for fluent usage.
"""
...
WebScrapingStrategy = LXMLWebScrapingStrategy