Dev Tool MCP

Overview Schema Related Servers Score Discussions

content_scraping_strategy.pyi•3.94 KiB

""" This type stub file was generated by pyright. """ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional from lxml import html as lhtml from .models import ScrapingResult OG_REGEX = ... TWITTER_REGEX = ... DIMENSION_REGEX = ... def parse_srcset(s: str) -> List[Dict]: ... def parse_dimension(dimension): # -> tuple[int, str | Any] | tuple[None, None]: ... def fetch_image_file_size(img, base_url): # -> str | None: ... class ContentScrapingStrategy(ABC): @abstractmethod def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: ... @abstractmethod async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: ... class LXMLWebScrapingStrategy(ContentScrapingStrategy): """ LXML-based implementation for fast web content scraping. This is the primary scraping strategy in Crawl4AI, providing high-performance HTML parsing and content extraction using the lxml library. Note: WebScrapingStrategy is now an alias for this class to maintain backward compatibility. """ def __init__(self, logger=...) -> None: ... def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: """ Main entry point for content scraping. Args: url (str): The URL of the page to scrape. html (str): The HTML content of the page. **kwargs: Additional keyword arguments. Returns: ScrapingResult: A structured result containing the scraped content. """ ... async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: """ Main entry point for asynchronous content scraping. Args: url (str): The URL of the page to scrape. html (str): The HTML content of the page. **kwargs: Additional keyword arguments. Returns: ScrapingResult: A structured result containing the scraped content. """ ... def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]: """ Process an HTML element. How it works: 1. Check if the element is an image, video, or audio. 2. Extract the element's attributes and content. 3. Process the element based on its type. 4. Return the processed element information. Args: url (str): The URL of the page containing the element. element (lhtml.HtmlElement): The HTML element to process. **kwargs: Additional keyword arguments. Returns: dict: A dictionary containing the processed element information. """ ... def find_closest_parent_with_useful_text(self, element: lhtml.HtmlElement, **kwargs) -> Optional[str]: ... def flatten_nested_elements(self, element: lhtml.HtmlElement) -> lhtml.HtmlElement: """Flatten nested elements of the same type in LXML tree""" ... def process_image(self, img: lhtml.HtmlElement, url: str, index: int, total_images: int, **kwargs) -> Optional[List[Dict]]: ... def remove_empty_elements_fast(self, root, word_count_threshold=...): """ Remove elements that fall below the desired word threshold in a single pass from the bottom up. Skips non-element nodes like HtmlComment and bypasses certain tags that are allowed to have no content. """ ... def remove_unwanted_attributes_fast(self, root: lhtml.HtmlElement, important_attrs=..., keep_data_attributes=...) -> lhtml.HtmlElement: """ Removes all attributes from each element (including root) except those in `important_attrs`. If `keep_data_attributes=True`, also retain any attribute starting with 'data-'. Returns the same root element, mutated in-place, for fluent usage. """ ... WebScrapingStrategy = LXMLWebScrapingStrategy

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/osins/dev-tool-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content_scraping_strategy.pyi•3.94 KiB