Dev Tool MCP

Overview Schema Related Servers Score Discussions

async_crawler_strategy.pyi•15.6 KiB

""" This type stub file was generated by pyright. """ from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Final, List, Optional, Union from playwright.async_api import Page from .models import AsyncCrawlResponse from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig from .async_logger import AsyncLogger from .browser_adapter import BrowserAdapter class AsyncCrawlerStrategy(ABC): """ Abstract base class for crawler strategies. Subclasses must implement the crawl method. """ @abstractmethod async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: ... class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ Crawler strategy using Playwright. Attributes: browser_config (BrowserConfig): Configuration object containing browser settings. logger (AsyncLogger): Logger instance for recording events and errors. _downloaded_files (List[str]): List of downloaded file paths. hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior. browser_manager (BrowserManager): Manager for browser creation and management. Methods: __init__(self, browser_config=None, logger=None, **kwargs): Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. __aenter__(self): Start the browser and initialize the browser manager. __aexit__(self, exc_type, exc_val, exc_tb): Close the browser and clean up resources. start(self): Start the browser and initialize the browser manager. close(self): Close the browser and clean up resources. kill_session(self, session_id): Kill a browser session and clean up resources. crawl(self, url, **kwargs): Run the crawler for a single URL. """ def __init__(self, browser_config: BrowserConfig = ..., logger: AsyncLogger = ..., browser_adapter: BrowserAdapter = ..., **kwargs) -> None: """ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. Args: browser_config (BrowserConfig): Configuration object containing browser settings. If None, will be created from kwargs for backwards compatibility. logger: Logger instance for recording events and errors. browser_adapter (BrowserAdapter): Browser adapter for handling browser-specific operations. If None, defaults to PlaywrightAdapter. **kwargs: Additional arguments for backwards compatibility and extending functionality. """ ... async def __aenter__(self): # -> Self: ... async def __aexit__(self, exc_type, exc_val, exc_tb): # -> None: ... async def start(self): # -> None: """ Start the browser and initialize the browser manager. """ ... async def close(self): # -> None: """ Close the browser and clean up resources. """ ... async def kill_session(self, session_id: str): # -> None: """ Kill a browser session and clean up resources. Args: session_id (str): The ID of the session to kill. Returns: None """ ... def set_hook(self, hook_type: str, hook: Callable): # -> None: """ Set a hook function for a specific hook type. Following are list of hook types: - on_browser_created: Called when a new browser instance is created. - on_page_context_created: Called when a new page context is created. - on_user_agent_updated: Called when the user agent is updated. - on_execution_started: Called when the execution starts. - before_goto: Called before a goto operation. - after_goto: Called after a goto operation. - before_return_html: Called before returning HTML content. - before_retrieve_html: Called before retrieving HTML content. All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs. Args: hook_type (str): The type of the hook. hook (Callable): The hook function to set. Returns: None """ ... async def execute_hook(self, hook_type: str, *args, **kwargs): # -> None: """ Execute a hook function for a specific hook type. Args: hook_type (str): The type of the hook. *args: Variable length positional arguments. **kwargs: Keyword arguments. Returns: The return value of the hook function, if any. """ ... def update_user_agent(self, user_agent: str): # -> None: """ Update the user agent for the browser. Args: user_agent (str): The new user agent string. Returns: None """ ... def set_custom_headers(self, headers: Dict[str, str]): # -> None: """ Set custom headers for the browser. Args: headers (Dict[str, str]): A dictionary of headers to set. Returns: None """ ... async def smart_wait(self, page: Page, wait_for: str, timeout: float = ...): # -> Any | Literal[False] | None: """ Wait for a condition in a smart way. This functions works as below: 1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true. 2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present. 3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true. 4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present. This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl(). Args: page: Playwright page object wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'. timeout (float): Maximum time to wait in milliseconds Returns: None """ ... async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = ...): # -> Any | Literal[False]: """ Wait for a condition in a CSP-compliant way. Args: page: Playwright page object user_wait_function: JavaScript function as string that returns boolean timeout: Maximum time to wait in milliseconds Returns: bool: True if condition was met, False if timed out Raises: RuntimeError: If there's an error evaluating the condition """ ... async def process_iframes(self, page): """ Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content. Args: page: Playwright page object Returns: Playwright page object """ ... async def create_session(self, **kwargs) -> str: """ Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls. This function is asynchronous and returns a string representing the session ID. Args: **kwargs: Optional keyword arguments to configure the session. Returns: str: The session ID. """ ... async def crawl(self, url: str, config: CrawlerRunConfig, **kwargs) -> AsyncCrawlResponse: """ Crawls a given URL or processes raw HTML/local file content based on the URL prefix. Args: url (str): The URL to crawl. Supported prefixes: - 'http://' or 'https://': Web URL to crawl. - 'file://': Local file path to process. - 'raw://': Raw HTML content to process. **kwargs: Additional parameters: - 'screenshot' (bool): Whether to take a screenshot. - ... [other existing parameters] Returns: AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. """ ... async def remove_overlay_elements(self, page: Page) -> None: """ Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. Args: page (Page): The Playwright page instance """ ... async def export_pdf(self, page: Page) -> bytes: """ Exports the current page as a PDF. Args: page (Page): The Playwright page object Returns: bytes: The PDF data """ ... async def capture_mhtml(self, page: Page) -> Optional[str]: """ Captures the current page as MHTML using CDP. MHTML (MIME HTML) is a web page archive format that combines the HTML content with its resources (images, CSS, etc.) into a single MIME-encoded file. Args: page (Page): The Playwright page object Returns: Optional[str]: The MHTML content as a string, or None if there was an error """ ... async def take_screenshot(self, page, **kwargs) -> str: """ Take a screenshot of the current page. Args: page (Page): The Playwright page object kwargs: Additional keyword arguments Returns: str: The base64-encoded screenshot data """ ... async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: """ Convert the first page of the PDF to a screenshot. Requires pdf2image and poppler. Args: pdf_data (bytes): The PDF data Returns: str: The base64-encoded screenshot data """ ... async def take_screenshot_scroller(self, page: Page, **kwargs) -> str: """ Attempt to set a large viewport and take a full-page screenshot. If still too large, segment the page as before. Requires pdf2image and poppler. Args: page (Page): The Playwright page object kwargs: Additional keyword arguments Returns: str: The base64-encoded screenshot data """ ... async def take_screenshot_naive(self, page: Page) -> str: """ Takes a screenshot of the current page. Args: page (Page): The Playwright page instance Returns: str: Base64-encoded screenshot image """ ... async def export_storage_state(self, path: str = ...) -> dict: """ Exports the current storage state (cookies, localStorage, sessionStorage) to a JSON file at the specified path. Args: path (str): The path to save the storage state JSON file Returns: dict: The exported storage state """ ... async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: """ Executes user-provided JavaScript code with proper error handling and context, supporting both synchronous and async user code, plus navigations. How it works: 1. Wait for load state 'domcontentloaded' 2. If js_code is a string, execute it directly 3. If js_code is a list, execute each element in sequence 4. Wait for load state 'networkidle' 5. Return results Args: page (Page): The Playwright page instance js_code (Union[str, List[str]]): The JavaScript code to execute Returns: Dict[str, Any]: The results of the execution """ ... async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: """ Executes user-provided JavaScript code with proper error handling and context. Args: page: Playwright page object js_code: Single JavaScript string or list of JavaScript code strings Returns: Dict containing execution status and results/errors """ ... async def check_visibility(self, page): # -> Any: """ Checks if an element is visible on the page. Args: page: Playwright page object Returns: Boolean indicating visibility """ ... async def safe_scroll(self, page: Page, x: int, y: int, delay: float = ...): # -> Dict[str, Any]: """ Safely scroll the page with rendering time. Args: page: Playwright page object x: Horizontal scroll position y: Vertical scroll position """ ... async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: """ Performs a CSP-compliant scroll operation and returns the result status. Args: page: Playwright page object x: Horizontal scroll position y: Vertical scroll position Returns: Dict containing scroll status and position information """ ... async def get_page_dimensions(self, page: Page): # -> Any: """ Get the dimensions of the page. Args: page: Playwright page object Returns: Dict containing width and height of the page """ ... async def page_need_scroll(self, page: Page) -> bool: """ Determine whether the page need to scroll Args: page: Playwright page object Returns: bool: True if page needs scrolling """ ... class HTTPCrawlerError(Exception): """Base error class for HTTP crawler specific exceptions""" ... class ConnectionTimeoutError(HTTPCrawlerError): """Raised when connection timeout occurs""" ... class HTTPStatusError(HTTPCrawlerError): """Raised for unexpected status codes""" def __init__(self, status_code: int, message: str) -> None: ... class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): """ Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency. """ __slots__ = ... DEFAULT_TIMEOUT: Final[int] = ... DEFAULT_CHUNK_SIZE: Final[int] = ... DEFAULT_MAX_CONNECTIONS: Final[int] = ... DEFAULT_DNS_CACHE_TTL: Final[int] = ... VALID_SCHEMES: Final = ... _BASE_HEADERS: Final = ... def __init__(self, browser_config: Optional[HTTPCrawlerConfig] = ..., logger: Optional[AsyncLogger] = ..., max_connections: int = ..., dns_cache_ttl: int = ..., chunk_size: int = ...) -> None: """Initialize the HTTP crawler with config""" ... async def __aenter__(self) -> AsyncHTTPCrawlerStrategy: ... async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: ... def set_hook(self, hook_type: str, hook_func: Callable) -> None: ... async def start(self) -> None: ... async def close(self) -> None: ... async def crawl(self, url: str, config: Optional[CrawlerRunConfig] = ..., **kwargs) -> AsyncCrawlResponse: ...

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/osins/dev-tool-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

async_crawler_strategy.pyi•15.6 KiB