"""
This type stub file was generated by pyright.
"""
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, Final, List, Optional, Union
from playwright.async_api import Page
from .models import AsyncCrawlResponse
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
from .async_logger import AsyncLogger
from .browser_adapter import BrowserAdapter
class AsyncCrawlerStrategy(ABC):
"""
Abstract base class for crawler strategies.
Subclasses must implement the crawl method.
"""
@abstractmethod
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
...
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
Crawler strategy using Playwright.
Attributes:
browser_config (BrowserConfig): Configuration object containing browser settings.
logger (AsyncLogger): Logger instance for recording events and errors.
_downloaded_files (List[str]): List of downloaded file paths.
hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior.
browser_manager (BrowserManager): Manager for browser creation and management.
Methods:
__init__(self, browser_config=None, logger=None, **kwargs):
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
__aenter__(self):
Start the browser and initialize the browser manager.
__aexit__(self, exc_type, exc_val, exc_tb):
Close the browser and clean up resources.
start(self):
Start the browser and initialize the browser manager.
close(self):
Close the browser and clean up resources.
kill_session(self, session_id):
Kill a browser session and clean up resources.
crawl(self, url, **kwargs):
Run the crawler for a single URL.
"""
def __init__(self, browser_config: BrowserConfig = ..., logger: AsyncLogger = ..., browser_adapter: BrowserAdapter = ..., **kwargs) -> None:
"""
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
Args:
browser_config (BrowserConfig): Configuration object containing browser settings.
If None, will be created from kwargs for backwards compatibility.
logger: Logger instance for recording events and errors.
browser_adapter (BrowserAdapter): Browser adapter for handling browser-specific operations.
If None, defaults to PlaywrightAdapter.
**kwargs: Additional arguments for backwards compatibility and extending functionality.
"""
...
async def __aenter__(self): # -> Self:
...
async def __aexit__(self, exc_type, exc_val, exc_tb): # -> None:
...
async def start(self): # -> None:
"""
Start the browser and initialize the browser manager.
"""
...
async def close(self): # -> None:
"""
Close the browser and clean up resources.
"""
...
async def kill_session(self, session_id: str): # -> None:
"""
Kill a browser session and clean up resources.
Args:
session_id (str): The ID of the session to kill.
Returns:
None
"""
...
def set_hook(self, hook_type: str, hook: Callable): # -> None:
"""
Set a hook function for a specific hook type. Following are list of hook types:
- on_browser_created: Called when a new browser instance is created.
- on_page_context_created: Called when a new page context is created.
- on_user_agent_updated: Called when the user agent is updated.
- on_execution_started: Called when the execution starts.
- before_goto: Called before a goto operation.
- after_goto: Called after a goto operation.
- before_return_html: Called before returning HTML content.
- before_retrieve_html: Called before retrieving HTML content.
All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs.
Args:
hook_type (str): The type of the hook.
hook (Callable): The hook function to set.
Returns:
None
"""
...
async def execute_hook(self, hook_type: str, *args, **kwargs): # -> None:
"""
Execute a hook function for a specific hook type.
Args:
hook_type (str): The type of the hook.
*args: Variable length positional arguments.
**kwargs: Keyword arguments.
Returns:
The return value of the hook function, if any.
"""
...
def update_user_agent(self, user_agent: str): # -> None:
"""
Update the user agent for the browser.
Args:
user_agent (str): The new user agent string.
Returns:
None
"""
...
def set_custom_headers(self, headers: Dict[str, str]): # -> None:
"""
Set custom headers for the browser.
Args:
headers (Dict[str, str]): A dictionary of headers to set.
Returns:
None
"""
...
async def smart_wait(self, page: Page, wait_for: str, timeout: float = ...): # -> Any | Literal[False] | None:
"""
Wait for a condition in a smart way. This functions works as below:
1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true.
2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present.
3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true.
4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present.
This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl().
Args:
page: Playwright page object
wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'.
timeout (float): Maximum time to wait in milliseconds
Returns:
None
"""
...
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = ...): # -> Any | Literal[False]:
"""
Wait for a condition in a CSP-compliant way.
Args:
page: Playwright page object
user_wait_function: JavaScript function as string that returns boolean
timeout: Maximum time to wait in milliseconds
Returns:
bool: True if condition was met, False if timed out
Raises:
RuntimeError: If there's an error evaluating the condition
"""
...
async def process_iframes(self, page):
"""
Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content.
Args:
page: Playwright page object
Returns:
Playwright page object
"""
...
async def create_session(self, **kwargs) -> str:
"""
Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.
This function is asynchronous and returns a string representing the session ID.
Args:
**kwargs: Optional keyword arguments to configure the session.
Returns:
str: The session ID.
"""
...
async def crawl(self, url: str, config: CrawlerRunConfig, **kwargs) -> AsyncCrawlResponse:
"""
Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
Args:
url (str): The URL to crawl. Supported prefixes:
- 'http://' or 'https://': Web URL to crawl.
- 'file://': Local file path to process.
- 'raw://': Raw HTML content to process.
**kwargs: Additional parameters:
- 'screenshot' (bool): Whether to take a screenshot.
- ... [other existing parameters]
Returns:
AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
"""
...
async def remove_overlay_elements(self, page: Page) -> None:
"""
Removes popup overlays, modals, cookie notices, and other intrusive elements from the page.
Args:
page (Page): The Playwright page instance
"""
...
async def export_pdf(self, page: Page) -> bytes:
"""
Exports the current page as a PDF.
Args:
page (Page): The Playwright page object
Returns:
bytes: The PDF data
"""
...
async def capture_mhtml(self, page: Page) -> Optional[str]:
"""
Captures the current page as MHTML using CDP.
MHTML (MIME HTML) is a web page archive format that combines the HTML content
with its resources (images, CSS, etc.) into a single MIME-encoded file.
Args:
page (Page): The Playwright page object
Returns:
Optional[str]: The MHTML content as a string, or None if there was an error
"""
...
async def take_screenshot(self, page, **kwargs) -> str:
"""
Take a screenshot of the current page.
Args:
page (Page): The Playwright page object
kwargs: Additional keyword arguments
Returns:
str: The base64-encoded screenshot data
"""
...
async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
"""
Convert the first page of the PDF to a screenshot.
Requires pdf2image and poppler.
Args:
pdf_data (bytes): The PDF data
Returns:
str: The base64-encoded screenshot data
"""
...
async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
"""
Attempt to set a large viewport and take a full-page screenshot.
If still too large, segment the page as before.
Requires pdf2image and poppler.
Args:
page (Page): The Playwright page object
kwargs: Additional keyword arguments
Returns:
str: The base64-encoded screenshot data
"""
...
async def take_screenshot_naive(self, page: Page) -> str:
"""
Takes a screenshot of the current page.
Args:
page (Page): The Playwright page instance
Returns:
str: Base64-encoded screenshot image
"""
...
async def export_storage_state(self, path: str = ...) -> dict:
"""
Exports the current storage state (cookies, localStorage, sessionStorage)
to a JSON file at the specified path.
Args:
path (str): The path to save the storage state JSON file
Returns:
dict: The exported storage state
"""
...
async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
"""
Executes user-provided JavaScript code with proper error handling and context,
supporting both synchronous and async user code, plus navigations.
How it works:
1. Wait for load state 'domcontentloaded'
2. If js_code is a string, execute it directly
3. If js_code is a list, execute each element in sequence
4. Wait for load state 'networkidle'
5. Return results
Args:
page (Page): The Playwright page instance
js_code (Union[str, List[str]]): The JavaScript code to execute
Returns:
Dict[str, Any]: The results of the execution
"""
...
async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]:
"""
Executes user-provided JavaScript code with proper error handling and context.
Args:
page: Playwright page object
js_code: Single JavaScript string or list of JavaScript code strings
Returns:
Dict containing execution status and results/errors
"""
...
async def check_visibility(self, page): # -> Any:
"""
Checks if an element is visible on the page.
Args:
page: Playwright page object
Returns:
Boolean indicating visibility
"""
...
async def safe_scroll(self, page: Page, x: int, y: int, delay: float = ...): # -> Dict[str, Any]:
"""
Safely scroll the page with rendering time.
Args:
page: Playwright page object
x: Horizontal scroll position
y: Vertical scroll position
"""
...
async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]:
"""
Performs a CSP-compliant scroll operation and returns the result status.
Args:
page: Playwright page object
x: Horizontal scroll position
y: Vertical scroll position
Returns:
Dict containing scroll status and position information
"""
...
async def get_page_dimensions(self, page: Page): # -> Any:
"""
Get the dimensions of the page.
Args:
page: Playwright page object
Returns:
Dict containing width and height of the page
"""
...
async def page_need_scroll(self, page: Page) -> bool:
"""
Determine whether the page need to scroll
Args:
page: Playwright page object
Returns:
bool: True if page needs scrolling
"""
...
class HTTPCrawlerError(Exception):
"""Base error class for HTTP crawler specific exceptions"""
...
class ConnectionTimeoutError(HTTPCrawlerError):
"""Raised when connection timeout occurs"""
...
class HTTPStatusError(HTTPCrawlerError):
"""Raised for unexpected status codes"""
def __init__(self, status_code: int, message: str) -> None:
...
class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
"""
Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency.
"""
__slots__ = ...
DEFAULT_TIMEOUT: Final[int] = ...
DEFAULT_CHUNK_SIZE: Final[int] = ...
DEFAULT_MAX_CONNECTIONS: Final[int] = ...
DEFAULT_DNS_CACHE_TTL: Final[int] = ...
VALID_SCHEMES: Final = ...
_BASE_HEADERS: Final = ...
def __init__(self, browser_config: Optional[HTTPCrawlerConfig] = ..., logger: Optional[AsyncLogger] = ..., max_connections: int = ..., dns_cache_ttl: int = ..., chunk_size: int = ...) -> None:
"""Initialize the HTTP crawler with config"""
...
async def __aenter__(self) -> AsyncHTTPCrawlerStrategy:
...
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
...
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
...
async def start(self) -> None:
...
async def close(self) -> None:
...
async def crawl(self, url: str, config: Optional[CrawlerRunConfig] = ..., **kwargs) -> AsyncCrawlResponse:
...