Scraper MCP

Overview Schema Related Servers Score Discussions

router.py•7.34 KiB

"""MCP tool definitions for web scraping.""" from __future__ import annotations from mcp.server.fastmcp import FastMCP from scraper_mcp.admin.service import DEFAULT_CONCURRENCY from scraper_mcp.cache import clear_all_cache, clear_expired_cache, get_cache_stats from scraper_mcp.models.links import BatchLinksResponse from scraper_mcp.models.scrape import BatchScrapeResponse from scraper_mcp.tools.service import ( batch_extract_links, batch_scrape_urls, batch_scrape_urls_markdown, batch_scrape_urls_text, ) async def scrape_url( urls: list[str], timeout: int = 30, max_retries: int = 3, css_selector: str | None = None, include_headers: bool = False, ) -> BatchScrapeResponse: """Scrape raw HTML content from one or more URLs. Args: urls: List of URLs to scrape (must be http:// or https://) timeout: Request timeout in seconds (default: 30) max_retries: Maximum number of retry attempts on failure (default: 3) css_selector: Optional CSS selector to filter HTML elements (e.g., "meta", "img, video", ".article-content") include_headers: Include HTTP response headers in metadata (default: False) Returns: BatchScrapeResponse with results for all URLs """ return await batch_scrape_urls(urls, timeout, max_retries, DEFAULT_CONCURRENCY, css_selector, include_headers) async def scrape_url_markdown( urls: list[str], timeout: int = 30, max_retries: int = 3, strip_tags: list[str] | None = None, css_selector: str | None = None, include_headers: bool = False, ) -> BatchScrapeResponse: """Scrape one or more URLs and convert the content to markdown format. Args: urls: List of URLs to scrape (must be http:// or https://) timeout: Request timeout in seconds (default: 30) max_retries: Maximum number of retry attempts on failure (default: 3) strip_tags: List of HTML tags to strip (e.g., ['script', 'style']) css_selector: Optional CSS selector to filter HTML elements before conversion (e.g., ".article-content", "article p") include_headers: Include HTTP response headers in metadata (default: False) Returns: BatchScrapeResponse with markdown results for all URLs """ return await batch_scrape_urls_markdown( urls, timeout, max_retries, strip_tags, DEFAULT_CONCURRENCY, css_selector, include_headers ) async def scrape_url_text( urls: list[str], timeout: int = 30, max_retries: int = 3, strip_tags: list[str] | None = None, css_selector: str | None = None, include_headers: bool = False, ) -> BatchScrapeResponse: """Scrape one or more URLs and extract plain text content. Args: urls: List of URLs to scrape (must be http:// or https://) timeout: Request timeout in seconds (default: 30) max_retries: Maximum number of retry attempts on failure (default: 3) strip_tags: List of HTML tags to strip (default: script, style, meta, link, noscript) css_selector: Optional CSS selector to filter HTML elements before text extraction (e.g., "#main-content", "article.post") include_headers: Include HTTP response headers in metadata (default: False) Returns: BatchScrapeResponse with text results for all URLs """ return await batch_scrape_urls_text( urls, timeout, max_retries, strip_tags, DEFAULT_CONCURRENCY, css_selector, include_headers ) async def scrape_extract_links( urls: list[str], timeout: int = 30, max_retries: int = 3, css_selector: str | None = None, include_headers: bool = False, ) -> BatchLinksResponse: """Scrape one or more URLs and extract all links. Args: urls: List of URLs to scrape (must be http:// or https://) timeout: Request timeout in seconds (default: 30) max_retries: Maximum number of retry attempts on failure (default: 3) css_selector: Optional CSS selector to scope link extraction to specific sections (e.g., "nav", "article.main-content") include_headers: Include HTTP response headers in metadata (default: False, not used for links) Returns: BatchLinksResponse with link extraction results for all URLs """ return await batch_extract_links(urls, timeout, max_retries, DEFAULT_CONCURRENCY, css_selector, include_headers) async def cache_stats() -> dict[str, int | float]: """Get HTTP cache statistics. Returns: Dictionary with cache statistics including size, number of entries, and location """ return get_cache_stats() async def cache_clear_expired() -> dict[str, int]: """Clear expired entries from HTTP cache. Returns: Dictionary with the number of expired entries removed """ removed = clear_expired_cache() return { "status": "success", "expired_entries_removed": removed, } async def cache_clear_all() -> dict[str, str]: """Clear all entries from HTTP cache. WARNING: This will remove all cached responses. Returns: Dictionary with operation status """ clear_all_cache() return { "status": "success", "message": "All cache entries cleared", } def register_scraping_tools(mcp: FastMCP) -> None: """Register core scraping tools on the MCP server. Tool Registration Pattern: ------------------------- This function uses FastMCP's decorator pattern to register async functions as MCP tools. The pattern `mcp.tool()(function)` is equivalent to: @mcp.tool() async def function(...): ... By using the functional approach, we can: 1. Define tools in this module with proper type hints 2. Keep business logic separate in service.py 3. Register all tools in one place for clarity 4. Make tools importable for testing The MCP framework automatically: - Extracts function signatures for tool schemas - Generates OpenAPI-compatible documentation from docstrings - Handles JSON serialization of Pydantic models - Routes incoming tool calls to the registered functions Args: mcp: FastMCP server instance to register tools on Example: >>> from mcp.server.fastmcp import FastMCP >>> mcp = FastMCP("Scraper") >>> register_scraping_tools(mcp) >>> # Tools are now available via MCP protocol """ # Register core scraping tools # Each tool is exposed via MCP and documented in the API schema mcp.tool()(scrape_url) mcp.tool()(scrape_url_markdown) mcp.tool()(scrape_url_text) mcp.tool()(scrape_extract_links) def register_cache_tools(mcp: FastMCP) -> None: """Register optional cache management tools on the MCP server. These tools are only registered when ENABLE_CACHE_TOOLS=true in the environment. They provide administrative access to cache operations and should be used carefully in production environments. Args: mcp: FastMCP server instance to register tools on Note: Cache tools are disabled by default for security. Enable them only in development environments or when explicit cache management is needed. """ # Register optional cache management tools # These provide direct cache access and statistics mcp.tool()(cache_stats) mcp.tool()(cache_clear_expired) mcp.tool()(cache_clear_all)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cotdp/scraper-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

router.py•7.34 KiB