"""MCP server for context-efficient web scraping functionality."""
from __future__ import annotations
import json
import os
from mcp.server.fastmcp import FastMCP
from mcp.server.transport_security import TransportSecuritySettings
from scraper_mcp.admin.router import (
api_cache_clear,
api_config_get,
api_config_update,
api_request_details,
api_stats,
health_check,
)
from scraper_mcp.dashboard.router import dashboard as dashboard_view
from scraper_mcp.services.perplexity_service import PerplexityService
from scraper_mcp.tools.router import (
register_cache_tools,
register_perplexity_tools,
register_scraping_tools,
)
# Configuration: Enable cache management tools via environment variable
# Set ENABLE_CACHE_TOOLS=true to expose cache_stats, cache_clear_expired, and cache_clear_all
ENABLE_CACHE_TOOLS = os.getenv("ENABLE_CACHE_TOOLS", "false").lower() in ("true", "1", "yes")
# Configuration: Disable resources/prompts via environment variables
# Set DISABLE_RESOURCES=true or DISABLE_PROMPTS=true to reduce context overhead
DISABLE_RESOURCES_ENV = os.getenv("DISABLE_RESOURCES", "false").lower() in ("true", "1", "yes")
DISABLE_PROMPTS_ENV = os.getenv("DISABLE_PROMPTS", "false").lower() in ("true", "1", "yes")
# Default allowed hosts/origins for Docker environments
# These are added automatically so Docker users don't need extra configuration
DEFAULT_ALLOWED_HOSTS = [
"localhost",
"localhost:8000",
"127.0.0.1",
"127.0.0.1:8000",
"host.docker.internal", # Docker Desktop (Mac/Windows) host access
"host.docker.internal:8000", # Host header includes port
]
DEFAULT_ALLOWED_ORIGINS = [
"http://localhost:8000",
"http://127.0.0.1:8000",
"http://host.docker.internal:8000", # Docker Desktop (Mac/Windows) host access
]
def _get_transport_security_settings() -> TransportSecuritySettings:
"""Read transport security settings from environment variables.
MCP 1.24+ enables DNS rebinding protection by default. This function reads
FASTMCP_TRANSPORT_SECURITY__ALLOWED_HOSTS and FASTMCP_TRANSPORT_SECURITY__ALLOWED_ORIGINS
environment variables to configure allowed hosts and origins.
Default hosts include localhost, 127.0.0.1, and host.docker.internal for Docker support.
Returns:
TransportSecuritySettings with configured or default allowed hosts/origins
"""
allowed_hosts_env = os.getenv("FASTMCP_TRANSPORT_SECURITY__ALLOWED_HOSTS")
allowed_origins_env = os.getenv("FASTMCP_TRANSPORT_SECURITY__ALLOWED_ORIGINS")
# Start with defaults for Docker support
allowed_hosts = DEFAULT_ALLOWED_HOSTS.copy()
allowed_origins = DEFAULT_ALLOWED_ORIGINS.copy()
# Add custom hosts from environment variable
if allowed_hosts_env:
try:
custom_hosts = json.loads(allowed_hosts_env)
except json.JSONDecodeError:
# Fallback: treat as comma-separated string
custom_hosts = [h.strip() for h in allowed_hosts_env.split(",") if h.strip()]
# Merge with defaults (avoid duplicates)
allowed_hosts = list(set(allowed_hosts + custom_hosts))
# Add custom origins from environment variable
if allowed_origins_env:
try:
custom_origins = json.loads(allowed_origins_env)
except json.JSONDecodeError:
# Fallback: treat as comma-separated string
custom_origins = [o.strip() for o in allowed_origins_env.split(",") if o.strip()]
# Merge with defaults (avoid duplicates)
allowed_origins = list(set(allowed_origins + custom_origins))
return TransportSecuritySettings(
enable_dns_rebinding_protection=True,
allowed_hosts=allowed_hosts,
allowed_origins=allowed_origins,
)
# Create MCP server with stateless mode enabled
# Stateless mode auto-creates sessions for unknown session IDs, making the server
# resilient to restarts and eliminating "No valid session ID" errors
mcp = FastMCP(
"Scraper MCP",
instructions=(
"A web scraping MCP server that provides efficient webpage scraping tools. "
"Supports scraping HTML content, converting to markdown, extracting text, "
"and extracting links from webpages. Also includes resources for accessing "
"cache, configuration, and server information, plus prompts for common "
"analysis workflows."
),
stateless_http=True, # Accept requests without requiring initialize handshake
transport_security=_get_transport_security_settings(), # DNS rebinding protection config
)
# Register MCP tools
register_scraping_tools(mcp)
# Register optional cache management tools
if ENABLE_CACHE_TOOLS:
register_cache_tools(mcp)
# Register Perplexity AI tools (if API key is configured)
if PerplexityService.is_available():
register_perplexity_tools(mcp)
# Register admin API routes
mcp.custom_route("/healthz", methods=["GET"])(health_check)
mcp.custom_route("/api/stats", methods=["GET"])(api_stats)
mcp.custom_route("/api/cache/clear", methods=["POST"])(api_cache_clear)
mcp.custom_route("/api/config", methods=["GET"])(api_config_get)
mcp.custom_route("/api/config", methods=["POST"])(api_config_update)
mcp.custom_route("/api/requests/{request_id}/details", methods=["GET"])(api_request_details)
# Register dashboard route
mcp.custom_route("/", methods=["GET"])(dashboard_view)
def run_server(
transport: str = "streamable-http",
host: str = "0.0.0.0",
port: int = 8000,
enable_resources: bool = True,
enable_prompts: bool = True,
) -> None:
"""Run the MCP server.
Args:
transport: Transport type ('streamable-http' or 'sse')
host: Host to bind to (default: 0.0.0.0)
port: Port to bind to (default: 8000)
enable_resources: Enable MCP resources (default: True)
enable_prompts: Enable MCP prompts (default: True)
"""
# Register resources if enabled (CLI flag takes precedence over env var)
if enable_resources and not DISABLE_RESOURCES_ENV:
from scraper_mcp.resources import register_resources
register_resources(mcp)
# Register prompts if enabled (CLI flag takes precedence over env var)
if enable_prompts and not DISABLE_PROMPTS_ENV:
from scraper_mcp.prompts import register_prompts
register_prompts(mcp)
# Configure host and port via settings
mcp.settings.host = host
mcp.settings.port = port
# Run server with specified transport
mcp.run(transport=transport)
if __name__ == "__main__":
run_server()