Skip to main content
Glama

Scrapy MCP Server

by ThreeFish-AI
config.py4.53 kB
"""Configuration settings for the Data Extractor MCP Server.""" from typing import Dict, Any, Optional, Union from pydantic import Field, field_validator from pydantic_settings import BaseSettings # 动态从 pyproject.toml 读取版本号 def _get_dynamic_version(): """从 pyproject.toml 动态读取版本号""" from pathlib import Path try: current_file = Path(__file__).resolve() project_root = ( current_file.parent.parent ) # config.py -> extractor -> project_root pyproject_path = project_root / "pyproject.toml" if pyproject_path.exists(): with open(pyproject_path, "r", encoding="utf-8") as f: content = f.read() for line in content.splitlines(): line = line.strip() if line.startswith('version = "') and line.endswith('"'): return line.split('"')[1] except Exception: pass return "0.0.0" class DataExtractorSettings(BaseSettings): """Settings for the Data Extractor MCP Server.""" # Server settings server_name: str = Field(default="data-extractor") server_version: str = Field(default_factory=_get_dynamic_version) # Data Extractor settings concurrent_requests: int = Field(default=16, gt=0) download_delay: float = Field(default=1.0, ge=0.0) randomize_download_delay: bool = Field(default=True) autothrottle_enabled: bool = Field(default=True) autothrottle_start_delay: float = Field(default=1.0, ge=0.0) autothrottle_max_delay: float = Field(default=60.0, ge=0.0) autothrottle_target_concurrency: float = Field(default=1.0, ge=0.0) # Rate limiting settings rate_limit_requests_per_minute: int = Field(default=60, ge=1) # Retry settings max_retries: int = Field(default=3, ge=0) retry_delay: float = Field(default=1.0, ge=0.0) # Cache settings enable_caching: bool = Field(default=True) cache_ttl_hours: int = Field(default=24, gt=0) cache_max_size: Optional[int] = Field(default=None) # Logging settings log_level: str = Field(default="INFO") log_requests: Optional[bool] = Field(default=None) log_responses: Optional[bool] = Field(default=None) # Browser settings enable_javascript: bool = Field(default=False) browser_headless: bool = Field(default=True) browser_timeout: int = Field(default=30, ge=0) browser_window_size: Union[str, tuple] = Field(default="1920x1080") # User agent settings use_random_user_agent: bool = Field(default=True) default_user_agent: str = Field( default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) # Proxy settings use_proxy: bool = Field(default=False) proxy_url: Optional[str] = Field(default=None) # Request settings request_timeout: float = Field(default=30.0, gt=0.0) model_config = { "env_file": ".env", "env_file_encoding": "utf-8", "extra": "ignore", # Allow extra environment variables "env_prefix": "DATA_EXTRACTOR_", # Automatically map env vars with this prefix "env_ignore_empty": True, # Ignore empty environment variables "frozen": True, # Make instances immutable } @field_validator("log_level") @classmethod def validate_log_level(cls, v): """Validate log level is one of the standard logging levels.""" valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] if v.upper() not in valid_levels: raise ValueError(f"log_level must be one of: {valid_levels}") return v.upper() def get_scrapy_settings(self) -> Dict[str, Any]: """Get Scrapy-specific settings as a dictionary.""" return { "CONCURRENT_REQUESTS": self.concurrent_requests, "DOWNLOAD_DELAY": self.download_delay, "RANDOMIZE_DOWNLOAD_DELAY": self.randomize_download_delay, "AUTOTHROTTLE_ENABLED": self.autothrottle_enabled, "AUTOTHROTTLE_START_DELAY": self.autothrottle_start_delay, "AUTOTHROTTLE_MAX_DELAY": self.autothrottle_max_delay, "AUTOTHROTTLE_TARGET_CONCURRENCY": self.autothrottle_target_concurrency, "RETRY_TIMES": self.max_retries, "DOWNLOAD_TIMEOUT": self.request_timeout, "USER_AGENT": self.default_user_agent, } # 创建全局设置实例 settings = DataExtractorSettings()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server