Crawl4AI+SearXNG MCP Server

settings.py•13.8 KiB

"""Configuration settings for Crawl4AI MCP Server using Pydantic Settings. This module provides type-safe configuration management with automatic validation, environment variable loading, and documentation generation. """ import logging from typing import Any from pydantic import Field, field_validator from pydantic_settings import BaseSettings, SettingsConfigDict logger = logging.getLogger(__name__) class Settings(BaseSettings): """Central configuration management with Pydantic validation. All settings are loaded from environment variables with automatic type conversion and validation. Default values are provided for non-critical settings. """ model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore", # Ignore extra env vars validate_default=True, ) # ======================================== # Debug Settings # ======================================== debug: bool = Field( default=False, alias="MCP_DEBUG", description="Enable debug mode with verbose logging", ) # ======================================== # API Keys # ======================================== openai_api_key: str | None = Field( default=None, description="OpenAI API key for LLM operations", ) # ======================================== # Server Settings # ======================================== host: str = Field( default="0.0.0.0", description="Server host address", ) port: int = Field( default=8051, ge=1024, le=65535, description="Server port number", ) # ======================================== # Database Settings # ======================================== vector_database: str = Field( default="qdrant", description="Vector database type (qdrant or supabase)", ) qdrant_url: str | None = Field( default=None, description="Qdrant server URL", ) qdrant_api_key: str | None = Field( default=None, description="Qdrant API key for authentication", ) # ======================================== # Neo4j Settings # ======================================== neo4j_uri: str | None = Field( default=None, description="Neo4j database URI", ) neo4j_username: str | None = Field( default=None, description="Neo4j username", ) neo4j_password: str | None = Field( default=None, description="Neo4j password", ) use_knowledge_graph: bool = Field( default=False, description="Enable knowledge graph features", ) neo4j_batch_size: int = Field( default=50, ge=1, le=1000, description="Batch size for Neo4j transaction processing", ) neo4j_batch_timeout: int = Field( default=120, ge=10, le=600, description="Timeout in seconds for Neo4j batch operations", ) # ======================================== # SearXNG Settings # ======================================== # Default port 8888 is for local SearXNG installation # Docker deployments override via SEARXNG_URL in docker-compose.yml searxng_url: str | None = Field( default="http://localhost:8888", description="SearXNG instance URL for web search", ) searxng_user_agent: str = Field( default="MCP-Crawl4AI-RAG-Server/1.0", description="User agent for SearXNG requests", ) searxng_timeout: int = Field( default=30, ge=5, le=120, description="Timeout in seconds for SearXNG requests", ) searxng_default_engines: str = Field( default="", description="Comma-separated list of default search engines", ) # ======================================== # Feature Flags # ======================================== use_reranking: bool = Field( default=False, description="Enable result reranking for improved relevance", ) use_test_env: bool = Field( default=False, alias="USE_TEST_ENV", description="Use test environment configuration", ) use_agentic_rag: bool = Field( default=False, description="Enable agentic RAG features", ) # ======================================== # Agentic Search Settings # ======================================== agentic_search_enabled: bool = Field( default=False, description="Enable agentic search with iterative refinement", ) agentic_search_completeness_threshold: float = Field( default=0.95, ge=0.0, le=1.0, description=( "Completeness threshold (0.0-1.0) for determining when answer is sufficient" ), ) agentic_search_max_iterations: int = Field( default=3, ge=1, le=10, description="Maximum number of search iterations", ) agentic_search_max_urls_per_iteration: int = Field( default=5, ge=1, le=20, description="Maximum starting URLs to crawl per iteration", ) agentic_search_max_pages_per_iteration: int = Field( default=15, ge=1, le=200, description=( "Maximum total pages to crawl recursively across all URLs in iteration" ), ) agentic_search_max_crawl_depth: int = Field( default=2, ge=1, le=5, description=("Maximum crawl depth (1=only starting URLs, 2=+1 level of links)"), ) agentic_search_url_score_threshold: float = Field( default=0.7, ge=0.0, le=1.0, description="Minimum relevance score (0.0-1.0) for URLs to be crawled", ) agentic_search_use_search_hints: bool = Field( default=False, description="Generate search hints from crawled content", ) agentic_search_enable_url_filtering: bool = Field( default=True, description=( "Enable smart URL filtering to avoid GitHub commits, pagination, etc." ), ) agentic_search_max_urls_to_rank: int = Field( default=20, ge=5, le=50, description=( "Maximum number of search results to rank with LLM (reduce for lower costs)" ), ) agentic_search_llm_temperature: float = Field( default=0.3, ge=0.0, le=2.0, description="LLM temperature for agentic search evaluations", ) agentic_search_max_qdrant_results: int = Field( default=10, ge=1, le=50, description="Maximum results to retrieve from Qdrant per query", ) model_choice: str = Field( default="gpt-4o-mini", description="LLM model for evaluations and completeness checks", ) # ======================================== # Recency Decay Settings # ======================================== recency_decay_enabled: bool = Field( default=True, description="Enable recency-based score decay for search results", ) recency_decay_half_life_days: float = Field( default=14.0, ge=1.0, le=365.0, description="Half-life in days for exponential decay (after this time, score is halved)", ) recency_decay_weight: float = Field( default=0.3, ge=0.0, le=1.0, description="Weight of recency in final score (0=ignore recency, 1=only recency)", ) recency_decay_min_score: float = Field( default=0.1, ge=0.0, le=1.0, description="Minimum decay factor (prevents very old docs from being completely ignored)", ) # ======================================== # Crawler Settings # ======================================== max_concurrent_sessions: int = Field( default=50, ge=1, le=500, description="Maximum concurrent browser sessions (global limit)", ) # ======================================== # Transport Settings # ======================================== transport: str = Field( default="http", description="Transport mode (http or stdio)", ) mcp_api_key: str | None = Field( default=None, description="MCP API key for authentication", ) # ======================================== # OAuth2 Settings # ======================================== use_oauth2: bool = Field( default=False, description="Enable OAuth2 authentication", ) oauth2_issuer: str | None = Field( default=None, description="OAuth2 issuer URL", ) oauth2_secret_key: str = Field( default="change-me-in-production", description="OAuth2 JWT secret key", ) oauth2_scopes: str = Field( default="read:data,write:data", description="Comma-separated list of valid OAuth2 scopes", ) oauth2_required_scopes: str = Field( default="read:data", description="Comma-separated list of required OAuth2 scopes", ) oauth_storage_dir: str = Field( default=".oauth_storage", description="Directory for persistent OAuth storage (clients, tokens)", ) # ======================================== # Repository Size Limits # ======================================== repo_max_size_mb: int = Field( default=500, ge=1, le=10000, description="Maximum repository size in MB", ) repo_max_file_count: int = Field( default=10000, ge=1, le=1000000, description="Maximum file count for repository", ) repo_min_free_space_gb: float = Field( default=1.0, ge=0.1, le=1000.0, description="Minimum free disk space required in GB", ) repo_allow_size_override: bool = Field( default=False, description="Allow overriding size limits", ) # ======================================== # Test Settings # ======================================== test_openai_api_key: str | None = Field( default=None, description="OpenAI API key for tests (falls back to openai_api_key)", ) test_model_choice: str = Field( default="gpt-4.1-nano", description="LLM model for integration tests (cheap and fast)", ) # ======================================== # Validators # ======================================== @field_validator("oauth2_issuer", mode="before") @classmethod def set_oauth2_issuer(cls, v: str | None, info: Any) -> str: """Set OAuth2 issuer default from host and port if not provided.""" if v: return v # Access other field values during validation host = info.data.get("host", "0.0.0.0") port = info.data.get("port", 8051) return f"https://{host}:{port}" # ======================================== # Helper Methods # ======================================== def has_neo4j_config(self) -> bool: """Check if Neo4j environment variables are configured.""" return all([self.neo4j_uri, self.neo4j_username, self.neo4j_password]) def get_neo4j_config(self) -> dict[str, Any]: """Get Neo4j configuration as a dictionary.""" return { "uri": self.neo4j_uri or "", "auth": (self.neo4j_username or "", self.neo4j_password or ""), } def get_oauth2_scopes_list(self) -> list[str]: """Get OAuth2 scopes as a list.""" return [s.strip() for s in self.oauth2_scopes.split(",") if s.strip()] def get_oauth2_required_scopes_list(self) -> list[str]: """Get required OAuth2 scopes as a list.""" return [s.strip() for s in self.oauth2_required_scopes.split(",") if s.strip()] def to_dict(self) -> dict[str, Any]: """Export settings as a dictionary (safe version without secrets).""" return { "debug": self.debug, "host": self.host, "port": self.port, "vector_database": self.vector_database, "use_knowledge_graph": self.use_knowledge_graph, "use_reranking": self.use_reranking, "use_test_env": self.use_test_env, "has_neo4j": self.has_neo4j_config(), "has_searxng": bool(self.searxng_url), "has_openai": bool(self.openai_api_key), "neo4j_batch_size": self.neo4j_batch_size, "neo4j_batch_timeout": self.neo4j_batch_timeout, "repo_max_size_mb": self.repo_max_size_mb, "repo_max_file_count": self.repo_max_file_count, "repo_min_free_space_gb": self.repo_min_free_space_gb, "repo_allow_size_override": self.repo_allow_size_override, "agentic_search_enabled": self.agentic_search_enabled, "agentic_search_completeness_threshold": ( self.agentic_search_completeness_threshold ), "model_choice": self.model_choice, "test_model_choice": self.test_model_choice, } # Singleton pattern with proper typing _settings_instance: Settings | None = None def get_settings() -> Settings: """Get cached settings instance (singleton pattern).""" global _settings_instance if _settings_instance is None: _settings_instance = Settings() logger.info("Settings initialized from environment") logger.debug("Vector database: %s", _settings_instance.vector_database) if not _settings_instance.openai_api_key: logger.warning( "OPENAI_API_KEY is missing. OpenAI features will be unavailable.", ) return _settings_instance def reset_settings() -> None: """Reset settings instance (useful for testing).""" global _settings_instance _settings_instance = None

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-enthusiasts/crawl4ai-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

settings.py•13.8 KiB