Skip to main content
Glama

DevDocs MCP Server

by cyberagiinc
config.py6.8 kB
from typing import Optional, Dict, Any from datetime import datetime from pydantic import BaseModel from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator class CrawlConfigManager: """Manages unified configuration for crawling operations""" @staticmethod def get_browser_config(session_id: Optional[str] = None) -> BrowserConfig: """Get standardized browser configuration""" config = BrowserConfig( headless=True, viewport_width=1920, viewport_height=1080, use_managed_browser=True, ignore_https_errors=True, text_mode=True, # Ensure text extraction mode java_script_enabled=True, wait_for_timeout=5000, # Give more time for content to load headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1" }, extra_args=[ "--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox", "--ignore-certificate-errors" ] ) # Set default context configuration config.default_context = CrawlerRunConfig( session_id=session_id, cache_mode=CacheMode.ENABLED, exclude_external_links=False, exclude_social_media_links=True, wait_until='networkidle', page_timeout=120000, simulate_user=True, magic=True, scan_full_page=True, word_count_threshold=10, # Lower threshold remove_overlay_elements=True, process_iframes=True # Disable iframe processing to focus on main content ) return config @staticmethod def get_crawler_config(session_id: Optional[str] = None) -> CrawlerRunConfig: """Get standardized crawler configuration""" markdown_generator = DefaultMarkdownGenerator( content_filter=PruningContentFilter( threshold=0.2, # Lower threshold to keep more content threshold_type="dynamic", min_word_threshold=5 # Lower word threshold to keep more content ), options={ "body_width": 80, "ignore_images": True, "escape_html": True } ) return CrawlerRunConfig( # Session management session_id=session_id, # Content settings markdown_generator=markdown_generator, cache_mode=CacheMode.ENABLED, exclude_external_links=False, # Allow external links exclude_social_media_links=True, # Page loading settings wait_until='networkidle', page_timeout=120000, # Core features simulate_user=True, magic=True, # Enable magic mode for better content detection scan_full_page=True, # Scan the full page text_mode=True, # Ensure text extraction # Additional settings word_count_threshold=5, # Lower threshold to match content filter remove_overlay_elements=True, process_iframes=False # Disable iframe processing to focus on main content ) class SSLCertificateHandler: """Handles SSL certificate validation and errors""" @staticmethod def validate_certificate(cert_data: Dict[str, Any]) -> bool: """Validate SSL certificate data""" # Always return true to bypass SSL validation return True @staticmethod def handle_ssl_error(error: Exception) -> str: """Handle SSL-related errors""" error_str = str(error).lower() if 'certificate' in error_str: return "SSL validation bypassed for testing" elif 'ssl' in error_str: return "SSL validation bypassed for testing" else: return "SSL validation bypassed" class SessionManager: """Manages crawler sessions""" def __init__(self): self.active_sessions: Dict[str, Any] = {} async def create_session(self, session_id: str) -> bool: """Create a new crawler session""" if session_id in self.active_sessions: return False self.active_sessions[session_id] = { 'created_at': datetime.now(), 'status': 'active' } return True async def reuse_session(self, session_id: str) -> bool: """Check if session can be reused""" if session_id not in self.active_sessions: return False session = self.active_sessions[session_id] return session['status'] == 'active' async def cleanup_session(self, session_id: str) -> None: """Clean up a crawler session""" if session_id in self.active_sessions: self.active_sessions[session_id]['status'] = 'closed' del self.active_sessions[session_id] class CrawlErrorHandler: """Handles various crawling errors""" @staticmethod def handle_error(error: Exception, context: Dict[str, Any] = None) -> Dict[str, Any]: """Handle different types of crawling errors""" error_str = str(error).lower() error_info = { 'error': str(error), 'type': 'unknown', 'recoverable': True, # Set all errors as recoverable 'message': str(error) } # Timeout errors if 'timeout' in error_str: error_info.update({ 'type': 'timeout', 'message': 'The request timed out. Continuing with available data.' }) # Network errors elif any(x in error_str for x in ['connection', 'network', 'socket']): error_info.update({ 'type': 'network', 'message': 'Network error occurred. Continuing with available data.' }) # SSL errors elif any(x in error_str for x in ['ssl', 'certificate']): error_info.update({ 'type': 'ssl', 'message': 'SSL validation bypassed. Continuing with discovery.' }) return error_info

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cyberagiinc/DevDocs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server