Crawl4Claude

config.py•9.73 kB

""" Configuration file for Documentation Scraper This file contains all the configuration options that users typically need to customize when scraping different documentation sites or knowledge bases. """ from pathlib import Path from urllib.parse import urlparse # Basic scraping configuration SCRAPER_CONFIG = { # Target website - CUSTOMIZE THIS for your documentation site "base_url": "https://docs.example.com/", # Output settings "output_dir": "docs_db", # Crawling limits "max_depth": 3, # How many levels deep to crawl (1-5 recommended) "max_pages": 20, # Maximum pages to scrape # Rate limiting (be respectful to the target site!) "delay_between_requests": 0.5, # Seconds between requests (0.5-2.0 recommended) "page_timeout": 30000, # Timeout per page in milliseconds # Browser settings "headless": True, # Run browser in background (True recommended) "user_agent": "Documentation-Scraper/1.0 (Educational Purpose)", # Content processing "cache_enabled": True, # Enable caching for resumable crawls "content_filtering": True, # Remove navigation and boilerplate "generate_screenshots": False, # Don't generate screenshots (saves time/space) } # URL filtering rules - CUSTOMIZE these patterns for your target site URL_FILTER_CONFIG = { # Skip these URL patterns (regex patterns) "skip_patterns": [ r'/api/', r'/search', r'\.pdf$', r'\.zip$', r'\.tar\.gz$', r'/edit/', r'/history/', r'#', # Fragment-only URLs r'\?', # Query parameters (usually duplicates) ], # Only crawl these domains - UPDATE this for your target site "allowed_domains": ["docs.example.com"], # Priority sections (crawl these first if using BestFirst strategy) # UPDATE these section names for your documentation structure "priority_sections": [ "getting-started", "tutorials", "guides", "reference" ] } # Content cleaning patterns - CUSTOMIZE for your site's structure CONTENT_FILTER_CONFIG = { # Patterns to remove from markdown content "remove_patterns": [ r'', # HTML comments r'\[Edit this page.*?\]', # Edit links r'Table of Contents.*?\n', # TOC headers r'Skip to main content.*?\n', # Skip links r'Previous\s+Next.*?\n', # Navigation r'Improve this doc.*?\n', # Improvement links r'Was this helpful\?.*?\n', # Feedback sections r'Rate this page.*?\n', # Rating sections ], # Additional cleanup rules "clean_excessive_whitespace": True, "remove_empty_sections": True, "max_consecutive_newlines": 2, } # Database and export settings EXPORT_CONFIG = { # Export formats to generate "generate_json": True, "generate_sqlite": True, "generate_llm_context": True, "generate_summary": True, # LLM context options "create_master_file": True, # Single file with all content "create_section_files": True, # Separate files per section "max_file_size_mb": 10, # Split large files # Content organization "sort_by_title": True, "include_metadata": True, "include_word_counts": True, } # Advanced crawling options ADVANCED_CONFIG = { # Deep crawling strategy "crawl_strategy": "BFS", # Options: "BFS" (Breadth-First), "DFS" (Depth-First), "BestFirst" # For BestFirst strategy - UPDATE keywords for your domain "scoring_keywords": ["tutorial", "guide", "documentation", "reference"], # Browser optimization "wait_for_images": False, # Skip image loading for speed "disable_javascript": False, # Keep JS enabled for SPAs "block_resources": ["image", "media", "font"], # Block these resource types for speed # Error handling "max_retries": 3, "retry_delay": 2.0, "continue_on_error": True, } # Logging configuration LOGGING_CONFIG = { "log_level": "INFO", # Options: DEBUG, INFO, WARNING, ERROR "log_to_file": True, "log_to_console": True, "log_format": "%(asctime)s - %(levelname)s - %(message)s", } # Content extraction settings - FINE-TUNE these for your site's content structure CONTENT_CONFIG = { "content_filter": { "type": "PruningContentFilter", "threshold": 0.48, # Lower threshold for more content "threshold_type": "fixed", "min_word_threshold": 10, # Lower minimum to capture shorter content "sim_threshold": 0.5, "always_bypass_local_score_threshold": False, "tags_to_exclude": [ "nav", "footer", "header", "aside", "script", "style", "meta", "noscript", ".nav", ".footer", ".header", ".sidebar", ".advertisement", ".ad", ".cookie", ".search-bar", ".menu", ".breadcrumb" ] }, "wait_for_load": True, "wait_time": 5000, # Increased wait time for JS content (5 seconds) "js_wait_time": 3000, # Additional wait for JavaScript execution "remove_forms": True, "remove_overlay": True, "extract_blocks": True, "word_count_threshold": 5 # Lower threshold to capture more content } # Browser configuration for better JS handling BROWSER_CONFIG = { "headless": True, "browser_type": "chromium", "viewport": {"width": 1920, "height": 1080}, "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", "extra_args": [ "--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled", "--disable-extensions", "--disable-plugins", "--disable-images", # Faster loading "--disable-javascript-harmony-shipping", "--disable-ipc-flooding-protection" ], "java_script_enabled": True, "load_images": False, # Skip images for faster loading "accept_downloads": False, "ignore_https_errors": True } # Debug and testing configuration DEBUG_CONFIG = { # Test URLs for debugging content extraction - UPDATE these for your site "test_urls": [ # Primary test URL - should be a content-rich page "https://docs.example.com/getting-started/", # Secondary test URL - should be a different section "https://docs.example.com/tutorials/first-tutorial/", ], # Content validation keywords - UPDATE these for your domain # These are used to verify that meaningful content was extracted "content_validation_keywords": [ "tutorial", "guide", "documentation", "example", "reference" ], # Debug output settings "preview_length": 500, # Characters to show in preview "verbose_output": True, # Show detailed debug information "save_debug_files": False, # Save debug HTML/markdown to files } # MCP (Model Context Protocol) server configuration MCP_CONFIG = { # Server identification "server_name": "docs-server", # Name of the MCP server "server_description": "Documentation Search and Retrieval Server", # Documentation display name (will derive from base_url if None) "docs_display_name": None, # Will derive from base_url if None # Search and retrieval settings "default_search_limit": 10, # Default number of search results "max_search_limit": 50, # Maximum allowed search results "default_section_limit": 20, # Default pages per section "max_section_limit": 100, # Maximum pages per section # Content settings "include_full_urls": True, # Add full URLs to results "snippet_length": 32, # Words in search snippets "enable_fts_fallback": True, # Use LIKE search if FTS unavailable } # Helper functions for derived configuration values def get_database_path(): """Get the complete database path""" output_dir = SCRAPER_CONFIG.get("output_dir", "docs_db") return str(Path(output_dir) / "documentation.db") def get_docs_display_name(): """Get the display name for the documentation""" if MCP_CONFIG.get("docs_display_name"): return MCP_CONFIG["docs_display_name"] base_url = SCRAPER_CONFIG.get("base_url", "") if base_url: parsed = urlparse(base_url) return parsed.netloc or "Documentation" return "Documentation" def get_mcp_server_config(): """Get complete MCP server configuration with derived values""" return { "db_path": get_database_path(), "docs_name": get_docs_display_name(), "server_name": MCP_CONFIG.get("server_name", "docs-server"), "base_url": SCRAPER_CONFIG.get("base_url", ""), "server_description": MCP_CONFIG.get("server_description", "Documentation Server"), "search_limit": MCP_CONFIG.get("default_search_limit", 10), "max_search_limit": MCP_CONFIG.get("max_search_limit", 50), "section_limit": MCP_CONFIG.get("default_section_limit", 20), "max_section_limit": MCP_CONFIG.get("max_section_limit", 100), "snippet_length": MCP_CONFIG.get("snippet_length", 32), "enable_fts_fallback": MCP_CONFIG.get("enable_fts_fallback", True), "include_full_urls": MCP_CONFIG.get("include_full_urls", True), }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dragomirweb/Crawl4Claude'

If you have feedback or need assistance with the MCP directory API, please join our Discord server