mcp-server-docy

Overview Schema Related Servers Score Discussions

mcp-server-docy
src
mcp_server_docy

server.py•20 KiB

from typing import Dict, List, Literal, Optional import json import os import subprocess import asyncio from functools import wraps from loguru import logger from pydantic import Field from pydantic_settings import BaseSettings, SettingsConfigDict from fastmcp import FastMCP from crawl4ai import AsyncWebCrawler from diskcache import Cache # Remove default handler to allow configuration from __main__.py logger.remove() # Server metadata SERVER_NAME = "Docy" SERVER_VERSION = "0.5.0" DEFAULT_USER_AGENT = f"ModelContextProtocol/1.0 {SERVER_NAME} (+https://github.com/modelcontextprotocol/servers)" class Settings(BaseSettings): """Configuration settings for the Docy server.""" model_config = SettingsConfigDict(extra="ignore", env_file=".env") docy_user_agent: str = Field( default=DEFAULT_USER_AGENT, description="Custom User-Agent string for HTTP requests", ) docy_documentation_urls: Optional[str] = Field( default=None, description="Comma-separated list of URLs to documentation sites to include", ) docy_documentation_urls_file: Optional[str] = Field( default=".docy.urls", description="Path to a file containing documentation URLs (one per line)", ) docy_cache_ttl: int = Field( default=432000, description="Cache time-to-live in seconds" ) docy_cache_directory: str = Field( default=".docy.cache", description="Path to the cache directory" ) docy_debug: bool = Field(default=False, description="Enable debug logging") docy_skip_crawl4ai_setup: bool = Field( default=False, description="Skip running crawl4ai-setup command at startup" ) docy_transport: Literal["sse", "stdio"] = Field( default="stdio", description="Transport protocol (sse or stdio)" ) docy_host: str = Field( default="127.0.0.1", description="Host address to bind the server to" ) docy_port: int = Field(default=8000, description="Port to run the server on") @property def user_agent(self) -> str: return self.docy_user_agent @property def cache_ttl(self) -> int: return self.docy_cache_ttl @property def cache_directory(self) -> str: return self.docy_cache_directory @property def debug(self) -> bool: return self.docy_debug @property def skip_crawl4ai_setup(self) -> bool: return self.docy_skip_crawl4ai_setup @property def documentation_urls_str(self) -> Optional[str]: return self.docy_documentation_urls @property def documentation_urls_file_path(self) -> Optional[str]: return self.docy_documentation_urls_file @property def transport(self) -> Literal["sse", "stdio"]: return self.docy_transport @property def host(self) -> str: return self.docy_host @property def port(self) -> int: return self.docy_port def _read_urls_from_file(self) -> List[str]: """Read URLs from a file, one per line.""" import os if not self.documentation_urls_file_path: return [] try: if not os.path.exists(self.documentation_urls_file_path): logger.debug( f"URLs file not found: {self.documentation_urls_file_path}" ) return [] with open(self.documentation_urls_file_path, "r") as f: lines = f.readlines() # Filter out empty lines and comments, strip whitespace urls = [ line.strip() for line in lines if line.strip() and not line.strip().startswith("#") ] logger.debug( f"Read {len(urls)} URLs from file: {self.documentation_urls_file_path}" ) return urls except Exception as e: logger.error( f"Error reading URLs file {self.documentation_urls_file_path}: {str(e)}" ) return [] @property def documentation_urls(self) -> List[str]: """Parse the comma-separated URLs into a list, or read from file if no env var provided.""" # Add debug output to help diagnose environment variable issues logger.debug(f"Documentation URLs string: '{self.documentation_urls_str}'") # First try to get URLs from environment variable if self.documentation_urls_str: # Split by comma and strip whitespace from each URL urls = [ url.strip() for url in self.documentation_urls_str.split(",") if url.strip() ] logger.debug( f"Parsed {len(urls)} documentation URLs from environment variable: {urls}" ) return urls # If no URLs in env var, try to read from file urls = self._read_urls_from_file() if urls: return urls # No URLs found anywhere logger.warning( "No documentation URLs provided (neither via environment variable nor file)" ) return [] settings = Settings() # Cache for HTTP requests (will be initialized in create_server) cache = None def async_cached(func): """Decorator to cache results of async functions using diskcache. Uses an executor to prevent blocking the event loop during cache operations. """ @wraps(func) async def wrapper(*args, **kwargs): global cache loop = asyncio.get_event_loop() if cache is None: logger.warning("Cache not initialized, skipping caching") return await func(*args, **kwargs) # Create a cache key from the function name and arguments key = f"{func.__name__}:{str(args)}:{str(kwargs)}" # Try to get the result from cache (offload to executor) cached_result = await loop.run_in_executor(None, lambda: cache.get(key)) if cached_result is not None: logger.info(f"Cache HIT for {func.__name__}") return cached_result logger.info(f"Cache MISS for {func.__name__}") # Call the original function try: result = await func(*args, **kwargs) # Store the result in cache (offload to executor) await loop.run_in_executor( None, lambda: cache.set(key, result, expire=settings.cache_ttl) ) return result except Exception as e: logger.error(f"Error executing {func.__name__}: {str(e)}") raise return wrapper # Create the FastMCP server mcp = FastMCP( SERVER_NAME, version=SERVER_VERSION, description="Documentation search and access functionality for LLMs", dependencies=["crawl4ai", "diskcache", "loguru", "pydantic-settings"], ) async def fetch_documentation_content(url: str) -> Dict: """Fetch the content of a documentation page by direct URL.""" logger.info(f"Fetching documentation page content from {url}") try: async with AsyncWebCrawler() as crawler: result = await crawler.arun(url=url) # Log the result for debugging logger.debug(f"Crawler result for URL {url}: success={result.success}") # Extract markdown from the result markdown_content = "" if result.markdown: # Check if markdown is a string or a MarkdownGenerationResult object if isinstance(result.markdown, str): markdown_content = str(result.markdown) else: # If it's a MarkdownGenerationResult, use the appropriate field markdown_content = str( getattr(result.markdown, "markdown_with_citations", "") or getattr(result.markdown, "raw_markdown", "") ) # Get page title from metadata or use URL as fallback title = "" if result.metadata and isinstance(result.metadata, dict): title = result.metadata.get( "title", url.split("/")[-1] or "Documentation" ) else: title = url.split("/")[-1] or "Documentation" # Return information about the documentation page return { "url": url, "title": title, "markdown": markdown_content, "links": result.links or {}, "success": result.success, } except Exception as e: logger.error(f"Failed to fetch documentation page content from {url}: {str(e)}") raise ValueError(f"Failed to fetch documentation content: {str(e)}") # Apply caching to the fetch_documentation_content function @async_cached async def cached_fetch_documentation_content(url: str) -> Dict: """Cached version of fetch_documentation_content.""" return await fetch_documentation_content(url) @mcp.resource("documentation://sources") def list_documentation_sources() -> str: """List all configured documentation sources.""" logger.info("Listing all documentation sources") # Read the documentation URLs directly from the file to get fresh values documentation_urls = [] # First check if URL is provided via environment variable if settings.documentation_urls_str: documentation_urls = [ url.strip() for url in settings.documentation_urls_str.split(",") if url.strip() ] logger.info( f"Using {len(documentation_urls)} documentation URLs from environment variable" ) # Otherwise read from file for real-time updates elif settings.documentation_urls_file_path: documentation_urls = settings._read_urls_from_file() logger.info( f"Hot-reloaded {len(documentation_urls)} documentation URLs from file" ) results = [] for url in documentation_urls: results.append( {"url": url, "type": "web", "description": "Web-based documentation"} ) return f"Available documentation sources:\n{json.dumps(results, indent=2)}" @mcp.tool() def list_documentation_sources_tool() -> str: """List all available documentation sources this service has access to. This tool requires no input parameters and returns a list of documentation sources configured for this service. Use this tool first to discover what documentation sources are available. Example usage: ``` list_documentation_sources_tool() ``` Response provides the URLs to documentation sources and their types. """ # Read the documentation URLs directly from the file to get fresh values documentation_urls = [] # First check if URL is provided via environment variable if settings.documentation_urls_str: documentation_urls = [ url.strip() for url in settings.documentation_urls_str.split(",") if url.strip() ] logger.info( f"Using {len(documentation_urls)} documentation URLs from environment variable" ) # Otherwise read from file for real-time updates elif settings.documentation_urls_file_path: documentation_urls = settings._read_urls_from_file() logger.info( f"Hot-reloaded {len(documentation_urls)} documentation URLs from file" ) results = [] for url in documentation_urls: results.append( {"url": url, "type": "web", "description": "Web-based documentation"} ) return f"Available documentation sources:\n{json.dumps(results, indent=2)}" @mcp.tool() async def fetch_documentation_page(url: str) -> str: """Fetch the content of a documentation page by URL as markdown. This tool retrieves the full content from a documentation page at the specified URL and returns it as markdown. The markdown format preserves headings, links, lists, and other formatting from the original documentation. Example usage: ``` fetch_documentation_page(url="https://example.com/documentation/page") ``` Response includes the full markdown content of the page along with metadata like title and links. """ logger.info(f"Tool call: fetching documentation page content for URL: {url}") # Make sure the URL is properly formatted with scheme if not url.startswith(("http://", "https://")): url = f"https://{url}" try: result = await cached_fetch_documentation_content(url) logger.info("Successfully fetched documentation page content") if not result.get("success", True): return f"# Failed to load content from {url}\n\nUnable to retrieve documentation content. Please verify the URL is valid and accessible." title = result.get("title", "Documentation") markdown = result.get("markdown", "") return f"# {title}\n\n{markdown}" except Exception as e: logger.error(f"Error fetching documentation page: {str(e)}") return f"# Error retrieving documentation\n\nFailed to retrieve documentation from {url}. Error: {str(e)}" @mcp.prompt() def documentation_sources() -> str: """List all available documentation sources with their URLs and types""" # Get fresh documentation URLs (this will hot-reload from file if needed) documentation_urls = [] if settings.documentation_urls_str: documentation_urls = [ url.strip() for url in settings.documentation_urls_str.split(",") if url.strip() ] elif settings.documentation_urls_file_path: documentation_urls = settings._read_urls_from_file() logger.info( f"Hot-reloaded {len(documentation_urls)} documentation URLs for prompt" ) # Format the response as a JSON string for consistency results = [] for url in documentation_urls: results.append( {"url": url, "type": "web", "description": "Web-based documentation"} ) return f"Please list all documentation sources available through this server. Here they are:\n{json.dumps(results, indent=2)}" @mcp.prompt() def documentation_page(url: str) -> str: """Fetch the full content of a documentation page at a specific URL as markdown""" return ( f"Please provide the full documentation content from the following URL: {url}" ) @mcp.tool() async def fetch_document_links(url: str) -> str: """Fetch all links from a documentation page, categorized by internal and external links. This tool retrieves all links from a web page at the specified URL and returns them categorized as internal links (within the same domain) and external links (to other domains). Use this tool to discover related documentation pages from a starting URL. Example usage: ``` fetch_document_links(url="https://example.com/documentation/page") ``` Response includes a structured list of internal and external links found on the page, with their URLs and link text when available. """ logger.info(f"Tool call: fetching links from documentation page at URL: {url}") # Make sure the URL is properly formatted with scheme if not url.startswith(("http://", "https://")): url = f"https://{url}" try: result = await cached_fetch_documentation_content(url) logger.info("Successfully fetched links from documentation page") if not result.get("success", True): return f"# Failed to retrieve links from {url}\n\nUnable to access the page. Please verify the URL is valid and accessible." # Get the links from the result links = result.get("links", {}) # Format the links for output formatted_output = [f"# Links extracted from {url}\n"] # Add internal links section internal_links = links.get("internal", []) formatted_output.append(f"\n## Internal Links ({len(internal_links)})\n") for link in internal_links: href = link.get("href", "") text = link.get("text", "").strip() or "[No text]" formatted_output.append(f"- [{text}]({href})") # Add external links section external_links = links.get("external", []) formatted_output.append(f"\n## External Links ({len(external_links)})\n") for link in external_links: href = link.get("href", "") text = link.get("text", "").strip() or "[No text]" formatted_output.append(f"- [{text}]({href})") return "\n".join(formatted_output) except Exception as e: logger.error(f"Error fetching links from URL {url}: {str(e)}") return f"# Error retrieving links\n\nFailed to retrieve links from {url}. Error: {str(e)}" @mcp.prompt() def documentation_links(url: str) -> str: """Fetch all links from a documentation page to discover related content""" return f"Please list all links available on the documentation page at the following URL: {url}" def ensure_crawl4ai_setup(): """Ensure that crawl4ai is properly set up by running the crawl4ai-setup command.""" if settings.skip_crawl4ai_setup: logger.info("Skipping crawl4ai setup (docy_skip_crawl4ai_setup=true)") return logger.info("Ensuring crawl4ai is properly set up...") try: result = subprocess.run( ["crawl4ai-setup"], capture_output=True, text=True, check=False ) if result.returncode != 0: logger.warning(f"crawl4ai-setup exited with code {result.returncode}") logger.warning(f"STDERR: {result.stderr}") logger.warning( "crawl4ai setup might be incomplete, but we'll try to continue anyway" ) else: logger.info("crawl4ai setup completed successfully") except FileNotFoundError: logger.error( "crawl4ai-setup command not found. Some functionality may be limited." ) except Exception as e: logger.error(f"Error running crawl4ai-setup: {str(e)}") logger.warning( "Continuing despite setup failure, but functionality may be limited" ) async def cache_documentation_urls(): """Pre-cache all documentation URLs configured in settings.""" docs_urls = settings.documentation_urls if not docs_urls: logger.warning("No documentation URLs to cache") return logger.info(f"Pre-caching {len(docs_urls)} documentation URLs...") # Get cache statistics before pre-caching cache_size_before = cache.volume() for url in docs_urls: try: logger.info(f"Pre-caching documentation URL: {url}") await cached_fetch_documentation_content(url) logger.info(f"Successfully cached content from {url}") except Exception as e: logger.error(f"Failed to cache documentation URL {url}: {str(e)}") # Get cache statistics after pre-caching cache_size_after = cache.volume() logger.info( f"Cache size: {cache_size_before / 1024:.2f} KB -> {cache_size_after / 1024:.2f} KB" ) def create_server() -> FastMCP: """Create and configure the MCP server instance.""" global cache # Initialize the diskcache Cache cache_dir = settings.cache_directory logger.info(f"Initializing disk cache at: {cache_dir}") # Ensure parent directory exists if cache_dir has a parent path parent_dir = os.path.dirname(cache_dir) if parent_dir: os.makedirs(parent_dir, exist_ok=True) # Initialize the cache with disk-based persistence cache = Cache(cache_dir) # Configure cache settings cache.reset("size_limit", int(1e9)) # Default to 1GB size limit logger.info(f"Disk cache initialized with TTL: {settings.cache_ttl}s") # Ensure crawl4ai is properly set up ensure_crawl4ai_setup() # Pre-cache documentation URLs if settings.documentation_urls: logger.info("Pre-caching documentation URLs...") asyncio.run(cache_documentation_urls()) # Log server creation logger.info(f"Created MCP server with name: {SERVER_NAME}") logger.info( f"Configured with {len(settings.documentation_urls)} documentation URLs and cache TTL: {settings.cache_ttl}s" ) # Note: URL caching will be initiated in __main__.py after server startup return mcp

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/oborchers/mcp-server-docy'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

server.py•20 KiB