AnyDocs MCP Server

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

website.py•19.2 kB

#!/usr/bin/env python3 """ Generic Website Documentation Adapter Adapter for scraping documentation from any website that doesn't have a specific API. Supports automatic discovery through sitemaps, RSS feeds, or manual URL lists. """ import asyncio import json from typing import Any, Dict, List, Optional, Set from urllib.parse import urljoin, urlparse, urlencode from datetime import datetime import re import httpx from bs4 import BeautifulSoup from mcp.types import Resource from .base import BaseDocumentAdapter, DocumentContent, SearchResult from ..utils import get_logger logger = get_logger(__name__) class WebsiteAdapter(BaseDocumentAdapter): """Generic adapter for scraping documentation websites. This adapter can work with any website containing documentation, automatically discovering pages through various methods: - Sitemap parsing (sitemap.xml) - RSS/Atom feeds - Manual URL patterns - Recursive crawling from seed URLs Configuration options: - base_url: Base URL of the documentation site (required) - discovery_method: "sitemap", "manual", "crawl", or "feed" - url_patterns: List of URL patterns to include/exclude - content_selectors: CSS selectors for content extraction - max_pages: Maximum number of pages to process - crawl_depth: Maximum depth for recursive crawling - rate_limit: Delay between requests (seconds) """ def __init__(self, config: Dict[str, Any]): super().__init__(config) self.client: Optional[httpx.AsyncClient] = None self.pages_cache: List[Dict[str, Any]] = [] self.discovered_urls: Set[str] = set() # Default configuration self.discovery_method = config.get("discovery_method", "sitemap") self.max_pages = config.get("max_pages", 1000) self.crawl_depth = config.get("crawl_depth", 3) self.rate_limit = config.get("rate_limit", 1.0) # Content extraction selectors (prioritized) self.content_selectors = config.get("content_selectors", [ "main", "article", ".content", ".documentation", ".docs-content", ".doc-content", "#content", ".markdown-body", ".post-content", ".entry-content" ]) # URL filtering patterns self.include_patterns = config.get("include_patterns", []) self.exclude_patterns = config.get("exclude_patterns", [ r"/api/", r"/admin/", r"/login", r"/register", r"\.(pdf|jpg|png|gif|svg|css|js)$" ]) async def initialize(self) -> None: """Initialize the website adapter.""" logger.info(f"Initializing Website adapter for {self.config.get('base_url')}") # Validate required configuration self._validate_config(["base_url"]) base_url = self.config["base_url"].rstrip("/") # Setup HTTP client headers = { "User-Agent": "AnyDocs-MCP/1.0 (+https://github.com/anydocs-mcp)", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "DNT": "1", "Connection": "keep-alive" } # Add custom headers if provided if "headers" in self.config: headers.update(self.config["headers"]) self.client = httpx.AsyncClient( headers=headers, timeout=30.0, follow_redirects=True, limits=httpx.Limits(max_connections=5, max_keepalive_connections=2) ) # Test connection to base URL await self._test_connection() # Discover pages based on method await self._discover_pages() self._initialized = True logger.info(f"Website adapter initialized: discovered {len(self.pages_cache)} pages") async def list_resources(self) -> List[Resource]: """List all discovered pages.""" self._ensure_initialized() resources = [] for page in self.pages_cache: resource = Resource( uri=self.get_resource_uri(page["id"]), name=page["title"], description=page.get("description", f"Page from {self.config['base_url']}"), mimeType="text/markdown" ) resources.append(resource) logger.debug(f"Listed {len(resources)} resources from website") return resources async def get_content(self, resource_path: str) -> DocumentContent: """Get content for a specific page.""" self._ensure_initialized() # Find page in cache page = None for p in self.pages_cache: if p["id"] == resource_path or p["url"] == resource_path: page = p break if not page: raise FileNotFoundError(f"Page not found: {resource_path}") # Fetch and extract content try: await asyncio.sleep(self.rate_limit) # Rate limiting response = await self.client.get(page["url"]) response.raise_for_status() # Extract content from HTML content = self._extract_content(response.text, page["url"]) return DocumentContent( title=page["title"], content=content, url=page["url"], source=self.name, content_type="text/markdown", last_modified=page.get("last_modified"), metadata={ "page_id": page["id"], "path": page.get("path", ""), "depth": page.get("depth", 0) } ) except Exception as e: logger.error(f"Error fetching content for {page['url']}: {e}") raise async def search(self, query: str, limit: int = 10) -> List[SearchResult]: """Search through cached page content.""" self._ensure_initialized() if not query.strip(): return [] results = [] query_lower = query.lower() # Simple text-based search (could be enhanced with proper indexing) for page in self.pages_cache: score = 0.0 # Check title match (higher weight) if query_lower in page["title"].lower(): score += 10.0 # Check URL path match if query_lower in page["url"].lower(): score += 5.0 # Check description match description = page.get("description", "") if query_lower in description.lower(): score += 3.0 if score > 0: # Get content excerpt try: # For search results, we might want to fetch a snippet content_snippet = description or page["title"] if len(content_snippet) > 200: content_snippet = content_snippet[:200] + "..." results.append(SearchResult( title=page["title"], content=content_snippet, url=page["url"], source=self.name, score=score, metadata={ "page_id": page["id"], "path": page.get("path", "") } )) except Exception as e: logger.error(f"Error creating search result for {page['url']}: {e}") # Sort by score and limit results results.sort(key=lambda x: x.score, reverse=True) return results[:limit] async def get_structure(self) -> str: """Get the hierarchical structure of discovered pages.""" self._ensure_initialized() base_url = self.config["base_url"] structure_lines = [f"# Documentation from {base_url}"] structure_lines.append("") structure_lines.append(f"**Discovery Method:** {self.discovery_method}") structure_lines.append(f"**Total Pages:** {len(self.pages_cache)}") structure_lines.append("") # Group pages by path depth or domain structure pages_by_path = {} for page in self.pages_cache: path_parts = urlparse(page["url"]).path.strip("/").split("/") depth = len([p for p in path_parts if p]) # Count non-empty parts if depth not in pages_by_path: pages_by_path[depth] = [] pages_by_path[depth].append(page) # Build hierarchical structure for depth in sorted(pages_by_path.keys()): if depth == 0: structure_lines.append("## Root Pages") else: structure_lines.append(f"## Level {depth} Pages") structure_lines.append("") for page in pages_by_path[depth][:20]: # Limit to first 20 per level indent = " " * min(depth, 3) # Max 3 levels of indent structure_lines.append(f"{indent}- [{page['title']}]({page['url']})") if len(pages_by_path[depth]) > 20: structure_lines.append(f" ... and {len(pages_by_path[depth]) - 20} more") structure_lines.append("") return "\n".join(structure_lines) async def _test_connection(self) -> None: """Test connection to the base URL.""" try: base_url = self.config["base_url"] response = await self.client.get(base_url) response.raise_for_status() logger.debug("Website connection test successful") except Exception as e: logger.error(f"Website connection test failed: {e}") raise ConnectionError(f"Failed to connect to {self.config['base_url']}: {e}") async def _discover_pages(self) -> None: """Discover pages using the configured method.""" method = self.discovery_method.lower() if method == "sitemap": await self._discover_via_sitemap() elif method == "manual": await self._discover_via_manual_urls() elif method == "crawl": await self._discover_via_crawling() elif method == "feed": await self._discover_via_feeds() else: logger.warning(f"Unknown discovery method: {method}, trying sitemap") await self._discover_via_sitemap() # If no pages found, fall back to base URL if not self.pages_cache: await self._add_single_page(self.config["base_url"]) async def _discover_via_sitemap(self) -> None: """Discover pages through sitemap.xml.""" base_url = self.config["base_url"].rstrip("/") sitemap_urls = [ f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml", f"{base_url}/robots.txt" # Check for sitemap in robots.txt ] for sitemap_url in sitemap_urls: try: await asyncio.sleep(self.rate_limit) response = await self.client.get(sitemap_url) if response.status_code == 200: if sitemap_url.endswith("robots.txt"): # Extract sitemap URLs from robots.txt await self._parse_robots_txt(response.text) else: # Parse sitemap XML await self._parse_sitemap(response.text, base_url) if self.pages_cache: # Stop if we found pages return except Exception as e: logger.debug(f"Failed to fetch {sitemap_url}: {e}") logger.info("No sitemap found or sitemap empty") async def _discover_via_manual_urls(self) -> None: """Discover pages from manually specified URLs.""" urls = self.config.get("urls", []) for url in urls: if self._should_include_url(url): await self._add_single_page(url) async def _discover_via_crawling(self) -> None: """Discover pages through recursive crawling.""" base_url = self.config["base_url"] seed_urls = self.config.get("seed_urls", [base_url]) await self._crawl_recursive(seed_urls, 0) async def _discover_via_feeds(self) -> None: """Discover pages through RSS/Atom feeds.""" base_url = self.config["base_url"].rstrip("/") feed_urls = self.config.get("feed_urls", [ f"{base_url}/feed.xml", f"{base_url}/rss.xml", f"{base_url}/atom.xml", f"{base_url}/feed/", f"{base_url}/rss/" ]) for feed_url in feed_urls: try: await asyncio.sleep(self.rate_limit) response = await self.client.get(feed_url) if response.status_code == 200: await self._parse_feed(response.text) except Exception as e: logger.debug(f"Failed to fetch feed {feed_url}: {e}") async def _parse_sitemap(self, sitemap_xml: str, base_url: str) -> None: """Parse sitemap XML to extract page URLs.""" try: soup = BeautifulSoup(sitemap_xml, 'xml') # Handle sitemap index files sitemaps = soup.find_all('sitemap') for sitemap in sitemaps: loc = sitemap.find('loc') if loc: await asyncio.sleep(self.rate_limit) try: response = await self.client.get(loc.text) if response.status_code == 200: await self._parse_sitemap(response.text, base_url) except Exception as e: logger.debug(f"Failed to fetch nested sitemap {loc.text}: {e}") # Handle URL entries urls = soup.find_all('url') for url_elem in urls: loc = url_elem.find('loc') if loc and self._should_include_url(loc.text): # Get metadata lastmod = url_elem.find('lastmod') last_modified = None if lastmod: try: last_modified = datetime.fromisoformat(lastmod.text.replace('Z', '+00:00')) except Exception: pass # Create page entry page_url = loc.text path = urlparse(page_url).path title = self._generate_title_from_url(page_url) page = { "id": path or "home", "title": title, "url": page_url, "path": path, "last_modified": last_modified } self.pages_cache.append(page) if len(self.pages_cache) >= self.max_pages: break logger.info(f"Parsed sitemap: found {len(self.pages_cache)} URLs") except Exception as e: logger.error(f"Error parsing sitemap: {e}") raise def _should_include_url(self, url: str) -> bool: """Check if URL should be included based on patterns.""" # Check include patterns (if specified) if self.include_patterns: if not any(re.search(pattern, url) for pattern in self.include_patterns): return False # Check exclude patterns if any(re.search(pattern, url) for pattern in self.exclude_patterns): return False # Must be under base URL return url.startswith(self.config["base_url"]) def _generate_title_from_url(self, url: str) -> str: """Generate a readable title from URL.""" path = urlparse(url).path.strip("/") if not path: return "Home" # Convert path to title parts = path.split("/") title_parts = [] for part in parts: # Replace hyphens and underscores with spaces part = part.replace("-", " ").replace("_", " ") # Capitalize each word part = " ".join(word.capitalize() for word in part.split()) title_parts.append(part) return " > ".join(title_parts) def _extract_content(self, html: str, url: str) -> str: """Extract main content from HTML.""" soup = BeautifulSoup(html, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): element.decompose() # Try content selectors in order content_elem = None for selector in self.content_selectors: content_elem = soup.select_one(selector) if content_elem: break # Fallback to body if not content_elem: content_elem = soup.find('body') if not content_elem: return soup.get_text(strip=True) # Convert to markdown-like format return self._html_to_markdown(content_elem) def _html_to_markdown(self, element) -> str: """Convert HTML element to markdown-like text.""" if not element: return "" # Get text content with basic formatting text = element.get_text(separator="\n", strip=True) # Basic cleanup and formatting lines = [line.strip() for line in text.split("\n") if line.strip()] # Join paragraphs return "\n\n".join(lines) async def _add_single_page(self, url: str) -> None: """Add a single page to the cache.""" if not self._should_include_url(url): return path = urlparse(url).path title = self._generate_title_from_url(url) page = { "id": path or "home", "title": title, "url": url, "path": path } self.pages_cache.append(page) async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.client: await self.client.aclose()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/funky1688/AnyDocs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server