website.pyโข19.2 kB
#!/usr/bin/env python3
"""
Generic Website Documentation Adapter
Adapter for scraping documentation from any website that doesn't have a specific API.
Supports automatic discovery through sitemaps, RSS feeds, or manual URL lists.
"""
import asyncio
import json
from typing import Any, Dict, List, Optional, Set
from urllib.parse import urljoin, urlparse, urlencode
from datetime import datetime
import re
import httpx
from bs4 import BeautifulSoup
from mcp.types import Resource
from .base import BaseDocumentAdapter, DocumentContent, SearchResult
from ..utils import get_logger
logger = get_logger(__name__)
class WebsiteAdapter(BaseDocumentAdapter):
"""Generic adapter for scraping documentation websites.
This adapter can work with any website containing documentation,
automatically discovering pages through various methods:
- Sitemap parsing (sitemap.xml)
- RSS/Atom feeds
- Manual URL patterns
- Recursive crawling from seed URLs
Configuration options:
- base_url: Base URL of the documentation site (required)
- discovery_method: "sitemap", "manual", "crawl", or "feed"
- url_patterns: List of URL patterns to include/exclude
- content_selectors: CSS selectors for content extraction
- max_pages: Maximum number of pages to process
- crawl_depth: Maximum depth for recursive crawling
- rate_limit: Delay between requests (seconds)
"""
def __init__(self, config: Dict[str, Any]):
super().__init__(config)
self.client: Optional[httpx.AsyncClient] = None
self.pages_cache: List[Dict[str, Any]] = []
self.discovered_urls: Set[str] = set()
# Default configuration
self.discovery_method = config.get("discovery_method", "sitemap")
self.max_pages = config.get("max_pages", 1000)
self.crawl_depth = config.get("crawl_depth", 3)
self.rate_limit = config.get("rate_limit", 1.0)
# Content extraction selectors (prioritized)
self.content_selectors = config.get("content_selectors", [
"main", "article", ".content", ".documentation",
".docs-content", ".doc-content", "#content",
".markdown-body", ".post-content", ".entry-content"
])
# URL filtering patterns
self.include_patterns = config.get("include_patterns", [])
self.exclude_patterns = config.get("exclude_patterns", [
r"/api/", r"/admin/", r"/login", r"/register",
r"\.(pdf|jpg|png|gif|svg|css|js)$"
])
async def initialize(self) -> None:
"""Initialize the website adapter."""
logger.info(f"Initializing Website adapter for {self.config.get('base_url')}")
# Validate required configuration
self._validate_config(["base_url"])
base_url = self.config["base_url"].rstrip("/")
# Setup HTTP client
headers = {
"User-Agent": "AnyDocs-MCP/1.0 (+https://github.com/anydocs-mcp)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "keep-alive"
}
# Add custom headers if provided
if "headers" in self.config:
headers.update(self.config["headers"])
self.client = httpx.AsyncClient(
headers=headers,
timeout=30.0,
follow_redirects=True,
limits=httpx.Limits(max_connections=5, max_keepalive_connections=2)
)
# Test connection to base URL
await self._test_connection()
# Discover pages based on method
await self._discover_pages()
self._initialized = True
logger.info(f"Website adapter initialized: discovered {len(self.pages_cache)} pages")
async def list_resources(self) -> List[Resource]:
"""List all discovered pages."""
self._ensure_initialized()
resources = []
for page in self.pages_cache:
resource = Resource(
uri=self.get_resource_uri(page["id"]),
name=page["title"],
description=page.get("description", f"Page from {self.config['base_url']}"),
mimeType="text/markdown"
)
resources.append(resource)
logger.debug(f"Listed {len(resources)} resources from website")
return resources
async def get_content(self, resource_path: str) -> DocumentContent:
"""Get content for a specific page."""
self._ensure_initialized()
# Find page in cache
page = None
for p in self.pages_cache:
if p["id"] == resource_path or p["url"] == resource_path:
page = p
break
if not page:
raise FileNotFoundError(f"Page not found: {resource_path}")
# Fetch and extract content
try:
await asyncio.sleep(self.rate_limit) # Rate limiting
response = await self.client.get(page["url"])
response.raise_for_status()
# Extract content from HTML
content = self._extract_content(response.text, page["url"])
return DocumentContent(
title=page["title"],
content=content,
url=page["url"],
source=self.name,
content_type="text/markdown",
last_modified=page.get("last_modified"),
metadata={
"page_id": page["id"],
"path": page.get("path", ""),
"depth": page.get("depth", 0)
}
)
except Exception as e:
logger.error(f"Error fetching content for {page['url']}: {e}")
raise
async def search(self, query: str, limit: int = 10) -> List[SearchResult]:
"""Search through cached page content."""
self._ensure_initialized()
if not query.strip():
return []
results = []
query_lower = query.lower()
# Simple text-based search (could be enhanced with proper indexing)
for page in self.pages_cache:
score = 0.0
# Check title match (higher weight)
if query_lower in page["title"].lower():
score += 10.0
# Check URL path match
if query_lower in page["url"].lower():
score += 5.0
# Check description match
description = page.get("description", "")
if query_lower in description.lower():
score += 3.0
if score > 0:
# Get content excerpt
try:
# For search results, we might want to fetch a snippet
content_snippet = description or page["title"]
if len(content_snippet) > 200:
content_snippet = content_snippet[:200] + "..."
results.append(SearchResult(
title=page["title"],
content=content_snippet,
url=page["url"],
source=self.name,
score=score,
metadata={
"page_id": page["id"],
"path": page.get("path", "")
}
))
except Exception as e:
logger.error(f"Error creating search result for {page['url']}: {e}")
# Sort by score and limit results
results.sort(key=lambda x: x.score, reverse=True)
return results[:limit]
async def get_structure(self) -> str:
"""Get the hierarchical structure of discovered pages."""
self._ensure_initialized()
base_url = self.config["base_url"]
structure_lines = [f"# Documentation from {base_url}"]
structure_lines.append("")
structure_lines.append(f"**Discovery Method:** {self.discovery_method}")
structure_lines.append(f"**Total Pages:** {len(self.pages_cache)}")
structure_lines.append("")
# Group pages by path depth or domain structure
pages_by_path = {}
for page in self.pages_cache:
path_parts = urlparse(page["url"]).path.strip("/").split("/")
depth = len([p for p in path_parts if p]) # Count non-empty parts
if depth not in pages_by_path:
pages_by_path[depth] = []
pages_by_path[depth].append(page)
# Build hierarchical structure
for depth in sorted(pages_by_path.keys()):
if depth == 0:
structure_lines.append("## Root Pages")
else:
structure_lines.append(f"## Level {depth} Pages")
structure_lines.append("")
for page in pages_by_path[depth][:20]: # Limit to first 20 per level
indent = " " * min(depth, 3) # Max 3 levels of indent
structure_lines.append(f"{indent}- [{page['title']}]({page['url']})")
if len(pages_by_path[depth]) > 20:
structure_lines.append(f" ... and {len(pages_by_path[depth]) - 20} more")
structure_lines.append("")
return "\n".join(structure_lines)
async def _test_connection(self) -> None:
"""Test connection to the base URL."""
try:
base_url = self.config["base_url"]
response = await self.client.get(base_url)
response.raise_for_status()
logger.debug("Website connection test successful")
except Exception as e:
logger.error(f"Website connection test failed: {e}")
raise ConnectionError(f"Failed to connect to {self.config['base_url']}: {e}")
async def _discover_pages(self) -> None:
"""Discover pages using the configured method."""
method = self.discovery_method.lower()
if method == "sitemap":
await self._discover_via_sitemap()
elif method == "manual":
await self._discover_via_manual_urls()
elif method == "crawl":
await self._discover_via_crawling()
elif method == "feed":
await self._discover_via_feeds()
else:
logger.warning(f"Unknown discovery method: {method}, trying sitemap")
await self._discover_via_sitemap()
# If no pages found, fall back to base URL
if not self.pages_cache:
await self._add_single_page(self.config["base_url"])
async def _discover_via_sitemap(self) -> None:
"""Discover pages through sitemap.xml."""
base_url = self.config["base_url"].rstrip("/")
sitemap_urls = [
f"{base_url}/sitemap.xml",
f"{base_url}/sitemap_index.xml",
f"{base_url}/robots.txt" # Check for sitemap in robots.txt
]
for sitemap_url in sitemap_urls:
try:
await asyncio.sleep(self.rate_limit)
response = await self.client.get(sitemap_url)
if response.status_code == 200:
if sitemap_url.endswith("robots.txt"):
# Extract sitemap URLs from robots.txt
await self._parse_robots_txt(response.text)
else:
# Parse sitemap XML
await self._parse_sitemap(response.text, base_url)
if self.pages_cache: # Stop if we found pages
return
except Exception as e:
logger.debug(f"Failed to fetch {sitemap_url}: {e}")
logger.info("No sitemap found or sitemap empty")
async def _discover_via_manual_urls(self) -> None:
"""Discover pages from manually specified URLs."""
urls = self.config.get("urls", [])
for url in urls:
if self._should_include_url(url):
await self._add_single_page(url)
async def _discover_via_crawling(self) -> None:
"""Discover pages through recursive crawling."""
base_url = self.config["base_url"]
seed_urls = self.config.get("seed_urls", [base_url])
await self._crawl_recursive(seed_urls, 0)
async def _discover_via_feeds(self) -> None:
"""Discover pages through RSS/Atom feeds."""
base_url = self.config["base_url"].rstrip("/")
feed_urls = self.config.get("feed_urls", [
f"{base_url}/feed.xml",
f"{base_url}/rss.xml",
f"{base_url}/atom.xml",
f"{base_url}/feed/",
f"{base_url}/rss/"
])
for feed_url in feed_urls:
try:
await asyncio.sleep(self.rate_limit)
response = await self.client.get(feed_url)
if response.status_code == 200:
await self._parse_feed(response.text)
except Exception as e:
logger.debug(f"Failed to fetch feed {feed_url}: {e}")
async def _parse_sitemap(self, sitemap_xml: str, base_url: str) -> None:
"""Parse sitemap XML to extract page URLs."""
try:
soup = BeautifulSoup(sitemap_xml, 'xml')
# Handle sitemap index files
sitemaps = soup.find_all('sitemap')
for sitemap in sitemaps:
loc = sitemap.find('loc')
if loc:
await asyncio.sleep(self.rate_limit)
try:
response = await self.client.get(loc.text)
if response.status_code == 200:
await self._parse_sitemap(response.text, base_url)
except Exception as e:
logger.debug(f"Failed to fetch nested sitemap {loc.text}: {e}")
# Handle URL entries
urls = soup.find_all('url')
for url_elem in urls:
loc = url_elem.find('loc')
if loc and self._should_include_url(loc.text):
# Get metadata
lastmod = url_elem.find('lastmod')
last_modified = None
if lastmod:
try:
last_modified = datetime.fromisoformat(lastmod.text.replace('Z', '+00:00'))
except Exception:
pass
# Create page entry
page_url = loc.text
path = urlparse(page_url).path
title = self._generate_title_from_url(page_url)
page = {
"id": path or "home",
"title": title,
"url": page_url,
"path": path,
"last_modified": last_modified
}
self.pages_cache.append(page)
if len(self.pages_cache) >= self.max_pages:
break
logger.info(f"Parsed sitemap: found {len(self.pages_cache)} URLs")
except Exception as e:
logger.error(f"Error parsing sitemap: {e}")
raise
def _should_include_url(self, url: str) -> bool:
"""Check if URL should be included based on patterns."""
# Check include patterns (if specified)
if self.include_patterns:
if not any(re.search(pattern, url) for pattern in self.include_patterns):
return False
# Check exclude patterns
if any(re.search(pattern, url) for pattern in self.exclude_patterns):
return False
# Must be under base URL
return url.startswith(self.config["base_url"])
def _generate_title_from_url(self, url: str) -> str:
"""Generate a readable title from URL."""
path = urlparse(url).path.strip("/")
if not path:
return "Home"
# Convert path to title
parts = path.split("/")
title_parts = []
for part in parts:
# Replace hyphens and underscores with spaces
part = part.replace("-", " ").replace("_", " ")
# Capitalize each word
part = " ".join(word.capitalize() for word in part.split())
title_parts.append(part)
return " > ".join(title_parts)
def _extract_content(self, html: str, url: str) -> str:
"""Extract main content from HTML."""
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
element.decompose()
# Try content selectors in order
content_elem = None
for selector in self.content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
break
# Fallback to body
if not content_elem:
content_elem = soup.find('body')
if not content_elem:
return soup.get_text(strip=True)
# Convert to markdown-like format
return self._html_to_markdown(content_elem)
def _html_to_markdown(self, element) -> str:
"""Convert HTML element to markdown-like text."""
if not element:
return ""
# Get text content with basic formatting
text = element.get_text(separator="\n", strip=True)
# Basic cleanup and formatting
lines = [line.strip() for line in text.split("\n") if line.strip()]
# Join paragraphs
return "\n\n".join(lines)
async def _add_single_page(self, url: str) -> None:
"""Add a single page to the cache."""
if not self._should_include_url(url):
return
path = urlparse(url).path
title = self._generate_title_from_url(url)
page = {
"id": path or "home",
"title": title,
"url": url,
"path": path
}
self.pages_cache.append(page)
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.client:
await self.client.aclose()