Skip to main content
Glama

AnyDocs MCP Server

by funky1688
gitbook.pyโ€ข17.2 kB
#!/usr/bin/env python3 """ GitBook Document Adapter Adapter for integrating with GitBook documentation platforms. """ import asyncio import json from typing import Any, Dict, List, Optional from urllib.parse import urljoin, urlparse from datetime import datetime import httpx from bs4 import BeautifulSoup from mcp.types import Resource from .base import BaseDocumentAdapter, DocumentContent, SearchResult from ..utils import get_logger logger = get_logger(__name__) class GitBookAdapter(BaseDocumentAdapter): """Adapter for GitBook documentation. Supports both GitBook.com hosted spaces and self-hosted GitBook instances. Required configuration: - base_url: Base URL of the GitBook space - api_token: GitBook API token (optional, for private spaces) - space_id: GitBook space ID (optional, auto-detected if not provided) """ def __init__(self, config: Dict[str, Any]): super().__init__(config) self.client: Optional[httpx.AsyncClient] = None self.space_info: Dict[str, Any] = {} self.pages_cache: List[Dict[str, Any]] = [] async def initialize(self) -> None: """Initialize the GitBook adapter.""" logger.info(f"Initializing GitBook adapter for {self.config.get('base_url')}") # Validate required configuration self._validate_config(["base_url"]) # Validate base_url format base_url = self.config["base_url"] if not base_url.startswith(('http://', 'https://')): raise ValueError(f"Invalid base_url format: {base_url}. Must start with http:// or https://") # Setup HTTP client headers = { "User-Agent": "AnyDocs-MCP/0.1.0", "Accept": "application/json" } # Add API token if provided if "api_token" in self.config: headers["Authorization"] = f"Bearer {self.config['api_token']}" self.client = httpx.AsyncClient( headers=headers, timeout=30.0, follow_redirects=True ) # Test connection and get space info await self._fetch_space_info() # Build pages cache await self._build_pages_cache() self._initialized = True logger.info(f"GitBook adapter initialized successfully for space: {self.space_info.get('title', 'Unknown')}") async def list_resources(self) -> List[Resource]: """List all pages in the GitBook space.""" self._ensure_initialized() resources = [] for page in self.pages_cache: resource = Resource( uri=self.get_resource_uri(page["id"]), name=page["title"], description=page.get("description", ""), mimeType="text/markdown" ) resources.append(resource) logger.debug(f"Listed {len(resources)} resources from GitBook") return resources async def get_content(self, resource_path: str) -> DocumentContent: """Get content for a specific GitBook page.""" self._ensure_initialized() # Find page in cache page = None for p in self.pages_cache: if p["id"] == resource_path or p["slug"] == resource_path: page = p break if not page: raise FileNotFoundError(f"Page not found: {resource_path}") # Fetch page content try: if "api_token" in self.config: # Use API if token is available content = await self._fetch_page_content_api(page["id"]) else: # Fallback to web scraping content = await self._fetch_page_content_web(page["url"]) return DocumentContent( title=page["title"], content=content, url=page["url"], source=self.name, content_type="text/markdown", last_modified=page.get("updatedAt"), metadata={ "page_id": page["id"], "slug": page.get("slug", ""), "parent": page.get("parent", "") } ) except Exception as e: logger.error(f"Error fetching content for page {resource_path}: {e}") raise async def search(self, query: str, limit: int = 10) -> List[SearchResult]: """Search GitBook pages.""" self._ensure_initialized() if not query.strip(): raise ValueError("Search query cannot be empty") results = [] query_lower = query.lower() # Simple text-based search through cached pages for page in self.pages_cache: score = 0.0 # Check title match if query_lower in page["title"].lower(): score += 10.0 # Check description match if "description" in page and query_lower in page["description"].lower(): score += 5.0 # If we have content, search in it if "content" in page and query_lower in page["content"].lower(): score += 2.0 if score > 0: # Get content snippet content = page.get("content", page.get("description", "")) if len(content) > 200: # Find query context query_pos = content.lower().find(query_lower) if query_pos >= 0: start = max(0, query_pos - 100) end = min(len(content), query_pos + 100) content = "..." + content[start:end] + "..." else: content = content[:200] + "..." results.append(SearchResult( title=page["title"], content=content, url=page["url"], source=self.name, score=score, metadata={ "page_id": page["id"], "slug": page.get("slug", "") } )) # Sort by score and limit results results.sort(key=lambda x: x.score, reverse=True) return results[:limit] async def get_structure(self) -> str: """Get the structure of the GitBook space.""" self._ensure_initialized() structure_lines = [f"# {self.space_info.get('title', 'GitBook Space')}"] structure_lines.append("") # Build hierarchical structure root_pages = [p for p in self.pages_cache if not p.get("parent")] def add_page_to_structure(page: Dict[str, Any], indent: int = 0): prefix = " " * indent + "- " structure_lines.append(f"{prefix}[{page['title']}]({page['url']})") # Add child pages child_pages = [p for p in self.pages_cache if p.get("parent") == page["id"]] for child in child_pages: add_page_to_structure(child, indent + 1) for page in root_pages: add_page_to_structure(page) return "\n".join(structure_lines) async def _fetch_space_info(self) -> None: """Fetch information about the GitBook space.""" base_url = self.config["base_url"].rstrip("/") try: # Try API first if token is available if "api_token" in self.config and "space_id" in self.config: api_url = f"https://api.gitbook.com/v1/spaces/{self.config['space_id']}" response = await self.client.get(api_url) response.raise_for_status() self.space_info = response.json() else: # Fallback to web scraping response = await self.client.get(base_url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') title_tag = soup.find('title') self.space_info = { "title": title_tag.text if title_tag else "GitBook Space", "url": base_url } except Exception as e: logger.error(f"Error fetching space info: {e}") self.space_info = {"title": "GitBook Space", "url": base_url} async def _build_pages_cache(self) -> None: """Build cache of all pages in the space.""" base_url = self.config["base_url"].rstrip("/") try: # Try API first if available if "api_token" in self.config and "space_id" in self.config: await self._build_pages_cache_api() else: # Fallback to sitemap or web crawling await self._build_pages_cache_web(base_url) except Exception as e: logger.error(f"Error building pages cache: {e}") # Create minimal cache with just the home page self.pages_cache = [{ "id": "home", "title": self.space_info.get("title", "Home"), "url": base_url, "slug": "" }] async def _build_pages_cache_api(self) -> None: """Build pages cache using GitBook API.""" api_url = f"https://api.gitbook.com/v1/spaces/{self.config['space_id']}/content" try: response = await self.client.get(api_url) response.raise_for_status() data = response.json() self.pages_cache = [] def process_page(page_data: Dict[str, Any], parent_id: str = None): page = { "id": page_data["id"], "title": page_data["title"], "url": page_data.get("urls", {}).get("public", ""), "slug": page_data.get("slug", ""), "description": page_data.get("description", ""), "updatedAt": page_data.get("updatedAt"), "parent": parent_id } self.pages_cache.append(page) # Process child pages for child in page_data.get("pages", []): process_page(child, page_data["id"]) # Process all pages for page in data.get("pages", []): process_page(page) except Exception as e: logger.error(f"Error building pages cache via API: {e}") raise async def _build_pages_cache_web(self, base_url: str) -> None: """Build pages cache by web crawling.""" # Try to get sitemap first sitemap_urls = [f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml"] for sitemap_url in sitemap_urls: try: response = await self.client.get(sitemap_url) if response.status_code == 200: await self._parse_sitemap(response.text, base_url) return except Exception: continue # Fallback: create basic cache with home page self.pages_cache = [{ "id": "home", "title": self.space_info.get("title", "Home"), "url": base_url, "slug": "" }] async def _parse_sitemap(self, sitemap_xml: str, base_url: str) -> None: """Parse sitemap XML to extract page URLs.""" try: soup = BeautifulSoup(sitemap_xml, 'xml') urls = soup.find_all('url') self.pages_cache = [] for url_elem in urls: loc = url_elem.find('loc') if loc: url = loc.text if url.startswith(base_url): # Extract page info from URL path = url[len(base_url):].strip('/') title = path.replace('-', ' ').replace('/', ' > ').title() or "Home" page = { "id": path or "home", "title": title, "url": url, "slug": path } # Try to get last modified date lastmod = url_elem.find('lastmod') if lastmod: try: page["updatedAt"] = datetime.fromisoformat(lastmod.text.replace('Z', '+00:00')) except Exception: pass self.pages_cache.append(page) except Exception as e: logger.error(f"Error parsing sitemap: {e}") raise async def _fetch_page_content_api(self, page_id: str) -> str: """Fetch page content using GitBook API.""" api_url = f"https://api.gitbook.com/v1/spaces/{self.config['space_id']}/content/{page_id}" try: response = await self.client.get(api_url) response.raise_for_status() data = response.json() # Convert GitBook content to markdown return self._convert_gitbook_content_to_markdown(data.get("document", {})) except Exception as e: logger.error(f"Error fetching page content via API: {e}") raise async def _fetch_page_content_web(self, page_url: str) -> str: """Fetch page content by web scraping.""" try: response = await self.client.get(page_url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Find main content area (GitBook specific selectors) content_selectors = [ '[data-testid="page-content"]', '.page-content', 'main', '.content' ] content_elem = None for selector in content_selectors: content_elem = soup.select_one(selector) if content_elem: break if not content_elem: content_elem = soup.find('body') # Convert HTML to markdown-like text return self._html_to_markdown(content_elem) except Exception as e: logger.error(f"Error fetching page content via web: {e}") raise def _convert_gitbook_content_to_markdown(self, document: Dict[str, Any]) -> str: """Convert GitBook document structure to markdown.""" # This is a simplified converter - GitBook's document format is complex # In a production implementation, you'd want a more sophisticated converter def process_node(node: Dict[str, Any]) -> str: node_type = node.get("type", "") text = "" if node_type == "paragraph": text += "\n\n" elif node_type == "heading": level = node.get("data", {}).get("level", 1) text += "\n\n" + "#" * level + " " elif node_type == "list": text += "\n\n" elif node_type == "list-item": text += "- " elif node_type == "code-block": text += "\n\n```\n" # Process text content if "text" in node: text += node["text"] # Process child nodes for child in node.get("nodes", []): text += process_node(child) if node_type == "code-block": text += "\n```\n\n" return text return process_node(document).strip() def _html_to_markdown(self, element) -> str: """Convert HTML element to markdown-like text.""" if not element: return "" # Remove script and style elements for script in element(["script", "style"]): script.decompose() # Get text content with some basic formatting text = element.get_text(separator="\n", strip=True) # Basic cleanup lines = [line.strip() for line in text.split("\n") if line.strip()] return "\n\n".join(lines) async def __aenter__(self): return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.client: await self.client.aclose()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/funky1688/AnyDocs-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server