gitbook.pyโข17.2 kB
#!/usr/bin/env python3
"""
GitBook Document Adapter
Adapter for integrating with GitBook documentation platforms.
"""
import asyncio
import json
from typing import Any, Dict, List, Optional
from urllib.parse import urljoin, urlparse
from datetime import datetime
import httpx
from bs4 import BeautifulSoup
from mcp.types import Resource
from .base import BaseDocumentAdapter, DocumentContent, SearchResult
from ..utils import get_logger
logger = get_logger(__name__)
class GitBookAdapter(BaseDocumentAdapter):
"""Adapter for GitBook documentation.
Supports both GitBook.com hosted spaces and self-hosted GitBook instances.
Required configuration:
- base_url: Base URL of the GitBook space
- api_token: GitBook API token (optional, for private spaces)
- space_id: GitBook space ID (optional, auto-detected if not provided)
"""
def __init__(self, config: Dict[str, Any]):
super().__init__(config)
self.client: Optional[httpx.AsyncClient] = None
self.space_info: Dict[str, Any] = {}
self.pages_cache: List[Dict[str, Any]] = []
async def initialize(self) -> None:
"""Initialize the GitBook adapter."""
logger.info(f"Initializing GitBook adapter for {self.config.get('base_url')}")
# Validate required configuration
self._validate_config(["base_url"])
# Validate base_url format
base_url = self.config["base_url"]
if not base_url.startswith(('http://', 'https://')):
raise ValueError(f"Invalid base_url format: {base_url}. Must start with http:// or https://")
# Setup HTTP client
headers = {
"User-Agent": "AnyDocs-MCP/0.1.0",
"Accept": "application/json"
}
# Add API token if provided
if "api_token" in self.config:
headers["Authorization"] = f"Bearer {self.config['api_token']}"
self.client = httpx.AsyncClient(
headers=headers,
timeout=30.0,
follow_redirects=True
)
# Test connection and get space info
await self._fetch_space_info()
# Build pages cache
await self._build_pages_cache()
self._initialized = True
logger.info(f"GitBook adapter initialized successfully for space: {self.space_info.get('title', 'Unknown')}")
async def list_resources(self) -> List[Resource]:
"""List all pages in the GitBook space."""
self._ensure_initialized()
resources = []
for page in self.pages_cache:
resource = Resource(
uri=self.get_resource_uri(page["id"]),
name=page["title"],
description=page.get("description", ""),
mimeType="text/markdown"
)
resources.append(resource)
logger.debug(f"Listed {len(resources)} resources from GitBook")
return resources
async def get_content(self, resource_path: str) -> DocumentContent:
"""Get content for a specific GitBook page."""
self._ensure_initialized()
# Find page in cache
page = None
for p in self.pages_cache:
if p["id"] == resource_path or p["slug"] == resource_path:
page = p
break
if not page:
raise FileNotFoundError(f"Page not found: {resource_path}")
# Fetch page content
try:
if "api_token" in self.config:
# Use API if token is available
content = await self._fetch_page_content_api(page["id"])
else:
# Fallback to web scraping
content = await self._fetch_page_content_web(page["url"])
return DocumentContent(
title=page["title"],
content=content,
url=page["url"],
source=self.name,
content_type="text/markdown",
last_modified=page.get("updatedAt"),
metadata={
"page_id": page["id"],
"slug": page.get("slug", ""),
"parent": page.get("parent", "")
}
)
except Exception as e:
logger.error(f"Error fetching content for page {resource_path}: {e}")
raise
async def search(self, query: str, limit: int = 10) -> List[SearchResult]:
"""Search GitBook pages."""
self._ensure_initialized()
if not query.strip():
raise ValueError("Search query cannot be empty")
results = []
query_lower = query.lower()
# Simple text-based search through cached pages
for page in self.pages_cache:
score = 0.0
# Check title match
if query_lower in page["title"].lower():
score += 10.0
# Check description match
if "description" in page and query_lower in page["description"].lower():
score += 5.0
# If we have content, search in it
if "content" in page and query_lower in page["content"].lower():
score += 2.0
if score > 0:
# Get content snippet
content = page.get("content", page.get("description", ""))
if len(content) > 200:
# Find query context
query_pos = content.lower().find(query_lower)
if query_pos >= 0:
start = max(0, query_pos - 100)
end = min(len(content), query_pos + 100)
content = "..." + content[start:end] + "..."
else:
content = content[:200] + "..."
results.append(SearchResult(
title=page["title"],
content=content,
url=page["url"],
source=self.name,
score=score,
metadata={
"page_id": page["id"],
"slug": page.get("slug", "")
}
))
# Sort by score and limit results
results.sort(key=lambda x: x.score, reverse=True)
return results[:limit]
async def get_structure(self) -> str:
"""Get the structure of the GitBook space."""
self._ensure_initialized()
structure_lines = [f"# {self.space_info.get('title', 'GitBook Space')}"]
structure_lines.append("")
# Build hierarchical structure
root_pages = [p for p in self.pages_cache if not p.get("parent")]
def add_page_to_structure(page: Dict[str, Any], indent: int = 0):
prefix = " " * indent + "- "
structure_lines.append(f"{prefix}[{page['title']}]({page['url']})")
# Add child pages
child_pages = [p for p in self.pages_cache if p.get("parent") == page["id"]]
for child in child_pages:
add_page_to_structure(child, indent + 1)
for page in root_pages:
add_page_to_structure(page)
return "\n".join(structure_lines)
async def _fetch_space_info(self) -> None:
"""Fetch information about the GitBook space."""
base_url = self.config["base_url"].rstrip("/")
try:
# Try API first if token is available
if "api_token" in self.config and "space_id" in self.config:
api_url = f"https://api.gitbook.com/v1/spaces/{self.config['space_id']}"
response = await self.client.get(api_url)
response.raise_for_status()
self.space_info = response.json()
else:
# Fallback to web scraping
response = await self.client.get(base_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title_tag = soup.find('title')
self.space_info = {
"title": title_tag.text if title_tag else "GitBook Space",
"url": base_url
}
except Exception as e:
logger.error(f"Error fetching space info: {e}")
self.space_info = {"title": "GitBook Space", "url": base_url}
async def _build_pages_cache(self) -> None:
"""Build cache of all pages in the space."""
base_url = self.config["base_url"].rstrip("/")
try:
# Try API first if available
if "api_token" in self.config and "space_id" in self.config:
await self._build_pages_cache_api()
else:
# Fallback to sitemap or web crawling
await self._build_pages_cache_web(base_url)
except Exception as e:
logger.error(f"Error building pages cache: {e}")
# Create minimal cache with just the home page
self.pages_cache = [{
"id": "home",
"title": self.space_info.get("title", "Home"),
"url": base_url,
"slug": ""
}]
async def _build_pages_cache_api(self) -> None:
"""Build pages cache using GitBook API."""
api_url = f"https://api.gitbook.com/v1/spaces/{self.config['space_id']}/content"
try:
response = await self.client.get(api_url)
response.raise_for_status()
data = response.json()
self.pages_cache = []
def process_page(page_data: Dict[str, Any], parent_id: str = None):
page = {
"id": page_data["id"],
"title": page_data["title"],
"url": page_data.get("urls", {}).get("public", ""),
"slug": page_data.get("slug", ""),
"description": page_data.get("description", ""),
"updatedAt": page_data.get("updatedAt"),
"parent": parent_id
}
self.pages_cache.append(page)
# Process child pages
for child in page_data.get("pages", []):
process_page(child, page_data["id"])
# Process all pages
for page in data.get("pages", []):
process_page(page)
except Exception as e:
logger.error(f"Error building pages cache via API: {e}")
raise
async def _build_pages_cache_web(self, base_url: str) -> None:
"""Build pages cache by web crawling."""
# Try to get sitemap first
sitemap_urls = [f"{base_url}/sitemap.xml", f"{base_url}/sitemap_index.xml"]
for sitemap_url in sitemap_urls:
try:
response = await self.client.get(sitemap_url)
if response.status_code == 200:
await self._parse_sitemap(response.text, base_url)
return
except Exception:
continue
# Fallback: create basic cache with home page
self.pages_cache = [{
"id": "home",
"title": self.space_info.get("title", "Home"),
"url": base_url,
"slug": ""
}]
async def _parse_sitemap(self, sitemap_xml: str, base_url: str) -> None:
"""Parse sitemap XML to extract page URLs."""
try:
soup = BeautifulSoup(sitemap_xml, 'xml')
urls = soup.find_all('url')
self.pages_cache = []
for url_elem in urls:
loc = url_elem.find('loc')
if loc:
url = loc.text
if url.startswith(base_url):
# Extract page info from URL
path = url[len(base_url):].strip('/')
title = path.replace('-', ' ').replace('/', ' > ').title() or "Home"
page = {
"id": path or "home",
"title": title,
"url": url,
"slug": path
}
# Try to get last modified date
lastmod = url_elem.find('lastmod')
if lastmod:
try:
page["updatedAt"] = datetime.fromisoformat(lastmod.text.replace('Z', '+00:00'))
except Exception:
pass
self.pages_cache.append(page)
except Exception as e:
logger.error(f"Error parsing sitemap: {e}")
raise
async def _fetch_page_content_api(self, page_id: str) -> str:
"""Fetch page content using GitBook API."""
api_url = f"https://api.gitbook.com/v1/spaces/{self.config['space_id']}/content/{page_id}"
try:
response = await self.client.get(api_url)
response.raise_for_status()
data = response.json()
# Convert GitBook content to markdown
return self._convert_gitbook_content_to_markdown(data.get("document", {}))
except Exception as e:
logger.error(f"Error fetching page content via API: {e}")
raise
async def _fetch_page_content_web(self, page_url: str) -> str:
"""Fetch page content by web scraping."""
try:
response = await self.client.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find main content area (GitBook specific selectors)
content_selectors = [
'[data-testid="page-content"]',
'.page-content',
'main',
'.content'
]
content_elem = None
for selector in content_selectors:
content_elem = soup.select_one(selector)
if content_elem:
break
if not content_elem:
content_elem = soup.find('body')
# Convert HTML to markdown-like text
return self._html_to_markdown(content_elem)
except Exception as e:
logger.error(f"Error fetching page content via web: {e}")
raise
def _convert_gitbook_content_to_markdown(self, document: Dict[str, Any]) -> str:
"""Convert GitBook document structure to markdown."""
# This is a simplified converter - GitBook's document format is complex
# In a production implementation, you'd want a more sophisticated converter
def process_node(node: Dict[str, Any]) -> str:
node_type = node.get("type", "")
text = ""
if node_type == "paragraph":
text += "\n\n"
elif node_type == "heading":
level = node.get("data", {}).get("level", 1)
text += "\n\n" + "#" * level + " "
elif node_type == "list":
text += "\n\n"
elif node_type == "list-item":
text += "- "
elif node_type == "code-block":
text += "\n\n```\n"
# Process text content
if "text" in node:
text += node["text"]
# Process child nodes
for child in node.get("nodes", []):
text += process_node(child)
if node_type == "code-block":
text += "\n```\n\n"
return text
return process_node(document).strip()
def _html_to_markdown(self, element) -> str:
"""Convert HTML element to markdown-like text."""
if not element:
return ""
# Remove script and style elements
for script in element(["script", "style"]):
script.decompose()
# Get text content with some basic formatting
text = element.get_text(separator="\n", strip=True)
# Basic cleanup
lines = [line.strip() for line in text.split("\n") if line.strip()]
return "\n\n".join(lines)
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.client:
await self.client.aclose()