Skip to main content
Glama
by elad12390
api_docs.py20 kB
"""API documentation search and extraction.""" from __future__ import annotations import re from dataclasses import dataclass from urllib.parse import urlparse import httpx @dataclass(slots=True) class APIDocumentation: """Represents extracted API documentation.""" api_name: str topic: str docs_url: str overview: str parameters: list[dict] examples: list[dict] related_links: list[dict] notes: list[str] source_urls: list[str] class APIDocsDetector: """Intelligently find API documentation URLs.""" # Common documentation URL patterns to try # Order matters: try most common patterns first (.com before .io) DOC_PATTERNS = [ # .com patterns (most common) "https://docs.{api}.com", "https://{api}.com/docs", "https://{api}.com/docs/api", # Stripe-style "https://www.{api}.com/docs", "https://developers.{api}.com", "https://developer.{api}.com", "https://{api}.com/documentation", "https://api.{api}.com/docs", # Framework-specific patterns "https://{api}.dev", # Vite, Nuxt, etc. "https://www.{api}.dev", "https://{api}.ng", # Angular-based (Spartan) "https://www.{api}.ng", # .io patterns (less common, try after .com) "https://docs.{api}.io", "https://{api}.io/docs", "https://www.{api}.io/docs", # .org patterns "https://{api}.org/docs", "https://www.{api}.org/docs", "https://docs.{api}.org", # .ai patterns "https://docs.{api}.ai", "https://{api}.ai/docs", ] # API name aliases - map common variations to canonical names/URLs # This handles cases like "Meta Graph API" -> "facebook" API_ALIASES: dict[str, str | dict] = { # Meta/Facebook "meta": "facebook", "meta graph": "facebook", "meta graph api": "facebook", "facebook graph": "facebook", "facebook graph api": "facebook", "instagram api": "facebook", "instagram graph": "facebook", # Google "google site verification": { "name": "google", "docs_url": "https://developers.google.com/site-verification", }, "google site verification api": { "name": "google", "docs_url": "https://developers.google.com/site-verification", }, "google analytics": { "name": "google", "docs_url": "https://developers.google.com/analytics", }, "google analytics 4": { "name": "google", "docs_url": "https://developers.google.com/analytics/devguides/config/admin/v1", }, "ga4": { "name": "google", "docs_url": "https://developers.google.com/analytics/devguides/config/admin/v1", }, "gemini": {"name": "google", "docs_url": "https://ai.google.dev/docs"}, "google gemini": {"name": "google", "docs_url": "https://ai.google.dev/docs"}, "vertex ai": {"name": "google", "docs_url": "https://cloud.google.com/vertex-ai/docs"}, "google cloud": {"name": "google", "docs_url": "https://cloud.google.com/docs"}, # TikTok "tiktok": {"name": "tiktok", "docs_url": "https://developers.tiktok.com/doc"}, "tiktok business": { "name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs", }, "tiktok business api": { "name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs", }, "tiktok ads": {"name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs"}, # OpenAI "openai": {"name": "openai", "docs_url": "https://platform.openai.com/docs"}, "chatgpt": {"name": "openai", "docs_url": "https://platform.openai.com/docs"}, "gpt": {"name": "openai", "docs_url": "https://platform.openai.com/docs"}, "dall-e": {"name": "openai", "docs_url": "https://platform.openai.com/docs/guides/images"}, "dalle": {"name": "openai", "docs_url": "https://platform.openai.com/docs/guides/images"}, # Anthropic "anthropic": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"}, "claude": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"}, "claude api": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"}, # Notion "notion": {"name": "notion", "docs_url": "https://developers.notion.com"}, "notion api": {"name": "notion", "docs_url": "https://developers.notion.com/reference"}, # Slack "slack": {"name": "slack", "docs_url": "https://api.slack.com/docs"}, "slack api": {"name": "slack", "docs_url": "https://api.slack.com/docs"}, "slack block kit": {"name": "slack", "docs_url": "https://api.slack.com/block-kit"}, # ElevenLabs "elevenlabs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"}, "eleven labs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"}, "11labs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"}, # Fal.ai "fal": {"name": "fal", "docs_url": "https://fal.ai/docs"}, "fal.ai": {"name": "fal", "docs_url": "https://fal.ai/docs"}, "fal ai": {"name": "fal", "docs_url": "https://fal.ai/docs"}, # Cloudflare "cloudflare": {"name": "cloudflare", "docs_url": "https://developers.cloudflare.com"}, "cloudflare waf": { "name": "cloudflare", "docs_url": "https://developers.cloudflare.com/waf", }, # AWS "aws": {"name": "aws", "docs_url": "https://docs.aws.amazon.com"}, "amazon": {"name": "aws", "docs_url": "https://docs.aws.amazon.com"}, # Stripe "stripe": {"name": "stripe", "docs_url": "https://docs.stripe.com/api"}, # Twilio "twilio": {"name": "twilio", "docs_url": "https://www.twilio.com/docs/messaging"}, "twilio sms": {"name": "twilio", "docs_url": "https://www.twilio.com/docs/sms"}, # SendGrid "sendgrid": {"name": "sendgrid", "docs_url": "https://www.twilio.com/docs/sendgrid"}, # Plaid "plaid": {"name": "plaid", "docs_url": "https://plaid.com/docs"}, # Vercel "vercel": {"name": "vercel", "docs_url": "https://vercel.com/docs"}, # Spartan (Angular UI) "spartan": {"name": "spartan", "docs_url": "https://www.spartan.ng/documentation"}, "spartan ui": {"name": "spartan", "docs_url": "https://www.spartan.ng/documentation"}, # Mureka "mureka": {"name": "mureka", "docs_url": "https://docs.mureka.ai"}, # Replicate "replicate": {"name": "replicate", "docs_url": "https://replicate.com/docs"}, # Hugging Face "huggingface": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"}, "hugging face": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"}, "hf": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"}, # Supabase "supabase": {"name": "supabase", "docs_url": "https://supabase.com/docs"}, # Firebase "firebase": {"name": "firebase", "docs_url": "https://firebase.google.com/docs"}, # Vercel "vercel": {"name": "vercel", "docs_url": "https://vercel.com/docs"}, # Netlify "netlify": {"name": "netlify", "docs_url": "https://docs.netlify.com"}, # Discord "discord": {"name": "discord", "docs_url": "https://discord.com/developers/docs"}, "discord api": {"name": "discord", "docs_url": "https://discord.com/developers/docs"}, } def __init__(self): self.http_client = httpx.AsyncClient( timeout=10.0, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (compatible; API-Docs-Explorer/1.0)"}, ) def normalize_api_name(self, api_name: str) -> tuple[str, str | None]: """ Normalize API name using aliases. Returns: Tuple of (normalized_name, known_docs_url or None) """ api_lower = api_name.lower().strip() # Check for exact match in aliases if api_lower in self.API_ALIASES: alias = self.API_ALIASES[api_lower] if isinstance(alias, str): return alias, None elif isinstance(alias, dict): return alias.get("name", api_lower), alias.get("docs_url") # Check for partial matches (e.g., "Meta Graph API" contains "meta graph") for key, alias in self.API_ALIASES.items(): if key in api_lower or api_lower in key: if isinstance(alias, str): return alias, None elif isinstance(alias, dict): return alias.get("name", api_lower), alias.get("docs_url") # Clean up common suffixes cleaned = api_lower for suffix in [" api", " sdk", " docs", " documentation"]: if cleaned.endswith(suffix): cleaned = cleaned[: -len(suffix)].strip() # Remove special characters and spaces for URL generation url_safe = re.sub(r"[^a-z0-9]", "", cleaned) return url_safe, None async def find_docs_url(self, api_name: str) -> str | None: """ Dynamically find the official documentation URL for an API. Strategy: 1. Check API aliases for known documentation URLs 2. Try common URL patterns (docs.X.com, X.com/docs, etc.) 3. If patterns fail, return None to trigger search fallback """ # Normalize the API name and check for known URLs normalized_name, known_url = self.normalize_api_name(api_name) # If we have a known URL from aliases, verify it's accessible if known_url: if await self._is_valid_docs_site(known_url): return known_url # Try common patterns with normalized name for pattern in self.DOC_PATTERNS: url = pattern.format(api=normalized_name) if await self._is_valid_docs_site(url): return url # If all patterns fail, return None and let caller search return None def get_search_terms(self, api_name: str) -> list[str]: """ Get alternative search terms for an API. Returns multiple variations to try when searching. """ api_lower = api_name.lower().strip() terms = [api_name] # Original # Add normalized version normalized, _ = self.normalize_api_name(api_name) if normalized != api_lower: terms.append(normalized) # Add common variations if "api" not in api_lower: terms.append(f"{api_name} API") # Check aliases for the canonical name if api_lower in self.API_ALIASES: alias = self.API_ALIASES[api_lower] if isinstance(alias, str): terms.append(alias) elif isinstance(alias, dict) and "name" in alias: terms.append(alias["name"]) return list(dict.fromkeys(terms)) # Deduplicate while preserving order async def _is_valid_docs_site(self, url: str) -> bool: """Check if a URL is a valid documentation site.""" try: response = await self.http_client.head(url, timeout=5.0) # Check for successful response and likely docs content if response.status_code == 200: # Optionally verify it looks like a docs site # by checking content-type or doing a quick GET return True return False except Exception: return False def get_docs_domain(self, docs_url: str) -> str: """Extract the domain from a documentation URL for site-specific search.""" parsed = urlparse(docs_url) return parsed.netloc async def close(self): """Close the HTTP client.""" await self.http_client.aclose() class APIDocsExtractor: """Extract and format API documentation content.""" def extract_overview(self, content: str) -> str: """Extract the overview/description from documentation.""" # Look for common overview sections patterns = [ r"(?:^|\n)#{1,3}\s*(?:Overview|Description|About|Introduction)\s*\n(.*?)(?:\n#{1,3}|\Z)", r"(?:^|\n)(?:Overview|Description):\s*(.*?)(?:\n\n|\Z)", # First substantial paragraph r"(?:^|\n)([A-Z][^.\n]{50,}\.(?:\s+[A-Z][^.\n]+\.){0,3})", ] for pattern in patterns: match = re.search(pattern, content, re.DOTALL | re.MULTILINE) if match: overview = match.group(1).strip() # Clean up and limit length overview = re.sub(r"\s+", " ", overview) if len(overview) > 500: overview = overview[:500] + "..." return overview # Fallback: first paragraph lines = content.strip().split("\n") for line in lines: line = line.strip() if len(line) > 50 and not line.startswith("#"): return line[:500] return "No overview available." def extract_parameters(self, content: str) -> list[dict]: """Extract parameter information from documentation.""" parameters = [] # Pattern for parameter documentation # Matches: "param_name (type, required/optional) - description" param_pattern = r"[\*\-]?\s*`?(\w+)`?\s*\(([^)]+)\)\s*[-–—:]\s*(.+?)(?=\n[\*\-]?\s*`?\w+`?\s*\(|\n\n|\Z)" matches = re.finditer(param_pattern, content, re.DOTALL) for match in matches: name = match.group(1) type_info = match.group(2).strip() description = match.group(3).strip() # Clean up description description = re.sub(r"\s+", " ", description) # Extract if required/optional required = "required" in type_info.lower() parameters.append( { "name": name, "type": type_info, "required": required, "description": description[:300], # Limit length } ) return parameters def extract_examples(self, content: str) -> list[dict]: """Extract code examples from documentation.""" examples = [] # Match code blocks with language specifier code_block_pattern = r"```(\w+)\n(.*?)```" matches = re.finditer(code_block_pattern, content, re.DOTALL) for match in matches: language = match.group(1) code = match.group(2).strip() # Skip very short snippets (likely not examples) if len(code) > 20: examples.append({"language": language, "code": code}) return examples[:10] # Limit to 10 examples def extract_notes(self, content: str) -> list[str]: """Extract important notes, warnings, and tips.""" notes = [] # Look for note/warning/tip sections patterns = [ r"(?:⚠️|⚡|💡|📝|Note|Warning|Important|Tip):\s*(.+?)(?:\n\n|\Z)", r"> (.+?)(?:\n\n|\Z)", # Blockquotes ] for pattern in patterns: matches = re.finditer(pattern, content, re.DOTALL) for match in matches: note = match.group(1).strip() note = re.sub(r"\s+", " ", note) if len(note) > 30 and len(note) < 500: notes.append(note) return notes[:5] # Limit to 5 notes def extract_links(self, content: str, base_url: str) -> list[dict]: """Extract related documentation links.""" links = [] # Match markdown links link_pattern = r"\[([^\]]+)\]\(([^)]+)\)" matches = re.finditer(link_pattern, content) for match in matches: title = match.group(1) url = match.group(2) # Filter for documentation links (not random external links) if any( keyword in title.lower() for keyword in ["api", "docs", "guide", "reference", "tutorial", "see"] ): # Make relative URLs absolute if url.startswith("/"): parsed = urlparse(base_url) url = f"{parsed.scheme}://{parsed.netloc}{url}" links.append({"title": title, "url": url}) # Deduplicate seen_urls = set() unique_links = [] for link in links: if link["url"] not in seen_urls: seen_urls.add(link["url"]) unique_links.append(link) return unique_links[:10] # Limit to 10 links def format_documentation(self, doc: APIDocumentation) -> str: """Format extracted documentation into readable text.""" lines = [ f"API Documentation: {doc.api_name.title()} - {doc.topic}", "═" * 70, "", ] # Overview if doc.overview: lines.extend( [ "📖 Overview:", f" {doc.overview}", "", ] ) # Main documentation URL lines.extend( [ f"📚 Documentation: {doc.docs_url}", "", ] ) # Parameters if doc.parameters: lines.extend( [ "📋 Parameters:", "─" * 70, "", ] ) for param in doc.parameters: req_marker = "required" if param["required"] else "optional" lines.append(f" {param['name']} ({param['type']}, {req_marker})") lines.append(f" {param['description']}") lines.append("") # Code Examples if doc.examples: lines.extend( [ "💡 Code Examples:", "─" * 70, "", ] ) for i, example in enumerate(doc.examples, 1): lines.append(f" Example {i} ({example['language']}):") lines.append(f" ```{example['language']}") # Indent code for code_line in example["code"].split("\n"): lines.append(f" {code_line}") lines.append(" ```") lines.append("") # Important Notes if doc.notes: lines.extend( [ "⚠️ Important Notes:", "─" * 70, "", ] ) for note in doc.notes: lines.append(f" • {note}") lines.append("") # Related Links if doc.related_links: lines.extend( [ "🔗 Related Documentation:", "─" * 70, "", ] ) for link in doc.related_links: lines.append(f" • {link['title']}") lines.append(f" {link['url']}") lines.append("") # Source URLs if doc.source_urls: lines.extend( [ "📄 Sources:", "─" * 70, "", ] ) for url in doc.source_urls: lines.append(f" • {url}") return "\n".join(lines)

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/elad12390/web-research-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server