Web Research Assistant

Overview Schema Related Servers Score Discussions

api_docs.py•19.5 KiB

"""API documentation search and extraction.""" from __future__ import annotations import re from dataclasses import dataclass from urllib.parse import urlparse import httpx @dataclass(slots=True) class APIDocumentation: """Represents extracted API documentation.""" api_name: str topic: str docs_url: str overview: str parameters: list[dict] examples: list[dict] related_links: list[dict] notes: list[str] source_urls: list[str] class APIDocsDetector: """Intelligently find API documentation URLs.""" # Common documentation URL patterns to try # Order matters: try most common patterns first (.com before .io) DOC_PATTERNS = [ # .com patterns (most common) "https://docs.{api}.com", "https://{api}.com/docs", "https://{api}.com/docs/api", # Stripe-style "https://www.{api}.com/docs", "https://developers.{api}.com", "https://developer.{api}.com", "https://{api}.com/documentation", "https://api.{api}.com/docs", # Framework-specific patterns "https://{api}.dev", # Vite, Nuxt, etc. "https://www.{api}.dev", "https://{api}.ng", # Angular-based (Spartan) "https://www.{api}.ng", # .io patterns (less common, try after .com) "https://docs.{api}.io", "https://{api}.io/docs", "https://www.{api}.io/docs", # .org patterns "https://{api}.org/docs", "https://www.{api}.org/docs", "https://docs.{api}.org", # .ai patterns "https://docs.{api}.ai", "https://{api}.ai/docs", ] # API name aliases - map common variations to canonical names/URLs # This handles cases like "Meta Graph API" -> "facebook" API_ALIASES: dict[str, str | dict] = { # Meta/Facebook "meta": "facebook", "meta graph": "facebook", "meta graph api": "facebook", "facebook graph": "facebook", "facebook graph api": "facebook", "instagram api": "facebook", "instagram graph": "facebook", # Google "google site verification": { "name": "google", "docs_url": "https://developers.google.com/site-verification", }, "google site verification api": { "name": "google", "docs_url": "https://developers.google.com/site-verification", }, "google analytics": { "name": "google", "docs_url": "https://developers.google.com/analytics", }, "google analytics 4": { "name": "google", "docs_url": "https://developers.google.com/analytics/devguides/config/admin/v1", }, "ga4": { "name": "google", "docs_url": "https://developers.google.com/analytics/devguides/config/admin/v1", }, "gemini": {"name": "google", "docs_url": "https://ai.google.dev/docs"}, "google gemini": {"name": "google", "docs_url": "https://ai.google.dev/docs"}, "vertex ai": {"name": "google", "docs_url": "https://cloud.google.com/vertex-ai/docs"}, "google cloud": {"name": "google", "docs_url": "https://cloud.google.com/docs"}, # TikTok "tiktok": {"name": "tiktok", "docs_url": "https://developers.tiktok.com/doc"}, "tiktok business": { "name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs", }, "tiktok business api": { "name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs", }, "tiktok ads": {"name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs"}, # OpenAI "openai": {"name": "openai", "docs_url": "https://platform.openai.com/docs"}, "chatgpt": {"name": "openai", "docs_url": "https://platform.openai.com/docs"}, "gpt": {"name": "openai", "docs_url": "https://platform.openai.com/docs"}, "dall-e": {"name": "openai", "docs_url": "https://platform.openai.com/docs/guides/images"}, "dalle": {"name": "openai", "docs_url": "https://platform.openai.com/docs/guides/images"}, # Anthropic "anthropic": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"}, "claude": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"}, "claude api": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"}, # Notion "notion": {"name": "notion", "docs_url": "https://developers.notion.com"}, "notion api": {"name": "notion", "docs_url": "https://developers.notion.com/reference"}, # Slack "slack": {"name": "slack", "docs_url": "https://api.slack.com/docs"}, "slack api": {"name": "slack", "docs_url": "https://api.slack.com/docs"}, "slack block kit": {"name": "slack", "docs_url": "https://api.slack.com/block-kit"}, # ElevenLabs "elevenlabs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"}, "eleven labs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"}, "11labs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"}, # Fal.ai "fal": {"name": "fal", "docs_url": "https://fal.ai/docs"}, "fal.ai": {"name": "fal", "docs_url": "https://fal.ai/docs"}, "fal ai": {"name": "fal", "docs_url": "https://fal.ai/docs"}, # Cloudflare "cloudflare": {"name": "cloudflare", "docs_url": "https://developers.cloudflare.com"}, "cloudflare waf": { "name": "cloudflare", "docs_url": "https://developers.cloudflare.com/waf", }, # AWS "aws": {"name": "aws", "docs_url": "https://docs.aws.amazon.com"}, "amazon": {"name": "aws", "docs_url": "https://docs.aws.amazon.com"}, # Stripe "stripe": {"name": "stripe", "docs_url": "https://docs.stripe.com/api"}, # Twilio "twilio": {"name": "twilio", "docs_url": "https://www.twilio.com/docs/messaging"}, "twilio sms": {"name": "twilio", "docs_url": "https://www.twilio.com/docs/sms"}, # SendGrid "sendgrid": {"name": "sendgrid", "docs_url": "https://www.twilio.com/docs/sendgrid"}, # Plaid "plaid": {"name": "plaid", "docs_url": "https://plaid.com/docs"}, # Vercel "vercel": {"name": "vercel", "docs_url": "https://vercel.com/docs"}, # Spartan (Angular UI) "spartan": {"name": "spartan", "docs_url": "https://www.spartan.ng/documentation"}, "spartan ui": {"name": "spartan", "docs_url": "https://www.spartan.ng/documentation"}, # Mureka "mureka": {"name": "mureka", "docs_url": "https://docs.mureka.ai"}, # Replicate "replicate": {"name": "replicate", "docs_url": "https://replicate.com/docs"}, # Hugging Face "huggingface": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"}, "hugging face": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"}, "hf": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"}, # Supabase "supabase": {"name": "supabase", "docs_url": "https://supabase.com/docs"}, # Firebase "firebase": {"name": "firebase", "docs_url": "https://firebase.google.com/docs"}, # Vercel "vercel": {"name": "vercel", "docs_url": "https://vercel.com/docs"}, # Netlify "netlify": {"name": "netlify", "docs_url": "https://docs.netlify.com"}, # Discord "discord": {"name": "discord", "docs_url": "https://discord.com/developers/docs"}, "discord api": {"name": "discord", "docs_url": "https://discord.com/developers/docs"}, } def __init__(self): self.http_client = httpx.AsyncClient( timeout=10.0, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (compatible; API-Docs-Explorer/1.0)"}, ) def normalize_api_name(self, api_name: str) -> tuple[str, str | None]: """ Normalize API name using aliases. Returns: Tuple of (normalized_name, known_docs_url or None) """ api_lower = api_name.lower().strip() # Check for exact match in aliases if api_lower in self.API_ALIASES: alias = self.API_ALIASES[api_lower] if isinstance(alias, str): return alias, None elif isinstance(alias, dict): return alias.get("name", api_lower), alias.get("docs_url") # Check for partial matches (e.g., "Meta Graph API" contains "meta graph") for key, alias in self.API_ALIASES.items(): if key in api_lower or api_lower in key: if isinstance(alias, str): return alias, None elif isinstance(alias, dict): return alias.get("name", api_lower), alias.get("docs_url") # Clean up common suffixes cleaned = api_lower for suffix in [" api", " sdk", " docs", " documentation"]: if cleaned.endswith(suffix): cleaned = cleaned[: -len(suffix)].strip() # Remove special characters and spaces for URL generation url_safe = re.sub(r"[^a-z0-9]", "", cleaned) return url_safe, None async def find_docs_url(self, api_name: str) -> str | None: """ Dynamically find the official documentation URL for an API. Strategy: 1. Check API aliases for known documentation URLs 2. Try common URL patterns (docs.X.com, X.com/docs, etc.) 3. If patterns fail, return None to trigger search fallback """ # Normalize the API name and check for known URLs normalized_name, known_url = self.normalize_api_name(api_name) # If we have a known URL from aliases, verify it's accessible if known_url: if await self._is_valid_docs_site(known_url): return known_url # Try common patterns with normalized name for pattern in self.DOC_PATTERNS: url = pattern.format(api=normalized_name) if await self._is_valid_docs_site(url): return url # If all patterns fail, return None and let caller search return None def get_search_terms(self, api_name: str) -> list[str]: """ Get alternative search terms for an API. Returns multiple variations to try when searching. """ api_lower = api_name.lower().strip() terms = [api_name] # Original # Add normalized version normalized, _ = self.normalize_api_name(api_name) if normalized != api_lower: terms.append(normalized) # Add common variations if "api" not in api_lower: terms.append(f"{api_name} API") # Check aliases for the canonical name if api_lower in self.API_ALIASES: alias = self.API_ALIASES[api_lower] if isinstance(alias, str): terms.append(alias) elif isinstance(alias, dict) and "name" in alias: terms.append(alias["name"]) return list(dict.fromkeys(terms)) # Deduplicate while preserving order async def _is_valid_docs_site(self, url: str) -> bool: """Check if a URL is a valid documentation site.""" try: response = await self.http_client.head(url, timeout=5.0) # Check for successful response and likely docs content if response.status_code == 200: # Optionally verify it looks like a docs site # by checking content-type or doing a quick GET return True return False except Exception: return False def get_docs_domain(self, docs_url: str) -> str: """Extract the domain from a documentation URL for site-specific search.""" parsed = urlparse(docs_url) return parsed.netloc async def close(self): """Close the HTTP client.""" await self.http_client.aclose() class APIDocsExtractor: """Extract and format API documentation content.""" def extract_overview(self, content: str) -> str: """Extract the overview/description from documentation.""" # Look for common overview sections patterns = [ r"(?:^|\n)#{1,3}\s*(?:Overview|Description|About|Introduction)\s*\n(.*?)(?:\n#{1,3}|\Z)", r"(?:^|\n)(?:Overview|Description):\s*(.*?)(?:\n\n|\Z)", # First substantial paragraph r"(?:^|\n)([A-Z][^.\n]{50,}\.(?:\s+[A-Z][^.\n]+\.){0,3})", ] for pattern in patterns: match = re.search(pattern, content, re.DOTALL | re.MULTILINE) if match: overview = match.group(1).strip() # Clean up and limit length overview = re.sub(r"\s+", " ", overview) if len(overview) > 500: overview = overview[:500] + "..." return overview # Fallback: first paragraph lines = content.strip().split("\n") for line in lines: line = line.strip() if len(line) > 50 and not line.startswith("#"): return line[:500] return "No overview available." def extract_parameters(self, content: str) -> list[dict]: """Extract parameter information from documentation.""" parameters = [] # Pattern for parameter documentation # Matches: "param_name (type, required/optional) - description" param_pattern = r"[\*\-]?\s*`?(\w+)`?\s*\(([^)]+)\)\s*[-–—:]\s*(.+?)(?=\n[\*\-]?\s*`?\w+`?\s*\(|\n\n|\Z)" matches = re.finditer(param_pattern, content, re.DOTALL) for match in matches: name = match.group(1) type_info = match.group(2).strip() description = match.group(3).strip() # Clean up description description = re.sub(r"\s+", " ", description) # Extract if required/optional required = "required" in type_info.lower() parameters.append( { "name": name, "type": type_info, "required": required, "description": description[:300], # Limit length } ) return parameters def extract_examples(self, content: str) -> list[dict]: """Extract code examples from documentation.""" examples = [] # Match code blocks with language specifier code_block_pattern = r"```(\w+)\n(.*?)```" matches = re.finditer(code_block_pattern, content, re.DOTALL) for match in matches: language = match.group(1) code = match.group(2).strip() # Skip very short snippets (likely not examples) if len(code) > 20: examples.append({"language": language, "code": code}) return examples[:10] # Limit to 10 examples def extract_notes(self, content: str) -> list[str]: """Extract important notes, warnings, and tips.""" notes = [] # Look for note/warning/tip sections patterns = [ r"(?:⚠️|⚡|💡|📝|Note|Warning|Important|Tip):\s*(.+?)(?:\n\n|\Z)", r"> (.+?)(?:\n\n|\Z)", # Blockquotes ] for pattern in patterns: matches = re.finditer(pattern, content, re.DOTALL) for match in matches: note = match.group(1).strip() note = re.sub(r"\s+", " ", note) if len(note) > 30 and len(note) < 500: notes.append(note) return notes[:5] # Limit to 5 notes def extract_links(self, content: str, base_url: str) -> list[dict]: """Extract related documentation links.""" links = [] # Match markdown links link_pattern = r"\[([^\]]+)\]\(([^)]+)\)" matches = re.finditer(link_pattern, content) for match in matches: title = match.group(1) url = match.group(2) # Filter for documentation links (not random external links) if any( keyword in title.lower() for keyword in ["api", "docs", "guide", "reference", "tutorial", "see"] ): # Make relative URLs absolute if url.startswith("/"): parsed = urlparse(base_url) url = f"{parsed.scheme}://{parsed.netloc}{url}" links.append({"title": title, "url": url}) # Deduplicate seen_urls = set() unique_links = [] for link in links: if link["url"] not in seen_urls: seen_urls.add(link["url"]) unique_links.append(link) return unique_links[:10] # Limit to 10 links def format_documentation(self, doc: APIDocumentation) -> str: """Format extracted documentation into readable text.""" lines = [ f"API Documentation: {doc.api_name.title()} - {doc.topic}", "═" * 70, "", ] # Overview if doc.overview: lines.extend( [ "📖 Overview:", f" {doc.overview}", "", ] ) # Main documentation URL lines.extend( [ f"📚 Documentation: {doc.docs_url}", "", ] ) # Parameters if doc.parameters: lines.extend( [ "📋 Parameters:", "─" * 70, "", ] ) for param in doc.parameters: req_marker = "required" if param["required"] else "optional" lines.append(f" {param['name']} ({param['type']}, {req_marker})") lines.append(f" {param['description']}") lines.append("") # Code Examples if doc.examples: lines.extend( [ "💡 Code Examples:", "─" * 70, "", ] ) for i, example in enumerate(doc.examples, 1): lines.append(f" Example {i} ({example['language']}):") lines.append(f" ```{example['language']}") # Indent code for code_line in example["code"].split("\n"): lines.append(f" {code_line}") lines.append(" ```") lines.append("") # Important Notes if doc.notes: lines.extend( [ "⚠️ Important Notes:", "─" * 70, "", ] ) for note in doc.notes: lines.append(f" • {note}") lines.append("") # Related Links if doc.related_links: lines.extend( [ "🔗 Related Documentation:", "─" * 70, "", ] ) for link in doc.related_links: lines.append(f" • {link['title']}") lines.append(f" {link['url']}") lines.append("") # Source URLs if doc.source_urls: lines.extend( [ "📄 Sources:", "─" * 70, "", ] ) for url in doc.source_urls: lines.append(f" • {url}") return "\n".join(lines)

Loading blob content...

Implementation Reference

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/elad12390/web-research-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

api_docs.py•19.5 KiB