"""API documentation search and extraction."""
from __future__ import annotations
import re
from dataclasses import dataclass
from urllib.parse import urlparse
import httpx
@dataclass(slots=True)
class APIDocumentation:
"""Represents extracted API documentation."""
api_name: str
topic: str
docs_url: str
overview: str
parameters: list[dict]
examples: list[dict]
related_links: list[dict]
notes: list[str]
source_urls: list[str]
class APIDocsDetector:
"""Intelligently find API documentation URLs."""
# Common documentation URL patterns to try
# Order matters: try most common patterns first (.com before .io)
DOC_PATTERNS = [
# .com patterns (most common)
"https://docs.{api}.com",
"https://{api}.com/docs",
"https://{api}.com/docs/api", # Stripe-style
"https://www.{api}.com/docs",
"https://developers.{api}.com",
"https://developer.{api}.com",
"https://{api}.com/documentation",
"https://api.{api}.com/docs",
# Framework-specific patterns
"https://{api}.dev", # Vite, Nuxt, etc.
"https://www.{api}.dev",
"https://{api}.ng", # Angular-based (Spartan)
"https://www.{api}.ng",
# .io patterns (less common, try after .com)
"https://docs.{api}.io",
"https://{api}.io/docs",
"https://www.{api}.io/docs",
# .org patterns
"https://{api}.org/docs",
"https://www.{api}.org/docs",
"https://docs.{api}.org",
# .ai patterns
"https://docs.{api}.ai",
"https://{api}.ai/docs",
]
# API name aliases - map common variations to canonical names/URLs
# This handles cases like "Meta Graph API" -> "facebook"
API_ALIASES: dict[str, str | dict] = {
# Meta/Facebook
"meta": "facebook",
"meta graph": "facebook",
"meta graph api": "facebook",
"facebook graph": "facebook",
"facebook graph api": "facebook",
"instagram api": "facebook",
"instagram graph": "facebook",
# Google
"google site verification": {
"name": "google",
"docs_url": "https://developers.google.com/site-verification",
},
"google site verification api": {
"name": "google",
"docs_url": "https://developers.google.com/site-verification",
},
"google analytics": {
"name": "google",
"docs_url": "https://developers.google.com/analytics",
},
"google analytics 4": {
"name": "google",
"docs_url": "https://developers.google.com/analytics/devguides/config/admin/v1",
},
"ga4": {
"name": "google",
"docs_url": "https://developers.google.com/analytics/devguides/config/admin/v1",
},
"gemini": {"name": "google", "docs_url": "https://ai.google.dev/docs"},
"google gemini": {"name": "google", "docs_url": "https://ai.google.dev/docs"},
"vertex ai": {"name": "google", "docs_url": "https://cloud.google.com/vertex-ai/docs"},
"google cloud": {"name": "google", "docs_url": "https://cloud.google.com/docs"},
# TikTok
"tiktok": {"name": "tiktok", "docs_url": "https://developers.tiktok.com/doc"},
"tiktok business": {
"name": "tiktok",
"docs_url": "https://business-api.tiktok.com/portal/docs",
},
"tiktok business api": {
"name": "tiktok",
"docs_url": "https://business-api.tiktok.com/portal/docs",
},
"tiktok ads": {"name": "tiktok", "docs_url": "https://business-api.tiktok.com/portal/docs"},
# OpenAI
"openai": {"name": "openai", "docs_url": "https://platform.openai.com/docs"},
"chatgpt": {"name": "openai", "docs_url": "https://platform.openai.com/docs"},
"gpt": {"name": "openai", "docs_url": "https://platform.openai.com/docs"},
"dall-e": {"name": "openai", "docs_url": "https://platform.openai.com/docs/guides/images"},
"dalle": {"name": "openai", "docs_url": "https://platform.openai.com/docs/guides/images"},
# Anthropic
"anthropic": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"},
"claude": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"},
"claude api": {"name": "anthropic", "docs_url": "https://docs.anthropic.com"},
# Notion
"notion": {"name": "notion", "docs_url": "https://developers.notion.com"},
"notion api": {"name": "notion", "docs_url": "https://developers.notion.com/reference"},
# Slack
"slack": {"name": "slack", "docs_url": "https://api.slack.com/docs"},
"slack api": {"name": "slack", "docs_url": "https://api.slack.com/docs"},
"slack block kit": {"name": "slack", "docs_url": "https://api.slack.com/block-kit"},
# ElevenLabs
"elevenlabs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"},
"eleven labs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"},
"11labs": {"name": "elevenlabs", "docs_url": "https://elevenlabs.io/docs"},
# Fal.ai
"fal": {"name": "fal", "docs_url": "https://fal.ai/docs"},
"fal.ai": {"name": "fal", "docs_url": "https://fal.ai/docs"},
"fal ai": {"name": "fal", "docs_url": "https://fal.ai/docs"},
# Cloudflare
"cloudflare": {"name": "cloudflare", "docs_url": "https://developers.cloudflare.com"},
"cloudflare waf": {
"name": "cloudflare",
"docs_url": "https://developers.cloudflare.com/waf",
},
# AWS
"aws": {"name": "aws", "docs_url": "https://docs.aws.amazon.com"},
"amazon": {"name": "aws", "docs_url": "https://docs.aws.amazon.com"},
# Stripe
"stripe": {"name": "stripe", "docs_url": "https://docs.stripe.com/api"},
# Twilio
"twilio": {"name": "twilio", "docs_url": "https://www.twilio.com/docs/messaging"},
"twilio sms": {"name": "twilio", "docs_url": "https://www.twilio.com/docs/sms"},
# SendGrid
"sendgrid": {"name": "sendgrid", "docs_url": "https://www.twilio.com/docs/sendgrid"},
# Plaid
"plaid": {"name": "plaid", "docs_url": "https://plaid.com/docs"},
# Vercel
"vercel": {"name": "vercel", "docs_url": "https://vercel.com/docs"},
# Spartan (Angular UI)
"spartan": {"name": "spartan", "docs_url": "https://www.spartan.ng/documentation"},
"spartan ui": {"name": "spartan", "docs_url": "https://www.spartan.ng/documentation"},
# Mureka
"mureka": {"name": "mureka", "docs_url": "https://docs.mureka.ai"},
# Replicate
"replicate": {"name": "replicate", "docs_url": "https://replicate.com/docs"},
# Hugging Face
"huggingface": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"},
"hugging face": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"},
"hf": {"name": "huggingface", "docs_url": "https://huggingface.co/docs"},
# Supabase
"supabase": {"name": "supabase", "docs_url": "https://supabase.com/docs"},
# Firebase
"firebase": {"name": "firebase", "docs_url": "https://firebase.google.com/docs"},
# Vercel
"vercel": {"name": "vercel", "docs_url": "https://vercel.com/docs"},
# Netlify
"netlify": {"name": "netlify", "docs_url": "https://docs.netlify.com"},
# Discord
"discord": {"name": "discord", "docs_url": "https://discord.com/developers/docs"},
"discord api": {"name": "discord", "docs_url": "https://discord.com/developers/docs"},
}
def __init__(self):
self.http_client = httpx.AsyncClient(
timeout=10.0,
follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; API-Docs-Explorer/1.0)"},
)
def normalize_api_name(self, api_name: str) -> tuple[str, str | None]:
"""
Normalize API name using aliases.
Returns:
Tuple of (normalized_name, known_docs_url or None)
"""
api_lower = api_name.lower().strip()
# Check for exact match in aliases
if api_lower in self.API_ALIASES:
alias = self.API_ALIASES[api_lower]
if isinstance(alias, str):
return alias, None
elif isinstance(alias, dict):
return alias.get("name", api_lower), alias.get("docs_url")
# Check for partial matches (e.g., "Meta Graph API" contains "meta graph")
for key, alias in self.API_ALIASES.items():
if key in api_lower or api_lower in key:
if isinstance(alias, str):
return alias, None
elif isinstance(alias, dict):
return alias.get("name", api_lower), alias.get("docs_url")
# Clean up common suffixes
cleaned = api_lower
for suffix in [" api", " sdk", " docs", " documentation"]:
if cleaned.endswith(suffix):
cleaned = cleaned[: -len(suffix)].strip()
# Remove special characters and spaces for URL generation
url_safe = re.sub(r"[^a-z0-9]", "", cleaned)
return url_safe, None
async def find_docs_url(self, api_name: str) -> str | None:
"""
Dynamically find the official documentation URL for an API.
Strategy:
1. Check API aliases for known documentation URLs
2. Try common URL patterns (docs.X.com, X.com/docs, etc.)
3. If patterns fail, return None to trigger search fallback
"""
# Normalize the API name and check for known URLs
normalized_name, known_url = self.normalize_api_name(api_name)
# If we have a known URL from aliases, verify it's accessible
if known_url:
if await self._is_valid_docs_site(known_url):
return known_url
# Try common patterns with normalized name
for pattern in self.DOC_PATTERNS:
url = pattern.format(api=normalized_name)
if await self._is_valid_docs_site(url):
return url
# If all patterns fail, return None and let caller search
return None
def get_search_terms(self, api_name: str) -> list[str]:
"""
Get alternative search terms for an API.
Returns multiple variations to try when searching.
"""
api_lower = api_name.lower().strip()
terms = [api_name] # Original
# Add normalized version
normalized, _ = self.normalize_api_name(api_name)
if normalized != api_lower:
terms.append(normalized)
# Add common variations
if "api" not in api_lower:
terms.append(f"{api_name} API")
# Check aliases for the canonical name
if api_lower in self.API_ALIASES:
alias = self.API_ALIASES[api_lower]
if isinstance(alias, str):
terms.append(alias)
elif isinstance(alias, dict) and "name" in alias:
terms.append(alias["name"])
return list(dict.fromkeys(terms)) # Deduplicate while preserving order
async def _is_valid_docs_site(self, url: str) -> bool:
"""Check if a URL is a valid documentation site."""
try:
response = await self.http_client.head(url, timeout=5.0)
# Check for successful response and likely docs content
if response.status_code == 200:
# Optionally verify it looks like a docs site
# by checking content-type or doing a quick GET
return True
return False
except Exception:
return False
def get_docs_domain(self, docs_url: str) -> str:
"""Extract the domain from a documentation URL for site-specific search."""
parsed = urlparse(docs_url)
return parsed.netloc
async def close(self):
"""Close the HTTP client."""
await self.http_client.aclose()
class APIDocsExtractor:
"""Extract and format API documentation content."""
def extract_overview(self, content: str) -> str:
"""Extract the overview/description from documentation."""
# Look for common overview sections
patterns = [
r"(?:^|\n)#{1,3}\s*(?:Overview|Description|About|Introduction)\s*\n(.*?)(?:\n#{1,3}|\Z)",
r"(?:^|\n)(?:Overview|Description):\s*(.*?)(?:\n\n|\Z)",
# First substantial paragraph
r"(?:^|\n)([A-Z][^.\n]{50,}\.(?:\s+[A-Z][^.\n]+\.){0,3})",
]
for pattern in patterns:
match = re.search(pattern, content, re.DOTALL | re.MULTILINE)
if match:
overview = match.group(1).strip()
# Clean up and limit length
overview = re.sub(r"\s+", " ", overview)
if len(overview) > 500:
overview = overview[:500] + "..."
return overview
# Fallback: first paragraph
lines = content.strip().split("\n")
for line in lines:
line = line.strip()
if len(line) > 50 and not line.startswith("#"):
return line[:500]
return "No overview available."
def extract_parameters(self, content: str) -> list[dict]:
"""Extract parameter information from documentation."""
parameters = []
# Pattern for parameter documentation
# Matches: "param_name (type, required/optional) - description"
param_pattern = r"[\*\-]?\s*`?(\w+)`?\s*\(([^)]+)\)\s*[-–—:]\s*(.+?)(?=\n[\*\-]?\s*`?\w+`?\s*\(|\n\n|\Z)"
matches = re.finditer(param_pattern, content, re.DOTALL)
for match in matches:
name = match.group(1)
type_info = match.group(2).strip()
description = match.group(3).strip()
# Clean up description
description = re.sub(r"\s+", " ", description)
# Extract if required/optional
required = "required" in type_info.lower()
parameters.append(
{
"name": name,
"type": type_info,
"required": required,
"description": description[:300], # Limit length
}
)
return parameters
def extract_examples(self, content: str) -> list[dict]:
"""Extract code examples from documentation."""
examples = []
# Match code blocks with language specifier
code_block_pattern = r"```(\w+)\n(.*?)```"
matches = re.finditer(code_block_pattern, content, re.DOTALL)
for match in matches:
language = match.group(1)
code = match.group(2).strip()
# Skip very short snippets (likely not examples)
if len(code) > 20:
examples.append({"language": language, "code": code})
return examples[:10] # Limit to 10 examples
def extract_notes(self, content: str) -> list[str]:
"""Extract important notes, warnings, and tips."""
notes = []
# Look for note/warning/tip sections
patterns = [
r"(?:⚠️|⚡|💡|📝|Note|Warning|Important|Tip):\s*(.+?)(?:\n\n|\Z)",
r"> (.+?)(?:\n\n|\Z)", # Blockquotes
]
for pattern in patterns:
matches = re.finditer(pattern, content, re.DOTALL)
for match in matches:
note = match.group(1).strip()
note = re.sub(r"\s+", " ", note)
if len(note) > 30 and len(note) < 500:
notes.append(note)
return notes[:5] # Limit to 5 notes
def extract_links(self, content: str, base_url: str) -> list[dict]:
"""Extract related documentation links."""
links = []
# Match markdown links
link_pattern = r"\[([^\]]+)\]\(([^)]+)\)"
matches = re.finditer(link_pattern, content)
for match in matches:
title = match.group(1)
url = match.group(2)
# Filter for documentation links (not random external links)
if any(
keyword in title.lower()
for keyword in ["api", "docs", "guide", "reference", "tutorial", "see"]
):
# Make relative URLs absolute
if url.startswith("/"):
parsed = urlparse(base_url)
url = f"{parsed.scheme}://{parsed.netloc}{url}"
links.append({"title": title, "url": url})
# Deduplicate
seen_urls = set()
unique_links = []
for link in links:
if link["url"] not in seen_urls:
seen_urls.add(link["url"])
unique_links.append(link)
return unique_links[:10] # Limit to 10 links
def format_documentation(self, doc: APIDocumentation) -> str:
"""Format extracted documentation into readable text."""
lines = [
f"API Documentation: {doc.api_name.title()} - {doc.topic}",
"═" * 70,
"",
]
# Overview
if doc.overview:
lines.extend(
[
"📖 Overview:",
f" {doc.overview}",
"",
]
)
# Main documentation URL
lines.extend(
[
f"📚 Documentation: {doc.docs_url}",
"",
]
)
# Parameters
if doc.parameters:
lines.extend(
[
"📋 Parameters:",
"─" * 70,
"",
]
)
for param in doc.parameters:
req_marker = "required" if param["required"] else "optional"
lines.append(f" {param['name']} ({param['type']}, {req_marker})")
lines.append(f" {param['description']}")
lines.append("")
# Code Examples
if doc.examples:
lines.extend(
[
"💡 Code Examples:",
"─" * 70,
"",
]
)
for i, example in enumerate(doc.examples, 1):
lines.append(f" Example {i} ({example['language']}):")
lines.append(f" ```{example['language']}")
# Indent code
for code_line in example["code"].split("\n"):
lines.append(f" {code_line}")
lines.append(" ```")
lines.append("")
# Important Notes
if doc.notes:
lines.extend(
[
"⚠️ Important Notes:",
"─" * 70,
"",
]
)
for note in doc.notes:
lines.append(f" • {note}")
lines.append("")
# Related Links
if doc.related_links:
lines.extend(
[
"🔗 Related Documentation:",
"─" * 70,
"",
]
)
for link in doc.related_links:
lines.append(f" • {link['title']}")
lines.append(f" {link['url']}")
lines.append("")
# Source URLs
if doc.source_urls:
lines.extend(
[
"📄 Sources:",
"─" * 70,
"",
]
)
for url in doc.source_urls:
lines.append(f" • {url}")
return "\n".join(lines)