check_ai_bot_access
Audits a domain's robots.txt and root URL to report which AI bots are allowed or blocked, including Cloudflare AI-bot-default warnings.
Instructions
Check whether AI bots can read this site.
Fetches /robots.txt and the root URL. Reports per-bot allow/disallow
plus Cloudflare AI-bot-default warning signals.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| domain | Yes | e.g. `example.com` or `https://example.com` |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
No arguments | |||
Implementation Reference
- src/ai_visibility_mcp/server.py:124-210 (handler)The core implementation of the check_ai_bot_access tool. Fetches robots.txt and root URL, evaluates per-bot access using KNOWN_AI_BOTS and robots.txt parsing, detects Cloudflare bot-challenge signals, and returns structured JSON with domain, robots_txt info, per-bot verdicts, summary stats, Cloudflare signals, and warnings.
async def check_ai_bot_access_impl(domain: str) -> dict[str, Any]: base = normalize_domain(domain) robots_url = f"{base}/robots.txt" warnings: list[str] = [] try: rstatus, rheaders, rbody = await _fetch(robots_url) except SSRFBlocked as exc: return { "domain": base, "error": f"refused: {exc}", } except httpx.HTTPError as exc: return { "domain": base, "error": f"failed to fetch robots.txt: {exc.__class__.__name__}: {exc}", } records: list = [] if rstatus == 200: records = parse(rbody) elif rstatus == 404: warnings.append("no robots.txt — all bots implicitly allowed") else: warnings.append(f"robots.txt returned HTTP {rstatus} — treating as unspecified") try: root_status, root_headers, root_body = await _fetch(base + "/") except (httpx.HTTPError, SSRFBlocked) as exc: root_status, root_headers, root_body = 0, {}, "" warnings.append(f"failed to fetch root: {exc.__class__.__name__}") cf = _cloudflare_signals(root_headers, root_status, root_body) if cf["likely_bot_challenge"]: warnings.append( "Cloudflare bot-challenge detected at root — AI bots without JS will be blocked " "even if robots.txt allows them. Check Cloudflare → Security → Bots → AI Scrapers." ) bots_report: list[dict[str, Any]] = [] for bot in KNOWN_AI_BOTS: verdict = access_for(records, bot.ua, "/") if records else ( "allowed" if rstatus == 404 else "unspecified" ) has_explicit_rule = any( bot.ua.lower() in ua.strip().lower() for rec in records for ua in rec.user_agents if ua.strip() != "*" ) bots_report.append({ "user_agent": bot.ua, "vendor": bot.vendor, "purpose": bot.purpose, "verdict": verdict, "rule_source": "explicit" if has_explicit_rule else ( "wildcard" if records else "default" ), }) allowed = sum(1 for b in bots_report if b["verdict"] == "allowed") disallowed = sum(1 for b in bots_report if b["verdict"] == "disallowed") if disallowed >= len(bots_report) * 0.75: warnings.append( f"{disallowed}/{len(bots_report)} known AI bots are disallowed — " "site is largely invisible to AI search." ) return { "domain": base, "robots_txt": { "url": robots_url, "status": rstatus, "size_bytes": len(rbody) if rstatus == 200 else 0, "records_parsed": len(records), }, "bots": bots_report, "summary": { "total": len(bots_report), "allowed": allowed, "disallowed": disallowed, "unspecified": len(bots_report) - allowed - disallowed, }, "cloudflare": cf, "warnings": warnings, } - src/ai_visibility_mcp/server.py:213-227 (registration)The @mcp.tool() decorator registration that exposes 'check_ai_bot_access' as an MCP tool. Includes the docstring and delegates to check_ai_bot_access_impl.
@mcp.tool() async def check_ai_bot_access(domain: str) -> dict[str, Any]: """Check whether AI bots can read this site. Fetches `/robots.txt` and the root URL. Reports per-bot allow/disallow plus Cloudflare AI-bot-default warning signals. Args: domain: e.g. `example.com` or `https://example.com` Returns: JSON with `domain`, `robots_txt`, `bots` (list of per-bot verdicts), `cloudflare`, and `warnings`. """ return await check_ai_bot_access_impl(domain) - src/ai_visibility_mcp/bots.py:1-36 (helper)Defines the KNOWN_AI_BOTS tuple (list of known AI bot User-Agent strings grouped by vendor and purpose) used by check_ai_bot_access_impl to iterate over and evaluate each bot's access.
"""Known AI bot user-agents as of 2026-05.""" from dataclasses import dataclass @dataclass(frozen=True) class Bot: ua: str vendor: str purpose: str KNOWN_AI_BOTS: tuple[Bot, ...] = ( Bot("GPTBot", "OpenAI", "training"), Bot("ChatGPT-User", "OpenAI", "user-fetch"), Bot("OAI-SearchBot", "OpenAI", "search-index"), Bot("ClaudeBot", "Anthropic", "training"), Bot("Claude-User", "Anthropic", "user-fetch"), Bot("Claude-SearchBot", "Anthropic", "search-index"), Bot("anthropic-ai", "Anthropic", "legacy"), Bot("PerplexityBot", "Perplexity", "search-index"), Bot("Perplexity-User", "Perplexity", "user-fetch"), Bot("Google-Extended", "Google", "gemini-training"), Bot("GoogleOther", "Google", "research"), Bot("Applebot-Extended", "Apple", "training"), Bot("Bytespider", "ByteDance", "training"), Bot("CCBot", "Common Crawl", "open-dataset"), Bot("Meta-ExternalAgent", "Meta", "training"), Bot("FacebookBot", "Meta", "user-fetch"), Bot("Amazonbot", "Amazon", "alexa-llm"), Bot("DuckAssistBot", "DuckDuckGo", "assist"), Bot("cohere-ai", "Cohere", "training"), Bot("Diffbot", "Diffbot", "knowledge-graph"), Bot("YouBot", "You.com", "search-index"), Bot("MistralAI-User", "Mistral", "user-fetch"), ) - The access_for function used by check_ai_bot_access_impl to determine allow/disallow/unspecified for each bot against the parsed robots.txt records.
def access_for(records: list[Record], user_agent: str, url_path: str = "/") -> str: """Returns 'allowed' | 'disallowed' | 'unspecified'. Resolution: pick the record whose UA matches `user_agent` most specifically. If no specific match, fall back to the `*` wildcard record. Within the chosen record, longest matching pattern wins; ties go to Allow. """ ua_lower = user_agent.lower() specific: Record | None = None wildcard: Record | None = None for rec in records: for ua in rec.user_agents: ua_norm = ua.strip().lower() if ua_norm == "*": wildcard = rec elif ua_norm and ua_norm in ua_lower: specific = rec chosen = specific or wildcard if chosen is None: return "unspecified" if not chosen.allows and not chosen.disallows: return "unspecified" best_len = -1 best_verdict = "allowed" for pattern in chosen.disallows: if pattern == "": continue if _path_matches(pattern, url_path) and len(pattern) > best_len: best_len = len(pattern) best_verdict = "disallowed" for pattern in chosen.allows: if _path_matches(pattern, url_path) and len(pattern) >= best_len: best_len = len(pattern) best_verdict = "allowed" if best_len < 0: if any(d == "" for d in chosen.disallows): return "allowed" return "allowed" return best_verdict def normalize_domain(domain: str) -> str: if "://" not in domain: domain = "https://" + domain parts = urlsplit(domain) host = parts.netloc or parts.path return f"https://{host}" - The _cloudflare_signals helper function used by check_ai_bot_access_impl to detect Cloudflare presence and bot-challenge state from root page response headers/body.
def _cloudflare_signals(headers: dict[str, str], status: int, body: str) -> dict[str, Any]: h = {k.lower(): v for k, v in headers.items()} server = h.get("server", "").lower() has_cf = "cloudflare" in server or "cf-ray" in h mitigated = h.get("cf-mitigated", "").lower() challenged = ( mitigated in ("challenge", "block") or status == 403 or "just a moment" in body[:2000].lower() or "attention required" in body[:2000].lower() ) return { "cloudflare_detected": has_cf, "cf_ray": h.get("cf-ray"), "cf_mitigated": mitigated or None, "likely_bot_challenge": bool(has_cf and challenged), }