check_ai_bot_access

Audits a domain's robots.txt and root URL to report which AI bots are allowed or blocked, including Cloudflare AI-bot-default warnings.

Instructions

Check whether AI bots can read this site.

Fetches /robots.txt and the root URL. Reports per-bot allow/disallow plus Cloudflare AI-bot-default warning signals.

Input Schema

TableJSON Schema

Name	Required	Description	Default
`domain`	Yes	e.g. `example.com` or `https://example.com`

Output Schema

TableJSON Schema

Name	Required	Description	Default
No arguments

Implementation Reference

src/ai_visibility_mcp/server.py:124-210 (handler)

The core implementation of the check_ai_bot_access tool. Fetches robots.txt and root URL, evaluates per-bot access using KNOWN_AI_BOTS and robots.txt parsing, detects Cloudflare bot-challenge signals, and returns structured JSON with domain, robots_txt info, per-bot verdicts, summary stats, Cloudflare signals, and warnings.

async def check_ai_bot_access_impl(domain: str) -> dict[str, Any]:
    base = normalize_domain(domain)
    robots_url = f"{base}/robots.txt"

    warnings: list[str] = []
    try:
        rstatus, rheaders, rbody = await _fetch(robots_url)
    except SSRFBlocked as exc:
        return {
            "domain": base,
            "error": f"refused: {exc}",
        }
    except httpx.HTTPError as exc:
        return {
            "domain": base,
            "error": f"failed to fetch robots.txt: {exc.__class__.__name__}: {exc}",
        }

    records: list = []
    if rstatus == 200:
        records = parse(rbody)
    elif rstatus == 404:
        warnings.append("no robots.txt — all bots implicitly allowed")
    else:
        warnings.append(f"robots.txt returned HTTP {rstatus} — treating as unspecified")

    try:
        root_status, root_headers, root_body = await _fetch(base + "/")
    except (httpx.HTTPError, SSRFBlocked) as exc:
        root_status, root_headers, root_body = 0, {}, ""
        warnings.append(f"failed to fetch root: {exc.__class__.__name__}")

    cf = _cloudflare_signals(root_headers, root_status, root_body)
    if cf["likely_bot_challenge"]:
        warnings.append(
            "Cloudflare bot-challenge detected at root — AI bots without JS will be blocked "
            "even if robots.txt allows them. Check Cloudflare → Security → Bots → AI Scrapers."
        )

    bots_report: list[dict[str, Any]] = []
    for bot in KNOWN_AI_BOTS:
        verdict = access_for(records, bot.ua, "/") if records else (
            "allowed" if rstatus == 404 else "unspecified"
        )
        has_explicit_rule = any(
            bot.ua.lower() in ua.strip().lower()
            for rec in records
            for ua in rec.user_agents
            if ua.strip() != "*"
        )
        bots_report.append({
            "user_agent": bot.ua,
            "vendor": bot.vendor,
            "purpose": bot.purpose,
            "verdict": verdict,
            "rule_source": "explicit" if has_explicit_rule else (
                "wildcard" if records else "default"
            ),
        })

    allowed = sum(1 for b in bots_report if b["verdict"] == "allowed")
    disallowed = sum(1 for b in bots_report if b["verdict"] == "disallowed")

    if disallowed >= len(bots_report) * 0.75:
        warnings.append(
            f"{disallowed}/{len(bots_report)} known AI bots are disallowed — "
            "site is largely invisible to AI search."
        )

    return {
        "domain": base,
        "robots_txt": {
            "url": robots_url,
            "status": rstatus,
            "size_bytes": len(rbody) if rstatus == 200 else 0,
            "records_parsed": len(records),
        },
        "bots": bots_report,
        "summary": {
            "total": len(bots_report),
            "allowed": allowed,
            "disallowed": disallowed,
            "unspecified": len(bots_report) - allowed - disallowed,
        },
        "cloudflare": cf,
        "warnings": warnings,
    }

src/ai_visibility_mcp/server.py:213-227 (registration)

The @mcp.tool() decorator registration that exposes 'check_ai_bot_access' as an MCP tool. Includes the docstring and delegates to check_ai_bot_access_impl.

@mcp.tool()
async def check_ai_bot_access(domain: str) -> dict[str, Any]:
    """Check whether AI bots can read this site.

    Fetches `/robots.txt` and the root URL. Reports per-bot allow/disallow
    plus Cloudflare AI-bot-default warning signals.

    Args:
        domain: e.g. `example.com` or `https://example.com`

    Returns:
        JSON with `domain`, `robots_txt`, `bots` (list of per-bot verdicts),
        `cloudflare`, and `warnings`.
    """
    return await check_ai_bot_access_impl(domain)

src/ai_visibility_mcp/bots.py:1-36 (helper)

Defines the KNOWN_AI_BOTS tuple (list of known AI bot User-Agent strings grouped by vendor and purpose) used by check_ai_bot_access_impl to iterate over and evaluate each bot's access.

"""Known AI bot user-agents as of 2026-05."""

from dataclasses import dataclass


@dataclass(frozen=True)
class Bot:
    ua: str
    vendor: str
    purpose: str


KNOWN_AI_BOTS: tuple[Bot, ...] = (
    Bot("GPTBot", "OpenAI", "training"),
    Bot("ChatGPT-User", "OpenAI", "user-fetch"),
    Bot("OAI-SearchBot", "OpenAI", "search-index"),
    Bot("ClaudeBot", "Anthropic", "training"),
    Bot("Claude-User", "Anthropic", "user-fetch"),
    Bot("Claude-SearchBot", "Anthropic", "search-index"),
    Bot("anthropic-ai", "Anthropic", "legacy"),
    Bot("PerplexityBot", "Perplexity", "search-index"),
    Bot("Perplexity-User", "Perplexity", "user-fetch"),
    Bot("Google-Extended", "Google", "gemini-training"),
    Bot("GoogleOther", "Google", "research"),
    Bot("Applebot-Extended", "Apple", "training"),
    Bot("Bytespider", "ByteDance", "training"),
    Bot("CCBot", "Common Crawl", "open-dataset"),
    Bot("Meta-ExternalAgent", "Meta", "training"),
    Bot("FacebookBot", "Meta", "user-fetch"),
    Bot("Amazonbot", "Amazon", "alexa-llm"),
    Bot("DuckAssistBot", "DuckDuckGo", "assist"),
    Bot("cohere-ai", "Cohere", "training"),
    Bot("Diffbot", "Diffbot", "knowledge-graph"),
    Bot("YouBot", "You.com", "search-index"),
    Bot("MistralAI-User", "Mistral", "user-fetch"),
)

src/ai_visibility_mcp/robots.py:73-123 (helper)

The access_for function used by check_ai_bot_access_impl to determine allow/disallow/unspecified for each bot against the parsed robots.txt records.

def access_for(records: list[Record], user_agent: str, url_path: str = "/") -> str:
    """Returns 'allowed' | 'disallowed' | 'unspecified'.

    Resolution: pick the record whose UA matches `user_agent` most specifically.
    If no specific match, fall back to the `*` wildcard record.
    Within the chosen record, longest matching pattern wins; ties go to Allow.
    """
    ua_lower = user_agent.lower()
    specific: Record | None = None
    wildcard: Record | None = None

    for rec in records:
        for ua in rec.user_agents:
            ua_norm = ua.strip().lower()
            if ua_norm == "*":
                wildcard = rec
            elif ua_norm and ua_norm in ua_lower:
                specific = rec

    chosen = specific or wildcard
    if chosen is None:
        return "unspecified"
    if not chosen.allows and not chosen.disallows:
        return "unspecified"

    best_len = -1
    best_verdict = "allowed"
    for pattern in chosen.disallows:
        if pattern == "":
            continue
        if _path_matches(pattern, url_path) and len(pattern) > best_len:
            best_len = len(pattern)
            best_verdict = "disallowed"
    for pattern in chosen.allows:
        if _path_matches(pattern, url_path) and len(pattern) >= best_len:
            best_len = len(pattern)
            best_verdict = "allowed"

    if best_len < 0:
        if any(d == "" for d in chosen.disallows):
            return "allowed"
        return "allowed"
    return best_verdict


def normalize_domain(domain: str) -> str:
    if "://" not in domain:
        domain = "https://" + domain
    parts = urlsplit(domain)
    host = parts.netloc or parts.path
    return f"https://{host}"

src/ai_visibility_mcp/server.py:105-122 (helper)

The _cloudflare_signals helper function used by check_ai_bot_access_impl to detect Cloudflare presence and bot-challenge state from root page response headers/body.

def _cloudflare_signals(headers: dict[str, str], status: int, body: str) -> dict[str, Any]:
    h = {k.lower(): v for k, v in headers.items()}
    server = h.get("server", "").lower()
    has_cf = "cloudflare" in server or "cf-ray" in h
    mitigated = h.get("cf-mitigated", "").lower()
    challenged = (
        mitigated in ("challenge", "block")
        or status == 403
        or "just a moment" in body[:2000].lower()
        or "attention required" in body[:2000].lower()
    )
    return {
        "cloudflare_detected": has_cf,
        "cf_ray": h.get("cf-ray"),
        "cf_mitigated": mitigated or None,
        "likely_bot_challenge": bool(has_cf and challenged),
    }

ai-visibility-mcp

check_ai_bot_access

Instructions

Input Schema

Output Schema

Implementation Reference

Tool Definition Quality

Other Tools

Latest Blog Posts

MCP directory API