audit_canonical
Audit a page's canonical link integrity, including self-reference, cross-domain mismatches, trailing-slash hygiene, and og:url consistency. Identifies duplicate-content issues.
Instructions
Audit a page's canonical link integrity: presence, self-reference, cross-domain mismatches, trailing-slash hygiene, and og:url consistency.
Read-only. One HTTP GET to fetch the HEAD section.
Deterministic, rule-based; no LLM.
When to use: a focused canonical-only audit (e.g. debugging a duplicate-content issue). For a full HEAD audit including OpenGraph, hreflang, noindex, title, use check_technical. For everything-on-a-page, use audit_page.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | Public URL whose canonical link tag and og:url consistency you want to audit. Must be a fully-qualified http(s) URL. The tool fetches the URL (following redirects) and inspects only the <head> section; the body is not parsed. | |
| respect_robots | No | If true (default), respect robots.txt before fetching. Set false only for auditing your own site where you've intentionally blocked crawlers. |
Implementation Reference
- src/tools/audit-canonical.ts:34-155 (handler)The main handler function `auditCanonical` that fetches the URL, parses the <head> section, and audits canonical link integrity: presence, self-reference, cross-domain mismatches, trailing-slash hygiene, and og:url consistency.
export async function auditCanonical( input: AuditCanonicalInput, hostDelays?: HostDelayMap, robotsCache?: Map<string, string> ): Promise<CanonicalResult> { const result = await politeFetch(input.url, { respectRobots: input.respect_robots, hostDelays, robotsCache, }); const head = parseHead(result.body); const findings: Finding[] = []; const finalUrl = result.finalUrl; let isSelfRef = false; let isCrossDomain = false; let trailingSlashConsistent = true; let canonicalOgUrlMatch = true; if (!head.canonical) { findings.push({ severity: "warning", category: "technical", where: '<link rel="canonical">', message: "No canonical link element found.", fix: `Add <link rel="canonical" href="${finalUrl}"> to <head>.`, estimated_impact: "medium", }); } else { try { const pageUrl = new URL(finalUrl); const canonUrl = new URL(head.canonical, input.url); isSelfRef = canonUrl.hostname === pageUrl.hostname && canonUrl.pathname.replace(/\/$/, "") === pageUrl.pathname.replace(/\/$/, ""); isCrossDomain = canonUrl.hostname !== pageUrl.hostname; // Trailing slash consistency const pageHasSlash = pageUrl.pathname.endsWith("/"); const canonHasSlash = canonUrl.pathname.endsWith("/"); if ( pageUrl.pathname !== "/" && canonUrl.pathname !== "/" && pageHasSlash !== canonHasSlash ) { trailingSlashConsistent = false; findings.push({ severity: "warning", category: "technical", where: '<link rel="canonical">', message: `Trailing slash inconsistency: page "${pageUrl.pathname}" vs canonical "${canonUrl.pathname}".`, fix: "Ensure canonical href and page URL use the same trailing slash convention. Pick one and redirect all variants.", estimated_impact: "low", }); } if (isCrossDomain) { findings.push({ severity: "warning", category: "technical", where: '<link rel="canonical">', message: `Canonical points to a different domain: ${canonUrl.hostname}.`, fix: "Verify this is intentional (syndicated content). If not, update to a self-referencing canonical.", estimated_impact: "medium", }); } else if (!isSelfRef) { findings.push({ severity: "warning", category: "technical", where: '<link rel="canonical">', message: "Canonical does not self-reference the current page URL.", fix: `Update canonical to: <link rel="canonical" href="${finalUrl}">.`, estimated_impact: "medium", }); } } catch { findings.push({ severity: "warning", category: "technical", where: '<link rel="canonical">', message: `Canonical value "${head.canonical}" is not a valid URL.`, fix: "Replace with a valid absolute URL.", estimated_impact: "medium", }); } // og:url vs canonical mismatch if (head.ogUrl && head.canonical) { try { const canonUrl = new URL(head.canonical, input.url); const ogUrl = new URL(head.ogUrl, input.url); canonicalOgUrlMatch = canonUrl.hostname === ogUrl.hostname && canonUrl.pathname === ogUrl.pathname; if (!canonicalOgUrlMatch) { findings.push({ severity: "warning", category: "technical", where: "og:url vs canonical", message: "og:url does not match canonical URL.", fix: `Set og:url to match the canonical URL: <meta property="og:url" content="${head.canonical}">.`, estimated_impact: "low", }); } } catch { // ignore URL parse errors } } } return { url: input.url, final_url: finalUrl, canonical_value: head.canonical, is_self_referential: isSelfRef, is_cross_domain: isCrossDomain, trailing_slash_consistent: trailingSlashConsistent, canonical_og_url_match: canonicalOgUrlMatch, findings, }; } - src/tools/audit-canonical.ts:9-19 (schema)Input validation schema `auditCanonicalInputSchema` using Zod: requires a URL string and optional respect_robots boolean.
export const auditCanonicalInputSchema = z.object({ url: z .string() .url() .describe("Public URL whose canonical link tag and og:url consistency you want to audit. Must be a fully-qualified http(s) URL. The tool fetches the URL (following redirects) and inspects only the <head> section; the body is not parsed."), respect_robots: z .boolean() .optional() .default(true) .describe("If true (default), respect robots.txt before fetching. Set false only for auditing your own site where you've intentionally blocked crawlers."), }); - src/tools/audit-canonical.ts:23-32 (schema)Output interface `CanonicalResult` with fields for url, final_url, canonical_value, is_self_referential, is_cross_domain, trailing_slash_consistent, canonical_og_url_match, and findings array.
export interface CanonicalResult { url: string; final_url: string; canonical_value: string | null; is_self_referential: boolean; is_cross_domain: boolean; trailing_slash_consistent: boolean; canonical_og_url_match: boolean; findings: Finding[]; } - src/index.ts:103-114 (registration)Registration of the 'audit_canonical' tool on the MCP server via `server.tool()` with its description, input schema shape, and handler invocation.
// --- Tool 3: audit_canonical --- server.tool( "audit_canonical", [ "Audit a page's canonical link integrity: presence, self-reference, cross-domain mismatches, trailing-slash hygiene, and og:url consistency.", "Read-only. One HTTP GET to fetch the HEAD section.", "Deterministic, rule-based; no LLM.", "When to use: a focused canonical-only audit (e.g. debugging a duplicate-content issue). For a full HEAD audit including OpenGraph, hreflang, noindex, title, use `check_technical`. For everything-on-a-page, use `audit_page`.", ].join("\n\n"), auditCanonicalInputSchema.shape, async (input) => wrapHandler(() => auditCanonical(input)) ); - src/lib/html.ts:41-100 (helper)The `parseHead` helper function used by the handler to extract canonical, og:url, and other HEAD signals from HTML using cheerio.
export function parseHead( html: string, xRobotsTag?: string | string[] ): HeadData { const $ = cheerio.load(html); const head = $("head"); const title = head.find("title").first().text().trim() || null; const metaDescription = head .find('meta[name="description"]') .attr("content") ?.trim() ?? null; const canonical = head.find('link[rel="canonical"]').attr("href")?.trim() ?? null; const ogTitle = head.find('meta[property="og:title"]').attr("content")?.trim() ?? null; const ogDescription = head.find('meta[property="og:description"]').attr("content")?.trim() ?? null; const ogImage = head.find('meta[property="og:image"]').attr("content")?.trim() ?? null; const ogUrl = head.find('meta[property="og:url"]').attr("content")?.trim() ?? null; const ogType = head.find('meta[property="og:type"]').attr("content")?.trim() ?? null; const twitterCard = head.find('meta[name="twitter:card"]').attr("content")?.trim() ?? null; const twitterTitle = head.find('meta[name="twitter:title"]').attr("content")?.trim() ?? null; const twitterDescription = head.find('meta[name="twitter:description"]').attr("content")?.trim() ?? null; // Check noindex from meta tags const robotsMeta = head .find('meta[name="robots"]') .attr("content") ?.toLowerCase() ?? ""; let noindex = robotsMeta.includes("noindex"); // Check X-Robots-Tag header let noindexHeader = false; if (xRobotsTag) { const tags = Array.isArray(xRobotsTag) ? xRobotsTag : [xRobotsTag]; noindexHeader = tags.some((t) => t.toLowerCase().includes("noindex")); if (noindexHeader) noindex = true; } const hreflangTags: Array<{ lang: string; href: string }> = []; head.find('link[rel="alternate"][hreflang]').each((_, el) => { const lang = $(el).attr("hreflang") ?? ""; const href = $(el).attr("href") ?? ""; if (lang && href) hreflangTags.push({ lang, href }); }); const charset = head.find('meta[charset]').attr("charset")?.trim() ?? head.find('meta[http-equiv="Content-Type"]').attr("content")?.match(/charset=([^;]+)/i)?.[1]?.trim() ?? null; const viewport = head.find('meta[name="viewport"]').attr("content")?.trim() ?? null;