Skip to main content
Glama
RichardDillman

SEO Audit MCP Server

plan-audit.ts22.9 kB
// src/tools/plan-audit.ts // Intelligent audit planning tool - analyzes sitemaps first to create sampling strategy import type { SitemapAnalysis, RobotsAnalysis, } from '../types/index.js'; import { fetchRobots, fetchSitemap, checkUrlStatus, } from '../utils/http.js'; // ============================================================================ // Types // ============================================================================ export interface RoutePattern { pattern: string; // Regex pattern or path prefix name: string; // Human-readable name (e.g., "Job Detail Pages") type: 'job_detail' | 'job_listing' | 'category' | 'location' | 'company' | 'static' | 'blog' | 'other'; exampleUrls: string[]; // Sample URLs matching this pattern estimatedCount: number; // Estimated total URLs of this type sitemapSource?: string; // Which sitemap these came from hasLastmod: boolean; // Whether URLs have lastmod dates dateRange?: { oldest?: string; newest?: string; }; } export interface SitemapValidation { url: string; isValid: boolean; issues: string[]; warnings: string[]; stats: { urlCount: number; hasLastmod: number; hasChangefreq: number; hasPriority: number; compressionUsed: boolean; isIndex: boolean; childCount?: number; duplicatesRemoved?: number; invalidUrls?: number; crossDomainUrls?: number; protocolMismatches?: number; fileSizeMB?: number; }; } export interface AuditPlan { baseUrl: string; timestamp: string; // Discovery results robots: RobotsAnalysis; sitemaps: SitemapValidation[]; // Route analysis routePatterns: RoutePattern[]; // Totals totals: { estimatedPages: number; sitemapsFound: number; routeTypesFound: number; jobPagesEstimate: number; }; // Recommended sampling strategy samplingStrategy: { totalSamplesToTake: number; samplesPerRouteType: Record<string, number>; lighthouseTargets: string[]; // URLs recommended for Lighthouse rationale: string; }; // Issues found during planning issues: string[]; warnings: string[]; recommendations: string[]; } export interface PlanAuditInput { baseUrl: string; maxSitemapsToProcess?: number; // Limit sitemap processing (default: 20) maxUrlsPerSitemap?: number; // Limit URLs to analyze per sitemap (default: 5000) } // ============================================================================ // Main Function // ============================================================================ export async function planAudit(input: PlanAuditInput): Promise<AuditPlan> { const { baseUrl, maxSitemapsToProcess = 20, maxUrlsPerSitemap = 5000, } = input; console.error(`\n=== Planning audit for: ${baseUrl} ===\n`); const issues: string[] = []; const warnings: string[] = []; const recommendations: string[] = []; // Step 1: Fetch and analyze robots.txt console.error('Step 1: Analyzing robots.txt...'); const robots = await fetchRobots(baseUrl); if (!robots.found) { warnings.push('No robots.txt found'); } else { issues.push(...robots.issues); warnings.push(...robots.warnings); } // Step 2: Discover all sitemaps console.error('Step 2: Discovering sitemaps...'); const sitemapUrls = await discoverAllSitemaps(baseUrl, robots, maxSitemapsToProcess); console.error(` Found ${sitemapUrls.length} sitemaps to analyze`); // Step 3: Fetch and validate each sitemap console.error('Step 3: Fetching and validating sitemaps...'); const sitemapValidations: SitemapValidation[] = []; const allUrls: Array<{ url: string; sitemap: string; lastmod?: string }> = []; for (const sitemapUrl of sitemapUrls.slice(0, maxSitemapsToProcess)) { console.error(` Processing: ${sitemapUrl}`); const sitemap = await fetchSitemap(sitemapUrl, { maxUrls: maxUrlsPerSitemap }); const validation = validateSitemap(sitemap); sitemapValidations.push(validation); // Collect URLs for pattern analysis for (const urlEntry of sitemap.urls.slice(0, maxUrlsPerSitemap)) { allUrls.push({ url: urlEntry.loc, sitemap: sitemapUrl, lastmod: urlEntry.lastmod, }); } issues.push(...validation.issues); warnings.push(...validation.warnings); } // Step 4: Analyze URL patterns to identify route types console.error('Step 4: Analyzing URL patterns...'); const routePatterns = analyzeRoutePatterns(allUrls, sitemapValidations); console.error(` Identified ${routePatterns.length} distinct route patterns`); // Step 5: Calculate totals and estimates const totals = calculateTotals(sitemapValidations, routePatterns); // Step 6: Generate sampling strategy console.error('Step 5: Generating sampling strategy...'); const samplingStrategy = generateSamplingStrategy(routePatterns, totals); // Step 7: Generate recommendations recommendations.push(...generateRecommendations(robots, sitemapValidations, routePatterns, totals)); return { baseUrl, timestamp: new Date().toISOString(), robots, sitemaps: sitemapValidations, routePatterns, totals, samplingStrategy, issues, warnings, recommendations, }; } // ============================================================================ // Helper Functions // ============================================================================ /** * Discover all sitemaps from robots.txt and common locations */ async function discoverAllSitemaps( baseUrl: string, robots: RobotsAnalysis, maxSitemaps: number ): Promise<string[]> { const sitemapUrls = new Set<string>(robots.sitemaps); // Check common sitemap locations if none found in robots.txt if (sitemapUrls.size === 0) { const commonPaths = [ '/sitemap.xml', '/sitemap_index.xml', '/sitemap-index.xml', '/sitemaps/sitemap.xml', '/wp-sitemap.xml', '/sitemap/sitemap-index.xml', ]; for (const path of commonPaths) { const url = new URL(path, baseUrl).toString(); const status = await checkUrlStatus(url); if (status.statusCode === 200) { sitemapUrls.add(url); break; // Found one, that's enough to start } } } // Expand sitemap indexes to find child sitemaps const expandedUrls = new Set<string>(); for (const url of sitemapUrls) { expandedUrls.add(url); if (expandedUrls.size >= maxSitemaps) break; const sitemap = await fetchSitemap(url, { maxUrls: 100 }); if (sitemap.type === 'sitemapindex' && sitemap.childSitemaps) { for (const childUrl of sitemap.childSitemaps) { expandedUrls.add(childUrl); if (expandedUrls.size >= maxSitemaps) break; } } } return Array.from(expandedUrls); } /** * Validate a sitemap against best practices */ function validateSitemap(sitemap: SitemapAnalysis): SitemapValidation { const issues: string[] = []; const warnings: string[] = []; // Check if found if (!sitemap.found) { return { url: sitemap.url, isValid: false, issues: ['Sitemap not accessible'], warnings: [], stats: { urlCount: 0, hasLastmod: 0, hasChangefreq: 0, hasPriority: 0, compressionUsed: sitemap.url.endsWith('.gz'), isIndex: false, }, }; } // URL count validation if (sitemap.urlCount > 50000) { issues.push(`Exceeds 50,000 URL limit (has ${sitemap.urlCount.toLocaleString()})`); } else if (sitemap.urlCount > 45000) { warnings.push(`Approaching 50,000 URL limit (has ${sitemap.urlCount.toLocaleString()})`); } // Lastmod coverage const lastmodPercent = sitemap.urlCount > 0 ? Math.round((sitemap.urlsWithLastmod / sitemap.urlCount) * 100) : 0; if (lastmodPercent < 50) { warnings.push(`Only ${lastmodPercent}% of URLs have lastmod dates`); } // Check for stale sitemap if (sitemap.newestLastmod) { const daysSinceUpdate = Math.floor( (Date.now() - new Date(sitemap.newestLastmod).getTime()) / (1000 * 60 * 60 * 24) ); if (daysSinceUpdate > 30) { warnings.push(`Sitemap appears stale - newest lastmod is ${daysSinceUpdate} days ago`); } } // Check compression const isCompressed = sitemap.url.endsWith('.gz'); if (sitemap.urlCount > 10000 && !isCompressed) { warnings.push('Large sitemap should use gzip compression'); } // Count metadata usage const hasChangefreq = sitemap.urls.filter(u => u.changefreq).length; const hasPriority = sitemap.urls.filter(u => u.priority).length; return { url: sitemap.url, isValid: issues.length === 0, issues, warnings, stats: { urlCount: sitemap.urlCount, hasLastmod: sitemap.urlsWithLastmod, hasChangefreq, hasPriority, compressionUsed: isCompressed, isIndex: sitemap.type === 'sitemapindex', childCount: sitemap.childSitemaps?.length, }, }; } /** * Analyze URLs to identify distinct route patterns using adaptive detection * This learns patterns from the actual URLs rather than using hardcoded patterns */ function analyzeRoutePatterns( urls: Array<{ url: string; sitemap: string; lastmod?: string }>, sitemaps: SitemapValidation[] ): RoutePattern[] { // First pass: Extract path segments and group URLs by structure const structureGroups: Map<string, { urls: typeof urls; structure: string[]; variablePositions: number[]; }> = new Map(); for (const urlEntry of urls) { const { url, sitemap, lastmod } = urlEntry; const path = new URL(url).pathname; const segments = path.split('/').filter(Boolean); // Analyze each segment to determine if it's variable or fixed const structure = segments.map((seg, i) => { // Check if this looks like a variable (ID, slug, etc.) if (isVariableSegment(seg)) { return `{${guessSegmentType(seg, i, segments)}}`; } return seg; }); const structureKey = '/' + structure.join('/'); if (!structureGroups.has(structureKey)) { structureGroups.set(structureKey, { urls: [], structure, variablePositions: structure.map((s, i) => s.startsWith('{') ? i : -1).filter(p => p >= 0), }); } structureGroups.get(structureKey)!.urls.push(urlEntry); } // Second pass: Merge similar structures and classify types const patterns: Map<string, RoutePattern> = new Map(); for (const [structureKey, group] of structureGroups.entries()) { // Classify the pattern type based on segments const type = classifyRouteType(group.structure); const name = generatePatternName(group.structure, type); const key = `${type}:${structureKey}`; // Aggregate URL data let hasLastmod = false; let oldestDate: string | undefined; let newestDate: string | undefined; for (const urlEntry of group.urls) { if (urlEntry.lastmod) { hasLastmod = true; if (!oldestDate || urlEntry.lastmod < oldestDate) oldestDate = urlEntry.lastmod; if (!newestDate || urlEntry.lastmod > newestDate) newestDate = urlEntry.lastmod; } } patterns.set(key, { pattern: structureKey, name, type, exampleUrls: group.urls.slice(0, 10).map(u => u.url), estimatedCount: group.urls.length, sitemapSource: group.urls[0]?.sitemap, hasLastmod, dateRange: oldestDate || newestDate ? { oldest: oldestDate, newest: newestDate } : {}, }); } // Sort by count descending return Array.from(patterns.values()) .sort((a, b) => b.estimatedCount - a.estimatedCount); } /** * Determine if a URL segment is likely a variable (ID, slug, etc.) */ function isVariableSegment(segment: string): boolean { // Numeric IDs if (/^\d+$/.test(segment)) return true; // UUIDs if (/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(segment)) return true; // Hash-like strings (hex, base64-ish) if (/^[a-f0-9]{16,}$/i.test(segment)) return true; // Slugs with hyphens AND numbers (like "software-engineer-12345") if (/^[a-z0-9]+-[a-z0-9-]+-\d+$/i.test(segment)) return true; // Very long segments (likely slugs or encoded data) if (segment.length > 50) return true; // Segments that look like encoded IDs if (/^[a-zA-Z0-9_-]{20,}$/.test(segment)) return true; // Common job board prefixed patterns (l-location, k-keyword, w-worktype) // e.g., "l-new-york-ny", "k-software-engineer", "l-united-states-w-remote" if (/^[lkwj]-[a-z0-9-]+$/i.test(segment)) return true; return false; } /** * Guess what type of variable a segment represents */ function guessSegmentType(segment: string, position: number, allSegments: string[]): string { // Check previous segment for context clues const prevSegment = position > 0 ? allSegments[position - 1] : null; // UUID-like if (/^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(segment)) return 'uuid'; // Pure numeric if (/^\d+$/.test(segment)) return 'id'; // Job board prefixed patterns (talent.com style) // l-{location}, k-{keyword}, w-{worktype}, j-{jobtype} if (/^l-[a-z0-9-]+$/i.test(segment)) return 'location-filter'; if (/^k-[a-z0-9-]+$/i.test(segment)) return 'keyword-filter'; if (/^w-[a-z0-9-]+$/i.test(segment)) return 'work-type-filter'; if (/^j-[a-z0-9-]+$/i.test(segment)) return 'job-type-filter'; // Context-based guessing from previous segment if (prevSegment) { const prev = prevSegment.toLowerCase(); if (['job', 'jobs', 'position', 'vacancy', 'opening'].includes(prev)) return 'job-slug'; if (['company', 'employer', 'organization', 'org'].includes(prev)) return 'company-slug'; if (['location', 'city', 'region', 'area', 'l'].includes(prev)) return 'location'; if (['category', 'cat', 'type', 'k'].includes(prev)) return 'category'; if (['user', 'profile', 'candidate'].includes(prev)) return 'user-id'; if (['blog', 'article', 'post', 'news'].includes(prev)) return 'post-slug'; } // Default based on format if (segment.includes('-') && segment.length > 10) return 'slug'; return 'param'; } /** * Classify route type based on path structure */ function classifyRouteType(structure: string[]): RoutePattern['type'] { const path = '/' + structure.join('/').toLowerCase(); const segments = structure.map(s => s.toLowerCase()); // Job detail detection const jobDetailIndicators = ['job', 'position', 'vacancy', 'opening', 'career']; const hasJobIndicator = segments.some(s => jobDetailIndicators.some(ind => s === ind || s === `{${ind}-slug}`)); const hasVariableSlug = segments.some(s => s.includes('{') && (s.includes('slug') || s.includes('id') || s.includes('uuid'))); if (hasJobIndicator && hasVariableSlug) return 'job_detail'; // Job listing root (no filters) if (segments.length <= 1 && segments.some(s => ['jobs', 'careers', 'positions', 'vacancies'].includes(s))) { return 'job_listing'; } // Check for filter-based patterns (e.g., {location-filter}, {keyword-filter}) const hasLocationFilter = segments.some(s => s.includes('{location-filter}') || s === '{location}'); const hasKeywordFilter = segments.some(s => s.includes('{keyword-filter}') || s === '{category}'); const hasWorkTypeFilter = segments.some(s => s.includes('{work-type-filter}') || s.includes('{job-type-filter}')); // Location pages (have location filter but no keyword filter) if (hasLocationFilter && !hasKeywordFilter) { return 'location'; } // Category pages (have keyword filter, may also have location) if (hasKeywordFilter) { return 'category'; } // Combined location + work type (still location-based) if (hasLocationFilter && hasWorkTypeFilter) { return 'location'; } // Legacy patterns with 'l-' or 'k-' prefixes (for backwards compatibility) if (segments.some(s => s === 'l' || s === 'location' || s.includes('l-'))) { return 'location'; } if (segments.some(s => s === 'k' || s === 'category' || s.includes('k-'))) { return 'category'; } // Company pages if (segments.some(s => ['company', 'employer', 'organization', 'org'].includes(s))) { return 'company'; } // Blog/content if (segments.some(s => ['blog', 'article', 'news', 'resources', 'insights'].includes(s))) { return 'blog'; } // Static pages const staticPages = ['about', 'contact', 'privacy', 'terms', 'faq', 'help', 'support', 'legal']; if (segments.some(s => staticPages.includes(s))) { return 'static'; } // Default return 'other'; } /** * Generate human-readable name for a pattern */ function generatePatternName(structure: string[], type: RoutePattern['type']): string { const typeNames: Record<RoutePattern['type'], string> = { job_detail: 'Job Detail Pages', job_listing: 'Job Listing Pages', category: 'Category/Filter Pages', location: 'Location Pages', company: 'Company Pages', blog: 'Blog/Content Pages', static: 'Static Pages', other: 'Other Pages', }; // Try to create a more specific name based on structure const staticParts = structure.filter(s => !s.startsWith('{')); if (staticParts.length > 0) { const prefix = staticParts[0].charAt(0).toUpperCase() + staticParts[0].slice(1); return `${prefix} ${typeNames[type]}`; } return typeNames[type]; } /** * Calculate total estimates */ function calculateTotals( sitemaps: SitemapValidation[], routePatterns: RoutePattern[] ): AuditPlan['totals'] { const estimatedPages = sitemaps.reduce((sum, s) => sum + s.stats.urlCount, 0); const jobPagesEstimate = routePatterns .filter(p => p.type === 'job_detail') .reduce((sum, p) => sum + p.estimatedCount, 0); return { estimatedPages, sitemapsFound: sitemaps.filter(s => s.stats.urlCount > 0).length, routeTypesFound: routePatterns.length, jobPagesEstimate, }; } /** * Generate intelligent sampling strategy */ function generateSamplingStrategy( routePatterns: RoutePattern[], totals: AuditPlan['totals'] ): AuditPlan['samplingStrategy'] { const samplesPerRouteType: Record<string, number> = {}; let totalSamples = 0; // Strategy: More samples for important page types, fewer for massive collections for (const pattern of routePatterns) { let samples: number; switch (pattern.type) { case 'job_detail': // For job pages: sample based on log scale due to volume // 10 for < 1000, 20 for < 10000, 30 for < 100000, 50 for more if (pattern.estimatedCount < 1000) samples = 10; else if (pattern.estimatedCount < 10000) samples = 20; else if (pattern.estimatedCount < 100000) samples = 30; else samples = 50; break; case 'category': case 'location': // Landing pages are important - sample more proportionally samples = Math.min(20, Math.max(5, Math.ceil(pattern.estimatedCount * 0.1))); break; case 'company': samples = Math.min(15, Math.max(5, Math.ceil(pattern.estimatedCount * 0.05))); break; case 'job_listing': case 'static': // Sample all or most static pages samples = Math.min(10, pattern.estimatedCount); break; case 'blog': samples = Math.min(10, Math.max(3, Math.ceil(pattern.estimatedCount * 0.1))); break; default: samples = Math.min(10, Math.max(3, pattern.estimatedCount)); } const key = `${pattern.type}:${pattern.pattern}`; samplesPerRouteType[key] = samples; totalSamples += samples; } // Select Lighthouse targets (one from each major type) const lighthouseTargets: string[] = []; const typesForLighthouse = ['job_detail', 'category', 'location', 'job_listing']; for (const type of typesForLighthouse) { const pattern = routePatterns.find(p => p.type === type && p.exampleUrls.length > 0); if (pattern) { lighthouseTargets.push(pattern.exampleUrls[0]); } } // Add homepage const homepage = routePatterns.find(p => p.exampleUrls.some(u => new URL(u).pathname === '/')); if (homepage) { lighthouseTargets.unshift(homepage.exampleUrls.find(u => new URL(u).pathname === '/')!); } const rationale = `Sampling ${totalSamples} pages across ${routePatterns.length} route types. ` + `Job detail pages use logarithmic sampling due to volume (${totals.jobPagesEstimate.toLocaleString()} estimated). ` + `Landing pages (category/location) get higher proportional coverage for SEO impact. ` + `Lighthouse will run on ${lighthouseTargets.length} representative pages.`; return { totalSamplesToTake: totalSamples, samplesPerRouteType, lighthouseTargets, rationale, }; } /** * Generate actionable recommendations */ function generateRecommendations( robots: RobotsAnalysis, sitemaps: SitemapValidation[], routePatterns: RoutePattern[], totals: AuditPlan['totals'] ): string[] { const recommendations: string[] = []; // Sitemap recommendations if (sitemaps.length === 0) { recommendations.push('CRITICAL: Create XML sitemaps to improve crawlability'); } const jobSitemap = sitemaps.find(s => s.url.toLowerCase().includes('job') && s.stats.urlCount > 0 ); if (!jobSitemap && totals.jobPagesEstimate > 1000) { recommendations.push('Consider creating a dedicated jobs sitemap for better job indexing'); } // Lastmod recommendations const lowLastmodSitemaps = sitemaps.filter(s => s.stats.urlCount > 0 && (s.stats.hasLastmod / s.stats.urlCount) < 0.5 ); if (lowLastmodSitemaps.length > 0) { recommendations.push('Add lastmod dates to sitemap URLs to help search engines prioritize fresh content'); } // Compression recommendations const uncompressedLarge = sitemaps.filter(s => s.stats.urlCount > 10000 && !s.stats.compressionUsed ); if (uncompressedLarge.length > 0) { recommendations.push('Use gzip compression for large sitemaps (>10k URLs)'); } // Job board specific if (totals.jobPagesEstimate > 10000) { recommendations.push('Consider implementing Google Indexing API for faster job posting indexation'); } // Landing page coverage const categoryPages = routePatterns.find(p => p.type === 'category'); const locationPages = routePatterns.find(p => p.type === 'location'); if (!categoryPages || categoryPages.estimatedCount < 10) { recommendations.push('Consider creating more category landing pages (e.g., /marketing-jobs, /engineering-jobs)'); } if (!locationPages || locationPages.estimatedCount < 10) { recommendations.push('Consider creating location landing pages (e.g., /jobs-in-new-york, /remote-jobs)'); } // Robots.txt recommendations if (robots.found && robots.sitemaps.length === 0) { recommendations.push('Add Sitemap directive(s) to robots.txt'); } return recommendations; } export default planAudit;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/RichardDillman/seo-audit-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server