// src/tools/plan-audit.ts
// Intelligent audit planning tool - analyzes sitemaps first to create sampling strategy
import type {
SitemapAnalysis,
RobotsAnalysis,
} from '../types/index.js';
import {
fetchRobots,
fetchSitemap,
checkUrlStatus,
} from '../utils/http.js';
// ============================================================================
// Types
// ============================================================================
export interface RoutePattern {
pattern: string; // Regex pattern or path prefix
name: string; // Human-readable name (e.g., "Job Detail Pages")
type: 'job_detail' | 'job_listing' | 'category' | 'location' | 'company' | 'static' | 'blog' | 'other';
exampleUrls: string[]; // Sample URLs matching this pattern
estimatedCount: number; // Estimated total URLs of this type
sitemapSource?: string; // Which sitemap these came from
hasLastmod: boolean; // Whether URLs have lastmod dates
dateRange?: {
oldest?: string;
newest?: string;
};
}
export interface SitemapValidation {
url: string;
isValid: boolean;
issues: string[];
warnings: string[];
stats: {
urlCount: number;
hasLastmod: number;
hasChangefreq: number;
hasPriority: number;
compressionUsed: boolean;
isIndex: boolean;
childCount?: number;
duplicatesRemoved?: number;
invalidUrls?: number;
crossDomainUrls?: number;
protocolMismatches?: number;
fileSizeMB?: number;
};
}
export interface AuditPlan {
baseUrl: string;
timestamp: string;
// Discovery results
robots: RobotsAnalysis;
sitemaps: SitemapValidation[];
// Route analysis
routePatterns: RoutePattern[];
// Totals
totals: {
estimatedPages: number;
sitemapsFound: number;
routeTypesFound: number;
jobPagesEstimate: number;
};
// Recommended sampling strategy
samplingStrategy: {
totalSamplesToTake: number;
samplesPerRouteType: Record<string, number>;
lighthouseTargets: string[]; // URLs recommended for Lighthouse
rationale: string;
};
// Issues found during planning
issues: string[];
warnings: string[];
recommendations: string[];
}
export interface PlanAuditInput {
baseUrl: string;
maxSitemapsToProcess?: number; // Limit sitemap processing (default: 20)
maxUrlsPerSitemap?: number; // Limit URLs to analyze per sitemap (default: 5000)
}
// ============================================================================
// Main Function
// ============================================================================
export async function planAudit(input: PlanAuditInput): Promise<AuditPlan> {
const {
baseUrl,
maxSitemapsToProcess = 20,
maxUrlsPerSitemap = 5000,
} = input;
console.error(`\n=== Planning audit for: ${baseUrl} ===\n`);
const issues: string[] = [];
const warnings: string[] = [];
const recommendations: string[] = [];
// Step 1: Fetch and analyze robots.txt
console.error('Step 1: Analyzing robots.txt...');
const robots = await fetchRobots(baseUrl);
if (!robots.found) {
warnings.push('No robots.txt found');
} else {
issues.push(...robots.issues);
warnings.push(...robots.warnings);
}
// Step 2: Discover all sitemaps
console.error('Step 2: Discovering sitemaps...');
const sitemapUrls = await discoverAllSitemaps(baseUrl, robots, maxSitemapsToProcess);
console.error(` Found ${sitemapUrls.length} sitemaps to analyze`);
// Step 3: Fetch and validate each sitemap
console.error('Step 3: Fetching and validating sitemaps...');
const sitemapValidations: SitemapValidation[] = [];
const allUrls: Array<{ url: string; sitemap: string; lastmod?: string }> = [];
for (const sitemapUrl of sitemapUrls.slice(0, maxSitemapsToProcess)) {
console.error(` Processing: ${sitemapUrl}`);
const sitemap = await fetchSitemap(sitemapUrl, { maxUrls: maxUrlsPerSitemap });
const validation = validateSitemap(sitemap);
sitemapValidations.push(validation);
// Collect URLs for pattern analysis
for (const urlEntry of sitemap.urls.slice(0, maxUrlsPerSitemap)) {
allUrls.push({
url: urlEntry.loc,
sitemap: sitemapUrl,
lastmod: urlEntry.lastmod,
});
}
issues.push(...validation.issues);
warnings.push(...validation.warnings);
}
// Step 4: Analyze URL patterns to identify route types
console.error('Step 4: Analyzing URL patterns...');
const routePatterns = analyzeRoutePatterns(allUrls, sitemapValidations);
console.error(` Identified ${routePatterns.length} distinct route patterns`);
// Step 5: Calculate totals and estimates
const totals = calculateTotals(sitemapValidations, routePatterns);
// Step 6: Generate sampling strategy
console.error('Step 5: Generating sampling strategy...');
const samplingStrategy = generateSamplingStrategy(routePatterns, totals);
// Step 7: Generate recommendations
recommendations.push(...generateRecommendations(robots, sitemapValidations, routePatterns, totals));
return {
baseUrl,
timestamp: new Date().toISOString(),
robots,
sitemaps: sitemapValidations,
routePatterns,
totals,
samplingStrategy,
issues,
warnings,
recommendations,
};
}
// ============================================================================
// Helper Functions
// ============================================================================
/**
* Discover all sitemaps from robots.txt and common locations
*/
async function discoverAllSitemaps(
baseUrl: string,
robots: RobotsAnalysis,
maxSitemaps: number
): Promise<string[]> {
const sitemapUrls = new Set<string>(robots.sitemaps);
// Check common sitemap locations if none found in robots.txt
if (sitemapUrls.size === 0) {
const commonPaths = [
'/sitemap.xml',
'/sitemap_index.xml',
'/sitemap-index.xml',
'/sitemaps/sitemap.xml',
'/wp-sitemap.xml',
'/sitemap/sitemap-index.xml',
];
for (const path of commonPaths) {
const url = new URL(path, baseUrl).toString();
const status = await checkUrlStatus(url);
if (status.statusCode === 200) {
sitemapUrls.add(url);
break; // Found one, that's enough to start
}
}
}
// Expand sitemap indexes to find child sitemaps
const expandedUrls = new Set<string>();
for (const url of sitemapUrls) {
expandedUrls.add(url);
if (expandedUrls.size >= maxSitemaps) break;
const sitemap = await fetchSitemap(url, { maxUrls: 100 });
if (sitemap.type === 'sitemapindex' && sitemap.childSitemaps) {
for (const childUrl of sitemap.childSitemaps) {
expandedUrls.add(childUrl);
if (expandedUrls.size >= maxSitemaps) break;
}
}
}
return Array.from(expandedUrls);
}
/**
* Validate a sitemap against best practices
*/
function validateSitemap(sitemap: SitemapAnalysis): SitemapValidation {
const issues: string[] = [];
const warnings: string[] = [];
// Check if found
if (!sitemap.found) {
return {
url: sitemap.url,
isValid: false,
issues: ['Sitemap not accessible'],
warnings: [],
stats: {
urlCount: 0,
hasLastmod: 0,
hasChangefreq: 0,
hasPriority: 0,
compressionUsed: sitemap.url.endsWith('.gz'),
isIndex: false,
},
};
}
// URL count validation
if (sitemap.urlCount > 50000) {
issues.push(`Exceeds 50,000 URL limit (has ${sitemap.urlCount.toLocaleString()})`);
} else if (sitemap.urlCount > 45000) {
warnings.push(`Approaching 50,000 URL limit (has ${sitemap.urlCount.toLocaleString()})`);
}
// Lastmod coverage
const lastmodPercent = sitemap.urlCount > 0
? Math.round((sitemap.urlsWithLastmod / sitemap.urlCount) * 100)
: 0;
if (lastmodPercent < 50) {
warnings.push(`Only ${lastmodPercent}% of URLs have lastmod dates`);
}
// Check for stale sitemap
if (sitemap.newestLastmod) {
const daysSinceUpdate = Math.floor(
(Date.now() - new Date(sitemap.newestLastmod).getTime()) / (1000 * 60 * 60 * 24)
);
if (daysSinceUpdate > 30) {
warnings.push(`Sitemap appears stale - newest lastmod is ${daysSinceUpdate} days ago`);
}
}
// Check compression
const isCompressed = sitemap.url.endsWith('.gz');
if (sitemap.urlCount > 10000 && !isCompressed) {
warnings.push('Large sitemap should use gzip compression');
}
// Count metadata usage
const hasChangefreq = sitemap.urls.filter(u => u.changefreq).length;
const hasPriority = sitemap.urls.filter(u => u.priority).length;
return {
url: sitemap.url,
isValid: issues.length === 0,
issues,
warnings,
stats: {
urlCount: sitemap.urlCount,
hasLastmod: sitemap.urlsWithLastmod,
hasChangefreq,
hasPriority,
compressionUsed: isCompressed,
isIndex: sitemap.type === 'sitemapindex',
childCount: sitemap.childSitemaps?.length,
},
};
}
/**
* Analyze URLs to identify distinct route patterns using adaptive detection
* This learns patterns from the actual URLs rather than using hardcoded patterns
*/
function analyzeRoutePatterns(
urls: Array<{ url: string; sitemap: string; lastmod?: string }>,
sitemaps: SitemapValidation[]
): RoutePattern[] {
// First pass: Extract path segments and group URLs by structure
const structureGroups: Map<string, {
urls: typeof urls;
structure: string[];
variablePositions: number[];
}> = new Map();
for (const urlEntry of urls) {
const { url, sitemap, lastmod } = urlEntry;
const path = new URL(url).pathname;
const segments = path.split('/').filter(Boolean);
// Analyze each segment to determine if it's variable or fixed
const structure = segments.map((seg, i) => {
// Check if this looks like a variable (ID, slug, etc.)
if (isVariableSegment(seg)) {
return `{${guessSegmentType(seg, i, segments)}}`;
}
return seg;
});
const structureKey = '/' + structure.join('/');
if (!structureGroups.has(structureKey)) {
structureGroups.set(structureKey, {
urls: [],
structure,
variablePositions: structure.map((s, i) => s.startsWith('{') ? i : -1).filter(p => p >= 0),
});
}
structureGroups.get(structureKey)!.urls.push(urlEntry);
}
// Second pass: Merge similar structures and classify types
const patterns: Map<string, RoutePattern> = new Map();
for (const [structureKey, group] of structureGroups.entries()) {
// Classify the pattern type based on segments
const type = classifyRouteType(group.structure);
const name = generatePatternName(group.structure, type);
const key = `${type}:${structureKey}`;
// Aggregate URL data
let hasLastmod = false;
let oldestDate: string | undefined;
let newestDate: string | undefined;
for (const urlEntry of group.urls) {
if (urlEntry.lastmod) {
hasLastmod = true;
if (!oldestDate || urlEntry.lastmod < oldestDate) oldestDate = urlEntry.lastmod;
if (!newestDate || urlEntry.lastmod > newestDate) newestDate = urlEntry.lastmod;
}
}
patterns.set(key, {
pattern: structureKey,
name,
type,
exampleUrls: group.urls.slice(0, 10).map(u => u.url),
estimatedCount: group.urls.length,
sitemapSource: group.urls[0]?.sitemap,
hasLastmod,
dateRange: oldestDate || newestDate ? { oldest: oldestDate, newest: newestDate } : {},
});
}
// Sort by count descending
return Array.from(patterns.values())
.sort((a, b) => b.estimatedCount - a.estimatedCount);
}
/**
* Determine if a URL segment is likely a variable (ID, slug, etc.)
*/
function isVariableSegment(segment: string): boolean {
// Numeric IDs
if (/^\d+$/.test(segment)) return true;
// UUIDs
if (/^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(segment)) return true;
// Hash-like strings (hex, base64-ish)
if (/^[a-f0-9]{16,}$/i.test(segment)) return true;
// Slugs with hyphens AND numbers (like "software-engineer-12345")
if (/^[a-z0-9]+-[a-z0-9-]+-\d+$/i.test(segment)) return true;
// Very long segments (likely slugs or encoded data)
if (segment.length > 50) return true;
// Segments that look like encoded IDs
if (/^[a-zA-Z0-9_-]{20,}$/.test(segment)) return true;
// Common job board prefixed patterns (l-location, k-keyword, w-worktype)
// e.g., "l-new-york-ny", "k-software-engineer", "l-united-states-w-remote"
if (/^[lkwj]-[a-z0-9-]+$/i.test(segment)) return true;
return false;
}
/**
* Guess what type of variable a segment represents
*/
function guessSegmentType(segment: string, position: number, allSegments: string[]): string {
// Check previous segment for context clues
const prevSegment = position > 0 ? allSegments[position - 1] : null;
// UUID-like
if (/^[0-9a-f]{8}-[0-9a-f]{4}-/i.test(segment)) return 'uuid';
// Pure numeric
if (/^\d+$/.test(segment)) return 'id';
// Job board prefixed patterns (talent.com style)
// l-{location}, k-{keyword}, w-{worktype}, j-{jobtype}
if (/^l-[a-z0-9-]+$/i.test(segment)) return 'location-filter';
if (/^k-[a-z0-9-]+$/i.test(segment)) return 'keyword-filter';
if (/^w-[a-z0-9-]+$/i.test(segment)) return 'work-type-filter';
if (/^j-[a-z0-9-]+$/i.test(segment)) return 'job-type-filter';
// Context-based guessing from previous segment
if (prevSegment) {
const prev = prevSegment.toLowerCase();
if (['job', 'jobs', 'position', 'vacancy', 'opening'].includes(prev)) return 'job-slug';
if (['company', 'employer', 'organization', 'org'].includes(prev)) return 'company-slug';
if (['location', 'city', 'region', 'area', 'l'].includes(prev)) return 'location';
if (['category', 'cat', 'type', 'k'].includes(prev)) return 'category';
if (['user', 'profile', 'candidate'].includes(prev)) return 'user-id';
if (['blog', 'article', 'post', 'news'].includes(prev)) return 'post-slug';
}
// Default based on format
if (segment.includes('-') && segment.length > 10) return 'slug';
return 'param';
}
/**
* Classify route type based on path structure
*/
function classifyRouteType(structure: string[]): RoutePattern['type'] {
const path = '/' + structure.join('/').toLowerCase();
const segments = structure.map(s => s.toLowerCase());
// Job detail detection
const jobDetailIndicators = ['job', 'position', 'vacancy', 'opening', 'career'];
const hasJobIndicator = segments.some(s => jobDetailIndicators.some(ind => s === ind || s === `{${ind}-slug}`));
const hasVariableSlug = segments.some(s => s.includes('{') && (s.includes('slug') || s.includes('id') || s.includes('uuid')));
if (hasJobIndicator && hasVariableSlug) return 'job_detail';
// Job listing root (no filters)
if (segments.length <= 1 && segments.some(s => ['jobs', 'careers', 'positions', 'vacancies'].includes(s))) {
return 'job_listing';
}
// Check for filter-based patterns (e.g., {location-filter}, {keyword-filter})
const hasLocationFilter = segments.some(s => s.includes('{location-filter}') || s === '{location}');
const hasKeywordFilter = segments.some(s => s.includes('{keyword-filter}') || s === '{category}');
const hasWorkTypeFilter = segments.some(s => s.includes('{work-type-filter}') || s.includes('{job-type-filter}'));
// Location pages (have location filter but no keyword filter)
if (hasLocationFilter && !hasKeywordFilter) {
return 'location';
}
// Category pages (have keyword filter, may also have location)
if (hasKeywordFilter) {
return 'category';
}
// Combined location + work type (still location-based)
if (hasLocationFilter && hasWorkTypeFilter) {
return 'location';
}
// Legacy patterns with 'l-' or 'k-' prefixes (for backwards compatibility)
if (segments.some(s => s === 'l' || s === 'location' || s.includes('l-'))) {
return 'location';
}
if (segments.some(s => s === 'k' || s === 'category' || s.includes('k-'))) {
return 'category';
}
// Company pages
if (segments.some(s => ['company', 'employer', 'organization', 'org'].includes(s))) {
return 'company';
}
// Blog/content
if (segments.some(s => ['blog', 'article', 'news', 'resources', 'insights'].includes(s))) {
return 'blog';
}
// Static pages
const staticPages = ['about', 'contact', 'privacy', 'terms', 'faq', 'help', 'support', 'legal'];
if (segments.some(s => staticPages.includes(s))) {
return 'static';
}
// Default
return 'other';
}
/**
* Generate human-readable name for a pattern
*/
function generatePatternName(structure: string[], type: RoutePattern['type']): string {
const typeNames: Record<RoutePattern['type'], string> = {
job_detail: 'Job Detail Pages',
job_listing: 'Job Listing Pages',
category: 'Category/Filter Pages',
location: 'Location Pages',
company: 'Company Pages',
blog: 'Blog/Content Pages',
static: 'Static Pages',
other: 'Other Pages',
};
// Try to create a more specific name based on structure
const staticParts = structure.filter(s => !s.startsWith('{'));
if (staticParts.length > 0) {
const prefix = staticParts[0].charAt(0).toUpperCase() + staticParts[0].slice(1);
return `${prefix} ${typeNames[type]}`;
}
return typeNames[type];
}
/**
* Calculate total estimates
*/
function calculateTotals(
sitemaps: SitemapValidation[],
routePatterns: RoutePattern[]
): AuditPlan['totals'] {
const estimatedPages = sitemaps.reduce((sum, s) => sum + s.stats.urlCount, 0);
const jobPagesEstimate = routePatterns
.filter(p => p.type === 'job_detail')
.reduce((sum, p) => sum + p.estimatedCount, 0);
return {
estimatedPages,
sitemapsFound: sitemaps.filter(s => s.stats.urlCount > 0).length,
routeTypesFound: routePatterns.length,
jobPagesEstimate,
};
}
/**
* Generate intelligent sampling strategy
*/
function generateSamplingStrategy(
routePatterns: RoutePattern[],
totals: AuditPlan['totals']
): AuditPlan['samplingStrategy'] {
const samplesPerRouteType: Record<string, number> = {};
let totalSamples = 0;
// Strategy: More samples for important page types, fewer for massive collections
for (const pattern of routePatterns) {
let samples: number;
switch (pattern.type) {
case 'job_detail':
// For job pages: sample based on log scale due to volume
// 10 for < 1000, 20 for < 10000, 30 for < 100000, 50 for more
if (pattern.estimatedCount < 1000) samples = 10;
else if (pattern.estimatedCount < 10000) samples = 20;
else if (pattern.estimatedCount < 100000) samples = 30;
else samples = 50;
break;
case 'category':
case 'location':
// Landing pages are important - sample more proportionally
samples = Math.min(20, Math.max(5, Math.ceil(pattern.estimatedCount * 0.1)));
break;
case 'company':
samples = Math.min(15, Math.max(5, Math.ceil(pattern.estimatedCount * 0.05)));
break;
case 'job_listing':
case 'static':
// Sample all or most static pages
samples = Math.min(10, pattern.estimatedCount);
break;
case 'blog':
samples = Math.min(10, Math.max(3, Math.ceil(pattern.estimatedCount * 0.1)));
break;
default:
samples = Math.min(10, Math.max(3, pattern.estimatedCount));
}
const key = `${pattern.type}:${pattern.pattern}`;
samplesPerRouteType[key] = samples;
totalSamples += samples;
}
// Select Lighthouse targets (one from each major type)
const lighthouseTargets: string[] = [];
const typesForLighthouse = ['job_detail', 'category', 'location', 'job_listing'];
for (const type of typesForLighthouse) {
const pattern = routePatterns.find(p => p.type === type && p.exampleUrls.length > 0);
if (pattern) {
lighthouseTargets.push(pattern.exampleUrls[0]);
}
}
// Add homepage
const homepage = routePatterns.find(p => p.exampleUrls.some(u => new URL(u).pathname === '/'));
if (homepage) {
lighthouseTargets.unshift(homepage.exampleUrls.find(u => new URL(u).pathname === '/')!);
}
const rationale = `Sampling ${totalSamples} pages across ${routePatterns.length} route types. ` +
`Job detail pages use logarithmic sampling due to volume (${totals.jobPagesEstimate.toLocaleString()} estimated). ` +
`Landing pages (category/location) get higher proportional coverage for SEO impact. ` +
`Lighthouse will run on ${lighthouseTargets.length} representative pages.`;
return {
totalSamplesToTake: totalSamples,
samplesPerRouteType,
lighthouseTargets,
rationale,
};
}
/**
* Generate actionable recommendations
*/
function generateRecommendations(
robots: RobotsAnalysis,
sitemaps: SitemapValidation[],
routePatterns: RoutePattern[],
totals: AuditPlan['totals']
): string[] {
const recommendations: string[] = [];
// Sitemap recommendations
if (sitemaps.length === 0) {
recommendations.push('CRITICAL: Create XML sitemaps to improve crawlability');
}
const jobSitemap = sitemaps.find(s =>
s.url.toLowerCase().includes('job') && s.stats.urlCount > 0
);
if (!jobSitemap && totals.jobPagesEstimate > 1000) {
recommendations.push('Consider creating a dedicated jobs sitemap for better job indexing');
}
// Lastmod recommendations
const lowLastmodSitemaps = sitemaps.filter(s =>
s.stats.urlCount > 0 &&
(s.stats.hasLastmod / s.stats.urlCount) < 0.5
);
if (lowLastmodSitemaps.length > 0) {
recommendations.push('Add lastmod dates to sitemap URLs to help search engines prioritize fresh content');
}
// Compression recommendations
const uncompressedLarge = sitemaps.filter(s =>
s.stats.urlCount > 10000 && !s.stats.compressionUsed
);
if (uncompressedLarge.length > 0) {
recommendations.push('Use gzip compression for large sitemaps (>10k URLs)');
}
// Job board specific
if (totals.jobPagesEstimate > 10000) {
recommendations.push('Consider implementing Google Indexing API for faster job posting indexation');
}
// Landing page coverage
const categoryPages = routePatterns.find(p => p.type === 'category');
const locationPages = routePatterns.find(p => p.type === 'location');
if (!categoryPages || categoryPages.estimatedCount < 10) {
recommendations.push('Consider creating more category landing pages (e.g., /marketing-jobs, /engineering-jobs)');
}
if (!locationPages || locationPages.estimatedCount < 10) {
recommendations.push('Consider creating location landing pages (e.g., /jobs-in-new-york, /remote-jobs)');
}
// Robots.txt recommendations
if (robots.found && robots.sitemaps.length === 0) {
recommendations.push('Add Sitemap directive(s) to robots.txt');
}
return recommendations;
}
export default planAudit;