Skip to main content
Glama
RichardDillman

SEO Audit MCP Server

sample-pages.ts13.5 kB
// src/tools/sample-pages.ts // Intelligent page sampling tool - samples pages based on audit plan import type { PageAnalysis } from '../types/index.js'; import type { AuditPlan, RoutePattern } from './plan-audit.js'; import { analyzePage } from './crawl-page.js'; import { closeBrowser } from '../utils/browser.js'; // ============================================================================ // Types // ============================================================================ export interface SamplePagesInput { plan: AuditPlan; // The audit plan from plan_audit routeTypes?: string[]; // Specific route types to sample (default: all) samplesOverride?: Record<string, number>; // Override sample counts per route includeOldest?: boolean; // Include oldest URLs by lastmod (default: true) includeNewest?: boolean; // Include newest URLs by lastmod (default: true) randomSeed?: number; // Seed for reproducible random sampling concurrency?: number; // Concurrent page analyses (default: 2) onProgress?: (completed: number, total: number, url: string) => void; } export interface SampledPage extends PageAnalysis { routeType: string; routePattern: string; sampleReason: 'random' | 'oldest' | 'newest' | 'first' | 'manual'; } export interface SampleResult { baseUrl: string; timestamp: string; // Sampling summary summary: { totalSampled: number; successfulAnalyses: number; failedAnalyses: number; routeTypesCovered: number; totalAnalysisTimeMs: number; }; // Results by route type byRouteType: Record<string, { pattern: RoutePattern; samples: SampledPage[]; aggregatedIssues: string[]; }>; // All sampled pages (flat list) pages: SampledPage[]; // Failed URLs failures: Array<{ url: string; routeType: string; error: string; }>; // Cross-cutting findings findings: { pagesWithoutTitle: number; pagesWithoutDescription: number; pagesWithoutH1: number; pagesWithJobPostingSchema: number; pagesWithJobPostingErrors: number; pagesRequiringJsRendering: number; averageLoadTimeMs: number; commonIssues: Array<{ issue: string; count: number; percentage: number }>; }; } // ============================================================================ // Main Function // ============================================================================ export async function samplePages(input: SamplePagesInput): Promise<SampleResult> { const { plan, routeTypes, samplesOverride = {}, includeOldest = true, includeNewest = true, concurrency = 2, onProgress, } = input; const startTime = Date.now(); const pages: SampledPage[] = []; const failures: SampleResult['failures'] = []; const byRouteType: SampleResult['byRouteType'] = {}; // Filter route patterns if specific types requested let patternsToSample = plan.routePatterns; if (routeTypes && routeTypes.length > 0) { patternsToSample = plan.routePatterns.filter(p => routeTypes.includes(p.type) || routeTypes.includes(`${p.type}:${p.pattern}`) ); } // Build URL list with sampling strategy const urlsToSample: Array<{ url: string; routeType: string; routePattern: string; reason: SampledPage['sampleReason']; }> = []; for (const pattern of patternsToSample) { const key = `${pattern.type}:${pattern.pattern}`; const targetCount = samplesOverride[key] ?? samplesOverride[pattern.type] ?? plan.samplingStrategy.samplesPerRouteType[key] ?? 5; const selectedUrls = selectSampleUrls( pattern, targetCount, includeOldest, includeNewest ); for (const { url, reason } of selectedUrls) { urlsToSample.push({ url, routeType: pattern.type, routePattern: pattern.pattern, reason, }); } // Initialize byRouteType entry byRouteType[key] = { pattern, samples: [], aggregatedIssues: [], }; } console.error(`\n=== Sampling ${urlsToSample.length} pages across ${patternsToSample.length} route types ===\n`); // Process URLs with concurrency control let completed = 0; const total = urlsToSample.length; // Simple concurrency using chunks for (let i = 0; i < urlsToSample.length; i += concurrency) { const chunk = urlsToSample.slice(i, i + concurrency); const results = await Promise.all( chunk.map(async ({ url, routeType, routePattern, reason }) => { try { console.error(`[${completed + 1}/${total}] Analyzing: ${url}`); const analysis = await analyzePage({ url, timeout: 25000, device: 'desktop', }); const sampledPage: SampledPage = { ...analysis, routeType, routePattern, sampleReason: reason, }; return { success: true, page: sampledPage, url, routeType, routePattern }; } catch (error: any) { console.error(` Failed: ${error.message}`); return { success: false, error: error.message, url, routeType, routePattern, }; } finally { completed++; onProgress?.(completed, total, url); } }) ); // Process results for (const result of results) { if (result.success && result.page) { pages.push(result.page); const key = `${result.routeType}:${result.routePattern}`; byRouteType[key]?.samples.push(result.page); } else { failures.push({ url: result.url, routeType: result.routeType, error: result.error || 'Unknown error', }); } } // Small delay between chunks to be polite if (i + concurrency < urlsToSample.length) { await new Promise(r => setTimeout(r, 500)); } } // Close browser after all analyses await closeBrowser(); // Aggregate issues by route type for (const [key, data] of Object.entries(byRouteType)) { data.aggregatedIssues = aggregateIssuesForRouteType(data.samples); } // Calculate findings const findings = calculateFindings(pages); const totalTimeMs = Date.now() - startTime; return { baseUrl: plan.baseUrl, timestamp: new Date().toISOString(), summary: { totalSampled: urlsToSample.length, successfulAnalyses: pages.length, failedAnalyses: failures.length, routeTypesCovered: Object.keys(byRouteType).filter(k => byRouteType[k].samples.length > 0).length, totalAnalysisTimeMs: totalTimeMs, }, byRouteType, pages, failures, findings, }; } // ============================================================================ // Helper Functions // ============================================================================ /** * Select URLs to sample from a route pattern */ function selectSampleUrls( pattern: RoutePattern, targetCount: number, includeOldest: boolean, includeNewest: boolean ): Array<{ url: string; reason: SampledPage['sampleReason'] }> { const selected: Array<{ url: string; reason: SampledPage['sampleReason'] }> = []; const available = [...pattern.exampleUrls]; if (available.length === 0) return []; // Always include first URL selected.push({ url: available[0], reason: 'first' }); // Try to include oldest/newest if we have date info // Note: In a real implementation, we'd need the full URL list with dates // For now, we'll just sample from available examples // Fill remaining with pseudo-random selection const remaining = targetCount - selected.length; const unselected = available.filter(u => !selected.some(s => s.url === u)); // Shuffle and take remaining const shuffled = shuffleArray(unselected); for (let i = 0; i < Math.min(remaining, shuffled.length); i++) { selected.push({ url: shuffled[i], reason: 'random' }); } return selected; } /** * Fisher-Yates shuffle */ function shuffleArray<T>(array: T[]): T[] { const result = [...array]; for (let i = result.length - 1; i > 0; i--) { const j = Math.floor(Math.random() * (i + 1)); [result[i], result[j]] = [result[j], result[i]]; } return result; } /** * Aggregate issues found across sampled pages for a route type */ function aggregateIssuesForRouteType(pages: SampledPage[]): string[] { const issues: string[] = []; if (pages.length === 0) return issues; // Check for missing titles const missingTitles = pages.filter(p => !p.title); if (missingTitles.length > 0) { issues.push(`${missingTitles.length}/${pages.length} pages missing title tag`); } // Check for missing descriptions const missingDesc = pages.filter(p => !p.metaDescription); if (missingDesc.length > 0) { issues.push(`${missingDesc.length}/${pages.length} pages missing meta description`); } // Check for missing H1 const missingH1 = pages.filter(p => p.headings.h1.length === 0); if (missingH1.length > 0) { issues.push(`${missingH1.length}/${pages.length} pages missing H1`); } // Check for multiple H1s const multipleH1 = pages.filter(p => p.headings.h1.length > 1); if (multipleH1.length > 0) { issues.push(`${multipleH1.length}/${pages.length} pages have multiple H1 tags`); } // Check for JS rendering requirement const jsRendered = pages.filter(p => p.rendering.jsRenderingRequired); if (jsRendered.length === pages.length) { issues.push('All pages require JavaScript rendering - potential indexing issues'); } else if (jsRendered.length > pages.length / 2) { issues.push(`${jsRendered.length}/${pages.length} pages require JavaScript rendering`); } // Check for JobPosting schema issues (for job detail pages) const withJobPosting = pages.filter(p => p.structuredData.hasJobPosting); const withJobPostingErrors = pages.filter(p => p.structuredData.jobPostingErrors.length > 0); if (withJobPosting.length > 0 && withJobPostingErrors.length > 0) { issues.push(`${withJobPostingErrors.length}/${withJobPosting.length} job pages have schema errors`); } // Check for slow pages const slowPages = pages.filter(p => p.loadTimeMs > 5000); if (slowPages.length > 0) { issues.push(`${slowPages.length}/${pages.length} pages are slow (>5s load time)`); } return issues; } /** * Calculate cross-cutting findings from all sampled pages */ function calculateFindings(pages: SampledPage[]): SampleResult['findings'] { if (pages.length === 0) { return { pagesWithoutTitle: 0, pagesWithoutDescription: 0, pagesWithoutH1: 0, pagesWithJobPostingSchema: 0, pagesWithJobPostingErrors: 0, pagesRequiringJsRendering: 0, averageLoadTimeMs: 0, commonIssues: [], }; } const pagesWithoutTitle = pages.filter(p => !p.title).length; const pagesWithoutDescription = pages.filter(p => !p.metaDescription).length; const pagesWithoutH1 = pages.filter(p => p.headings.h1.length === 0).length; const pagesWithJobPostingSchema = pages.filter(p => p.structuredData.hasJobPosting).length; const pagesWithJobPostingErrors = pages.filter(p => p.structuredData.jobPostingErrors.length > 0).length; const pagesRequiringJsRendering = pages.filter(p => p.rendering.jsRenderingRequired).length; const totalLoadTime = pages.reduce((sum, p) => sum + p.loadTimeMs, 0); const averageLoadTimeMs = Math.round(totalLoadTime / pages.length); // Collect all issues and count occurrences const issueCounts: Record<string, number> = {}; for (const page of pages) { if (!page.title) increment(issueCounts, 'Missing title tag'); if (!page.metaDescription) increment(issueCounts, 'Missing meta description'); if (page.headings.h1.length === 0) increment(issueCounts, 'Missing H1 tag'); if (page.headings.h1.length > 1) increment(issueCounts, 'Multiple H1 tags'); if (!page.canonicalUrl) increment(issueCounts, 'Missing canonical URL'); if (page.hasMixedContent) increment(issueCounts, 'Mixed content (HTTP on HTTPS)'); if (page.rendering.jsRenderingRequired) increment(issueCounts, 'Requires JavaScript rendering'); if (page.loadTimeMs > 5000) increment(issueCounts, 'Slow page load (>5s)'); if (page.loadTimeMs > 3000) increment(issueCounts, 'Moderate page load (>3s)'); // Image issues if (page.images.withoutAlt > 0) increment(issueCounts, 'Images without alt text'); // Heading issues for (const issue of page.headings.issues) { increment(issueCounts, issue); } // Schema issues for (const error of page.structuredData.jobPostingErrors) { increment(issueCounts, `Schema: ${error.message}`); } } // Sort by count and calculate percentage const commonIssues = Object.entries(issueCounts) .map(([issue, count]) => ({ issue, count, percentage: Math.round((count / pages.length) * 100), })) .sort((a, b) => b.count - a.count) .slice(0, 20); // Top 20 issues return { pagesWithoutTitle, pagesWithoutDescription, pagesWithoutH1, pagesWithJobPostingSchema, pagesWithJobPostingErrors, pagesRequiringJsRendering, averageLoadTimeMs, commonIssues, }; } function increment(obj: Record<string, number>, key: string): void { obj[key] = (obj[key] || 0) + 1; } export default samplePages;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/RichardDillman/seo-audit-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server