// src/tools/sample-pages.ts
// Intelligent page sampling tool - samples pages based on audit plan
import type { PageAnalysis } from '../types/index.js';
import type { AuditPlan, RoutePattern } from './plan-audit.js';
import { analyzePage } from './crawl-page.js';
import { closeBrowser } from '../utils/browser.js';
// ============================================================================
// Types
// ============================================================================
export interface SamplePagesInput {
plan: AuditPlan; // The audit plan from plan_audit
routeTypes?: string[]; // Specific route types to sample (default: all)
samplesOverride?: Record<string, number>; // Override sample counts per route
includeOldest?: boolean; // Include oldest URLs by lastmod (default: true)
includeNewest?: boolean; // Include newest URLs by lastmod (default: true)
randomSeed?: number; // Seed for reproducible random sampling
concurrency?: number; // Concurrent page analyses (default: 2)
onProgress?: (completed: number, total: number, url: string) => void;
}
export interface SampledPage extends PageAnalysis {
routeType: string;
routePattern: string;
sampleReason: 'random' | 'oldest' | 'newest' | 'first' | 'manual';
}
export interface SampleResult {
baseUrl: string;
timestamp: string;
// Sampling summary
summary: {
totalSampled: number;
successfulAnalyses: number;
failedAnalyses: number;
routeTypesCovered: number;
totalAnalysisTimeMs: number;
};
// Results by route type
byRouteType: Record<string, {
pattern: RoutePattern;
samples: SampledPage[];
aggregatedIssues: string[];
}>;
// All sampled pages (flat list)
pages: SampledPage[];
// Failed URLs
failures: Array<{
url: string;
routeType: string;
error: string;
}>;
// Cross-cutting findings
findings: {
pagesWithoutTitle: number;
pagesWithoutDescription: number;
pagesWithoutH1: number;
pagesWithJobPostingSchema: number;
pagesWithJobPostingErrors: number;
pagesRequiringJsRendering: number;
averageLoadTimeMs: number;
commonIssues: Array<{ issue: string; count: number; percentage: number }>;
};
}
// ============================================================================
// Main Function
// ============================================================================
export async function samplePages(input: SamplePagesInput): Promise<SampleResult> {
const {
plan,
routeTypes,
samplesOverride = {},
includeOldest = true,
includeNewest = true,
concurrency = 2,
onProgress,
} = input;
const startTime = Date.now();
const pages: SampledPage[] = [];
const failures: SampleResult['failures'] = [];
const byRouteType: SampleResult['byRouteType'] = {};
// Filter route patterns if specific types requested
let patternsToSample = plan.routePatterns;
if (routeTypes && routeTypes.length > 0) {
patternsToSample = plan.routePatterns.filter(p =>
routeTypes.includes(p.type) || routeTypes.includes(`${p.type}:${p.pattern}`)
);
}
// Build URL list with sampling strategy
const urlsToSample: Array<{
url: string;
routeType: string;
routePattern: string;
reason: SampledPage['sampleReason'];
}> = [];
for (const pattern of patternsToSample) {
const key = `${pattern.type}:${pattern.pattern}`;
const targetCount = samplesOverride[key] ??
samplesOverride[pattern.type] ??
plan.samplingStrategy.samplesPerRouteType[key] ??
5;
const selectedUrls = selectSampleUrls(
pattern,
targetCount,
includeOldest,
includeNewest
);
for (const { url, reason } of selectedUrls) {
urlsToSample.push({
url,
routeType: pattern.type,
routePattern: pattern.pattern,
reason,
});
}
// Initialize byRouteType entry
byRouteType[key] = {
pattern,
samples: [],
aggregatedIssues: [],
};
}
console.error(`\n=== Sampling ${urlsToSample.length} pages across ${patternsToSample.length} route types ===\n`);
// Process URLs with concurrency control
let completed = 0;
const total = urlsToSample.length;
// Simple concurrency using chunks
for (let i = 0; i < urlsToSample.length; i += concurrency) {
const chunk = urlsToSample.slice(i, i + concurrency);
const results = await Promise.all(
chunk.map(async ({ url, routeType, routePattern, reason }) => {
try {
console.error(`[${completed + 1}/${total}] Analyzing: ${url}`);
const analysis = await analyzePage({
url,
timeout: 25000,
device: 'desktop',
});
const sampledPage: SampledPage = {
...analysis,
routeType,
routePattern,
sampleReason: reason,
};
return { success: true, page: sampledPage, url, routeType, routePattern };
} catch (error: any) {
console.error(` Failed: ${error.message}`);
return {
success: false,
error: error.message,
url,
routeType,
routePattern,
};
} finally {
completed++;
onProgress?.(completed, total, url);
}
})
);
// Process results
for (const result of results) {
if (result.success && result.page) {
pages.push(result.page);
const key = `${result.routeType}:${result.routePattern}`;
byRouteType[key]?.samples.push(result.page);
} else {
failures.push({
url: result.url,
routeType: result.routeType,
error: result.error || 'Unknown error',
});
}
}
// Small delay between chunks to be polite
if (i + concurrency < urlsToSample.length) {
await new Promise(r => setTimeout(r, 500));
}
}
// Close browser after all analyses
await closeBrowser();
// Aggregate issues by route type
for (const [key, data] of Object.entries(byRouteType)) {
data.aggregatedIssues = aggregateIssuesForRouteType(data.samples);
}
// Calculate findings
const findings = calculateFindings(pages);
const totalTimeMs = Date.now() - startTime;
return {
baseUrl: plan.baseUrl,
timestamp: new Date().toISOString(),
summary: {
totalSampled: urlsToSample.length,
successfulAnalyses: pages.length,
failedAnalyses: failures.length,
routeTypesCovered: Object.keys(byRouteType).filter(k => byRouteType[k].samples.length > 0).length,
totalAnalysisTimeMs: totalTimeMs,
},
byRouteType,
pages,
failures,
findings,
};
}
// ============================================================================
// Helper Functions
// ============================================================================
/**
* Select URLs to sample from a route pattern
*/
function selectSampleUrls(
pattern: RoutePattern,
targetCount: number,
includeOldest: boolean,
includeNewest: boolean
): Array<{ url: string; reason: SampledPage['sampleReason'] }> {
const selected: Array<{ url: string; reason: SampledPage['sampleReason'] }> = [];
const available = [...pattern.exampleUrls];
if (available.length === 0) return [];
// Always include first URL
selected.push({ url: available[0], reason: 'first' });
// Try to include oldest/newest if we have date info
// Note: In a real implementation, we'd need the full URL list with dates
// For now, we'll just sample from available examples
// Fill remaining with pseudo-random selection
const remaining = targetCount - selected.length;
const unselected = available.filter(u => !selected.some(s => s.url === u));
// Shuffle and take remaining
const shuffled = shuffleArray(unselected);
for (let i = 0; i < Math.min(remaining, shuffled.length); i++) {
selected.push({ url: shuffled[i], reason: 'random' });
}
return selected;
}
/**
* Fisher-Yates shuffle
*/
function shuffleArray<T>(array: T[]): T[] {
const result = [...array];
for (let i = result.length - 1; i > 0; i--) {
const j = Math.floor(Math.random() * (i + 1));
[result[i], result[j]] = [result[j], result[i]];
}
return result;
}
/**
* Aggregate issues found across sampled pages for a route type
*/
function aggregateIssuesForRouteType(pages: SampledPage[]): string[] {
const issues: string[] = [];
if (pages.length === 0) return issues;
// Check for missing titles
const missingTitles = pages.filter(p => !p.title);
if (missingTitles.length > 0) {
issues.push(`${missingTitles.length}/${pages.length} pages missing title tag`);
}
// Check for missing descriptions
const missingDesc = pages.filter(p => !p.metaDescription);
if (missingDesc.length > 0) {
issues.push(`${missingDesc.length}/${pages.length} pages missing meta description`);
}
// Check for missing H1
const missingH1 = pages.filter(p => p.headings.h1.length === 0);
if (missingH1.length > 0) {
issues.push(`${missingH1.length}/${pages.length} pages missing H1`);
}
// Check for multiple H1s
const multipleH1 = pages.filter(p => p.headings.h1.length > 1);
if (multipleH1.length > 0) {
issues.push(`${multipleH1.length}/${pages.length} pages have multiple H1 tags`);
}
// Check for JS rendering requirement
const jsRendered = pages.filter(p => p.rendering.jsRenderingRequired);
if (jsRendered.length === pages.length) {
issues.push('All pages require JavaScript rendering - potential indexing issues');
} else if (jsRendered.length > pages.length / 2) {
issues.push(`${jsRendered.length}/${pages.length} pages require JavaScript rendering`);
}
// Check for JobPosting schema issues (for job detail pages)
const withJobPosting = pages.filter(p => p.structuredData.hasJobPosting);
const withJobPostingErrors = pages.filter(p => p.structuredData.jobPostingErrors.length > 0);
if (withJobPosting.length > 0 && withJobPostingErrors.length > 0) {
issues.push(`${withJobPostingErrors.length}/${withJobPosting.length} job pages have schema errors`);
}
// Check for slow pages
const slowPages = pages.filter(p => p.loadTimeMs > 5000);
if (slowPages.length > 0) {
issues.push(`${slowPages.length}/${pages.length} pages are slow (>5s load time)`);
}
return issues;
}
/**
* Calculate cross-cutting findings from all sampled pages
*/
function calculateFindings(pages: SampledPage[]): SampleResult['findings'] {
if (pages.length === 0) {
return {
pagesWithoutTitle: 0,
pagesWithoutDescription: 0,
pagesWithoutH1: 0,
pagesWithJobPostingSchema: 0,
pagesWithJobPostingErrors: 0,
pagesRequiringJsRendering: 0,
averageLoadTimeMs: 0,
commonIssues: [],
};
}
const pagesWithoutTitle = pages.filter(p => !p.title).length;
const pagesWithoutDescription = pages.filter(p => !p.metaDescription).length;
const pagesWithoutH1 = pages.filter(p => p.headings.h1.length === 0).length;
const pagesWithJobPostingSchema = pages.filter(p => p.structuredData.hasJobPosting).length;
const pagesWithJobPostingErrors = pages.filter(p => p.structuredData.jobPostingErrors.length > 0).length;
const pagesRequiringJsRendering = pages.filter(p => p.rendering.jsRenderingRequired).length;
const totalLoadTime = pages.reduce((sum, p) => sum + p.loadTimeMs, 0);
const averageLoadTimeMs = Math.round(totalLoadTime / pages.length);
// Collect all issues and count occurrences
const issueCounts: Record<string, number> = {};
for (const page of pages) {
if (!page.title) increment(issueCounts, 'Missing title tag');
if (!page.metaDescription) increment(issueCounts, 'Missing meta description');
if (page.headings.h1.length === 0) increment(issueCounts, 'Missing H1 tag');
if (page.headings.h1.length > 1) increment(issueCounts, 'Multiple H1 tags');
if (!page.canonicalUrl) increment(issueCounts, 'Missing canonical URL');
if (page.hasMixedContent) increment(issueCounts, 'Mixed content (HTTP on HTTPS)');
if (page.rendering.jsRenderingRequired) increment(issueCounts, 'Requires JavaScript rendering');
if (page.loadTimeMs > 5000) increment(issueCounts, 'Slow page load (>5s)');
if (page.loadTimeMs > 3000) increment(issueCounts, 'Moderate page load (>3s)');
// Image issues
if (page.images.withoutAlt > 0) increment(issueCounts, 'Images without alt text');
// Heading issues
for (const issue of page.headings.issues) {
increment(issueCounts, issue);
}
// Schema issues
for (const error of page.structuredData.jobPostingErrors) {
increment(issueCounts, `Schema: ${error.message}`);
}
}
// Sort by count and calculate percentage
const commonIssues = Object.entries(issueCounts)
.map(([issue, count]) => ({
issue,
count,
percentage: Math.round((count / pages.length) * 100),
}))
.sort((a, b) => b.count - a.count)
.slice(0, 20); // Top 20 issues
return {
pagesWithoutTitle,
pagesWithoutDescription,
pagesWithoutH1,
pagesWithJobPostingSchema,
pagesWithJobPostingErrors,
pagesRequiringJsRendering,
averageLoadTimeMs,
commonIssues,
};
}
function increment(obj: Record<string, number>, key: string): void {
obj[key] = (obj[key] || 0) + 1;
}
export default samplePages;