// src/utils/page-capture.ts
// Single-fetch page capture with HAR recording and comprehensive data extraction
import type { BrowserContext, Page } from 'playwright';
import type { PageSnapshot, StoredPageAnalysis, AuditStorage } from './storage.js';
import {
createPage,
extractMetaTags,
extractHeadings,
extractJsonLd,
extractLinks,
extractImages,
checkMixedContent,
detectFramework,
} from './browser.js';
import {
savePageSnapshot,
savePageAnalysis,
saveHarFile,
hasPageSnapshot,
loadPageSnapshot,
loadPageAnalysis,
} from './storage.js';
// ============================================================================
// Types
// ============================================================================
export interface CaptureOptions {
timeout?: number;
device?: 'desktop' | 'mobile';
captureHar?: boolean;
captureScreenshot?: boolean;
waitForSelector?: string;
}
export interface CaptureResult {
snapshot: PageSnapshot;
analysis: StoredPageAnalysis;
cached: boolean; // True if loaded from storage instead of fetched
}
// ============================================================================
// Main Capture Function
// ============================================================================
/**
* Capture a page once and extract all data needed for SEO analysis
* Returns cached data if already captured
*/
export async function capturePage(
storage: AuditStorage,
url: string,
options: CaptureOptions = {}
): Promise<CaptureResult> {
const {
timeout = 30000,
device = 'desktop',
captureHar = false,
captureScreenshot = false,
waitForSelector,
} = options;
// Check if already captured
if (await hasPageSnapshot(storage, url)) {
const snapshot = await loadPageSnapshot(storage, url);
const analysis = await loadPageAnalysis(storage, url);
if (snapshot && analysis) {
console.error(`[CACHE HIT] ${url}`);
return { snapshot, analysis, cached: true };
}
}
console.error(`[FETCHING] ${url}`);
const { context, page } = await createPage({ device, timeout });
// Set up HAR recording if requested
if (captureHar) {
await context.tracing.start({ screenshots: false, snapshots: false });
}
try {
// Capture initial HTML before JS execution
let initialHtml = '';
page.on('response', async (response) => {
if (response.url() === url && response.headers()['content-type']?.includes('text/html')) {
try {
initialHtml = await response.text();
} catch {
// Ignore
}
}
});
// Navigate to the page
const startTime = Date.now();
const response = await page.goto(url, {
waitUntil: 'networkidle',
timeout,
});
// Wait for optional selector
if (waitForSelector) {
await page.waitForSelector(waitForSelector, { timeout: 10000 }).catch(() => {});
}
// Additional wait for dynamic content
await page.waitForLoadState('networkidle').catch(() => {});
const loadTimeMs = Date.now() - startTime;
const httpStatus = response?.status() || 0;
// Get response headers
const headers: Record<string, string> = {};
if (response) {
const respHeaders = response.headers();
for (const [key, value] of Object.entries(respHeaders)) {
headers[key] = value;
}
}
// Get rendered HTML (after JS execution)
const renderedHtml = await page.content();
// Create snapshot
const snapshot: PageSnapshot = {
url,
fetchedAt: new Date().toISOString(),
httpStatus,
headers,
initialHtml,
renderedHtml,
};
// Save HAR if requested
if (captureHar) {
const harPath = `${storage.paths.har}/${Date.now()}.har`;
await context.tracing.stop({ path: harPath });
snapshot.harFile = harPath;
}
// Take screenshot if requested
if (captureScreenshot) {
const screenshotPath = `${storage.paths.pages}/${Date.now()}.png`;
await page.screenshot({ path: screenshotPath, fullPage: true });
snapshot.screenshotFile = screenshotPath;
}
// Extract all SEO data from the page (single pass)
const analysis = await extractAllSeoData(page, url, snapshot, loadTimeMs);
// Save to storage
await savePageSnapshot(storage, snapshot);
await savePageAnalysis(storage, url, analysis);
return { snapshot, analysis, cached: false };
} finally {
await context.close();
}
}
/**
* Extract all SEO-relevant data from a page in a single pass
*/
async function extractAllSeoData(
page: Page,
url: string,
snapshot: PageSnapshot,
loadTimeMs: number
): Promise<StoredPageAnalysis> {
// Run all extractions in parallel for efficiency
const [
metaTags,
headings,
jsonLd,
links,
images,
hasMixedContent,
framework,
openGraph,
twitterCard,
language,
] = await Promise.all([
extractMetaTags(page),
extractHeadings(page),
extractJsonLd(page),
extractLinks(page, url),
extractImages(page),
checkMixedContent(page),
detectFramework(page),
extractOpenGraph(page),
extractTwitterCard(page),
page.$eval('html', el => el.getAttribute('lang')).catch(() => null),
]);
// Analyze structured data
const structuredData = analyzeStructuredData(jsonLd);
// Determine if JS rendering is required
const jsRenderingRequired = snapshot.renderedHtml.length > snapshot.initialHtml.length * 1.5;
return {
snapshot,
seo: {
title: metaTags.title,
metaDescription: metaTags.description,
canonical: metaTags.canonical,
robots: metaTags.robots,
headings,
viewport: metaTags.viewport,
language,
},
structuredData,
openGraph,
twitter: twitterCard,
technical: {
isHttps: url.startsWith('https'),
hasMixedContent,
jsRenderingRequired,
framework,
loadTimeMs,
},
links: {
internal: links.internal,
external: links.external,
totalCount: links.internal.length + links.external.length,
},
images: {
total: images.total,
withAlt: images.withAlt,
withoutAlt: images.withoutAlt,
images: images.images.slice(0, 50), // Limit for storage
},
};
}
/**
* Extract Open Graph meta tags
*/
async function extractOpenGraph(page: Page): Promise<StoredPageAnalysis['openGraph']> {
return page.evaluate(() => {
const getMeta = (property: string): string | null => {
const el = document.querySelector(`meta[property="${property}"]`);
return el?.getAttribute('content') ?? null;
};
return {
title: getMeta('og:title'),
description: getMeta('og:description'),
image: getMeta('og:image'),
type: getMeta('og:type'),
url: getMeta('og:url'),
};
});
}
/**
* Extract Twitter Card meta tags
*/
async function extractTwitterCard(page: Page): Promise<StoredPageAnalysis['twitter']> {
return page.evaluate(() => {
const getMeta = (name: string): string | null => {
const el = document.querySelector(`meta[name="${name}"]`);
return el?.getAttribute('content') ?? null;
};
return {
card: getMeta('twitter:card'),
title: getMeta('twitter:title'),
description: getMeta('twitter:description'),
image: getMeta('twitter:image'),
};
});
}
/**
* Analyze structured data with JobPosting focus
*/
function analyzeStructuredData(jsonLd: any[]): StoredPageAnalysis['structuredData'] {
const result = {
jsonLd,
hasJobPosting: false,
jobPostings: [] as any[],
jobPostingErrors: [] as any[],
jobPostingWarnings: [] as any[],
};
for (const item of jsonLd) {
const type = item['@type'];
if (type === 'JobPosting') {
result.hasJobPosting = true;
const { errors, warnings } = validateJobPosting(item);
result.jobPostings.push(item);
result.jobPostingErrors.push(...errors);
result.jobPostingWarnings.push(...warnings);
}
}
return result;
}
/**
* Validate JobPosting schema
*/
function validateJobPosting(schema: any): { errors: any[]; warnings: any[] } {
const errors: any[] = [];
const warnings: any[] = [];
// Required fields
const required = ['title', 'description', 'datePosted', 'hiringOrganization', 'jobLocation'];
for (const field of required) {
if (!schema[field]) {
errors.push({ field, message: `Missing required field: ${field}`, severity: 'error' });
}
}
// Recommended fields
const recommended = ['validThrough', 'baseSalary', 'employmentType', 'directApply'];
for (const field of recommended) {
if (!schema[field]) {
warnings.push({ field, message: `Missing recommended field: ${field}`, severity: 'warning' });
}
}
// Validate expiration
if (schema.validThrough) {
const expiry = new Date(schema.validThrough);
if (expiry < new Date()) {
warnings.push({ field: 'validThrough', message: 'Job posting has expired', severity: 'warning' });
}
}
return { errors, warnings };
}
/**
* Batch capture multiple pages
*/
export async function capturePages(
storage: AuditStorage,
urls: string[],
options: CaptureOptions & { concurrency?: number } = {}
): Promise<CaptureResult[]> {
const { concurrency = 2, ...captureOptions } = options;
const results: CaptureResult[] = [];
// Process in chunks for controlled concurrency
for (let i = 0; i < urls.length; i += concurrency) {
const chunk = urls.slice(i, i + concurrency);
const chunkResults = await Promise.all(
chunk.map(url => capturePage(storage, url, captureOptions).catch(err => ({
snapshot: { url, fetchedAt: new Date().toISOString(), httpStatus: 0, headers: {}, initialHtml: '', renderedHtml: '' },
analysis: null as any,
cached: false,
error: err.message,
})))
);
results.push(...chunkResults);
// Small delay between chunks
if (i + concurrency < urls.length) {
await new Promise(r => setTimeout(r, 500));
}
}
return results;
}
export default {
capturePage,
capturePages,
};