// src/tools/crawl-page.ts
// Single page SEO analysis tool
import type {
PageAnalysis,
StructuredDataResult,
JobPostingSchema,
SchemaError,
AnalyzePageInput
} from '../types/index.js';
import {
createPage,
navigateToUrl,
extractMetaTags,
extractHeadings,
extractJsonLd,
extractLinks,
extractImages,
checkMixedContent,
detectFramework,
getRenderedHtml,
checkCriticalContent,
} from '../utils/browser.js';
/**
* Analyze a single page for SEO factors
*/
export async function analyzePage(input: AnalyzePageInput): Promise<PageAnalysis> {
const {
url,
waitForSelector,
timeout = 30000,
checkLinks = false,
device = 'desktop',
} = input;
const { context, page } = await createPage({ device, timeout });
try {
// Navigate and capture timing
const { response, loadTimeMs, initialHtml, redirectChain } = await navigateToUrl(
page,
url,
{ waitForSelector, timeout }
);
const httpStatus = response?.status() || 0;
// Get rendered HTML
const renderedHtml = await getRenderedHtml(page);
// Extract all SEO elements in parallel
const [
metaTags,
headings,
jsonLd,
links,
images,
hasMixedContent,
framework,
] = await Promise.all([
extractMetaTags(page),
extractHeadings(page),
extractJsonLd(page),
extractLinks(page, url),
extractImages(page),
checkMixedContent(page),
detectFramework(page),
]);
// Analyze structured data
const structuredData = analyzeStructuredData(jsonLd);
// Analyze rendering
const jsRenderingRequired = renderedHtml.length > initialHtml.length * 1.5;
const criticalPatterns = ['job', 'position', 'career', 'apply', 'salary'];
const criticalContentInInitialHtml = checkCriticalContent(initialHtml, criticalPatterns);
// Get page language
const language = await page.$eval('html', el => el.getAttribute('lang')).catch(() => null);
const analysis: PageAnalysis = {
url,
timestamp: new Date().toISOString(),
httpStatus,
redirectChain,
responseTime: loadTimeMs,
title: metaTags.title,
metaDescription: metaTags.description,
canonicalUrl: metaTags.canonical,
robotsMeta: metaTags.robots,
headings,
structuredData,
isHttps: url.startsWith('https'),
hasMixedContent,
viewport: metaTags.viewport,
charset: metaTags.charset,
language,
rendering: {
initialHtmlLength: initialHtml.length,
renderedHtmlLength: renderedHtml.length,
jsRenderingRequired,
jsRenderingRatio: initialHtml.length > 0 ? renderedHtml.length / initialHtml.length : 0,
criticalContentInInitialHtml,
framework,
},
links: {
internal: links.internal,
external: links.external,
broken: [], // Populated if checkLinks is true
nofollow: links.nofollow,
totalCount: links.internal.length + links.external.length,
},
images: {
total: images.total,
withAlt: images.withAlt,
withoutAlt: images.withoutAlt,
lazyLoaded: images.lazyLoaded,
oversized: [], // Would need size checking
images: images.images.slice(0, 50), // Limit for response size
},
loadTimeMs,
};
return analysis;
} finally {
await context.close();
}
}
/**
* Analyze structured data, with special focus on JobPosting
*/
function analyzeStructuredData(jsonLd: any[]): StructuredDataResult {
const result: StructuredDataResult = {
jsonLd,
microdata: [], // Would need separate extraction
rdfa: [],
hasJobPosting: false,
jobPostings: [],
jobPostingErrors: [],
jobPostingWarnings: [],
hasOrganization: false,
hasBreadcrumb: false,
hasWebSite: false,
hasSearchAction: false,
};
for (const item of jsonLd) {
const type = item['@type'];
if (type === 'JobPosting') {
result.hasJobPosting = true;
const { schema, errors, warnings } = validateJobPosting(item);
result.jobPostings.push(schema);
result.jobPostingErrors.push(...errors);
result.jobPostingWarnings.push(...warnings);
} else if (type === 'Organization' || type === 'Corporation') {
result.hasOrganization = true;
} else if (type === 'BreadcrumbList') {
result.hasBreadcrumb = true;
} else if (type === 'WebSite') {
result.hasWebSite = true;
if (item.potentialAction?.['@type'] === 'SearchAction') {
result.hasSearchAction = true;
}
}
}
return result;
}
/**
* Validate JobPosting schema against Google requirements
*/
function validateJobPosting(schema: any): {
schema: JobPostingSchema;
errors: SchemaError[];
warnings: SchemaError[];
} {
const errors: SchemaError[] = [];
const warnings: SchemaError[] = [];
// Required fields per Google
const requiredFields = [
'title',
'description',
'datePosted',
'hiringOrganization',
'jobLocation',
];
for (const field of requiredFields) {
if (!schema[field]) {
errors.push({
field,
message: `Missing required field: ${field}`,
severity: 'error',
});
}
}
// Recommended fields
const recommendedFields = [
'validThrough',
'baseSalary',
'employmentType',
'identifier',
'directApply',
];
for (const field of recommendedFields) {
if (!schema[field]) {
warnings.push({
field,
message: `Missing recommended field: ${field}`,
severity: 'warning',
});
}
}
// Specific validations
if (schema.validThrough) {
const expiry = new Date(schema.validThrough);
if (isNaN(expiry.getTime())) {
errors.push({
field: 'validThrough',
message: 'Invalid date format for validThrough',
severity: 'error',
});
} else if (expiry < new Date()) {
warnings.push({
field: 'validThrough',
message: 'Job posting has expired (validThrough is in the past)',
severity: 'warning',
});
}
}
if (schema.description) {
if (typeof schema.description === 'string' && schema.description.length < 100) {
warnings.push({
field: 'description',
message: 'Description is shorter than recommended (should be comprehensive)',
severity: 'warning',
});
}
}
if (schema.hiringOrganization && !schema.hiringOrganization.name) {
errors.push({
field: 'hiringOrganization.name',
message: 'hiringOrganization must include name',
severity: 'error',
});
}
// Remote job validation
if (schema.jobLocationType === 'TELECOMMUTE' && !schema.applicantLocationRequirements) {
warnings.push({
field: 'applicantLocationRequirements',
message: 'Remote jobs should specify applicantLocationRequirements',
severity: 'warning',
});
}
// Salary validation
if (schema.baseSalary) {
if (!schema.baseSalary.currency) {
warnings.push({
field: 'baseSalary.currency',
message: 'Salary should include currency',
severity: 'warning',
});
}
}
const parsedSchema: JobPostingSchema = {
title: schema.title,
description: typeof schema.description === 'string'
? schema.description.substring(0, 500) + '...' // Truncate for response
: undefined,
datePosted: schema.datePosted,
validThrough: schema.validThrough,
employmentType: schema.employmentType,
hiringOrganization: schema.hiringOrganization,
jobLocation: schema.jobLocation,
jobLocationType: schema.jobLocationType,
applicantLocationRequirements: schema.applicantLocationRequirements,
baseSalary: schema.baseSalary,
directApply: schema.directApply,
raw: schema,
};
return { schema: parsedSchema, errors, warnings };
}
export default analyzePage;