// src/tools/crawl-site.ts
// Multi-page site crawler for comprehensive SEO analysis
import type {
CrawlConfig,
SiteCrawlResult,
CrawlStats,
CrawlSummary,
PageTypeClassification,
DuplicateGroup,
PageAnalysis,
CrawlSiteInput,
} from '../types/index.js';
import { analyzePage } from './crawl-page.js';
import { closeBrowser } from '../utils/browser.js';
/**
* Crawl a site starting from a URL
*/
export async function crawlSite(input: CrawlSiteInput): Promise<SiteCrawlResult> {
const config: CrawlConfig = {
startUrl: input.startUrl,
maxPages: input.maxPages ?? 50,
maxDepth: input.maxDepth ?? 5,
includePatterns: input.includePatterns,
excludePatterns: input.excludePatterns ?? [
'\\?', // Query strings
'#', // Anchors
'/api/', // API endpoints
'/admin/', // Admin pages
'/wp-admin/', // WordPress admin
'/cdn-cgi/', // Cloudflare
'\\.(jpg|jpeg|png|gif|svg|webp|pdf|css|js|ico|woff|woff2|ttf|eot)$',
],
requestDelayMs: 1000,
maxConcurrent: 1, // Sequential for politeness
followRedirects: true,
runLighthouse: input.runLighthouse ?? false,
};
const startedAt = new Date().toISOString();
const baseHost = new URL(config.startUrl).host;
const visited = new Set<string>();
const toVisit: Array<{ url: string; depth: number }> = [{ url: config.startUrl, depth: 0 }];
const pages: PageAnalysis[] = [];
const stats: CrawlStats = {
urlsDiscovered: 1,
urlsCrawled: 0,
urlsSkipped: 0,
urlsFailed: 0,
statusCodes: {},
contentTypes: {},
};
// Compile regex patterns
const includeRegexes = config.includePatterns?.map(p => new RegExp(p, 'i')) ?? [];
const excludeRegexes = config.excludePatterns?.map(p => new RegExp(p, 'i')) ?? [];
console.error(`Starting crawl of ${config.startUrl} (max ${config.maxPages} pages)`);
try {
while (toVisit.length > 0 && pages.length < config.maxPages) {
const { url, depth } = toVisit.shift()!;
// Skip if already visited
if (visited.has(url)) continue;
visited.add(url);
// Check depth limit
if (depth > (config.maxDepth ?? 5)) {
stats.urlsSkipped++;
continue;
}
// Check patterns
const shouldExclude = excludeRegexes.some(r => r.test(url));
const shouldInclude = includeRegexes.length === 0 || includeRegexes.some(r => r.test(url));
if (shouldExclude || !shouldInclude) {
stats.urlsSkipped++;
continue;
}
console.error(`[${pages.length + 1}/${config.maxPages}] Crawling: ${url}`);
try {
const analysis = await analyzePage({
url,
timeout: 20000,
device: 'desktop',
});
pages.push(analysis);
stats.urlsCrawled++;
// Track status codes
stats.statusCodes[analysis.httpStatus] = (stats.statusCodes[analysis.httpStatus] || 0) + 1;
// Add internal links to queue
for (const link of analysis.links.internal) {
try {
const linkUrl = new URL(link.href);
if (linkUrl.host === baseHost && !visited.has(linkUrl.href)) {
toVisit.push({ url: linkUrl.href, depth: depth + 1 });
stats.urlsDiscovered++;
}
} catch {
// Invalid URL, skip
}
}
// Rate limiting
if (config.requestDelayMs) {
await new Promise(r => setTimeout(r, config.requestDelayMs));
}
} catch (error: any) {
console.error(`Failed to crawl ${url}: ${error.message}`);
stats.urlsFailed++;
}
}
const completedAt = new Date().toISOString();
const durationMs = new Date(completedAt).getTime() - new Date(startedAt).getTime();
// Generate summary
const summary = generateCrawlSummary(pages);
const pageTypes = classifyPages(pages);
return {
config,
startedAt,
completedAt,
durationMs,
stats,
pages,
summary,
pageTypes,
};
} finally {
await closeBrowser();
}
}
/**
* Generate summary statistics from crawl results
*/
function generateCrawlSummary(pages: PageAnalysis[]): CrawlSummary {
const titles = pages.map(p => p.title).filter(Boolean) as string[];
const descriptions = pages.map(p => p.metaDescription).filter(Boolean) as string[];
// Find duplicates
const duplicateTitles = findDuplicates(
pages.map(p => ({ value: p.title || '', url: p.url }))
);
const duplicateDescriptions = findDuplicates(
pages.map(p => ({ value: p.metaDescription || '', url: p.url }))
);
// Count issues
const criticalIssues: string[] = [];
const warnings: string[] = [];
const pagesWithoutTitle = pages.filter(p => !p.title);
if (pagesWithoutTitle.length > 0) {
criticalIssues.push(`${pagesWithoutTitle.length} pages missing title tags`);
}
const pagesWithJobPostingErrors = pages.filter(
p => p.structuredData.jobPostingErrors.length > 0
);
if (pagesWithJobPostingErrors.length > 0) {
criticalIssues.push(`${pagesWithJobPostingErrors.length} pages with JobPosting schema errors`);
}
const pagesWithJsRendering = pages.filter(p => p.rendering.jsRenderingRequired);
if (pagesWithJsRendering.length > pages.length * 0.5) {
warnings.push(`${pagesWithJsRendering.length}/${pages.length} pages require JavaScript rendering`);
}
const pagesWithMixedContent = pages.filter(p => p.hasMixedContent);
if (pagesWithMixedContent.length > 0) {
warnings.push(`${pagesWithMixedContent.length} pages have mixed content (HTTP resources on HTTPS)`);
}
// Count broken links
const brokenLinksCount = pages.reduce(
(sum, p) => sum + p.links.broken.length,
0
);
// Count redirect chains
const redirectChainsCount = pages.filter(
p => p.redirectChain.length > 1
).length;
return {
pagesWithTitle: pages.filter(p => p.title).length,
pagesWithMetaDescription: pages.filter(p => p.metaDescription).length,
pagesWithH1: pages.filter(p => p.headings.h1.length > 0).length,
pagesWithCanonical: pages.filter(p => p.canonicalUrl).length,
pagesWithJobPosting: pages.filter(p => p.structuredData.hasJobPosting).length,
pagesWithAnySchema: pages.filter(p => p.structuredData.jsonLd.length > 0).length,
jobPostingErrors: pages.reduce(
(sum, p) => sum + p.structuredData.jobPostingErrors.length,
0
),
pagesWithJsRendering: pagesWithJsRendering.length,
pagesWithMixedContent: pagesWithMixedContent.length,
brokenLinksCount,
redirectChainsCount,
duplicateTitles,
duplicateDescriptions,
criticalIssues,
warnings,
};
}
/**
* Find duplicate values across pages
*/
function findDuplicates(items: Array<{ value: string; url: string }>): DuplicateGroup[] {
const groups: Record<string, string[]> = {};
for (const item of items) {
if (!item.value) continue;
if (!groups[item.value]) {
groups[item.value] = [];
}
groups[item.value].push(item.url);
}
return Object.entries(groups)
.filter(([_, urls]) => urls.length > 1)
.map(([value, urls]) => ({ value, urls }));
}
/**
* Classify pages by type based on URL patterns and content
*/
function classifyPages(pages: PageAnalysis[]): PageTypeClassification {
const classification: PageTypeClassification = {
homepage: [],
jobDetail: [],
jobListing: [],
categoryLanding: [],
locationLanding: [],
companyProfile: [],
search: [],
blog: [],
static: [],
other: [],
};
for (const page of pages) {
const url = page.url.toLowerCase();
const path = new URL(url).pathname;
// Homepage
if (path === '/' || path === '') {
classification.homepage.push(page.url);
continue;
}
// Job detail pages (have JobPosting schema or match patterns)
if (page.structuredData.hasJobPosting ||
/\/job\/[a-z0-9-]+$/i.test(path) ||
/\/jobs\/\d+/i.test(path) ||
/\/position\/[a-z0-9-]+/i.test(path) ||
/\/career\/[a-z0-9-]+/i.test(path)) {
classification.jobDetail.push(page.url);
continue;
}
// Job listing/search pages
if (/\/jobs\/?$/.test(path) ||
/\/careers\/?$/.test(path) ||
/\/jobs\/search/i.test(path) ||
/search|results/i.test(path) ||
url.includes('?q=') ||
url.includes('?keyword')) {
classification.jobListing.push(page.url);
continue;
}
// Category landing pages
if (/\/jobs\/[a-z-]+\/?$/i.test(path) ||
/\/category\/[a-z-]+/i.test(path) ||
/\/department\/[a-z-]+/i.test(path) ||
/\/(engineering|marketing|sales|design|finance|hr|operations)-jobs/i.test(path)) {
classification.categoryLanding.push(page.url);
continue;
}
// Location landing pages
if (/\/location\/[a-z-]+/i.test(path) ||
/\/city\/[a-z-]+/i.test(path) ||
/\/jobs-in-[a-z-]+/i.test(path) ||
/\/(new-york|san-francisco|london|remote)-jobs/i.test(path)) {
classification.locationLanding.push(page.url);
continue;
}
// Company profile pages
if (/\/company\/[a-z0-9-]+/i.test(path) ||
/\/employer\/[a-z0-9-]+/i.test(path) ||
/\/companies\//i.test(path)) {
classification.companyProfile.push(page.url);
continue;
}
// Blog/content pages
if (/\/blog\//i.test(path) ||
/\/article\//i.test(path) ||
/\/news\//i.test(path) ||
/\/resources\//i.test(path)) {
classification.blog.push(page.url);
continue;
}
// Static pages
if (/\/(about|contact|privacy|terms|faq|help)/i.test(path)) {
classification.static.push(page.url);
continue;
}
// Everything else
classification.other.push(page.url);
}
return classification;
}
export default crawlSite;