Skip to main content
Glama
RichardDillman

SEO Audit MCP Server

crawl-site.ts10.1 kB
// src/tools/crawl-site.ts // Multi-page site crawler for comprehensive SEO analysis import type { CrawlConfig, SiteCrawlResult, CrawlStats, CrawlSummary, PageTypeClassification, DuplicateGroup, PageAnalysis, CrawlSiteInput, } from '../types/index.js'; import { analyzePage } from './crawl-page.js'; import { closeBrowser } from '../utils/browser.js'; /** * Crawl a site starting from a URL */ export async function crawlSite(input: CrawlSiteInput): Promise<SiteCrawlResult> { const config: CrawlConfig = { startUrl: input.startUrl, maxPages: input.maxPages ?? 50, maxDepth: input.maxDepth ?? 5, includePatterns: input.includePatterns, excludePatterns: input.excludePatterns ?? [ '\\?', // Query strings '#', // Anchors '/api/', // API endpoints '/admin/', // Admin pages '/wp-admin/', // WordPress admin '/cdn-cgi/', // Cloudflare '\\.(jpg|jpeg|png|gif|svg|webp|pdf|css|js|ico|woff|woff2|ttf|eot)$', ], requestDelayMs: 1000, maxConcurrent: 1, // Sequential for politeness followRedirects: true, runLighthouse: input.runLighthouse ?? false, }; const startedAt = new Date().toISOString(); const baseHost = new URL(config.startUrl).host; const visited = new Set<string>(); const toVisit: Array<{ url: string; depth: number }> = [{ url: config.startUrl, depth: 0 }]; const pages: PageAnalysis[] = []; const stats: CrawlStats = { urlsDiscovered: 1, urlsCrawled: 0, urlsSkipped: 0, urlsFailed: 0, statusCodes: {}, contentTypes: {}, }; // Compile regex patterns const includeRegexes = config.includePatterns?.map(p => new RegExp(p, 'i')) ?? []; const excludeRegexes = config.excludePatterns?.map(p => new RegExp(p, 'i')) ?? []; console.error(`Starting crawl of ${config.startUrl} (max ${config.maxPages} pages)`); try { while (toVisit.length > 0 && pages.length < config.maxPages) { const { url, depth } = toVisit.shift()!; // Skip if already visited if (visited.has(url)) continue; visited.add(url); // Check depth limit if (depth > (config.maxDepth ?? 5)) { stats.urlsSkipped++; continue; } // Check patterns const shouldExclude = excludeRegexes.some(r => r.test(url)); const shouldInclude = includeRegexes.length === 0 || includeRegexes.some(r => r.test(url)); if (shouldExclude || !shouldInclude) { stats.urlsSkipped++; continue; } console.error(`[${pages.length + 1}/${config.maxPages}] Crawling: ${url}`); try { const analysis = await analyzePage({ url, timeout: 20000, device: 'desktop', }); pages.push(analysis); stats.urlsCrawled++; // Track status codes stats.statusCodes[analysis.httpStatus] = (stats.statusCodes[analysis.httpStatus] || 0) + 1; // Add internal links to queue for (const link of analysis.links.internal) { try { const linkUrl = new URL(link.href); if (linkUrl.host === baseHost && !visited.has(linkUrl.href)) { toVisit.push({ url: linkUrl.href, depth: depth + 1 }); stats.urlsDiscovered++; } } catch { // Invalid URL, skip } } // Rate limiting if (config.requestDelayMs) { await new Promise(r => setTimeout(r, config.requestDelayMs)); } } catch (error: any) { console.error(`Failed to crawl ${url}: ${error.message}`); stats.urlsFailed++; } } const completedAt = new Date().toISOString(); const durationMs = new Date(completedAt).getTime() - new Date(startedAt).getTime(); // Generate summary const summary = generateCrawlSummary(pages); const pageTypes = classifyPages(pages); return { config, startedAt, completedAt, durationMs, stats, pages, summary, pageTypes, }; } finally { await closeBrowser(); } } /** * Generate summary statistics from crawl results */ function generateCrawlSummary(pages: PageAnalysis[]): CrawlSummary { const titles = pages.map(p => p.title).filter(Boolean) as string[]; const descriptions = pages.map(p => p.metaDescription).filter(Boolean) as string[]; // Find duplicates const duplicateTitles = findDuplicates( pages.map(p => ({ value: p.title || '', url: p.url })) ); const duplicateDescriptions = findDuplicates( pages.map(p => ({ value: p.metaDescription || '', url: p.url })) ); // Count issues const criticalIssues: string[] = []; const warnings: string[] = []; const pagesWithoutTitle = pages.filter(p => !p.title); if (pagesWithoutTitle.length > 0) { criticalIssues.push(`${pagesWithoutTitle.length} pages missing title tags`); } const pagesWithJobPostingErrors = pages.filter( p => p.structuredData.jobPostingErrors.length > 0 ); if (pagesWithJobPostingErrors.length > 0) { criticalIssues.push(`${pagesWithJobPostingErrors.length} pages with JobPosting schema errors`); } const pagesWithJsRendering = pages.filter(p => p.rendering.jsRenderingRequired); if (pagesWithJsRendering.length > pages.length * 0.5) { warnings.push(`${pagesWithJsRendering.length}/${pages.length} pages require JavaScript rendering`); } const pagesWithMixedContent = pages.filter(p => p.hasMixedContent); if (pagesWithMixedContent.length > 0) { warnings.push(`${pagesWithMixedContent.length} pages have mixed content (HTTP resources on HTTPS)`); } // Count broken links const brokenLinksCount = pages.reduce( (sum, p) => sum + p.links.broken.length, 0 ); // Count redirect chains const redirectChainsCount = pages.filter( p => p.redirectChain.length > 1 ).length; return { pagesWithTitle: pages.filter(p => p.title).length, pagesWithMetaDescription: pages.filter(p => p.metaDescription).length, pagesWithH1: pages.filter(p => p.headings.h1.length > 0).length, pagesWithCanonical: pages.filter(p => p.canonicalUrl).length, pagesWithJobPosting: pages.filter(p => p.structuredData.hasJobPosting).length, pagesWithAnySchema: pages.filter(p => p.structuredData.jsonLd.length > 0).length, jobPostingErrors: pages.reduce( (sum, p) => sum + p.structuredData.jobPostingErrors.length, 0 ), pagesWithJsRendering: pagesWithJsRendering.length, pagesWithMixedContent: pagesWithMixedContent.length, brokenLinksCount, redirectChainsCount, duplicateTitles, duplicateDescriptions, criticalIssues, warnings, }; } /** * Find duplicate values across pages */ function findDuplicates(items: Array<{ value: string; url: string }>): DuplicateGroup[] { const groups: Record<string, string[]> = {}; for (const item of items) { if (!item.value) continue; if (!groups[item.value]) { groups[item.value] = []; } groups[item.value].push(item.url); } return Object.entries(groups) .filter(([_, urls]) => urls.length > 1) .map(([value, urls]) => ({ value, urls })); } /** * Classify pages by type based on URL patterns and content */ function classifyPages(pages: PageAnalysis[]): PageTypeClassification { const classification: PageTypeClassification = { homepage: [], jobDetail: [], jobListing: [], categoryLanding: [], locationLanding: [], companyProfile: [], search: [], blog: [], static: [], other: [], }; for (const page of pages) { const url = page.url.toLowerCase(); const path = new URL(url).pathname; // Homepage if (path === '/' || path === '') { classification.homepage.push(page.url); continue; } // Job detail pages (have JobPosting schema or match patterns) if (page.structuredData.hasJobPosting || /\/job\/[a-z0-9-]+$/i.test(path) || /\/jobs\/\d+/i.test(path) || /\/position\/[a-z0-9-]+/i.test(path) || /\/career\/[a-z0-9-]+/i.test(path)) { classification.jobDetail.push(page.url); continue; } // Job listing/search pages if (/\/jobs\/?$/.test(path) || /\/careers\/?$/.test(path) || /\/jobs\/search/i.test(path) || /search|results/i.test(path) || url.includes('?q=') || url.includes('?keyword')) { classification.jobListing.push(page.url); continue; } // Category landing pages if (/\/jobs\/[a-z-]+\/?$/i.test(path) || /\/category\/[a-z-]+/i.test(path) || /\/department\/[a-z-]+/i.test(path) || /\/(engineering|marketing|sales|design|finance|hr|operations)-jobs/i.test(path)) { classification.categoryLanding.push(page.url); continue; } // Location landing pages if (/\/location\/[a-z-]+/i.test(path) || /\/city\/[a-z-]+/i.test(path) || /\/jobs-in-[a-z-]+/i.test(path) || /\/(new-york|san-francisco|london|remote)-jobs/i.test(path)) { classification.locationLanding.push(page.url); continue; } // Company profile pages if (/\/company\/[a-z0-9-]+/i.test(path) || /\/employer\/[a-z0-9-]+/i.test(path) || /\/companies\//i.test(path)) { classification.companyProfile.push(page.url); continue; } // Blog/content pages if (/\/blog\//i.test(path) || /\/article\//i.test(path) || /\/news\//i.test(path) || /\/resources\//i.test(path)) { classification.blog.push(page.url); continue; } // Static pages if (/\/(about|contact|privacy|terms|faq|help)/i.test(path)) { classification.static.push(page.url); continue; } // Everything else classification.other.push(page.url); } return classification; } export default crawlSite;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/RichardDillman/seo-audit-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server