Skip to main content
Glama
RichardDillman

SEO Audit MCP Server

http.ts18.1 kB
// src/utils/http.ts // HTTP utilities for fetching sitemaps, robots.txt, and checking URLs import { XMLParser } from 'fast-xml-parser'; import type { RobotsAnalysis, RobotsRule, SitemapAnalysis, SitemapUrl } from '../types/index.js'; const DEFAULT_TIMEOUT = 30000; const USER_AGENT = 'Mozilla/5.0 (compatible; SEOAuditBot/1.0)'; /** * Fetch with timeout and error handling */ async function fetchWithTimeout( url: string, options: { timeout?: number; followRedirects?: boolean } = {} ): Promise<Response> { const { timeout = DEFAULT_TIMEOUT } = options; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); try { const response = await fetch(url, { signal: controller.signal, headers: { 'User-Agent': USER_AGENT, 'Accept': '*/*', }, redirect: options.followRedirects === false ? 'manual' : 'follow', }); clearTimeout(timeoutId); return response; } catch (error: any) { clearTimeout(timeoutId); if (error.name === 'AbortError') { throw new Error(`Request timeout after ${timeout}ms`); } throw error; } } /** * Check URL status code */ export async function checkUrlStatus(url: string, options: { timeout?: number; followRedirects?: boolean; } = {}): Promise<{ url: string; statusCode: number; redirectUrl?: string; error?: string; responseTimeMs: number; }> { const startTime = Date.now(); try { const response = await fetchWithTimeout(url, { timeout: options.timeout, followRedirects: options.followRedirects ?? true, }); return { url, statusCode: response.status, redirectUrl: response.redirected ? response.url : undefined, responseTimeMs: Date.now() - startTime, }; } catch (error: any) { return { url, statusCode: 0, error: error.message, responseTimeMs: Date.now() - startTime, }; } } /** * Check multiple URLs in parallel with concurrency limit */ export async function checkUrls( urls: string[], options: { concurrency?: number; timeout?: number; onProgress?: (completed: number, total: number) => void; } = {} ): Promise<Array<{ url: string; statusCode: number; redirectUrl?: string; error?: string; responseTimeMs: number; }>> { const { concurrency = 5, timeout, onProgress } = options; // Dynamic import for p-limit (ESM) const pLimit = (await import('p-limit')).default; const limit = pLimit(concurrency); let completed = 0; const results = await Promise.all( urls.map(url => limit(async () => { const result = await checkUrlStatus(url, { timeout }); completed++; onProgress?.(completed, urls.length); return result; }) ) ); return results; } /** * Fetch and parse robots.txt */ export async function fetchRobots(baseUrl: string): Promise<RobotsAnalysis> { const robotsUrl = new URL('/robots.txt', baseUrl).toString(); const issues: string[] = []; const warnings: string[] = []; try { const response = await fetchWithTimeout(robotsUrl); if (!response.ok) { return { url: robotsUrl, found: false, content: '', sitemaps: [], rules: [], issues: [`robots.txt not found (HTTP ${response.status})`], warnings: [], }; } const content = await response.text(); const lines = content.split('\n').map(l => l.trim()); const sitemaps: string[] = []; const rules: RobotsRule[] = []; let currentRule: RobotsRule | null = null; for (const line of lines) { // Skip comments and empty lines if (line.startsWith('#') || !line) continue; const lower = line.toLowerCase(); if (lower.startsWith('sitemap:')) { sitemaps.push(line.substring(8).trim()); } else if (lower.startsWith('user-agent:')) { // Save previous rule if (currentRule) { rules.push(currentRule); } currentRule = { userAgent: line.substring(11).trim(), disallow: [], allow: [], }; } else if (currentRule) { if (lower.startsWith('disallow:')) { const path = line.substring(9).trim(); if (path) currentRule.disallow.push(path); } else if (lower.startsWith('allow:')) { currentRule.allow.push(line.substring(6).trim()); } else if (lower.startsWith('crawl-delay:')) { currentRule.crawlDelay = parseInt(line.substring(12).trim(), 10); } } } // Save last rule if (currentRule) { rules.push(currentRule); } // Analyze for issues const wildcardRule = rules.find(r => r.userAgent === '*'); if (wildcardRule?.disallow.includes('/')) { issues.push('CRITICAL: Entire site is blocked (Disallow: /)'); } // Check for common job board paths being blocked const jobPaths = ['/jobs', '/job', '/careers', '/positions']; for (const rule of rules) { for (const path of jobPaths) { if (rule.disallow.some(d => d.startsWith(path))) { issues.push(`Job paths may be blocked: ${path} (User-Agent: ${rule.userAgent})`); } } } if (sitemaps.length === 0) { warnings.push('No sitemap referenced in robots.txt'); } const highCrawlDelay = rules.find(r => r.crawlDelay && r.crawlDelay > 10); if (highCrawlDelay) { warnings.push(`High crawl-delay (${highCrawlDelay.crawlDelay}s) may slow indexing`); } return { url: robotsUrl, found: true, content, sitemaps, rules, issues, warnings, }; } catch (error: any) { return { url: robotsUrl, found: false, content: '', sitemaps: [], rules: [], issues: [`Error fetching robots.txt: ${error.message}`], warnings: [], }; } } /** * Fetch and parse XML sitemap with comprehensive validation */ export async function fetchSitemap( sitemapUrl: string, options: { maxUrls?: number } = {} ): Promise<SitemapAnalysis> { const { maxUrls = 50000 } = options; const issues: string[] = []; const warnings: string[] = []; const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', }); try { const response = await fetchWithTimeout(sitemapUrl); if (!response.ok) { return { url: sitemapUrl, found: false, type: 'unknown', urlCount: 0, urls: [], issues: [`Sitemap not found (HTTP ${response.status})`], warnings: [], jobUrlCount: 0, jobUrls: [], urlsWithLastmod: 0, }; } const xml = await response.text(); const xmlSizeBytes = new TextEncoder().encode(xml).length; // Check file size (50MB limit for uncompressed) const maxSizeBytes = 50 * 1024 * 1024; // 50MB if (xmlSizeBytes > maxSizeBytes) { issues.push(`Sitemap exceeds 50MB size limit (${Math.round(xmlSizeBytes / (1024 * 1024))}MB)`); } else if (xmlSizeBytes > maxSizeBytes * 0.8) { warnings.push(`Sitemap approaching 50MB size limit (${Math.round(xmlSizeBytes / (1024 * 1024))}MB)`); } // Check if it's valid XML if (!xml.trim().startsWith('<?xml') && !xml.trim().startsWith('<')) { return { url: sitemapUrl, found: true, type: 'unknown', urlCount: 0, urls: [], issues: ['Sitemap is not valid XML'], warnings: [], jobUrlCount: 0, jobUrls: [], urlsWithLastmod: 0, }; } // Check for proper XML declaration if (!xml.includes('<?xml')) { warnings.push('Missing XML declaration (<?xml version="1.0" encoding="UTF-8"?>)'); } // Check for sitemap namespace if (!xml.includes('http://www.sitemaps.org/schemas/sitemap/0.9')) { warnings.push('Missing or incorrect sitemap namespace'); } const parsed = parser.parse(xml); // Check if it's a sitemap index if (parsed.sitemapindex) { const sitemaps = Array.isArray(parsed.sitemapindex.sitemap) ? parsed.sitemapindex.sitemap : [parsed.sitemapindex.sitemap].filter(Boolean); const childSitemaps = sitemaps.map((s: any) => s.loc); // Validate sitemap index entries const sitemapHost = new URL(sitemapUrl).hostname; const crossDomainSitemaps = childSitemaps.filter((url: string) => { try { return new URL(url).hostname !== sitemapHost; } catch { return false; } }); if (crossDomainSitemaps.length > 0) { warnings.push(`${crossDomainSitemaps.length} child sitemaps point to different domains`); } return { url: sitemapUrl, found: true, type: 'sitemapindex', urlCount: childSitemaps.length, urls: [], childSitemaps, issues, warnings: [...warnings, 'This is a sitemap index - contains references to child sitemaps'], jobUrlCount: 0, jobUrls: [], urlsWithLastmod: 0, }; } // Regular urlset sitemap if (!parsed.urlset?.url) { return { url: sitemapUrl, found: true, type: 'urlset', urlCount: 0, urls: [], issues: ['Sitemap is empty or malformed'], warnings, jobUrlCount: 0, jobUrls: [], urlsWithLastmod: 0, }; } const urlEntries = Array.isArray(parsed.urlset.url) ? parsed.urlset.url : [parsed.urlset.url]; // Limit URLs if needed const limitedEntries = urlEntries.slice(0, maxUrls); if (urlEntries.length > maxUrls) { warnings.push(`Sitemap has ${urlEntries.length} URLs, only processed first ${maxUrls}`); } // Parse and validate URLs const urls: SitemapUrl[] = []; const seenUrls = new Set<string>(); const sitemapHost = new URL(sitemapUrl).hostname; const sitemapProtocol = new URL(sitemapUrl).protocol; let duplicateCount = 0; let invalidUrlCount = 0; let crossDomainCount = 0; let protocolMismatchCount = 0; let invalidLastmodCount = 0; let invalidPriorityCount = 0; let invalidChangefreqCount = 0; const validChangefreqs = ['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never']; for (const entry of limitedEntries) { const loc = entry.loc; // Validate URL format if (!loc || typeof loc !== 'string') { invalidUrlCount++; continue; } try { const parsedUrl = new URL(loc); // Check for duplicates if (seenUrls.has(loc)) { duplicateCount++; continue; } seenUrls.add(loc); // Check for cross-domain URLs if (parsedUrl.hostname !== sitemapHost && !parsedUrl.hostname.endsWith('.' + sitemapHost) && !sitemapHost.endsWith('.' + parsedUrl.hostname)) { crossDomainCount++; } // Check protocol consistency if (parsedUrl.protocol !== sitemapProtocol) { protocolMismatchCount++; } } catch { invalidUrlCount++; continue; } // Validate lastmod format (ISO 8601) let lastmod = entry.lastmod; if (lastmod) { // Valid formats: YYYY, YYYY-MM, YYYY-MM-DD, YYYY-MM-DDThh:mm:ss+00:00 const iso8601Regex = /^\d{4}(-\d{2}(-\d{2}(T\d{2}:\d{2}(:\d{2})?([+-]\d{2}:\d{2}|Z)?)?)?)?$/; if (!iso8601Regex.test(lastmod) && isNaN(Date.parse(lastmod))) { invalidLastmodCount++; lastmod = undefined; } } // Validate priority (0.0 to 1.0) let priority = entry.priority; if (priority !== undefined) { const priorityNum = parseFloat(priority); if (isNaN(priorityNum) || priorityNum < 0 || priorityNum > 1) { invalidPriorityCount++; priority = undefined; } } // Validate changefreq let changefreq = entry.changefreq; if (changefreq && !validChangefreqs.includes(changefreq.toLowerCase())) { invalidChangefreqCount++; changefreq = undefined; } urls.push({ loc, lastmod, changefreq, priority, }); } // Add validation issues/warnings if (duplicateCount > 0) { warnings.push(`${duplicateCount} duplicate URLs found and removed`); } if (invalidUrlCount > 0) { issues.push(`${invalidUrlCount} URLs are invalid or malformed`); } if (crossDomainCount > 0) { warnings.push(`${crossDomainCount} URLs point to different domains`); } if (protocolMismatchCount > 0) { warnings.push(`${protocolMismatchCount} URLs use different protocol (HTTP vs HTTPS)`); } if (invalidLastmodCount > 0) { warnings.push(`${invalidLastmodCount} URLs have invalid lastmod format`); } if (invalidPriorityCount > 0) { warnings.push(`${invalidPriorityCount} URLs have invalid priority values (must be 0.0-1.0)`); } if (invalidChangefreqCount > 0) { warnings.push(`${invalidChangefreqCount} URLs have invalid changefreq values`); } // Analyze coverage const urlsWithLastmod = urls.filter(u => u.lastmod).length; if (urlsWithLastmod < urls.length * 0.5) { warnings.push(`Only ${urlsWithLastmod}/${urls.length} URLs have lastmod dates`); } if (urls.length > 50000) { issues.push('Sitemap exceeds 50,000 URL limit per Google guidelines'); } // Find job-related URLs const jobPatterns = ['/job', '/jobs', '/career', '/position', '/vacancy', '/opening']; const jobUrls = urls .filter(u => jobPatterns.some(p => u.loc.toLowerCase().includes(p))) .map(u => u.loc); // Date analysis const dates = urls .filter(u => u.lastmod) .map(u => new Date(u.lastmod!).getTime()) .filter(d => !isNaN(d)) .sort((a, b) => a - b); return { url: sitemapUrl, found: true, type: 'urlset', urlCount: urls.length, urls, issues, warnings, jobUrlCount: jobUrls.length, jobUrls: jobUrls.slice(0, 100), // Limit for response size urlsWithLastmod, oldestLastmod: dates.length > 0 ? new Date(dates[0]).toISOString() : undefined, newestLastmod: dates.length > 0 ? new Date(dates[dates.length - 1]).toISOString() : undefined, }; } catch (error: any) { return { url: sitemapUrl, found: false, type: 'unknown', urlCount: 0, urls: [], issues: [`Error fetching sitemap: ${error.message}`], warnings: [], jobUrlCount: 0, jobUrls: [], urlsWithLastmod: 0, }; } } /** * Discover all sitemaps for a site */ export async function discoverSitemaps(baseUrl: string): Promise<{ robotsSitemaps: string[]; commonLocations: Array<{ url: string; found: boolean }>; allSitemaps: SitemapAnalysis[]; }> { // First check robots.txt const robots = await fetchRobots(baseUrl); const robotsSitemaps = robots.sitemaps; // Check common sitemap locations const commonPaths = [ '/sitemap.xml', '/sitemap_index.xml', '/sitemap-index.xml', '/sitemaps/sitemap.xml', '/wp-sitemap.xml', '/sitemap/sitemap-index.xml', '/jobs-sitemap.xml', ]; const commonLocations: Array<{ url: string; found: boolean }> = []; const sitemapUrls = new Set<string>(robotsSitemaps); for (const path of commonPaths) { const url = new URL(path, baseUrl).toString(); if (!sitemapUrls.has(url)) { const status = await checkUrlStatus(url); commonLocations.push({ url, found: status.statusCode === 200 }); if (status.statusCode === 200) { sitemapUrls.add(url); } } } // Fetch all discovered sitemaps const allSitemaps: SitemapAnalysis[] = []; for (const url of sitemapUrls) { const sitemap = await fetchSitemap(url); allSitemaps.push(sitemap); // If it's a sitemap index, fetch child sitemaps too if (sitemap.type === 'sitemapindex' && sitemap.childSitemaps) { for (const childUrl of sitemap.childSitemaps.slice(0, 10)) { // Limit to first 10 const childSitemap = await fetchSitemap(childUrl); allSitemaps.push(childSitemap); } } } return { robotsSitemaps, commonLocations, allSitemaps, }; } /** * Check if a URL would be blocked by robots.txt */ export function isUrlBlocked( url: string, rules: RobotsRule[], userAgent: string = '*' ): boolean { const path = new URL(url).pathname; // Find applicable rules (specific user-agent first, then wildcard) const applicableRules = rules.filter(r => r.userAgent === userAgent || r.userAgent === '*' ); // Sort: specific user-agent rules before wildcard applicableRules.sort((a, b) => { if (a.userAgent === userAgent && b.userAgent !== userAgent) return -1; if (b.userAgent === userAgent && a.userAgent !== userAgent) return 1; return 0; }); for (const rule of applicableRules) { // Check allow rules first (they take precedence for same specificity) for (const allowPath of rule.allow) { if (pathMatches(path, allowPath)) { return false; } } // Then check disallow rules for (const disallowPath of rule.disallow) { if (pathMatches(path, disallowPath)) { return true; } } } return false; } /** * Check if a path matches a robots.txt pattern */ function pathMatches(path: string, pattern: string): boolean { if (!pattern) return false; // Handle wildcards if (pattern.includes('*')) { const regex = new RegExp( '^' + pattern.replace(/\*/g, '.*').replace(/\$/g, '$') + '$' ); return regex.test(path); } // Handle $ (end of URL) if (pattern.endsWith('$')) { return path === pattern.slice(0, -1); } // Simple prefix match return path.startsWith(pattern); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/RichardDillman/seo-audit-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server