// src/utils/http.ts
// HTTP utilities for fetching sitemaps, robots.txt, and checking URLs
import { XMLParser } from 'fast-xml-parser';
import type {
RobotsAnalysis,
RobotsRule,
SitemapAnalysis,
SitemapUrl
} from '../types/index.js';
const DEFAULT_TIMEOUT = 30000;
const USER_AGENT = 'Mozilla/5.0 (compatible; SEOAuditBot/1.0)';
/**
* Fetch with timeout and error handling
*/
async function fetchWithTimeout(
url: string,
options: { timeout?: number; followRedirects?: boolean } = {}
): Promise<Response> {
const { timeout = DEFAULT_TIMEOUT } = options;
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
try {
const response = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': USER_AGENT,
'Accept': '*/*',
},
redirect: options.followRedirects === false ? 'manual' : 'follow',
});
clearTimeout(timeoutId);
return response;
} catch (error: any) {
clearTimeout(timeoutId);
if (error.name === 'AbortError') {
throw new Error(`Request timeout after ${timeout}ms`);
}
throw error;
}
}
/**
* Check URL status code
*/
export async function checkUrlStatus(url: string, options: {
timeout?: number;
followRedirects?: boolean;
} = {}): Promise<{
url: string;
statusCode: number;
redirectUrl?: string;
error?: string;
responseTimeMs: number;
}> {
const startTime = Date.now();
try {
const response = await fetchWithTimeout(url, {
timeout: options.timeout,
followRedirects: options.followRedirects ?? true,
});
return {
url,
statusCode: response.status,
redirectUrl: response.redirected ? response.url : undefined,
responseTimeMs: Date.now() - startTime,
};
} catch (error: any) {
return {
url,
statusCode: 0,
error: error.message,
responseTimeMs: Date.now() - startTime,
};
}
}
/**
* Check multiple URLs in parallel with concurrency limit
*/
export async function checkUrls(
urls: string[],
options: {
concurrency?: number;
timeout?: number;
onProgress?: (completed: number, total: number) => void;
} = {}
): Promise<Array<{
url: string;
statusCode: number;
redirectUrl?: string;
error?: string;
responseTimeMs: number;
}>> {
const { concurrency = 5, timeout, onProgress } = options;
// Dynamic import for p-limit (ESM)
const pLimit = (await import('p-limit')).default;
const limit = pLimit(concurrency);
let completed = 0;
const results = await Promise.all(
urls.map(url =>
limit(async () => {
const result = await checkUrlStatus(url, { timeout });
completed++;
onProgress?.(completed, urls.length);
return result;
})
)
);
return results;
}
/**
* Fetch and parse robots.txt
*/
export async function fetchRobots(baseUrl: string): Promise<RobotsAnalysis> {
const robotsUrl = new URL('/robots.txt', baseUrl).toString();
const issues: string[] = [];
const warnings: string[] = [];
try {
const response = await fetchWithTimeout(robotsUrl);
if (!response.ok) {
return {
url: robotsUrl,
found: false,
content: '',
sitemaps: [],
rules: [],
issues: [`robots.txt not found (HTTP ${response.status})`],
warnings: [],
};
}
const content = await response.text();
const lines = content.split('\n').map(l => l.trim());
const sitemaps: string[] = [];
const rules: RobotsRule[] = [];
let currentRule: RobotsRule | null = null;
for (const line of lines) {
// Skip comments and empty lines
if (line.startsWith('#') || !line) continue;
const lower = line.toLowerCase();
if (lower.startsWith('sitemap:')) {
sitemaps.push(line.substring(8).trim());
} else if (lower.startsWith('user-agent:')) {
// Save previous rule
if (currentRule) {
rules.push(currentRule);
}
currentRule = {
userAgent: line.substring(11).trim(),
disallow: [],
allow: [],
};
} else if (currentRule) {
if (lower.startsWith('disallow:')) {
const path = line.substring(9).trim();
if (path) currentRule.disallow.push(path);
} else if (lower.startsWith('allow:')) {
currentRule.allow.push(line.substring(6).trim());
} else if (lower.startsWith('crawl-delay:')) {
currentRule.crawlDelay = parseInt(line.substring(12).trim(), 10);
}
}
}
// Save last rule
if (currentRule) {
rules.push(currentRule);
}
// Analyze for issues
const wildcardRule = rules.find(r => r.userAgent === '*');
if (wildcardRule?.disallow.includes('/')) {
issues.push('CRITICAL: Entire site is blocked (Disallow: /)');
}
// Check for common job board paths being blocked
const jobPaths = ['/jobs', '/job', '/careers', '/positions'];
for (const rule of rules) {
for (const path of jobPaths) {
if (rule.disallow.some(d => d.startsWith(path))) {
issues.push(`Job paths may be blocked: ${path} (User-Agent: ${rule.userAgent})`);
}
}
}
if (sitemaps.length === 0) {
warnings.push('No sitemap referenced in robots.txt');
}
const highCrawlDelay = rules.find(r => r.crawlDelay && r.crawlDelay > 10);
if (highCrawlDelay) {
warnings.push(`High crawl-delay (${highCrawlDelay.crawlDelay}s) may slow indexing`);
}
return {
url: robotsUrl,
found: true,
content,
sitemaps,
rules,
issues,
warnings,
};
} catch (error: any) {
return {
url: robotsUrl,
found: false,
content: '',
sitemaps: [],
rules: [],
issues: [`Error fetching robots.txt: ${error.message}`],
warnings: [],
};
}
}
/**
* Fetch and parse XML sitemap with comprehensive validation
*/
export async function fetchSitemap(
sitemapUrl: string,
options: { maxUrls?: number } = {}
): Promise<SitemapAnalysis> {
const { maxUrls = 50000 } = options;
const issues: string[] = [];
const warnings: string[] = [];
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
});
try {
const response = await fetchWithTimeout(sitemapUrl);
if (!response.ok) {
return {
url: sitemapUrl,
found: false,
type: 'unknown',
urlCount: 0,
urls: [],
issues: [`Sitemap not found (HTTP ${response.status})`],
warnings: [],
jobUrlCount: 0,
jobUrls: [],
urlsWithLastmod: 0,
};
}
const xml = await response.text();
const xmlSizeBytes = new TextEncoder().encode(xml).length;
// Check file size (50MB limit for uncompressed)
const maxSizeBytes = 50 * 1024 * 1024; // 50MB
if (xmlSizeBytes > maxSizeBytes) {
issues.push(`Sitemap exceeds 50MB size limit (${Math.round(xmlSizeBytes / (1024 * 1024))}MB)`);
} else if (xmlSizeBytes > maxSizeBytes * 0.8) {
warnings.push(`Sitemap approaching 50MB size limit (${Math.round(xmlSizeBytes / (1024 * 1024))}MB)`);
}
// Check if it's valid XML
if (!xml.trim().startsWith('<?xml') && !xml.trim().startsWith('<')) {
return {
url: sitemapUrl,
found: true,
type: 'unknown',
urlCount: 0,
urls: [],
issues: ['Sitemap is not valid XML'],
warnings: [],
jobUrlCount: 0,
jobUrls: [],
urlsWithLastmod: 0,
};
}
// Check for proper XML declaration
if (!xml.includes('<?xml')) {
warnings.push('Missing XML declaration (<?xml version="1.0" encoding="UTF-8"?>)');
}
// Check for sitemap namespace
if (!xml.includes('http://www.sitemaps.org/schemas/sitemap/0.9')) {
warnings.push('Missing or incorrect sitemap namespace');
}
const parsed = parser.parse(xml);
// Check if it's a sitemap index
if (parsed.sitemapindex) {
const sitemaps = Array.isArray(parsed.sitemapindex.sitemap)
? parsed.sitemapindex.sitemap
: [parsed.sitemapindex.sitemap].filter(Boolean);
const childSitemaps = sitemaps.map((s: any) => s.loc);
// Validate sitemap index entries
const sitemapHost = new URL(sitemapUrl).hostname;
const crossDomainSitemaps = childSitemaps.filter((url: string) => {
try {
return new URL(url).hostname !== sitemapHost;
} catch { return false; }
});
if (crossDomainSitemaps.length > 0) {
warnings.push(`${crossDomainSitemaps.length} child sitemaps point to different domains`);
}
return {
url: sitemapUrl,
found: true,
type: 'sitemapindex',
urlCount: childSitemaps.length,
urls: [],
childSitemaps,
issues,
warnings: [...warnings, 'This is a sitemap index - contains references to child sitemaps'],
jobUrlCount: 0,
jobUrls: [],
urlsWithLastmod: 0,
};
}
// Regular urlset sitemap
if (!parsed.urlset?.url) {
return {
url: sitemapUrl,
found: true,
type: 'urlset',
urlCount: 0,
urls: [],
issues: ['Sitemap is empty or malformed'],
warnings,
jobUrlCount: 0,
jobUrls: [],
urlsWithLastmod: 0,
};
}
const urlEntries = Array.isArray(parsed.urlset.url)
? parsed.urlset.url
: [parsed.urlset.url];
// Limit URLs if needed
const limitedEntries = urlEntries.slice(0, maxUrls);
if (urlEntries.length > maxUrls) {
warnings.push(`Sitemap has ${urlEntries.length} URLs, only processed first ${maxUrls}`);
}
// Parse and validate URLs
const urls: SitemapUrl[] = [];
const seenUrls = new Set<string>();
const sitemapHost = new URL(sitemapUrl).hostname;
const sitemapProtocol = new URL(sitemapUrl).protocol;
let duplicateCount = 0;
let invalidUrlCount = 0;
let crossDomainCount = 0;
let protocolMismatchCount = 0;
let invalidLastmodCount = 0;
let invalidPriorityCount = 0;
let invalidChangefreqCount = 0;
const validChangefreqs = ['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'];
for (const entry of limitedEntries) {
const loc = entry.loc;
// Validate URL format
if (!loc || typeof loc !== 'string') {
invalidUrlCount++;
continue;
}
try {
const parsedUrl = new URL(loc);
// Check for duplicates
if (seenUrls.has(loc)) {
duplicateCount++;
continue;
}
seenUrls.add(loc);
// Check for cross-domain URLs
if (parsedUrl.hostname !== sitemapHost &&
!parsedUrl.hostname.endsWith('.' + sitemapHost) &&
!sitemapHost.endsWith('.' + parsedUrl.hostname)) {
crossDomainCount++;
}
// Check protocol consistency
if (parsedUrl.protocol !== sitemapProtocol) {
protocolMismatchCount++;
}
} catch {
invalidUrlCount++;
continue;
}
// Validate lastmod format (ISO 8601)
let lastmod = entry.lastmod;
if (lastmod) {
// Valid formats: YYYY, YYYY-MM, YYYY-MM-DD, YYYY-MM-DDThh:mm:ss+00:00
const iso8601Regex = /^\d{4}(-\d{2}(-\d{2}(T\d{2}:\d{2}(:\d{2})?([+-]\d{2}:\d{2}|Z)?)?)?)?$/;
if (!iso8601Regex.test(lastmod) && isNaN(Date.parse(lastmod))) {
invalidLastmodCount++;
lastmod = undefined;
}
}
// Validate priority (0.0 to 1.0)
let priority = entry.priority;
if (priority !== undefined) {
const priorityNum = parseFloat(priority);
if (isNaN(priorityNum) || priorityNum < 0 || priorityNum > 1) {
invalidPriorityCount++;
priority = undefined;
}
}
// Validate changefreq
let changefreq = entry.changefreq;
if (changefreq && !validChangefreqs.includes(changefreq.toLowerCase())) {
invalidChangefreqCount++;
changefreq = undefined;
}
urls.push({
loc,
lastmod,
changefreq,
priority,
});
}
// Add validation issues/warnings
if (duplicateCount > 0) {
warnings.push(`${duplicateCount} duplicate URLs found and removed`);
}
if (invalidUrlCount > 0) {
issues.push(`${invalidUrlCount} URLs are invalid or malformed`);
}
if (crossDomainCount > 0) {
warnings.push(`${crossDomainCount} URLs point to different domains`);
}
if (protocolMismatchCount > 0) {
warnings.push(`${protocolMismatchCount} URLs use different protocol (HTTP vs HTTPS)`);
}
if (invalidLastmodCount > 0) {
warnings.push(`${invalidLastmodCount} URLs have invalid lastmod format`);
}
if (invalidPriorityCount > 0) {
warnings.push(`${invalidPriorityCount} URLs have invalid priority values (must be 0.0-1.0)`);
}
if (invalidChangefreqCount > 0) {
warnings.push(`${invalidChangefreqCount} URLs have invalid changefreq values`);
}
// Analyze coverage
const urlsWithLastmod = urls.filter(u => u.lastmod).length;
if (urlsWithLastmod < urls.length * 0.5) {
warnings.push(`Only ${urlsWithLastmod}/${urls.length} URLs have lastmod dates`);
}
if (urls.length > 50000) {
issues.push('Sitemap exceeds 50,000 URL limit per Google guidelines');
}
// Find job-related URLs
const jobPatterns = ['/job', '/jobs', '/career', '/position', '/vacancy', '/opening'];
const jobUrls = urls
.filter(u => jobPatterns.some(p => u.loc.toLowerCase().includes(p)))
.map(u => u.loc);
// Date analysis
const dates = urls
.filter(u => u.lastmod)
.map(u => new Date(u.lastmod!).getTime())
.filter(d => !isNaN(d))
.sort((a, b) => a - b);
return {
url: sitemapUrl,
found: true,
type: 'urlset',
urlCount: urls.length,
urls,
issues,
warnings,
jobUrlCount: jobUrls.length,
jobUrls: jobUrls.slice(0, 100), // Limit for response size
urlsWithLastmod,
oldestLastmod: dates.length > 0 ? new Date(dates[0]).toISOString() : undefined,
newestLastmod: dates.length > 0 ? new Date(dates[dates.length - 1]).toISOString() : undefined,
};
} catch (error: any) {
return {
url: sitemapUrl,
found: false,
type: 'unknown',
urlCount: 0,
urls: [],
issues: [`Error fetching sitemap: ${error.message}`],
warnings: [],
jobUrlCount: 0,
jobUrls: [],
urlsWithLastmod: 0,
};
}
}
/**
* Discover all sitemaps for a site
*/
export async function discoverSitemaps(baseUrl: string): Promise<{
robotsSitemaps: string[];
commonLocations: Array<{ url: string; found: boolean }>;
allSitemaps: SitemapAnalysis[];
}> {
// First check robots.txt
const robots = await fetchRobots(baseUrl);
const robotsSitemaps = robots.sitemaps;
// Check common sitemap locations
const commonPaths = [
'/sitemap.xml',
'/sitemap_index.xml',
'/sitemap-index.xml',
'/sitemaps/sitemap.xml',
'/wp-sitemap.xml',
'/sitemap/sitemap-index.xml',
'/jobs-sitemap.xml',
];
const commonLocations: Array<{ url: string; found: boolean }> = [];
const sitemapUrls = new Set<string>(robotsSitemaps);
for (const path of commonPaths) {
const url = new URL(path, baseUrl).toString();
if (!sitemapUrls.has(url)) {
const status = await checkUrlStatus(url);
commonLocations.push({ url, found: status.statusCode === 200 });
if (status.statusCode === 200) {
sitemapUrls.add(url);
}
}
}
// Fetch all discovered sitemaps
const allSitemaps: SitemapAnalysis[] = [];
for (const url of sitemapUrls) {
const sitemap = await fetchSitemap(url);
allSitemaps.push(sitemap);
// If it's a sitemap index, fetch child sitemaps too
if (sitemap.type === 'sitemapindex' && sitemap.childSitemaps) {
for (const childUrl of sitemap.childSitemaps.slice(0, 10)) { // Limit to first 10
const childSitemap = await fetchSitemap(childUrl);
allSitemaps.push(childSitemap);
}
}
}
return {
robotsSitemaps,
commonLocations,
allSitemaps,
};
}
/**
* Check if a URL would be blocked by robots.txt
*/
export function isUrlBlocked(
url: string,
rules: RobotsRule[],
userAgent: string = '*'
): boolean {
const path = new URL(url).pathname;
// Find applicable rules (specific user-agent first, then wildcard)
const applicableRules = rules.filter(r =>
r.userAgent === userAgent || r.userAgent === '*'
);
// Sort: specific user-agent rules before wildcard
applicableRules.sort((a, b) => {
if (a.userAgent === userAgent && b.userAgent !== userAgent) return -1;
if (b.userAgent === userAgent && a.userAgent !== userAgent) return 1;
return 0;
});
for (const rule of applicableRules) {
// Check allow rules first (they take precedence for same specificity)
for (const allowPath of rule.allow) {
if (pathMatches(path, allowPath)) {
return false;
}
}
// Then check disallow rules
for (const disallowPath of rule.disallow) {
if (pathMatches(path, disallowPath)) {
return true;
}
}
}
return false;
}
/**
* Check if a path matches a robots.txt pattern
*/
function pathMatches(path: string, pattern: string): boolean {
if (!pattern) return false;
// Handle wildcards
if (pattern.includes('*')) {
const regex = new RegExp(
'^' + pattern.replace(/\*/g, '.*').replace(/\$/g, '$') + '$'
);
return regex.test(path);
}
// Handle $ (end of URL)
if (pattern.endsWith('$')) {
return path === pattern.slice(0, -1);
}
// Simple prefix match
return path.startsWith(pattern);
}