// src/tools/sitemap.ts
// Sitemap and robots.txt analysis tool
import type {
RobotsAnalysis,
SitemapAnalysis,
AnalyzeSitemapInput,
} from '../types/index.js';
import {
fetchRobots,
fetchSitemap,
discoverSitemaps,
isUrlBlocked,
} from '../utils/http.js';
export interface SiteAccessAnalysis {
baseUrl: string;
timestamp: string;
robots: RobotsAnalysis;
sitemaps: SitemapAnalysis[];
summary: {
robotsFound: boolean;
sitemapsFound: number;
totalUrlsInSitemaps: number;
jobUrlsFound: number;
issues: string[];
warnings: string[];
recommendations: string[];
};
// Sample URLs for further analysis
sampleJobUrls: string[];
sampleCategoryUrls: string[];
}
/**
* Comprehensive sitemap and robots.txt analysis
*/
export async function analyzeSiteAccess(input: AnalyzeSitemapInput): Promise<SiteAccessAnalysis> {
const { baseUrl, includeSitemapUrls = true, maxUrls = 1000 } = input;
console.error(`Analyzing site access for: ${baseUrl}`);
// Fetch robots.txt
const robots = await fetchRobots(baseUrl);
// Discover and fetch all sitemaps
const sitemapDiscovery = await discoverSitemaps(baseUrl);
// Aggregate stats
const issues: string[] = [...robots.issues];
const warnings: string[] = [...robots.warnings];
const recommendations: string[] = [];
let totalUrls = 0;
let jobUrls: string[] = [];
for (const sitemap of sitemapDiscovery.allSitemaps) {
issues.push(...sitemap.issues);
warnings.push(...sitemap.warnings);
totalUrls += sitemap.urlCount;
jobUrls.push(...sitemap.jobUrls);
}
// Check for common issues
if (sitemapDiscovery.allSitemaps.length === 0) {
issues.push('No XML sitemap found');
recommendations.push('Create and submit an XML sitemap to help search engines discover your pages');
}
if (robots.found && robots.sitemaps.length === 0) {
warnings.push('robots.txt exists but does not reference any sitemap');
recommendations.push('Add Sitemap: directive to robots.txt');
}
// Check if job URLs might be blocked
if (robots.found && jobUrls.length > 0) {
const sampleJobUrl = jobUrls[0];
if (isUrlBlocked(sampleJobUrl, robots.rules)) {
issues.push('Job URLs may be blocked by robots.txt');
recommendations.push('Review robots.txt rules to ensure job pages are crawlable');
}
}
// Analyze URL patterns
const sampleCategoryUrls = extractCategoryPatterns(
sitemapDiscovery.allSitemaps.flatMap(s => s.urls.map(u => u.loc))
);
// Check for job sitemap
const hasJobSitemap = sitemapDiscovery.allSitemaps.some(
s => s.url.includes('job') || s.jobUrlCount > 0
);
if (!hasJobSitemap && jobUrls.length === 0) {
recommendations.push('Consider creating a dedicated jobs sitemap for better indexing');
}
// Check sitemap freshness
for (const sitemap of sitemapDiscovery.allSitemaps) {
if (sitemap.newestLastmod) {
const daysSinceUpdate = Math.floor(
(Date.now() - new Date(sitemap.newestLastmod).getTime()) / (1000 * 60 * 60 * 24)
);
if (daysSinceUpdate > 7) {
warnings.push(`Sitemap ${sitemap.url} hasn't been updated in ${daysSinceUpdate} days`);
}
}
}
// Indexing API recommendation for job boards
recommendations.push('Consider implementing Google Indexing API for faster job posting indexation');
return {
baseUrl,
timestamp: new Date().toISOString(),
robots,
sitemaps: sitemapDiscovery.allSitemaps,
summary: {
robotsFound: robots.found,
sitemapsFound: sitemapDiscovery.allSitemaps.filter(s => s.found).length,
totalUrlsInSitemaps: totalUrls,
jobUrlsFound: jobUrls.length,
issues,
warnings,
recommendations,
},
sampleJobUrls: jobUrls.slice(0, 10),
sampleCategoryUrls: sampleCategoryUrls.slice(0, 10),
};
}
/**
* Extract likely category/landing page URLs from sitemap
*/
function extractCategoryPatterns(urls: string[]): string[] {
const categoryPatterns = [
/\/jobs\/[a-z-]+\/?$/i,
/\/category\/[a-z-]+/i,
/\/location\/[a-z-]+/i,
/\/[a-z]+-jobs\/?$/i,
/\/jobs-in-[a-z-]+/i,
];
const categoryUrls = new Set<string>();
for (const url of urls) {
for (const pattern of categoryPatterns) {
if (pattern.test(url)) {
categoryUrls.add(url);
break;
}
}
// Stop once we have enough samples
if (categoryUrls.size >= 20) break;
}
return Array.from(categoryUrls);
}
/**
* Analyze a specific sitemap
*/
export async function analyzeSitemap(url: string): Promise<SitemapAnalysis> {
return fetchSitemap(url);
}
/**
* Analyze robots.txt
*/
export async function analyzeRobots(baseUrl: string): Promise<RobotsAnalysis> {
return fetchRobots(baseUrl);
}
export { fetchRobots, fetchSitemap, discoverSitemaps };
export default analyzeSiteAccess;