import { logger } from '@/utils/logger.js';
interface RobotsRule {
userAgent: string;
disallow: string[];
allow: string[];
crawlDelay?: number;
}
export class RobotsParser {
private rules: RobotsRule[] = [];
private sitemaps: string[] = [];
static async fetch(baseUrl: string, userAgent: string): Promise<RobotsParser> {
const robotsUrl = new URL('/robots.txt', baseUrl).toString();
const parser = new RobotsParser();
try {
const response = await fetch(robotsUrl, {
headers: { 'User-Agent': userAgent },
signal: AbortSignal.timeout(5000),
});
if (response.ok) {
const text = await response.text();
parser.parse(text);
logger.info(`Fetched robots.txt from ${robotsUrl}`);
} else {
logger.warn(`No robots.txt found at ${robotsUrl} (${response.status})`);
}
} catch (error) {
logger.warn(`Failed to fetch robots.txt from ${robotsUrl}:`, error);
}
return parser;
}
private parse(content: string): void {
const lines = content.split('\n').map(line => line.trim());
let currentRule: Partial<RobotsRule> | null = null;
for (const line of lines) {
if (line.startsWith('#') || !line) continue;
const [directive, ...valueParts] = line.split(':');
const value = valueParts.join(':').trim();
switch (directive.toLowerCase()) {
case 'user-agent':
if (currentRule) {
this.rules.push(currentRule as RobotsRule);
}
currentRule = {
userAgent: value,
disallow: [],
allow: [],
};
break;
case 'disallow':
if (currentRule) {
currentRule.disallow!.push(value);
}
break;
case 'allow':
if (currentRule) {
currentRule.allow!.push(value);
}
break;
case 'crawl-delay':
if (currentRule) {
currentRule.crawlDelay = parseInt(value, 10);
}
break;
case 'sitemap':
this.sitemaps.push(value);
break;
}
}
if (currentRule) {
this.rules.push(currentRule as RobotsRule);
}
logger.debug(`Parsed robots.txt: ${this.rules.length} rules, ${this.sitemaps.length} sitemaps`);
}
canFetch(url: string, userAgent: string): boolean {
const path = new URL(url).pathname;
const applicableRules = this.getApplicableRules(userAgent);
for (const rule of applicableRules) {
// Check allow rules first
for (const allowPattern of rule.allow) {
if (this.matchesPattern(path, allowPattern)) {
return true;
}
}
// Then check disallow rules
for (const disallowPattern of rule.disallow) {
if (this.matchesPattern(path, disallowPattern)) {
return false;
}
}
}
return true;
}
getCrawlDelay(userAgent: string): number {
const applicableRules = this.getApplicableRules(userAgent);
for (const rule of applicableRules) {
if (rule.crawlDelay !== undefined) {
return rule.crawlDelay * 1000; // Convert to milliseconds
}
}
return 0;
}
getSitemaps(): string[] {
return [...this.sitemaps];
}
private getApplicableRules(userAgent: string): RobotsRule[] {
const specificRules = this.rules.filter(rule =>
rule.userAgent.toLowerCase() === userAgent.toLowerCase()
);
if (specificRules.length > 0) {
return specificRules;
}
return this.rules.filter(rule => rule.userAgent === '*');
}
private matchesPattern(path: string, pattern: string): boolean {
if (pattern === '/') return true;
if (pattern === '') return true;
// Convert robots.txt pattern to regex
const regexPattern = pattern
.replace(/\*/g, '.*')
.replace(/\$/g, '$');
const regex = new RegExp(`^${regexPattern}`);
return regex.test(path);
}
}