Skip to main content
Glama
RobotsTxtService.ts5.59 kB
import axios from 'axios'; import robotsParser from 'robots-parser'; import { IRobotsTxtService } from '../interfaces/IRobotsTxtService'; import { UrlUtils } from '../utils/UrlUtils'; import { LoggingUtils } from '../utils/LoggingUtils'; import { DelayUtils } from '../utils/DelayUtils'; /** * Implementation of the robots.txt service */ export class RobotsTxtService implements IRobotsTxtService { private robotsTxt: ReturnType<typeof robotsParser> | null = null; private baseUrl: string = ''; private userAgent: string = ''; private crawlDelay: number | null = null; private readonly logger = LoggingUtils.createTaggedLogger('robots'); private cache = new Map<string, boolean>(); /** * Load and parse the robots.txt file from a URL * @param baseUrl The base URL to load robots.txt from * @param userAgent The user agent to use when checking permissions */ async loadRobotsTxt(baseUrl: string, userAgent: string): Promise<void> { try { this.baseUrl = baseUrl; this.userAgent = userAgent; // Get the root URL const rootUrl = UrlUtils.getRootUrl(baseUrl); const robotsUrl = `${rootUrl}/robots.txt`; this.logger.info(`Loading robots.txt from ${robotsUrl}`); // Fetch the robots.txt file const response = await DelayUtils.withRetry( () => axios.get(robotsUrl, { headers: { 'User-Agent': userAgent }, timeout: 10000 }), 3 ); // Parse the robots.txt content if (response.status === 200) { const content = response.data; this.robotsTxt = robotsParser(content, robotsUrl); // Find crawl delay in robots.txt // Use a regex to find the Crawl-delay directive for our user agent or * const lines = content.split('\n'); let inUserAgentSection = false; let inWildcardSection = false; for (const line of lines) { const trimmedLine = line.trim(); // Check if we're in a User-agent section if (trimmedLine.toLowerCase().startsWith('user-agent:')) { const agentValue = trimmedLine.substring('user-agent:'.length).trim(); inUserAgentSection = agentValue === userAgent; inWildcardSection = agentValue === '*'; } // Try to find Crawl-delay if ((inUserAgentSection || inWildcardSection) && trimmedLine.toLowerCase().startsWith('crawl-delay:')) { const delayValue = trimmedLine.substring('crawl-delay:'.length).trim(); this.crawlDelay = parseFloat(delayValue) * 1000; // Convert to milliseconds this.logger.info(`Found crawl delay: ${this.crawlDelay}ms`); // If we found a crawl delay in the specific user agent section, break if (inUserAgentSection) { break; } } } this.logger.info(`Successfully loaded and parsed robots.txt from ${robotsUrl}`); } else { // No robots.txt or error (assume everything is allowed) this.robotsTxt = null; this.logger.warn(`No robots.txt found at ${robotsUrl} or server returned non-200 status`); } } catch (error) { // Handle errors (can't load robots.txt, assume everything is allowed) this.robotsTxt = null; this.logger.error(`Error loading robots.txt: ${error instanceof Error ? error.message : String(error)}`); } } /** * Check if a URL is allowed to be crawled * @param url The URL to check * @returns True if the URL is allowed, false otherwise */ isAllowed(url: string): boolean { try { // If we couldn't load robots.txt, assume everything is allowed if (!this.robotsTxt) { return true; } // Normalize the URL const normalizedUrl = UrlUtils.normalize(url); // Check cache first if (this.cache.has(normalizedUrl)) { const cacheResult = this.cache.get(normalizedUrl); return cacheResult === true; } // Check if the URL is allowed const allowed = this.robotsTxt.isAllowed(normalizedUrl, this.userAgent); // Cache the result (if allowed is undefined, treat as allowed) this.cache.set(normalizedUrl, allowed === true || allowed === undefined); return allowed === true || allowed === undefined; } catch (error) { this.logger.error(`Error checking if URL is allowed: ${error instanceof Error ? error.message : String(error)}`); return true; // Assume allowed in case of error } } /** * Get the crawl delay specified in robots.txt * @returns The crawl delay in milliseconds, or null if not specified */ getCrawlDelay(): number | null { return this.crawlDelay; } /** * Check if sitemap URLs are specified in robots.txt * @returns Array of sitemap URLs, or empty array if none */ getSitemapUrls(): string[] { try { if (!this.robotsTxt) { return []; } // The getSitemaps method might return undefined const sitemaps = this.robotsTxt.getSitemaps(); return sitemaps || []; } catch (error) { this.logger.error(`Error getting sitemap URLs: ${error instanceof Error ? error.message : String(error)}`); return []; } } /** * Reset the robots.txt service */ reset(): void { this.robotsTxt = null; this.baseUrl = ''; this.userAgent = ''; this.crawlDelay = null; this.cache.clear(); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server