docmcp

RobotsTxtService.test.ts•8.19 KiB

import { RobotsTxtService } from '../RobotsTxtService'; import axios from 'axios'; import { DelayUtils } from '../../utils/DelayUtils'; // Mock axios and DelayUtils jest.mock('axios'); jest.mock('../../utils/DelayUtils', () => ({ withRetry: jest.fn((fn) => fn()) })); const mockedAxios = axios as jest.Mocked<typeof axios>; describe('RobotsTxtService', () => { let robotsTxtService: RobotsTxtService; const userAgent = 'DocMCPBot'; beforeEach(() => { robotsTxtService = new RobotsTxtService(); jest.clearAllMocks(); }); afterEach(() => { robotsTxtService.reset(); }); describe('loadRobotsTxt', () => { it('should load and parse robots.txt file', async () => { const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ Allow: /public/ User-agent: * Disallow: /admin/ Sitemap: https://example.com/sitemap.xml `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // Verify axios was called correctly expect(mockedAxios.get).toHaveBeenCalledWith( 'https://example.com/robots.txt', expect.objectContaining({ headers: expect.objectContaining({ 'User-Agent': userAgent }) }) ); // Test the isAllowed method after loading expect(robotsTxtService.isAllowed('https://example.com/public/page')).toBe(true); expect(robotsTxtService.isAllowed('https://example.com/private/page')).toBe(false); // Other user-agent rules should not apply expect(robotsTxtService.isAllowed('https://example.com/admin/page')).toBe(true); }); it('should handle crawl-delay directive', async () => { const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ Crawl-delay: 3 `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // 3 seconds = 3000 milliseconds expect(robotsTxtService.getCrawlDelay()).toBe(3000); }); it('should handle wildcard (*) crawl-delay when no specific one is found', async () => { const robotsTxt = ` User-agent: * Disallow: /private/ Crawl-delay: 2 User-agent: Googlebot Disallow: /admin/ `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // 2 seconds = 2000 milliseconds expect(robotsTxtService.getCrawlDelay()).toBe(2000); }); it('should use specific user agent rules over wildcard', async () => { const robotsTxt = ` User-agent: * Disallow: /all-private/ User-agent: DocMCPBot Disallow: /bot-private/ `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // Bot-specific rules should apply expect(robotsTxtService.isAllowed('https://example.com/bot-private/page')).toBe(false); expect(robotsTxtService.isAllowed('https://example.com/all-private/page')).toBe(true); }); it('should allow all URLs if robots.txt cannot be loaded', async () => { mockedAxios.get.mockRejectedValueOnce(new Error('Network error')); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // Should allow all URLs when robots.txt can't be loaded expect(robotsTxtService.isAllowed('https://example.com/any/page')).toBe(true); expect(robotsTxtService.getCrawlDelay()).toBeNull(); }); it('should allow all URLs if robots.txt returns non-200 status', async () => { mockedAxios.get.mockResolvedValueOnce({ status: 404, data: 'Not found' }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // Should allow all URLs when robots.txt returns non-200 expect(robotsTxtService.isAllowed('https://example.com/any/page')).toBe(true); }); }); describe('isAllowed', () => { it('should allow all URLs if robots.txt is not loaded', () => { // Without loading robots.txt expect(robotsTxtService.isAllowed('https://example.com/any/page')).toBe(true); }); it('should cache results for performance', async () => { const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // First check expect(robotsTxtService.isAllowed('https://example.com/private/page')).toBe(false); // Modify the internal state of the parser (this is a bit of a hack for testing) // @ts-ignore: Accessing private property for test robotsTxtService.robotsTxt = null; // Second check should use cached result and still return false expect(robotsTxtService.isAllowed('https://example.com/private/page')).toBe(false); }); it('should normalize URLs before checking if allowed', async () => { const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // Check with trailing slash variation expect(robotsTxtService.isAllowed('https://example.com/private')).toBe(false); expect(robotsTxtService.isAllowed('https://example.com/private/')).toBe(false); }); }); describe('getSitemapUrls', () => { it('should return sitemap URLs if present in robots.txt', async () => { const robotsTxt = ` User-agent: * Disallow: /private/ Sitemap: https://example.com/sitemap.xml Sitemap: https://example.com/sitemap2.xml `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); const sitemaps = robotsTxtService.getSitemapUrls(); expect(sitemaps).toContain('https://example.com/sitemap.xml'); expect(sitemaps).toContain('https://example.com/sitemap2.xml'); expect(sitemaps.length).toBe(2); }); it('should return empty array if no sitemaps in robots.txt', async () => { const robotsTxt = ` User-agent: * Disallow: /private/ `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); const sitemaps = robotsTxtService.getSitemapUrls(); expect(sitemaps).toEqual([]); }); it('should return empty array if robots.txt is not loaded', () => { const sitemaps = robotsTxtService.getSitemapUrls(); expect(sitemaps).toEqual([]); }); }); describe('reset', () => { it('should reset the service state', async () => { const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ Crawl-delay: 5 `; mockedAxios.get.mockResolvedValueOnce({ status: 200, data: robotsTxt }); await robotsTxtService.loadRobotsTxt('https://example.com', userAgent); // Verify state before reset expect(robotsTxtService.isAllowed('https://example.com/private/page')).toBe(false); expect(robotsTxtService.getCrawlDelay()).toBe(5000); // Reset the service robotsTxtService.reset(); // Verify state after reset expect(robotsTxtService.isAllowed('https://example.com/private/page')).toBe(true); expect(robotsTxtService.getCrawlDelay()).toBeNull(); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

RobotsTxtService.test.ts•8.19 KiB