Skip to main content
Glama
crawler.service.test.ts13.4 kB
import { CrawlerService } from '../../services/crawler.service'; import { getTestPrismaClient } from '../utils/testDb'; import nock from 'nock'; import { Job, JobStatus, JobType, JobStage, Prisma } from '../../generated/prisma'; describe('CrawlerService Integration Tests', () => { let crawlerService: CrawlerService; const prisma = getTestPrismaClient(); beforeAll(async () => { crawlerService = new CrawlerService(); }); beforeEach(async () => { // Clean up the database before each test await prisma.chunk.deleteMany(); await prisma.document.deleteMany(); await prisma.job.deleteMany(); // Ensure jobs are cleared too // Reset nock nock.cleanAll(); // Reset Prisma mocks/spies if needed (using jest.clearAllMocks() or specific resets) jest.clearAllMocks(); // Remove prisma method mocks from beforeEach }); describe('crawl', () => { it('should crawl a simple documentation site', async () => { const baseUrl = 'https://test.com'; const jobId = 'test-job-crawl-simple'; // >>> Create the job record with minimal necessary data <<< await prisma.job.create({ data: { id: jobId, url: baseUrl, startDate: new Date(), // Let Prisma handle defaults for status, type, progress, stats, etc. }, }); const testHtml = ` <html> <head> <title>Test Documentation</title> <meta name="package" content="test-package" /> <meta name="version" content="1.0.0" /> </head> <body> <main> <h1>Welcome to Test Docs</h1> <p>This is a test documentation page.</p> <a href="/docs/page1">Page 1</a> <a href="/docs/page2">Page 2</a> </main> </body> </html> `; const page1Html = ` <html> <head> <title>Page 1 - Test Documentation</title> </head> <body> <main> <h1>Page 1</h1> <p>This is page 1 content.</p> </main> </body> </html> `; const page2Html = ` <html> <head> <title>Page 2 - Test Documentation</title> </head> <body> <main> <h1>Page 2</h1> <p>This is page 2 content.</p> </main> </body> </html> `; // Mock HTTP requests nock(baseUrl) .get('/') .reply(200, testHtml) .get('/docs/page1') .reply(200, page1Html) .get('/docs/page2') .reply(200, page2Html); // Start crawling await crawlerService.crawl(jobId, baseUrl, {}); // Verify documents were created const documents = await prisma.document.findMany({ orderBy: { url: 'asc' }, }); expect(documents).toHaveLength(3); // Check main page const mainDoc = documents.find(d => d.url === baseUrl); expect(mainDoc).toBeDefined(); expect(mainDoc?.title).toBe('Test Documentation'); expect(mainDoc?.metadata).toEqual({ package: 'test-package', version: '1.0.0', type: 'documentation', tags: ['auto-generated'], }); // Check child pages const page1Doc = documents.find(d => d.url === `${baseUrl}/docs/page1`); expect(page1Doc).toBeDefined(); expect(page1Doc?.title).toBe('Page 1 - Test Documentation'); const page2Doc = documents.find(d => d.url === `${baseUrl}/docs/page2`); expect(page2Doc).toBeDefined(); expect(page2Doc?.title).toBe('Page 2 - Test Documentation'); // Verify job updates by checking the final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobId } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); expect(finalJob?.progress).toBe(1); expect(finalJob?.error).toBeNull(); // Add more specific checks for job updates if necessary, e.g., stats }); it('should respect maxDepth option', async () => { const baseUrl = 'https://test.com'; const jobIdMaxDepth = 'test-job-max-depth'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdMaxDepth, url: baseUrl, startDate: new Date(), }, }); const testHtml = ` <html> <head><title>Test</title></head> <body> <a href="/page1">Page 1</a> </body> </html> `; const page1Html = ` <html> <head><title>Page 1</title></head> <body> <a href="/page2">Page 2</a> </body> </html> `; const page2Html = ` <html> <head><title>Page 2</title></head> <body> <a href="/page3">Page 3</a> </body> </html> `; // Mock HTTP requests nock(baseUrl) .get('/').reply(200, testHtml) .get('/page1').reply(200, page1Html) .get('/page2').reply(200, page2Html); // Start crawling with maxDepth = 1 await crawlerService.crawl(jobIdMaxDepth, baseUrl, { maxDepth: 1 }); // Document assertions (keep as is) const documents = await prisma.document.findMany(); expect(documents).toHaveLength(2); expect(documents.map(d => d.url).sort()).toEqual([ baseUrl, `${baseUrl}/page1`, ]); // Job completion check - Check final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobIdMaxDepth } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); }); it('should handle errors gracefully', async () => { const baseUrl = 'https://test.com'; const jobIdError = 'test-job-error'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdError, url: baseUrl, startDate: new Date(), }, }); // Define the actual testHtml needed for nock mocks const testHtml = ` <html> <head><title>Test</title></head> <body> <a href="/good-page">Good Page</a> <a href="/error-page">Error Page</a> </body> </html> `; const goodPageHtml = ` <html> <head><title>Good Page</title></head> <body><p>This page works</p></body> </html> `; // Mock HTTP requests (keep as is) nock(baseUrl) .get('/') .reply(200, testHtml) .get('/good-page') .reply(200, goodPageHtml) .get('/error-page') .reply(500, 'Internal Server Error'); // Start crawling try { await crawlerService.crawl(jobIdError, baseUrl, {}); } catch (error) { // Error is expected due to the 500 internal server error during crawlPage console.error("Caught error during crawl (expected for 500 response):", error); } // Should have crawled the main page and good page const documents = await prisma.document.findMany(); expect(documents).toHaveLength(2); expect(documents.map(d => d.url).sort()).toEqual([ baseUrl, `${baseUrl}/good-page`, ]); // Verify job was updated with error (inside crawlPage) and finally marked completed // Check the update call that includes the error message // (prisma.job.update).toHaveBeenCalledWith(expect.objectContaining({ // where: { id: jobIdError }, // data: expect.objectContaining({ error: expect.stringContaining('Error crawling https://test.com/error-page') }) // })); // Check the final update call in the finally block // (prisma.job.update).toHaveBeenCalledWith(expect.objectContaining({ // where: { id: jobIdError }, // data: expect.objectContaining({ status: 'completed' }) // })); }); it('should respect robots.txt rules', async () => { const baseUrl = 'https://test.com'; const jobIdRobots = 'test-job-robots'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdRobots, url: baseUrl, startDate: new Date(), }, }); const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ User-agent: * Allow: / `; const testHtml = ` <html> <head><title>Test</title></head> <body> <a href="/public">Public Page</a> <a href="/private/secret">Private Page</a> </body> </html> `; const publicPageHtml = ` <html> <head><title>Public Page</title></head> <body><p>This is a public page</p></body> </html> `; const privatePageHtml = ` <html> <head><title>Private Page</title></head> <body><p>This is a private page</p></body> </html> `; // Mock HTTP requests (keep as is) nock(baseUrl) .get('/robots.txt').reply(200, robotsTxt) .get('/').reply(200, testHtml) .get('/public').reply(200, publicPageHtml) .get('/private/secret').reply(200, privatePageHtml); // Nock still intercepts this // Start crawling await crawlerService.crawl(jobIdRobots, baseUrl, { respectRobotsTxt: true }); // Document assertions (keep as is) const documents = await prisma.document.findMany({ where: { jobId: jobIdRobots } // Filter by job ID for safety }); expect(documents).toHaveLength(2); // Should only have home and public expect(documents.map(d => d.url).sort()).toEqual([ baseUrl, `${baseUrl}/public`, ]); // Verify the private page document was not created in the DB const privateDoc = await prisma.document.findFirst({ where: { url: `${baseUrl}/private/secret`, jobId: jobIdRobots }, }); expect(privateDoc).toBeNull(); // Job completion check - Check final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobIdRobots } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); }); it('should detect and follow pagination links', async () => { const baseUrl = 'https://test.com'; const jobIdPagination = 'test-job-pagination'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdPagination, url: baseUrl, startDate: new Date(), }, }); const page1Html = ` <html> <head><title>Page 1</title></head> <body> <main> <h1>Documentation - Page 1</h1> <p>First page content.</p> <nav class="pagination"> <a href="/page1" class="current">1</a> <a href="/page2">2</a> <a href="/page3">3</a> </nav> </main> </body> </html> `; const page2Html = ` <html> <head><title>Page 2</title></head> <body> <main> <h1>Documentation - Page 2</h1> <p>Second page content.</p> <nav class="pagination"> <a href="/page1">1</a> <a href="/page2" class="current">2</a> <a href="/page3">3</a> </nav> </main> </body> </html> `; const page3Html = ` <html> <head><title>Page 3</title></head> <body> <main> <h1>Documentation - Page 3</h1> <p>Third page content.</p> <nav class="pagination"> <a href="/page1">1</a> <a href="/page2">2</a> <a href="/page3" class="current">3</a> </nav> </main> </body> </html> `; // Mock HTTP requests (keep as is) nock(baseUrl) .get('/').reply(200, page1Html) .get('/page1').reply(200, page1Html) .get('/page2').reply(200, page2Html) .get('/page3').reply(200, page3Html); // Start crawling await crawlerService.crawl(jobIdPagination, baseUrl, {}); // Document assertions (keep as is) const documents = await prisma.document.findMany({ where: { jobId: jobIdPagination } // Filter by job ID for safety }); expect(documents).toHaveLength(4); // Base URL + 3 pages const urls = documents.map(d => d.url).sort(); expect(urls).toContain(`${baseUrl}/page1`); expect(urls).toContain(`${baseUrl}/page2`); expect(urls).toContain(`${baseUrl}/page3`); // Job completion check - Check final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobIdPagination } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server