docmcp

crawler.service.test.ts•13.1 KiB

import { CrawlerService } from '../../services/crawler.service'; import { getTestPrismaClient } from '../utils/testDb'; import nock from 'nock'; import { Job, JobStatus, JobType, JobStage, Prisma } from '../../generated/prisma'; describe('CrawlerService Integration Tests', () => { let crawlerService: CrawlerService; const prisma = getTestPrismaClient(); beforeAll(async () => { crawlerService = new CrawlerService(); }); beforeEach(async () => { // Clean up the database before each test await prisma.chunk.deleteMany(); await prisma.document.deleteMany(); await prisma.job.deleteMany(); // Ensure jobs are cleared too // Reset nock nock.cleanAll(); // Reset Prisma mocks/spies if needed (using jest.clearAllMocks() or specific resets) jest.clearAllMocks(); // Remove prisma method mocks from beforeEach }); describe('crawl', () => { it('should crawl a simple documentation site', async () => { const baseUrl = 'https://test.com'; const jobId = 'test-job-crawl-simple'; // >>> Create the job record with minimal necessary data <<< await prisma.job.create({ data: { id: jobId, url: baseUrl, startDate: new Date(), // Let Prisma handle defaults for status, type, progress, stats, etc. }, }); const testHtml = ` <html> <head> <title>Test Documentation</title> <meta name="package" content="test-package" /> <meta name="version" content="1.0.0" /> </head> <body> <main> <h1>Welcome to Test Docs</h1> <p>This is a test documentation page.</p> <a href="/docs/page1">Page 1</a> <a href="/docs/page2">Page 2</a> </main> </body> </html> `; const page1Html = ` <html> <head> <title>Page 1 - Test Documentation</title> </head> <body> <main> <h1>Page 1</h1> <p>This is page 1 content.</p> </main> </body> </html> `; const page2Html = ` <html> <head> <title>Page 2 - Test Documentation</title> </head> <body> <main> <h1>Page 2</h1> <p>This is page 2 content.</p> </main> </body> </html> `; // Mock HTTP requests nock(baseUrl) .get('/') .reply(200, testHtml) .get('/docs/page1') .reply(200, page1Html) .get('/docs/page2') .reply(200, page2Html); // Start crawling await crawlerService.crawl(jobId, baseUrl, {}); // Verify documents were created const documents = await prisma.document.findMany({ orderBy: { url: 'asc' }, }); expect(documents).toHaveLength(3); // Check main page const mainDoc = documents.find(d => d.url === baseUrl); expect(mainDoc).toBeDefined(); expect(mainDoc?.title).toBe('Test Documentation'); expect(mainDoc?.metadata).toEqual({ package: 'test-package', version: '1.0.0', type: 'documentation', tags: ['auto-generated'], }); // Check child pages const page1Doc = documents.find(d => d.url === `${baseUrl}/docs/page1`); expect(page1Doc).toBeDefined(); expect(page1Doc?.title).toBe('Page 1 - Test Documentation'); const page2Doc = documents.find(d => d.url === `${baseUrl}/docs/page2`); expect(page2Doc).toBeDefined(); expect(page2Doc?.title).toBe('Page 2 - Test Documentation'); // Verify job updates by checking the final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobId } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); expect(finalJob?.progress).toBe(1); expect(finalJob?.error).toBeNull(); // Add more specific checks for job updates if necessary, e.g., stats }); it('should respect maxDepth option', async () => { const baseUrl = 'https://test.com'; const jobIdMaxDepth = 'test-job-max-depth'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdMaxDepth, url: baseUrl, startDate: new Date(), }, }); const testHtml = ` <html> <head><title>Test</title></head> <body> <a href="/page1">Page 1</a> </body> </html> `; const page1Html = ` <html> <head><title>Page 1</title></head> <body> <a href="/page2">Page 2</a> </body> </html> `; const page2Html = ` <html> <head><title>Page 2</title></head> <body> <a href="/page3">Page 3</a> </body> </html> `; // Mock HTTP requests nock(baseUrl) .get('/').reply(200, testHtml) .get('/page1').reply(200, page1Html) .get('/page2').reply(200, page2Html); // Start crawling with maxDepth = 1 await crawlerService.crawl(jobIdMaxDepth, baseUrl, { maxDepth: 1 }); // Document assertions (keep as is) const documents = await prisma.document.findMany(); expect(documents).toHaveLength(2); expect(documents.map(d => d.url).sort()).toEqual([ baseUrl, `${baseUrl}/page1`, ]); // Job completion check - Check final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobIdMaxDepth } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); }); it('should handle errors gracefully', async () => { const baseUrl = 'https://test.com'; const jobIdError = 'test-job-error'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdError, url: baseUrl, startDate: new Date(), }, }); // Define the actual testHtml needed for nock mocks const testHtml = ` <html> <head><title>Test</title></head> <body> <a href="/good-page">Good Page</a> <a href="/error-page">Error Page</a> </body> </html> `; const goodPageHtml = ` <html> <head><title>Good Page</title></head> <body><p>This page works</p></body> </html> `; // Mock HTTP requests (keep as is) nock(baseUrl) .get('/') .reply(200, testHtml) .get('/good-page') .reply(200, goodPageHtml) .get('/error-page') .reply(500, 'Internal Server Error'); // Start crawling try { await crawlerService.crawl(jobIdError, baseUrl, {}); } catch (error) { // Error is expected due to the 500 internal server error during crawlPage console.error("Caught error during crawl (expected for 500 response):", error); } // Should have crawled the main page and good page const documents = await prisma.document.findMany(); expect(documents).toHaveLength(2); expect(documents.map(d => d.url).sort()).toEqual([ baseUrl, `${baseUrl}/good-page`, ]); // Verify job was updated with error (inside crawlPage) and finally marked completed // Check the update call that includes the error message // (prisma.job.update).toHaveBeenCalledWith(expect.objectContaining({ // where: { id: jobIdError }, // data: expect.objectContaining({ error: expect.stringContaining('Error crawling https://test.com/error-page') }) // })); // Check the final update call in the finally block // (prisma.job.update).toHaveBeenCalledWith(expect.objectContaining({ // where: { id: jobIdError }, // data: expect.objectContaining({ status: 'completed' }) // })); }); it('should respect robots.txt rules', async () => { const baseUrl = 'https://test.com'; const jobIdRobots = 'test-job-robots'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdRobots, url: baseUrl, startDate: new Date(), }, }); const robotsTxt = ` User-agent: DocMCPBot Disallow: /private/ User-agent: * Allow: / `; const testHtml = ` <html> <head><title>Test</title></head> <body> <a href="/public">Public Page</a> <a href="/private/secret">Private Page</a> </body> </html> `; const publicPageHtml = ` <html> <head><title>Public Page</title></head> <body><p>This is a public page</p></body> </html> `; const privatePageHtml = ` <html> <head><title>Private Page</title></head> <body><p>This is a private page</p></body> </html> `; // Mock HTTP requests (keep as is) nock(baseUrl) .get('/robots.txt').reply(200, robotsTxt) .get('/').reply(200, testHtml) .get('/public').reply(200, publicPageHtml) .get('/private/secret').reply(200, privatePageHtml); // Nock still intercepts this // Start crawling await crawlerService.crawl(jobIdRobots, baseUrl, { respectRobotsTxt: true }); // Document assertions (keep as is) const documents = await prisma.document.findMany({ where: { jobId: jobIdRobots } // Filter by job ID for safety }); expect(documents).toHaveLength(2); // Should only have home and public expect(documents.map(d => d.url).sort()).toEqual([ baseUrl, `${baseUrl}/public`, ]); // Verify the private page document was not created in the DB const privateDoc = await prisma.document.findFirst({ where: { url: `${baseUrl}/private/secret`, jobId: jobIdRobots }, }); expect(privateDoc).toBeNull(); // Job completion check - Check final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobIdRobots } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); }); it('should detect and follow pagination links', async () => { const baseUrl = 'https://test.com'; const jobIdPagination = 'test-job-pagination'; // >>> Create the job record in the test DB before crawling <<< await prisma.job.create({ data: { id: jobIdPagination, url: baseUrl, startDate: new Date(), }, }); const page1Html = ` <html> <head><title>Page 1</title></head> <body> <main> <h1>Documentation - Page 1</h1> <p>First page content.</p> <nav class="pagination"> <a href="/page1" class="current">1</a> <a href="/page2">2</a> <a href="/page3">3</a> </nav> </main> </body> </html> `; const page2Html = ` <html> <head><title>Page 2</title></head> <body> <main> <h1>Documentation - Page 2</h1> <p>Second page content.</p> <nav class="pagination"> <a href="/page1">1</a> <a href="/page2" class="current">2</a> <a href="/page3">3</a> </nav> </main> </body> </html> `; const page3Html = ` <html> <head><title>Page 3</title></head> <body> <main> <h1>Documentation - Page 3</h1> <p>Third page content.</p> <nav class="pagination"> <a href="/page1">1</a> <a href="/page2">2</a> <a href="/page3" class="current">3</a> </nav> </main> </body> </html> `; // Mock HTTP requests (keep as is) nock(baseUrl) .get('/').reply(200, page1Html) .get('/page1').reply(200, page1Html) .get('/page2').reply(200, page2Html) .get('/page3').reply(200, page3Html); // Start crawling await crawlerService.crawl(jobIdPagination, baseUrl, {}); // Document assertions (keep as is) const documents = await prisma.document.findMany({ where: { jobId: jobIdPagination } // Filter by job ID for safety }); expect(documents).toHaveLength(4); // Base URL + 3 pages const urls = documents.map(d => d.url).sort(); expect(urls).toContain(`${baseUrl}/page1`); expect(urls).toContain(`${baseUrl}/page2`); expect(urls).toContain(`${baseUrl}/page3`); // Job completion check - Check final DB state const finalJob = await prisma.job.findUnique({ where: { id: jobIdPagination } }); expect(finalJob).toBeDefined(); expect(finalJob?.status).toBe('completed' as JobStatus); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

crawler.service.test.ts•13.1 KiB