crawler.service.test.ts•13.4 kB
import { CrawlerService } from '../../services/crawler.service';
import { getTestPrismaClient } from '../utils/testDb';
import nock from 'nock';
import { Job, JobStatus, JobType, JobStage, Prisma } from '../../generated/prisma';
describe('CrawlerService Integration Tests', () => {
let crawlerService: CrawlerService;
const prisma = getTestPrismaClient();
beforeAll(async () => {
crawlerService = new CrawlerService();
});
beforeEach(async () => {
// Clean up the database before each test
await prisma.chunk.deleteMany();
await prisma.document.deleteMany();
await prisma.job.deleteMany(); // Ensure jobs are cleared too
// Reset nock
nock.cleanAll();
// Reset Prisma mocks/spies if needed (using jest.clearAllMocks() or specific resets)
jest.clearAllMocks();
// Remove prisma method mocks from beforeEach
});
describe('crawl', () => {
it('should crawl a simple documentation site', async () => {
const baseUrl = 'https://test.com';
const jobId = 'test-job-crawl-simple';
// >>> Create the job record with minimal necessary data <<<
await prisma.job.create({
data: {
id: jobId,
url: baseUrl,
startDate: new Date(),
// Let Prisma handle defaults for status, type, progress, stats, etc.
},
});
const testHtml = `
<html>
<head>
<title>Test Documentation</title>
<meta name="package" content="test-package" />
<meta name="version" content="1.0.0" />
</head>
<body>
<main>
<h1>Welcome to Test Docs</h1>
<p>This is a test documentation page.</p>
<a href="/docs/page1">Page 1</a>
<a href="/docs/page2">Page 2</a>
</main>
</body>
</html>
`;
const page1Html = `
<html>
<head>
<title>Page 1 - Test Documentation</title>
</head>
<body>
<main>
<h1>Page 1</h1>
<p>This is page 1 content.</p>
</main>
</body>
</html>
`;
const page2Html = `
<html>
<head>
<title>Page 2 - Test Documentation</title>
</head>
<body>
<main>
<h1>Page 2</h1>
<p>This is page 2 content.</p>
</main>
</body>
</html>
`;
// Mock HTTP requests
nock(baseUrl)
.get('/')
.reply(200, testHtml)
.get('/docs/page1')
.reply(200, page1Html)
.get('/docs/page2')
.reply(200, page2Html);
// Start crawling
await crawlerService.crawl(jobId, baseUrl, {});
// Verify documents were created
const documents = await prisma.document.findMany({
orderBy: { url: 'asc' },
});
expect(documents).toHaveLength(3);
// Check main page
const mainDoc = documents.find(d => d.url === baseUrl);
expect(mainDoc).toBeDefined();
expect(mainDoc?.title).toBe('Test Documentation');
expect(mainDoc?.metadata).toEqual({
package: 'test-package',
version: '1.0.0',
type: 'documentation',
tags: ['auto-generated'],
});
// Check child pages
const page1Doc = documents.find(d => d.url === `${baseUrl}/docs/page1`);
expect(page1Doc).toBeDefined();
expect(page1Doc?.title).toBe('Page 1 - Test Documentation');
const page2Doc = documents.find(d => d.url === `${baseUrl}/docs/page2`);
expect(page2Doc).toBeDefined();
expect(page2Doc?.title).toBe('Page 2 - Test Documentation');
// Verify job updates by checking the final DB state
const finalJob = await prisma.job.findUnique({ where: { id: jobId } });
expect(finalJob).toBeDefined();
expect(finalJob?.status).toBe('completed' as JobStatus);
expect(finalJob?.progress).toBe(1);
expect(finalJob?.error).toBeNull();
// Add more specific checks for job updates if necessary, e.g., stats
});
it('should respect maxDepth option', async () => {
const baseUrl = 'https://test.com';
const jobIdMaxDepth = 'test-job-max-depth';
// >>> Create the job record in the test DB before crawling <<<
await prisma.job.create({
data: {
id: jobIdMaxDepth,
url: baseUrl,
startDate: new Date(),
},
});
const testHtml = `
<html>
<head><title>Test</title></head>
<body>
<a href="/page1">Page 1</a>
</body>
</html>
`;
const page1Html = `
<html>
<head><title>Page 1</title></head>
<body>
<a href="/page2">Page 2</a>
</body>
</html>
`;
const page2Html = `
<html>
<head><title>Page 2</title></head>
<body>
<a href="/page3">Page 3</a>
</body>
</html>
`;
// Mock HTTP requests
nock(baseUrl)
.get('/').reply(200, testHtml)
.get('/page1').reply(200, page1Html)
.get('/page2').reply(200, page2Html);
// Start crawling with maxDepth = 1
await crawlerService.crawl(jobIdMaxDepth, baseUrl, { maxDepth: 1 });
// Document assertions (keep as is)
const documents = await prisma.document.findMany();
expect(documents).toHaveLength(2);
expect(documents.map(d => d.url).sort()).toEqual([
baseUrl,
`${baseUrl}/page1`,
]);
// Job completion check - Check final DB state
const finalJob = await prisma.job.findUnique({ where: { id: jobIdMaxDepth } });
expect(finalJob).toBeDefined();
expect(finalJob?.status).toBe('completed' as JobStatus);
});
it('should handle errors gracefully', async () => {
const baseUrl = 'https://test.com';
const jobIdError = 'test-job-error';
// >>> Create the job record in the test DB before crawling <<<
await prisma.job.create({
data: {
id: jobIdError,
url: baseUrl,
startDate: new Date(),
},
});
// Define the actual testHtml needed for nock mocks
const testHtml = `
<html>
<head><title>Test</title></head>
<body>
<a href="/good-page">Good Page</a>
<a href="/error-page">Error Page</a>
</body>
</html>
`;
const goodPageHtml = `
<html>
<head><title>Good Page</title></head>
<body><p>This page works</p></body>
</html>
`;
// Mock HTTP requests (keep as is)
nock(baseUrl)
.get('/')
.reply(200, testHtml)
.get('/good-page')
.reply(200, goodPageHtml)
.get('/error-page')
.reply(500, 'Internal Server Error');
// Start crawling
try {
await crawlerService.crawl(jobIdError, baseUrl, {});
} catch (error) {
// Error is expected due to the 500 internal server error during crawlPage
console.error("Caught error during crawl (expected for 500 response):", error);
}
// Should have crawled the main page and good page
const documents = await prisma.document.findMany();
expect(documents).toHaveLength(2);
expect(documents.map(d => d.url).sort()).toEqual([
baseUrl,
`${baseUrl}/good-page`,
]);
// Verify job was updated with error (inside crawlPage) and finally marked completed
// Check the update call that includes the error message
// (prisma.job.update).toHaveBeenCalledWith(expect.objectContaining({
// where: { id: jobIdError },
// data: expect.objectContaining({ error: expect.stringContaining('Error crawling https://test.com/error-page') })
// }));
// Check the final update call in the finally block
// (prisma.job.update).toHaveBeenCalledWith(expect.objectContaining({
// where: { id: jobIdError },
// data: expect.objectContaining({ status: 'completed' })
// }));
});
it('should respect robots.txt rules', async () => {
const baseUrl = 'https://test.com';
const jobIdRobots = 'test-job-robots';
// >>> Create the job record in the test DB before crawling <<<
await prisma.job.create({
data: {
id: jobIdRobots,
url: baseUrl,
startDate: new Date(),
},
});
const robotsTxt = `
User-agent: DocMCPBot
Disallow: /private/
User-agent: *
Allow: /
`;
const testHtml = `
<html>
<head><title>Test</title></head>
<body>
<a href="/public">Public Page</a>
<a href="/private/secret">Private Page</a>
</body>
</html>
`;
const publicPageHtml = `
<html>
<head><title>Public Page</title></head>
<body><p>This is a public page</p></body>
</html>
`;
const privatePageHtml = `
<html>
<head><title>Private Page</title></head>
<body><p>This is a private page</p></body>
</html>
`;
// Mock HTTP requests (keep as is)
nock(baseUrl)
.get('/robots.txt').reply(200, robotsTxt)
.get('/').reply(200, testHtml)
.get('/public').reply(200, publicPageHtml)
.get('/private/secret').reply(200, privatePageHtml); // Nock still intercepts this
// Start crawling
await crawlerService.crawl(jobIdRobots, baseUrl, { respectRobotsTxt: true });
// Document assertions (keep as is)
const documents = await prisma.document.findMany({
where: { jobId: jobIdRobots } // Filter by job ID for safety
});
expect(documents).toHaveLength(2); // Should only have home and public
expect(documents.map(d => d.url).sort()).toEqual([
baseUrl,
`${baseUrl}/public`,
]);
// Verify the private page document was not created in the DB
const privateDoc = await prisma.document.findFirst({
where: { url: `${baseUrl}/private/secret`, jobId: jobIdRobots },
});
expect(privateDoc).toBeNull();
// Job completion check - Check final DB state
const finalJob = await prisma.job.findUnique({ where: { id: jobIdRobots } });
expect(finalJob).toBeDefined();
expect(finalJob?.status).toBe('completed' as JobStatus);
});
it('should detect and follow pagination links', async () => {
const baseUrl = 'https://test.com';
const jobIdPagination = 'test-job-pagination';
// >>> Create the job record in the test DB before crawling <<<
await prisma.job.create({
data: {
id: jobIdPagination,
url: baseUrl,
startDate: new Date(),
},
});
const page1Html = `
<html>
<head><title>Page 1</title></head>
<body>
<main>
<h1>Documentation - Page 1</h1>
<p>First page content.</p>
<nav class="pagination">
<a href="/page1" class="current">1</a>
<a href="/page2">2</a>
<a href="/page3">3</a>
</nav>
</main>
</body>
</html>
`;
const page2Html = `
<html>
<head><title>Page 2</title></head>
<body>
<main>
<h1>Documentation - Page 2</h1>
<p>Second page content.</p>
<nav class="pagination">
<a href="/page1">1</a>
<a href="/page2" class="current">2</a>
<a href="/page3">3</a>
</nav>
</main>
</body>
</html>
`;
const page3Html = `
<html>
<head><title>Page 3</title></head>
<body>
<main>
<h1>Documentation - Page 3</h1>
<p>Third page content.</p>
<nav class="pagination">
<a href="/page1">1</a>
<a href="/page2">2</a>
<a href="/page3" class="current">3</a>
</nav>
</main>
</body>
</html>
`;
// Mock HTTP requests (keep as is)
nock(baseUrl)
.get('/').reply(200, page1Html)
.get('/page1').reply(200, page1Html)
.get('/page2').reply(200, page2Html)
.get('/page3').reply(200, page3Html);
// Start crawling
await crawlerService.crawl(jobIdPagination, baseUrl, {});
// Document assertions (keep as is)
const documents = await prisma.document.findMany({
where: { jobId: jobIdPagination } // Filter by job ID for safety
});
expect(documents).toHaveLength(4); // Base URL + 3 pages
const urls = documents.map(d => d.url).sort();
expect(urls).toContain(`${baseUrl}/page1`);
expect(urls).toContain(`${baseUrl}/page2`);
expect(urls).toContain(`${baseUrl}/page3`);
// Job completion check - Check final DB state
const finalJob = await prisma.job.findUnique({ where: { id: jobIdPagination } });
expect(finalJob).toBeDefined();
expect(finalJob?.status).toBe('completed' as JobStatus);
});
});
});