Skip to main content
Glama
documentation-pipeline.test.ts6.27 kB
import { PrismaClient } from '../../generated/prisma'; import { startCrawlingProcess } from '../../services/mcp-tools/add-documentation.tool'; import { CrawlerService } from '../../services/crawler.service'; import { DocumentService } from '../../services/document.service'; import { DocumentProcessorService } from '../../services/document-processor.service'; import { JobService } from '../../services/job.service'; import { setupTestDatabase, teardownTestDatabase, getTestPrismaClient } from '../utils/testDb'; import axios from 'axios'; // Mock axios to avoid making actual HTTP requests jest.mock('axios'); const mockedAxios = axios as jest.Mocked<typeof axios>; describe('Documentation Processing Pipeline Integration', () => { let prisma: PrismaClient; let jobId: string; beforeAll(async () => { prisma = await setupTestDatabase(); }); afterAll(async () => { await teardownTestDatabase(); }); beforeEach(async () => { // Clear database tables await prisma.chunk.deleteMany(); await prisma.document.deleteMany(); await prisma.job.deleteMany(); // Create a new job to use in tests const jobService = new JobService(prisma); const job = await jobService.createJob({ url: 'https://example.com/docs', status: 'pending', startDate: new Date(), progress: 0, error: null, stats: { pagesProcessed: 0, pagesSkipped: 0, totalChunks: 0 }, }); jobId = job.id; // Setup axios mock to return HTML content mockedAxios.get.mockImplementation(async (url) => { if (url === 'https://example.com/robots.txt') { return { status: 200, data: 'User-agent: *\nAllow: /', }; } // First page with link to second if (url === 'https://example.com/docs') { return { status: 200, data: ` <html> <head> <title>Documentation Home</title> <meta name="package" content="example-docs"> <meta name="version" content="1.0.0"> </head> <body> <h1>Example Documentation</h1> <p>This is the main documentation page.</p> <a href="https://example.com/docs/page1">Page 1</a> </body> </html> `, }; } // Second page if (url === 'https://example.com/docs/page1') { return { status: 200, data: ` <html> <head> <title>Page 1</title> </head> <body> <h1>Page 1</h1> <p>This is page 1 of the documentation.</p> <pre><code class="language-javascript"> function example() { return "hello world"; } </code></pre> </body> </html> `, }; } return { status: 404, data: '404 Not Found', }; }); // Setup axios mock for embeddings mockedAxios.post.mockResolvedValue({ data: { embedding: Array(1536).fill(0.1), }, }); }); it('should process documents end-to-end', async () => { // Start the crawling process await startCrawlingProcess(jobId, { url: 'https://example.com/docs', maxDepth: 1, _prisma: prisma, _bypassAsync: true, }); // Check that job was completed const jobService = new JobService(prisma); const updatedJob = await jobService.findJobById(jobId); expect(updatedJob).toBeDefined(); expect(updatedJob?.status).toBe('completed'); expect(updatedJob?.progress).toBe(1.0); // Check that documents were created const documentService = new DocumentService(prisma); const documents = await prisma.document.findMany({ where: { jobId: jobId }, orderBy: { url: 'asc' }, }); expect(documents.length).toBe(2); expect(documents[0].title).toBe('Documentation Home'); expect(documents[1].title).toBe('Page 1'); // Check that chunks were created (without embeddings initially) const chunksData = await prisma.chunk.findMany({ where: { documentId: { in: documents.map(d => d.id) } }, select: { id: true, // embedding field removed from here } }); expect(chunksData.length).toBeGreaterThan(0); // Now, fetch embeddings using raw query const chunkIds = chunksData.map(c => c.id); // Type casting needed for raw query result type ChunkWithEmbedding = { id: string; embedding: number[] }; const embeddingsResult = await prisma.$queryRaw<ChunkWithEmbedding[]>` SELECT id, embedding::text FROM chunks WHERE id = ANY(${chunkIds}) `; // Convert text representation back to array const embeddingsMap = new Map( embeddingsResult.map(e => { let parsedEmbedding: number[] = []; try { // Assuming the ::text cast gives a JSON-parseable string like "[0.1, ... ]" parsedEmbedding = JSON.parse(e.embedding as unknown as string); } catch (parseError) { console.error(`Failed to parse embedding string for chunk ${e.id}:`, e.embedding, parseError); // Handle error appropriately, maybe fail the test or assign default } return [e.id, parsedEmbedding]; }) ); // Assertions on embeddings chunksData.forEach(chunk => { const embedding = embeddingsMap.get(chunk.id); expect(embedding).toBeDefined(); // We need to parse the string representation '[0.1, 0.1, ...]' from pgvector // Or rely on Prisma's raw query handling if it directly returns an array/vector type // For now, assuming it returns an array directly due to potential driver handling expect(Array.isArray(embedding)).toBe(true); expect(embedding?.length).toBe(1536); // Check dimension based on mocked embedding }); // Check that job stats were updated expect(updatedJob?.stats).toEqual(expect.objectContaining({ pagesProcessed: 2, totalChunks: expect.any(Number), })); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server