Skip to main content
Glama
document-processor.service.test.ts15.9 kB
import { DocumentProcessorService } from '../../services/document-processor.service'; import { DocumentService } from '../../services/document.service'; import { ChunkService } from '../../services/chunk.service'; import { getTestPrismaClient } from '../utils/testDb'; import axios from 'axios'; import config from '../../config'; // Mock the DocumentService and ChunkService jest.mock('../../services/document.service'); jest.mock('../../services/chunk.service'); // Mock axios jest.mock('axios'); const mockedAxios = axios as jest.Mocked<typeof axios>; describe('DocumentProcessorService', () => { let documentProcessorService: DocumentProcessorService; let mockDocumentService: jest.Mocked<DocumentService>; let mockChunkService: jest.Mocked<ChunkService>; const prisma = getTestPrismaClient(); beforeEach(() => { jest.clearAllMocks(); // Create a mock DocumentService mockDocumentService = { updateDocument: jest.fn().mockResolvedValue({ id: 'test-doc-id' }), findDocumentById: jest.fn().mockResolvedValue(null), createDocument: jest.fn().mockResolvedValue({ id: 'test-doc-id' }), } as unknown as jest.Mocked<DocumentService>; // Create a mock ChunkService mockChunkService = { createChunk: jest.fn().mockResolvedValue(undefined), createManyChunks: jest.fn().mockResolvedValue(undefined), findSimilarChunks: jest.fn().mockResolvedValue([]), deleteChunksByDocumentId: jest.fn().mockResolvedValue(undefined), // Add any other methods from ChunkService if they are used indirectly } as unknown as jest.Mocked<ChunkService>; // Reset axios mock completely before each test mockedAxios.post.mockReset(); // Create the service with mocks documentProcessorService = new DocumentProcessorService(prisma, mockDocumentService, mockChunkService); }); describe('processDocument', () => { // Define a standard successful mock response for tests that don't focus on errors const mockSuccessfulEmbedding = (embeddingData = [0.1, 0.2, 0.3]) => { mockedAxios.post.mockResolvedValue({ data: { embedding: embeddingData } }); }; it('should generate embeddings and call createManyChunks', async () => { const documentId = 'test-doc-id'; const html = `<html><body><h1>Title</h1><p>Chunk 1 content.</p><h2>Subtitle</h2><p>Chunk 2 content.</p></body></html>`; const dummyEmbedding1 = Array(1536).fill(0.1); // Dimension based on granite-30m const dummyEmbedding2 = Array(1536).fill(0.2); // Mock the Ollama API response mockedAxios.post .mockResolvedValueOnce({ data: { embedding: dummyEmbedding1 } }) // First call for first chunk .mockResolvedValueOnce({ data: { embedding: dummyEmbedding2 } }); // Second call for second chunk await documentProcessorService.processDocument(documentId, html); // Verify axios was called for each chunk expect(mockedAxios.post).toHaveBeenCalledTimes(2); // Assuming 2 chunks are generated expect(mockedAxios.post).toHaveBeenCalledWith( 'http://localhost:11434/api/embeddings', // Ensure this matches the service config expect.objectContaining({ prompt: expect.stringContaining('Chunk 1 content') }) ); expect(mockedAxios.post).toHaveBeenCalledWith( 'http://localhost:11434/api/embeddings', expect.objectContaining({ prompt: expect.stringContaining('Chunk 2 content') }) ); // Verify createManyChunks was called with the embeddings expect(mockChunkService.createManyChunks).toHaveBeenCalledTimes(1); expect(mockChunkService.createManyChunks).toHaveBeenCalledWith( expect.arrayContaining([ expect.objectContaining({ documentId, embedding: dummyEmbedding1 }), expect.objectContaining({ documentId, embedding: dummyEmbedding2 }) ]), documentId ); // Verify document update still happens expect(mockDocumentService.updateDocument).toHaveBeenCalledTimes(1); }); it('should handle Ollama API errors gracefully', async () => { const documentId = 'test-doc-id'; const html = `<html><body><h1>Title</h1><p>Chunk 1 content.</p><h2>Subtitle</h2><p>Chunk 2 content.</p></body></html>`; const dummyEmbedding2 = Array(1536).fill(0.2); const consoleErrorSpy = jest.spyOn(console, 'error').mockImplementation(); // Custom mock implementation for axios.post for this test let callCount = 0; mockedAxios.post.mockImplementation(async () => { callCount++; if (callCount <= 3) { // Fail the first 3 attempts (for the first chunk's retries) const connectionError = { message: 'Ollama connection refused', code: 'ECONNREFUSED', response: undefined // Ensure response is undefined }; return Promise.reject(connectionError); } else { // Succeed on the 4th call (for the second chunk) return Promise.resolve({ data: { embedding: dummyEmbedding2 } }); } }); await documentProcessorService.processDocument(documentId, html); // Verify axios was called 4 times (3 failed + 1 successful) expect(mockedAxios.post).toHaveBeenCalledTimes(4); // Verify createManyChunks was called only with the successful embedding expect(mockChunkService.createManyChunks).toHaveBeenCalledTimes(1); expect(mockChunkService.createManyChunks).toHaveBeenCalledWith( expect.arrayContaining([ expect.objectContaining({ documentId, embedding: dummyEmbedding2 }) ]), documentId ); expect(mockChunkService.createManyChunks).not.toHaveBeenCalledWith( expect.arrayContaining([ expect.objectContaining({ content: expect.stringContaining('Chunk 1 content') }) ]), expect.anything() ); // Verify document update still happens expect(mockDocumentService.updateDocument).toHaveBeenCalledTimes(1); consoleErrorSpy.mockRestore(); }); it('should convert HTML to markdown', async () => { mockSuccessfulEmbedding(); // Add default success mock const documentId = 'test-doc-id'; const html = ` <html> <head> <title>Test Document</title> <meta name="package" content="test-package"> <meta name="version" content="1.0.0"> </head> <body> <header>This should be removed</header> <main> <h1>Main Heading</h1> <p>This is a paragraph with <strong>bold</strong> text.</p> <pre><code class="language-javascript"> function test() { console.log('Hello world'); } </code></pre> <table> <tr> <th>Header 1</th> <th>Header 2</th> </tr> <tr> <td>Value 1</td> <td>Value 2</td> </tr> </table> </main> <footer>This should be removed</footer> </body> </html> `; const markdown = await documentProcessorService.processDocument(documentId, html); // Verify basic conversion worked expect(markdown).toContain('# Main Heading'); expect(markdown).toContain('This is a paragraph with **bold** text.'); expect(markdown).toContain('function test()'); expect(markdown).toContain('| Header 1 | Header 2 |'); expect(markdown).toContain('| Value 1 | Value 2 |'); // Verify headers and footers were removed expect(markdown).not.toContain('This should be removed'); // Verify the DocumentService was called to update the document expect(mockDocumentService.updateDocument).toHaveBeenCalledWith( documentId, expect.objectContaining({ content: expect.any(String), metadata: expect.objectContaining({ title: expect.any(String), package: 'test-package', version: '1.0.0', }), }) ); }); it('should handle empty HTML', async () => { // No axios call expected for empty HTML, so no mock needed here const documentId = 'test-doc-id'; const html = ''; const markdown = await documentProcessorService.processDocument(documentId, html); expect(markdown).toBe(''); // Verify the DocumentService was still called expect(mockDocumentService.updateDocument).toHaveBeenCalledWith( documentId, expect.objectContaining({ content: '', }) ); }); it('should extract metadata from HTML', async () => { mockSuccessfulEmbedding(); // Add default success mock const documentId = 'test-doc-id'; const html = ` <html> <head> <title>Test Document</title> <meta name="package" content="test-package"> <meta name="version" content="1.0.0"> </head> <body> <h1>Main Heading</h1> <h2>Subheading 1</h2> <p>Content for subheading 1.</p> <h2>Subheading 2</h2> <p>Content for subheading 2.</p> <a href="https://example.com">Example Link</a> </body> </html> `; await documentProcessorService.processDocument(documentId, html); // Verify the DocumentService was called with the correct metadata expect(mockDocumentService.updateDocument).toHaveBeenCalledWith( documentId, expect.objectContaining({ metadata: expect.objectContaining({ title: 'Test Document', package: 'test-package', version: '1.0.0', headings: expect.arrayContaining([ expect.objectContaining({ level: 1, text: 'Main Heading' }), expect.objectContaining({ level: 2, text: 'Subheading 1' }), expect.objectContaining({ level: 2, text: 'Subheading 2' }) ]), links: expect.arrayContaining([ expect.objectContaining({ text: 'Example Link', href: 'https://example.com' }) ]), type: 'documentation', }), }) ); }); it('should handle code blocks properly', async () => { mockSuccessfulEmbedding(); // Add default success mock const documentId = 'test-doc-id'; const html = ` <html> <body> <h1>Code Example</h1> <pre><code class="language-javascript"> function hello() { return "world"; } </code></pre> <pre><code class="language-python"> def hello(): return "world" </code></pre> </body> </html> `; const markdown = await documentProcessorService.processDocument(documentId, html); // Verify code blocks are converted with language tags expect(markdown).toMatch(/```javascript\n\s*function hello\(\)/); expect(markdown).toMatch(/```python\n\s*def hello\(\):/); // Verify metadata extraction expect(mockDocumentService.updateDocument).toHaveBeenCalledWith( documentId, expect.objectContaining({ metadata: expect.objectContaining({ codeBlocks: expect.arrayContaining([ expect.objectContaining({ language: 'javascript', }), expect.objectContaining({ language: 'python', }) ]), }), }) ); }); it('should handle tables properly', async () => { mockSuccessfulEmbedding(); // Add default success mock const documentId = 'test-doc-id'; const html = ` <html> <body> <h1>Table Example</h1> <table> <tr> <th>Name</th> <th>Age</th> <th>Role</th> </tr> <tr> <td>John</td> <td>30</td> <td>Developer</td> </tr> <tr> <td>Jane</td> <td>28</td> <td>Designer</td> </tr> </table> </body> </html> `; const markdown = await documentProcessorService.processDocument(documentId, html); // Verify table formatting expect(markdown).toContain('| Name | Age | Role |'); expect(markdown).toContain('| --- | --- | --- |'); expect(markdown).toContain('| John | 30 | Developer |'); expect(markdown).toContain('| Jane | 28 | Designer |'); // Verify metadata extraction expect(mockDocumentService.updateDocument).toHaveBeenCalledWith( documentId, expect.objectContaining({ metadata: expect.objectContaining({ tableCount: 1, }), }) ); }); it('should use fixed-size chunking when configured', async () => { mockSuccessfulEmbedding(); // Add default success mock const documentId = 'test-doc-id'; // Ensure config mock uses fixed strategy for this test const originalStrategy = config.chunking.strategy; config.chunking.strategy = 'fixed'; config.chunking.fixedChunkSize = 15; config.chunking.fixedChunkOverlap = 5; const html = `<html><body><p>This is the first part.</p><p>This is the second part.</p></body></html>`; // Markdown: "This is the first part.\n\nThis is the second part." const expectedMarkdown = 'This is the first part.\n\nThis is the second part.'; const expectedChunk1 = expectedMarkdown.substring(0, 15); // "This is the fir" const expectedChunk2 = expectedMarkdown.substring(10, 25); // "e first part.\n\nTh" const expectedChunk3 = expectedMarkdown.substring(20, 35); // "t part.\n\nThis is" const expectedChunk4 = expectedMarkdown.substring(30, 45); // "is is the secon" const expectedChunk5 = expectedMarkdown.substring(40, 55); // " the second part." const dummyEmbedding = Array(1536).fill(0.3); mockedAxios.post.mockResolvedValue({ data: { embedding: dummyEmbedding } }); await documentProcessorService.processDocument(documentId, html); // Verify createManyChunks was called with fixed-size chunks expect(mockChunkService.createManyChunks).toHaveBeenCalledTimes(1); const passedChunksArgs = mockChunkService.createManyChunks.mock.calls[0][0] as Array<{ content: string; metadata: any }>; // Type assertion expect(passedChunksArgs.length).toBe(5); expect(passedChunksArgs[0].content).toBe(expectedChunk1); expect(passedChunksArgs[1].content).toBe(expectedChunk2); expect(passedChunksArgs[2].content).toBe(expectedChunk3); expect(passedChunksArgs[3].content).toBe(expectedChunk4); expect(passedChunksArgs[4].content).toBe(expectedChunk5); // Safely access metadata expect(passedChunksArgs[0]?.metadata?.chunkIndex).toBe(0); expect(passedChunksArgs[1]?.metadata?.chunkIndex).toBe(1); expect(passedChunksArgs[0]?.metadata?.start).toBe(0); expect(passedChunksArgs[0]?.metadata?.end).toBe(15); expect(passedChunksArgs[1]?.metadata?.start).toBe(10); // 15 - 5 expect(passedChunksArgs[1]?.metadata?.end).toBe(25); // Restore original config strategy config.chunking.strategy = originalStrategy; }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/visheshd/docmcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server