Skip to main content
Glama
PSPDFKit

Nutrient Document Engine MCP Server

by PSPDFKit
extractText.test.ts11.5 kB
import { describe, it, expect, vi, beforeEach } from 'vitest'; import { extractText } from '../../../src/tools/extraction/extractText.js'; import { TextLine } from '../../../src/api/DocumentEngineSchema.js'; import { createMockClient, MockedDocumentEngineClient } from '../../utils/mockTypes.js'; // Mock the logger to prevent console output during tests vi.mock('../../../src/utils/Logger.js', () => ({ logger: { error: vi.fn(), info: vi.fn(), debug: vi.fn(), warn: vi.fn(), }, })); describe('extractText', () => { let mockClient: MockedDocumentEngineClient; beforeEach(() => { mockClient = createMockClient(); }); describe('successful scenarios', () => { it('should extract text from all pages when no page range is provided', async () => { // Mock document info response const mockDocInfoResponse = { data: { data: { pageCount: 2, }, }, }; // Mock page text responses const mockPage1TextResponse = { data: { textLines: [ { contents: 'Line 1 on page 1', left: 10, top: 10, width: 100, height: 20 }, { contents: 'Line 2 on page 1', left: 10, top: 40, width: 100, height: 20 }, ] as TextLine[], }, }; const mockPage2TextResponse = { data: { textLines: [ { contents: 'Line 1 on page 2', left: 10, top: 10, width: 100, height: 20 }, { contents: 'Line 2 on page 2', left: 10, top: 40, width: 100, height: 20 }, ] as TextLine[], }, }; // Set up the mock responses mockClient['fetch-document-info'].mockResolvedValue(mockDocInfoResponse); mockClient['get-document-page-text'].mockImplementation( ({ pageIndex }: { pageIndex: number }) => { if (pageIndex === 0) return Promise.resolve(mockPage1TextResponse); if (pageIndex === 1) return Promise.resolve(mockPage2TextResponse); return Promise.reject(new Error('Invalid page index')); } ); // Call the function const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, include_coordinates: false, ocr_enabled: false, }); // Verify the client was called with the correct parameters expect(mockClient['fetch-document-info']).toHaveBeenCalledWith({ documentId: 'doc_123', }); expect(mockClient['get-document-page-text']).toHaveBeenCalledTimes(2); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 0, }); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 1, }); // Verify the result contains the expected markdown expect(result.markdown).toContain('# Text Extraction'); expect(result.markdown).toContain('**Total Pages:** 2'); expect(result.markdown).toContain('**Total Words:** 20'); expect(result.markdown).toContain('**OCR Applied:** No'); expect(result.markdown).toContain('## Page 1 (10 words)'); expect(result.markdown).toContain('Line 1 on page 1 Line 2 on page 1'); expect(result.markdown).toContain('## Page 2 (10 words)'); expect(result.markdown).toContain('Line 1 on page 2 Line 2 on page 2'); expect(result.markdown).not.toContain('### Coordinates for Page'); }); it('should extract text from specified page range', async () => { // Mock document info response const mockDocInfoResponse = { data: { data: { pageCount: 3, }, }, }; // Mock page text responses const mockPage2TextResponse = { data: { textLines: [ { contents: 'Line 1 on page 2', left: 10, top: 10, width: 100, height: 20 }, { contents: 'Line 2 on page 2', left: 10, top: 40, width: 100, height: 20 }, ] as TextLine[], }, }; // Set up the mock responses mockClient['fetch-document-info'].mockResolvedValue(mockDocInfoResponse); mockClient['get-document-page-text'].mockImplementation( ({ pageIndex }: { pageIndex: number }) => { if (pageIndex === 1) return Promise.resolve(mockPage2TextResponse); return Promise.reject(new Error('Invalid page index')); } ); // Call the function with page_range const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, page_range: { start: 1, end: 1 }, include_coordinates: false, ocr_enabled: false, }); // Verify the client was called with the correct parameters expect(mockClient['get-document-page-text']).toHaveBeenCalledTimes(1); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 1, }); // Verify the result contains only page 2 expect(result.markdown).toContain('## Page 2 (10 words)'); expect(result.markdown).not.toContain('## Page 1'); expect(result.markdown).not.toContain('## Page 3'); }); it('should include coordinates when requested', async () => { // Mock document info response const mockDocInfoResponse = { data: { data: { pageCount: 1, }, }, }; // Mock page text response const mockPageTextResponse = { data: { textLines: [ { contents: 'Line 1', left: 10, top: 10, width: 100, height: 20 }, ] as TextLine[], }, }; // Set up the mock responses mockClient['fetch-document-info'].mockResolvedValue(mockDocInfoResponse); mockClient['get-document-page-text'].mockResolvedValue(mockPageTextResponse); // Call the function with include_coordinates = true const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, include_coordinates: true, ocr_enabled: false, }); // Verify the result contains coordinates expect(result.markdown).toContain('### Coordinates for Page 1'); expect(result.markdown).toContain('```json'); expect(result.markdown).toContain('"contents": "Line 1"'); expect(result.markdown).toContain('"boundingBox": "(left:10, top:10, width:100, height:20)"'); }); it('should enable OCR when requested', async () => { // Mock document info response const mockDocInfoResponse = { data: { data: { pageCount: 1, }, }, }; // Mock page text response const mockPageTextResponse = { data: { textLines: [ { contents: 'OCR extracted text', left: 10, top: 10, width: 100, height: 20 }, ] as TextLine[], }, }; // Set up the mock responses mockClient['fetch-document-info'].mockResolvedValue(mockDocInfoResponse); mockClient['get-document-page-text'].mockResolvedValue(mockPageTextResponse); // Call the function with ocr_enabled = true const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, include_coordinates: false, ocr_enabled: true, }); // Verify the client was called with OCR parameter expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 0, ocr: 'true', }); // Verify the result indicates OCR was applied expect(result.markdown).toContain('**OCR Applied:** Yes'); }); it('should handle page ranges with start and end', async () => { // Mock document info response const mockDocInfoResponse = { data: { data: { pageCount: 5, }, }, }; // Mock page text responses const mockPageTextResponse = { data: { textLines: [ { contents: 'Page content', left: 10, top: 10, width: 100, height: 20 }, ] as TextLine[], }, }; // Set up the mock responses mockClient['fetch-document-info'].mockResolvedValue(mockDocInfoResponse); mockClient['get-document-page-text'].mockResolvedValue(mockPageTextResponse); // Call the function with a page range that has start and end const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, page_range: { start: 0, end: 3 }, include_coordinates: false, ocr_enabled: false, }); // Verify the client was called for the correct pages expect(mockClient['get-document-page-text']).toHaveBeenCalledTimes(4); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 0, }); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 1, }); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 2, }); expect(mockClient['get-document-page-text']).toHaveBeenCalledWith({ documentId: 'doc_123', pageIndex: 3, }); // Verify the result contains the expected pages expect(result.markdown).toContain('## Page 1'); expect(result.markdown).toContain('## Page 2'); expect(result.markdown).toContain('## Page 3'); expect(result.markdown).toContain('## Page 4'); expect(result.markdown).not.toContain('## Page 5'); }); }); describe('error scenarios', () => { it('should handle API errors gracefully', async () => { // Mock the client to throw an error const errorMessage = 'API Error: Document not found'; mockClient['fetch-document-info'].mockRejectedValue(new Error(errorMessage)); // Call the function const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, include_coordinates: false, ocr_enabled: false, }); // Verify the result contains the error message expect(result.markdown).toContain('# Error Extracting Text'); expect(result.markdown).toContain( `An error occurred while trying to extract text: ${errorMessage}` ); }); it('should handle invalid page ranges', async () => { // Mock document info response const mockDocInfoResponse = { data: { data: { pageCount: 2, }, }, }; // Set up the mock responses mockClient['fetch-document-info'].mockResolvedValue(mockDocInfoResponse); mockClient['get-document-page-text'].mockResolvedValue({ data: { textLines: [] }, }); // Call the function with an invalid page range const result = await extractText(mockClient, { document_fingerprint: { document_id: 'doc_123' }, page_range: { start: 10, end: 20 }, // Out of bounds include_coordinates: false, ocr_enabled: false, }); // Verify the result contains the error message expect(result.markdown).toContain('# Error Extracting Text'); expect(result.markdown).toContain('out of bounds'); }); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PSPDFKit/nutrient-document-engine-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server