Scientific Paper Harvester MCP Server

fetch-pdf-content.test.ts•18.1 KiB

import { describe, it, expect, vi, beforeEach } from 'vitest'; import nock from 'nock'; import { fetchPdfContent, fetchPdfContentSchema } from '../../src/tools/fetch-pdf-content.js'; // Mock the PdfExtractor since we want to test the tool logic, not the extraction implementation vi.mock('../../src/extractors/pdf-extractor.js', () => ({ PdfExtractor: vi.fn().mockImplementation(() => ({ extractText: vi.fn() })) })); vi.mock('../../src/config/constants.js', () => ({ DEFAULT_TEXT_EXTRACTION_CONFIG: { maxSizeMB: 6, timeoutMs: 30000 } })); describe('fetchPdfContent', () => { let mockPdfExtractor: any; beforeEach(async () => { vi.clearAllMocks(); nock.cleanAll(); // Reset the mock implementation const { PdfExtractor } = await import('../../src/extractors/pdf-extractor.js'); mockPdfExtractor = { extractText: vi.fn() }; (PdfExtractor as any).mockImplementation(() => mockPdfExtractor); }); describe('Input validation', () => { it('should validate URL format', async () => { const result1 = await fetchPdfContent({ url: 'not-a-valid-url' }); expect(result1.success).toBe(false); expect(result1.error).toContain('Invalid url'); // FTP URLs are technically valid URLs but will fail during extraction // Mock extractor to return error for unsupported protocols mockPdfExtractor.extractText.mockRejectedValue(new Error('Unsupported protocol: ftp')); const result2 = await fetchPdfContent({ url: 'ftp://example.com/file.pdf' }); expect(result2.success).toBe(false); expect(result2.error).toContain('Unsupported protocol'); }); it('should validate maxSizeMB parameter', async () => { const result1 = await fetchPdfContent({ url: 'https://example.com/test.pdf', maxSizeMB: 0 }); expect(result1.success).toBe(false); expect(result1.error).toContain('Number must be greater than or equal to 1'); const result2 = await fetchPdfContent({ url: 'https://example.com/test.pdf', maxSizeMB: 101 }); expect(result2.success).toBe(false); expect(result2.error).toContain('Number must be less than or equal to 100'); const result3 = await fetchPdfContent({ url: 'https://example.com/test.pdf', maxSizeMB: -5 }); expect(result3.success).toBe(false); expect(result3.error).toContain('Number must be greater than or equal to 1'); }); it('should validate maxPages parameter', async () => { const result1 = await fetchPdfContent({ url: 'https://example.com/test.pdf', maxPages: 0 }); expect(result1.success).toBe(false); expect(result1.error).toContain('Number must be greater than or equal to 1'); const result2 = await fetchPdfContent({ url: 'https://example.com/test.pdf', maxPages: 501 }); expect(result2.success).toBe(false); expect(result2.error).toContain('Number must be less than or equal to 500'); const result3 = await fetchPdfContent({ url: 'https://example.com/test.pdf', maxPages: -10 }); expect(result3.success).toBe(false); expect(result3.error).toContain('Number must be greater than or equal to 1'); }); it('should validate timeout parameter', async () => { const result1 = await fetchPdfContent({ url: 'https://example.com/test.pdf', timeout: 5 }); expect(result1.success).toBe(false); expect(result1.error).toContain('Number must be greater than or equal to 10'); const result2 = await fetchPdfContent({ url: 'https://example.com/test.pdf', timeout: 301 }); expect(result2.success).toBe(false); expect(result2.error).toContain('Number must be less than or equal to 300'); const result3 = await fetchPdfContent({ url: 'https://example.com/test.pdf', timeout: -30 }); expect(result3.success).toBe(false); expect(result3.error).toContain('Number must be greater than or equal to 10'); }); it('should use default values when parameters are not provided', async () => { const mockResult = { extractionSuccess: true, text: 'Default parameters test', metadata: { pageCount: 5, pdfSize: 2.0, url: 'https://arxiv.org/pdf/test.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://arxiv.org/pdf/test.pdf' }); expect(result.success).toBe(true); // Verify PdfExtractor was called with default values const { PdfExtractor } = await import('../../src/extractors/pdf-extractor.js'); expect(PdfExtractor).toHaveBeenCalledWith( expect.any(Object), // DEFAULT_TEXT_EXTRACTION_CONFIG { maxSizeMB: 50, timeoutMs: 120000, // 120 seconds * 1000 maxPages: 100, requireConfirmation: false, interactive: false } ); }); }); describe('Successful PDF extraction', () => { it('should extract text from a PDF successfully', async () => { const mockResult = { extractionSuccess: true, text: 'This is the extracted PDF content. It contains important research findings.', metadata: { pageCount: 10, pdfSize: 1.5, extractionTime: 2345, url: 'https://arxiv.org/pdf/2305.11176.pdf' }, truncated: false }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://arxiv.org/pdf/2305.11176.pdf', maxSizeMB: 10, maxPages: 50, timeout: 60, confirmLargeFiles: false }); expect(result.success).toBe(true); expect(result.text).toBe('This is the extracted PDF content. It contains important research findings.'); expect(result.metadata).toEqual({ pageCount: 10, sizeBytes: Math.round(1.5 * 1024 * 1024), sizeMB: 1.5, extractionTime: 2345, extractionSource: 'pdf', textTruncated: false, contextWarning: undefined }); // Verify PdfExtractor was called with correct parameters expect(mockPdfExtractor.extractText).toHaveBeenCalledWith( 'https://arxiv.org/pdf/2305.11176.pdf', expect.any(Function), // progress callback expect.any(Function) // confirmation callback ); }); it('should handle large PDFs with context warning', async () => { const mockResult = { extractionSuccess: true, text: 'A'.repeat(100000), // Large text content metadata: { pageCount: 200, pdfSize: 25.0, contextWarning: 'Warning: This is a large PDF that may impact your context window. Consider using smaller page limits.', url: 'https://example.com/large-paper.pdf' }, truncated: true }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://example.com/large-paper.pdf', maxSizeMB: 50, maxPages: 200, timeout: 180 }); expect(result.success).toBe(true); expect(result.text).toBe('A'.repeat(100000)); expect(result.metadata?.contextWarning).toContain('large PDF'); expect(result.metadata?.pageCount).toBe(200); expect(result.metadata?.textTruncated).toBe(true); }); it('should handle missing optional metadata gracefully', async () => { const mockResult = { extractionSuccess: true, text: 'Minimal metadata test', metadata: { url: 'https://example.com/minimal.pdf' }, truncated: false }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://example.com/minimal.pdf' }); expect(result.success).toBe(true); expect(result.text).toBe('Minimal metadata test'); expect(result.metadata).toEqual({ pageCount: undefined, sizeBytes: undefined, sizeMB: undefined, extractionTime: undefined, extractionSource: 'pdf', textTruncated: false, contextWarning: undefined }); }); }); describe('User cancellation scenarios', () => { it('should handle user cancellation gracefully', async () => { const mockResult = { extractionSuccess: false, metadata: { userCancelled: true, reason: 'PDF size (75MB) exceeds the configured limit (50MB)', sizeMB: 75.0, url: 'https://example.com/huge-paper.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://example.com/huge-paper.pdf', maxSizeMB: 50 }); expect(result.success).toBe(false); expect(result.cancelled).toBe(true); expect(result.error).toContain('PDF size (75MB) exceeds the configured limit'); }); it('should handle cancellation without specific reason', async () => { const mockResult = { extractionSuccess: false, metadata: { userCancelled: true, url: 'https://example.com/cancelled.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://example.com/cancelled.pdf' }); expect(result.success).toBe(false); expect(result.cancelled).toBe(true); expect(result.error).toBe('PDF extraction cancelled'); }); }); describe('Extraction failures', () => { it('should handle extraction failures without cancellation', async () => { const mockResult = { extractionSuccess: false, metadata: { userCancelled: false, error: 'Failed to parse PDF: corrupted file', url: 'https://example.com/corrupted.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://example.com/corrupted.pdf' }); expect(result.success).toBe(false); expect(result.cancelled).toBeUndefined(); expect(result.error).toBe('PDF extraction failed'); }); it('should handle network errors', async () => { mockPdfExtractor.extractText.mockRejectedValue(new Error('Network timeout: Unable to download PDF')); const result = await fetchPdfContent({ url: 'https://example.com/unreachable.pdf' }); expect(result.success).toBe(false); expect(result.error).toBe('Network timeout: Unable to download PDF'); }); it('should handle parsing errors', async () => { mockPdfExtractor.extractText.mockRejectedValue(new Error('PDF parsing failed: Invalid PDF structure')); const result = await fetchPdfContent({ url: 'https://example.com/invalid.pdf' }); expect(result.success).toBe(false); expect(result.error).toBe('PDF parsing failed: Invalid PDF structure'); }); it('should handle unknown errors', async () => { mockPdfExtractor.extractText.mockRejectedValue('Some unknown error'); const result = await fetchPdfContent({ url: 'https://example.com/error.pdf' }); expect(result.success).toBe(false); expect(result.error).toBe('Unknown error during PDF extraction'); }); }); describe('Configuration handling', () => { it('should pass correct configuration to PdfExtractor', async () => { const mockResult = { extractionSuccess: true, text: 'Config test', metadata: { url: 'https://example.com/test.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); await fetchPdfContent({ url: 'https://example.com/test.pdf', maxSizeMB: 25, maxPages: 75, timeout: 90, confirmLargeFiles: true }); const { PdfExtractor } = await import('../../src/extractors/pdf-extractor.js'); expect(PdfExtractor).toHaveBeenCalledWith( expect.any(Object), // DEFAULT_TEXT_EXTRACTION_CONFIG { maxSizeMB: 25, timeoutMs: 90000, // timeout converted to milliseconds maxPages: 75, requireConfirmation: true, interactive: false // Always false for MCP mode } ); }); it('should always set interactive to false for MCP mode', async () => { const mockResult = { extractionSuccess: true, text: 'Interactive test', metadata: { url: 'https://example.com/test.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); await fetchPdfContent({ url: 'https://example.com/test.pdf', confirmLargeFiles: true }); const { PdfExtractor } = await import('../../src/extractors/pdf-extractor.js'); expect(PdfExtractor).toHaveBeenCalledWith( expect.any(Object), expect.objectContaining({ interactive: false }) ); }); }); describe('Progress and confirmation callbacks', () => { it('should provide progress callback to extractor', async () => { const mockResult = { extractionSuccess: true, text: 'Progress test', metadata: { url: 'https://example.com/test.pdf' } }; let progressCallback: any; mockPdfExtractor.extractText.mockImplementation((url, progress, confirm) => { progressCallback = progress; return Promise.resolve(mockResult); }); await fetchPdfContent({ url: 'https://example.com/test.pdf' }); expect(typeof progressCallback).toBe('function'); // Test that progress callback doesn't throw expect(() => progressCallback({ phase: 'downloading', progress: 50, message: 'Downloading PDF...' })).not.toThrow(); }); it('should provide auto-confirmation callback that respects size limits', async () => { const mockResult = { extractionSuccess: true, text: 'Confirmation test', metadata: { url: 'https://example.com/test.pdf' } }; let confirmationCallback: any; mockPdfExtractor.extractText.mockImplementation((url, progress, confirm) => { confirmationCallback = confirm; return Promise.resolve(mockResult); }); await fetchPdfContent({ url: 'https://example.com/test.pdf', maxSizeMB: 10 }); expect(typeof confirmationCallback).toBe('function'); // Test auto-approval for small files const smallFileResult = await confirmationCallback({ url: 'https://example.com/test.pdf', sizeMB: 5.0 }); expect(smallFileResult).toBe(true); // Test auto-rejection for large files const largeFileResult = await confirmationCallback({ url: 'https://example.com/test.pdf', sizeMB: 15.0 }); expect(largeFileResult).toBe(false); }); }); describe('Different PDF sources', () => { it('should handle arXiv PDFs', async () => { const mockResult = { extractionSuccess: true, text: 'arXiv paper content about machine learning and neural networks.', metadata: { pageCount: 12, pdfSize: 3.2, url: 'https://arxiv.org/pdf/2305.11176.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://arxiv.org/pdf/2305.11176.pdf' }); expect(result.success).toBe(true); expect(result.text).toContain('machine learning'); }); it('should handle research institution PDFs', async () => { const mockResult = { extractionSuccess: true, text: 'Research paper from university repository about quantum computing.', metadata: { pageCount: 8, pdfSize: 2.1, url: 'https://university.edu/papers/quantum-research.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://university.edu/papers/quantum-research.pdf' }); expect(result.success).toBe(true); expect(result.text).toContain('quantum computing'); }); it('should handle journal publisher PDFs', async () => { const mockResult = { extractionSuccess: true, text: 'Published journal article about climate change research and environmental science.', metadata: { pageCount: 15, pdfSize: 4.8, url: 'https://journal.com/articles/climate-research.pdf' } }; mockPdfExtractor.extractText.mockResolvedValue(mockResult); const result = await fetchPdfContent({ url: 'https://journal.com/articles/climate-research.pdf' }); expect(result.success).toBe(true); expect(result.text).toContain('climate change'); }); }); describe('Schema validation', () => { it('should validate input schema correctly', () => { // Valid input should parse const validInput = { url: 'https://example.com/test.pdf', maxSizeMB: 50, maxPages: 100, timeout: 120, confirmLargeFiles: false }; expect(() => fetchPdfContentSchema.parse(validInput)).not.toThrow(); // Invalid URL should fail expect(() => fetchPdfContentSchema.parse({ url: 'not-a-url' })).toThrow(); // Out of range values should fail expect(() => fetchPdfContentSchema.parse({ url: 'https://example.com/test.pdf', maxSizeMB: 101 })).toThrow(); expect(() => fetchPdfContentSchema.parse({ url: 'https://example.com/test.pdf', timeout: 5 })).toThrow(); }); it('should apply default values correctly', () => { const input = { url: 'https://example.com/test.pdf' }; const parsed = fetchPdfContentSchema.parse(input); expect(parsed).toEqual({ url: 'https://example.com/test.pdf', maxSizeMB: 50, maxPages: 100, timeout: 120, confirmLargeFiles: false }); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/benedict2310/Scientific-Papers-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

fetch-pdf-content.test.ts•18.1 KiB