Skip to main content
Glama
binary-handler.test.ts17.7 kB
/** * Unified Binary Handler Tests * * These tests define the contract for extractTextFromResponse(). * The function handles content-type detection, PDF extraction, HTML parsing, * and plain text handling with fallback strategies. * * Feature Requirements: * - Detect content type from Response headers * - Fallback to URL extension when content-type is missing/generic * - Extract text from PDF buffers * - Extract text from HTML with Jina fallback for JS-rendered content * - Handle plain text directly * - Provide meaningful errors with URL and content-type context * * Integration Points: * - src/tools/binary-handler.ts - Main extraction logic */ import { describe, it, expect } from 'vitest'; // Import the function and types import { extractTextFromResponse, type PdfParseFunction } from '../src/tools/binary-handler.js'; // ============================================================================ // Test Helpers // ============================================================================ /** * Create a mock Response object for testing * UBH-001: Mock Response factory helper */ function createMockResponse(options: { contentType?: string; body: string | Buffer; status?: number; url?: string; }): Response { const { contentType, body, status = 200, url = 'https://example.com/test' } = options; const headers = new Headers(); if (contentType) { headers.set('Content-Type', contentType); } // Convert body to appropriate format for Response constructor // Buffer needs to be converted to Uint8Array for Web API compatibility const bodyContent: BodyInit = Buffer.isBuffer(body) ? new Uint8Array(body) : body; // Create a mock Response const response = new Response(bodyContent, { status, headers }); // Override URL property (Response.url is read-only, so we need to mock it) Object.defineProperty(response, 'url', { value: url, writable: false }); return response; } /** * Create a mock PDF parser that returns valid text */ function createMockPdfParser(text: string = 'Hello World - This is test content from a valid PDF document.'): PdfParseFunction { return async (_buffer: Buffer) => ({ text, numpages: 1 }); } /** * Create a mock PDF parser that throws an error (corrupted PDF) */ function createCorruptedPdfParser(): PdfParseFunction { return async (_buffer: Buffer) => { throw new Error('Invalid PDF structure'); }; } /** * Create a mock PDF parser that returns empty text (image-only PDF) */ function createImageOnlyPdfParser(): PdfParseFunction { return async (_buffer: Buffer) => ({ text: '', numpages: 1 }); } /** * Create a minimal valid PDF buffer for testing * This is a mock PDF - the actual parsing is handled by injected mock parser */ function createValidPdfBuffer(): Buffer { // Simple PDF header to identify as PDF - actual parsing uses injected mock return Buffer.from('%PDF-1.4\n%Test PDF for mocking\n%%EOF\n', 'utf-8'); } /** * Create a corrupted PDF buffer (invalid structure) */ function createCorruptedPdfBuffer(): Buffer { return Buffer.from('%PDF-1.4\n\nCORRUPTED DATA - NOT A VALID PDF STRUCTURE\n%%EOF', 'utf-8'); } /** * Create an image-only PDF buffer (no extractable text) * This simulates a scanned document with only images */ function createImageOnlyPdfBuffer(): Buffer { return Buffer.from('%PDF-1.4\n%Image-only PDF\n%%EOF\n', 'utf-8'); } // ============================================================================ // Content-Type Detection Tests (UBH-100 to UBH-103) // ============================================================================ describe('Content-Type Detection', () => { it('UBH-100: should use PDF extractor when content-type is application/pdf', async () => { const pdfBuffer = createValidPdfBuffer(); const response = createMockResponse({ contentType: 'application/pdf', body: pdfBuffer, url: 'https://example.com/document.pdf' }); const result = await extractTextFromResponse({ url: 'https://example.com/document.pdf', response, _pdfParser: createMockPdfParser() }); expect(result.extractorUsed).toBe('pdf'); expect(result.contentType).toBe('application/pdf'); expect(result.text).toBeDefined(); }); it('UBH-101: should use HTML extractor when content-type is text/html', async () => { const htmlContent = `<!DOCTYPE html> <html> <head><title>Test Page</title></head> <body> <h1>Hello World</h1> <p>This is a test paragraph with meaningful content that should be extracted properly.</p> </body> </html>`; const response = createMockResponse({ contentType: 'text/html; charset=utf-8', body: htmlContent, url: 'https://example.com/page.html' }); const result = await extractTextFromResponse({ url: 'https://example.com/page.html', response }); expect(result.extractorUsed).toBe('html'); expect(result.contentType).toContain('text/html'); expect(result.text).toContain('Hello World'); expect(result.text).not.toContain('<h1>'); }); it('UBH-102: should use plain text extractor when content-type is text/plain', async () => { const plainText = 'This is plain text content that should be returned as-is.'; const response = createMockResponse({ contentType: 'text/plain', body: plainText, url: 'https://example.com/readme.txt' }); const result = await extractTextFromResponse({ url: 'https://example.com/readme.txt', response }); expect(result.extractorUsed).toBe('plain'); expect(result.contentType).toBe('text/plain'); expect(result.text).toBe(plainText); }); it('UBH-103: should throw descriptive error for unknown binary content-type', async () => { const binaryData = Buffer.from([0x00, 0x01, 0x02, 0xFF, 0xFE, 0xFD]); const response = createMockResponse({ contentType: 'application/x-unknown-binary', body: binaryData, url: 'https://example.com/file.bin' }); await expect(extractTextFromResponse({ url: 'https://example.com/file.bin', response })).rejects.toThrow(/unsupported|unknown|cannot extract/i); }); }); // ============================================================================ // URL Extension Fallback Tests (UBH-110 to UBH-112) // ============================================================================ describe('URL Extension Fallback', () => { it('UBH-110: should use PDF extractor when content-type is missing but URL ends with .pdf', async () => { const pdfBuffer = createValidPdfBuffer(); const response = createMockResponse({ contentType: undefined, // No content-type header body: pdfBuffer, url: 'https://example.com/document.pdf' }); const result = await extractTextFromResponse({ url: 'https://example.com/document.pdf', response, _pdfParser: createMockPdfParser() }); expect(result.extractorUsed).toBe('pdf'); expect(result.text).toBeDefined(); }); it('UBH-111: should use plain text extractor when content-type is missing but URL ends with .txt', async () => { const textContent = 'Plain text file content.'; const response = createMockResponse({ contentType: undefined, // No content-type header body: textContent, url: 'https://example.com/notes.txt' }); const result = await extractTextFromResponse({ url: 'https://example.com/notes.txt', response }); expect(result.extractorUsed).toBe('plain'); expect(result.text).toBe(textContent); }); it('UBH-112: should use PDF extractor when content-type is application/octet-stream but URL ends with .pdf', async () => { const pdfBuffer = createValidPdfBuffer(); const response = createMockResponse({ contentType: 'application/octet-stream', // Generic binary body: pdfBuffer, url: 'https://example.com/document.pdf' }); const result = await extractTextFromResponse({ url: 'https://example.com/document.pdf', response, _pdfParser: createMockPdfParser() }); expect(result.extractorUsed).toBe('pdf'); expect(result.text).toBeDefined(); }); }); // ============================================================================ // PDF Extraction Tests (UBH-120 to UBH-122) // ============================================================================ describe('PDF Extraction', () => { it('UBH-120: should extract text from valid PDF buffer', async () => { const pdfBuffer = createValidPdfBuffer(); const response = createMockResponse({ contentType: 'application/pdf', body: pdfBuffer, url: 'https://example.com/document.pdf' }); const result = await extractTextFromResponse({ url: 'https://example.com/document.pdf', response, _pdfParser: createMockPdfParser('Hello World - This is test content from a valid PDF document.') }); expect(result.extractorUsed).toBe('pdf'); expect(result.text.length).toBeGreaterThan(0); // The mock PDF returns "Hello World" expect(result.text).toContain('Hello'); }); it('UBH-121: should throw meaningful error for corrupted PDF', async () => { const corruptedPdf = createCorruptedPdfBuffer(); const response = createMockResponse({ contentType: 'application/pdf', body: corruptedPdf, url: 'https://example.com/corrupted.pdf' }); await expect(extractTextFromResponse({ url: 'https://example.com/corrupted.pdf', response, _pdfParser: createCorruptedPdfParser() })).rejects.toThrow(/corrupt|invalid|parse|failed/i); }); it('UBH-122: should throw insufficient text error for image-only PDF', async () => { const imageOnlyPdf = createImageOnlyPdfBuffer(); const response = createMockResponse({ contentType: 'application/pdf', body: imageOnlyPdf, url: 'https://example.com/scanned.pdf' }); await expect(extractTextFromResponse({ url: 'https://example.com/scanned.pdf', response, _pdfParser: createImageOnlyPdfParser() })).rejects.toThrow(/insufficient|no text|empty|ocr/i); }); }); // ============================================================================ // HTML Extraction Tests (UBH-130 to UBH-132) // ============================================================================ describe('HTML Extraction', () => { it('UBH-130: should extract text without HTML tags from valid HTML', async () => { const htmlContent = `<!DOCTYPE html> <html> <head> <title>Test Document</title> <script>console.log("ignored");</script> <style>body { color: red; }</style> </head> <body> <nav>Navigation Menu</nav> <main> <h1>Main Heading</h1> <p>This is a paragraph with <strong>bold text</strong> and <em>italic text</em>.</p> <p>Another paragraph with sufficient content to pass extraction thresholds and provide meaningful text.</p> </main> <footer>Footer content</footer> </body> </html>`; const response = createMockResponse({ contentType: 'text/html', body: htmlContent, url: 'https://example.com/article.html' }); const result = await extractTextFromResponse({ url: 'https://example.com/article.html', response }); expect(result.extractorUsed).toBe('html'); expect(result.text).toContain('Main Heading'); expect(result.text).toContain('paragraph'); // Should not contain HTML tags expect(result.text).not.toMatch(/<[^>]+>/); // Should not contain script content expect(result.text).not.toContain('console.log'); }); it('UBH-131: should trigger Jina fallback for JavaScript-rendered shell HTML', async () => { // This represents a React/Vue/Angular app shell that has minimal content const jsShellHtml = `<!DOCTYPE html> <html> <head> <title>App</title> </head> <body> <div id="root"></div> <noscript>You need to enable JavaScript to run this app.</noscript> <script src="/static/js/main.chunk.js"></script> </body> </html>`; const response = createMockResponse({ contentType: 'text/html', body: jsShellHtml, url: 'https://example.com/app' }); const result = await extractTextFromResponse({ url: 'https://example.com/app', response }); // When HTML has minimal content, should fall back to Jina expect(result.extractorUsed).toBe('jina'); }); it('UBH-132: should trigger Jina fallback when extracted HTML text is less than 200 characters', async () => { const minimalHtml = `<!DOCTYPE html> <html> <head><title>Page</title></head> <body><p>Short.</p></body> </html>`; const response = createMockResponse({ contentType: 'text/html', body: minimalHtml, url: 'https://example.com/short-page.html' }); const result = await extractTextFromResponse({ url: 'https://example.com/short-page.html', response }); // When extracted text is too short, should fall back to Jina expect(result.extractorUsed).toBe('jina'); }); }); // ============================================================================ // Error Handling Tests (UBH-140 to UBH-141) // ============================================================================ describe('Error Handling', () => { it('UBH-140: should throw error when content exceeds size limit', async () => { // Create a large buffer that exceeds the default size limit const largeBuffer = Buffer.alloc(50 * 1024 * 1024, 'x'); // 50MB const response = createMockResponse({ contentType: 'text/plain', body: largeBuffer, url: 'https://example.com/huge-file.txt' }); await expect(extractTextFromResponse({ url: 'https://example.com/huge-file.txt', response, maxSizeBytes: 10 * 1024 * 1024 // 10MB limit })).rejects.toThrow(/size|limit|too large|exceeds/i); }); it('UBH-141: should include URL and content-type in error messages', async () => { const binaryData = Buffer.from([0x00, 0x01, 0x02, 0xFF]); const testUrl = 'https://example.com/mystery-file.xyz'; const testContentType = 'application/x-mystery'; const response = createMockResponse({ contentType: testContentType, body: binaryData, url: testUrl }); try { await extractTextFromResponse({ url: testUrl, response }); // Should not reach here expect(true).toBe(false); } catch (error) { const errorMessage = (error as Error).message; // Error should include both URL and content-type for debugging expect(errorMessage).toContain(testUrl); expect(errorMessage).toContain(testContentType); } }); }); // ============================================================================ // Additional Edge Case Tests // ============================================================================ describe('Edge Cases', () => { it('should handle Response with empty body', async () => { const response = createMockResponse({ contentType: 'text/plain', body: '', url: 'https://example.com/empty.txt' }); const result = await extractTextFromResponse({ url: 'https://example.com/empty.txt', response }); expect(result.text).toBe(''); expect(result.extractorUsed).toBe('plain'); }); it('should handle content-type with additional parameters', async () => { const response = createMockResponse({ contentType: 'text/html; charset=utf-8; boundary=something', body: '<html><body><main><p>Test content that is long enough to pass threshold requirements. More text here to ensure it passes.</p></main></body></html>', url: 'https://example.com/page.html' }); const result = await extractTextFromResponse({ url: 'https://example.com/page.html', response }); expect(result.extractorUsed).toBe('html'); }); it('should handle URL with query parameters when detecting extension', async () => { const pdfBuffer = createValidPdfBuffer(); const response = createMockResponse({ contentType: undefined, body: pdfBuffer, url: 'https://example.com/download.pdf?token=abc123&expires=12345' }); const result = await extractTextFromResponse({ url: 'https://example.com/download.pdf?token=abc123&expires=12345', response, _pdfParser: createMockPdfParser() }); expect(result.extractorUsed).toBe('pdf'); }); it('should handle URL with fragment when detecting extension', async () => { const textContent = 'Text file content.'; const response = createMockResponse({ contentType: undefined, body: textContent, url: 'https://example.com/readme.txt#section-1' }); const result = await extractTextFromResponse({ url: 'https://example.com/readme.txt#section-1', response }); expect(result.extractorUsed).toBe('plain'); }); it('should handle markdown content-type', async () => { const markdownContent = `# Heading This is **bold** and *italic* text. - List item 1 - List item 2`; const response = createMockResponse({ contentType: 'text/markdown', body: markdownContent, url: 'https://example.com/readme.md' }); const result = await extractTextFromResponse({ url: 'https://example.com/readme.md', response }); // Markdown should be treated as plain text expect(result.extractorUsed).toBe('plain'); expect(result.text).toBe(markdownContent); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/mnehmos.index-foundry.mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server