MCP PDF Reader

pdf-tools.test.ts•14.6 KiB

import { extractTextFromPDF, extractMetadata, searchInPDF, getPageCount, extractPages, listPDFImages, extractPDFImage } from '../pdf-tools.js'; import path from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Path to test PDF const TEST_PDF_PATH = path.join(__dirname, '../../test-files/sample.pdf'); /** * Helper function to format table data in ASCII style */ function formatTableASCII(rows: string[][]): string { if (rows.length === 0) return ''; // Calculate column widths const colWidths = rows[0]!.map((_, colIndex) => Math.max(...rows.map(row => (row[colIndex] || '').length)) ); // Create separator line const separator = '+' + colWidths.map(w => '-'.repeat(w + 2)).join('+') + '+'; // Format rows const formattedRows = rows.map(row => '| ' + row.map((cell, i) => (cell || '').padEnd(colWidths[i]!)).join(' | ') + ' |' ); // Combine with separators return [ separator, formattedRows[0], separator, ...formattedRows.slice(1), separator ].join('\n'); } /** * Helper function to extract table from PDF text */ function extractTableFromText(text: string): string[][] { // Look for the table pattern in the text const lines = text.split('\n').map(l => l.trim()).filter(l => l); const tableData: string[][] = []; let foundTitle = false; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Detect header row if (line?.includes('Título') && line.includes('Campo')) { tableData.push(['Título', 'Campo 1', 'Campo 2']); foundTitle = true; continue; } // Skip the explanatory text if (line?.includes('Además de una Tabla')) { continue; } // Extract data rows if (foundTitle && line?.includes('quijote')) { tableData.push(['El quijote', 'Cervantes', 'España']); } if (foundTitle && line?.includes('Romeo')) { tableData.push(['Romeo y Julieta', 'Shakespeare', 'Inglaterra']); break; // End of table } } return tableData; } describe('PDF Tools - Tests específicos con sample.pdf', () => { describe('getPageCount', () => { it('should throw error for nonexistent file', async () => { await expect( getPageCount('/nonexistent/file.pdf') ).rejects.toThrow('Failed to get page count'); }); it('should return exactly 6 pages for sample.pdf', async () => { const pageCount = await getPageCount(TEST_PDF_PATH); expect(pageCount).toBe(6); }); }); describe('extractMetadata', () => { it('should throw error for nonexistent file', async () => { await expect( extractMetadata('/nonexistent/file.pdf') ).rejects.toThrow('Failed to extract metadata'); }); it('should extract correct metadata from sample.pdf', async () => { const metadata = await extractMetadata(TEST_PDF_PATH); expect(metadata.totalPages).toBe(6); expect(metadata.author).toBe('Ramon Tur'); expect(metadata.creator).toBe('Microsoft® Word 2019'); expect(metadata.producer).toBe('Microsoft® Word 2019'); expect(metadata.creationDate).toContain('D:20260114'); }); }); describe('extractTextFromPDF', () => { it('should throw error for nonexistent file', async () => { await expect( extractTextFromPDF('/nonexistent/file.pdf') ).rejects.toThrow('Failed to read PDF'); }); it('should extract complete text and metadata from sample.pdf', async () => { const result = await extractTextFromPDF(TEST_PDF_PATH); expect(result).toHaveProperty('text'); expect(result).toHaveProperty('metadata'); expect(result).toHaveProperty('pageCount'); expect(result.pageCount).toBe(6); expect(result.metadata?.author).toBe('Ramon Tur'); expect(typeof result.text).toBe('string'); expect(result.text.length).toBeGreaterThan(100); }); it('should contain expected text from sample.pdf', async () => { const result = await extractTextFromPDF(TEST_PDF_PATH); // Verificar que contiene textos específicos del PDF expect(result.text).toContain('Encabezado documento pruebas pdf'); expect(result.text).toContain('Este es n archivo Pdf para pruebas'); expect(result.text).toContain('El quijote'); expect(result.text).toContain('Cervantes'); expect(result.text).toContain('Romeo y Julieta'); expect(result.text).toContain('Shakespeare'); expect(result.text).toContain('Lorem ipsum'); }); }); describe('extractPages', () => { it('should throw error for nonexistent file', async () => { await expect( extractPages('/nonexistent/file.pdf', 1, 2) ).rejects.toThrow('Failed to extract pages'); }); it('should extract first page containing specific text', async () => { const result = await extractPages(TEST_PDF_PATH, 1, 1); expect(result.text).toContain('Encabezado documento pruebas pdf'); expect(result.text).toContain('Este es n archivo Pdf para pruebas'); expect(result.startPage).toBe(1); expect(result.endPage).toBe(1); expect(result.totalPages).toBe(6); }); it('should extract second page containing table data', async () => { const result = await extractPages(TEST_PDF_PATH, 2, 2); expect(result.text).toContain('Tabla'); expect(result.text).toContain('El quijote'); expect(result.text).toContain('Cervantes'); expect(result.text).toContain('Romeo y Julieta'); expect(result.startPage).toBe(2); expect(result.endPage).toBe(2); }); it('should extract page range (pages 1-3)', async () => { const result = await extractPages(TEST_PDF_PATH, 1, 3); expect(result.startPage).toBe(1); expect(result.endPage).toBe(3); expect(result.text).toContain('Encabezado'); expect(result.text).toContain('Lorem ipsum'); }); it('should handle single page extraction without endPage', async () => { const result = await extractPages(TEST_PDF_PATH, 2); expect(result.startPage).toBe(2); expect(result.endPage).toBe(2); expect(result.text).toContain('El quijote'); }); it('should throw error for invalid page numbers', async () => { await expect( extractPages(TEST_PDF_PATH, 0, 1) ).rejects.toThrow(); await expect( extractPages(TEST_PDF_PATH, 7, 8) ).rejects.toThrow(); }); }); describe('searchInPDF', () => { it('should throw error for nonexistent file', async () => { await expect( searchInPDF('/nonexistent/file.pdf', 'test') ).rejects.toThrow('Failed to search PDF'); }); it('should find "Cervantes" in the PDF', async () => { const results = await searchInPDF(TEST_PDF_PATH, 'Cervantes', false); expect(results.length).toBeGreaterThan(0); const firstResult = results[0]; if (firstResult) { expect(firstResult.text).toBe('Cervantes'); expect(firstResult.context).toContain('Cervantes'); expect(firstResult.page).toBeGreaterThanOrEqual(1); expect(firstResult.position).toBeGreaterThanOrEqual(0); } }); it('should find "Shakespeare" exactly once', async () => { const results = await searchInPDF(TEST_PDF_PATH, 'Shakespeare', false); expect(results.length).toBe(1); expect(results[0]?.text).toBe('Shakespeare'); expect(results[0]?.context).toContain('Romeo y Julieta'); }); it('should find multiple occurrences of "Encabezado"', async () => { const results = await searchInPDF(TEST_PDF_PATH, 'Encabezado', false); // Debería aparecer en varias páginas (es el encabezado) expect(results.length).toBeGreaterThanOrEqual(6); results.forEach(result => { expect(result.text).toBe('Encabezado'); }); }); it('should return empty array for non-existent text', async () => { const results = await searchInPDF(TEST_PDF_PATH, 'xyzabc123notfound', false); expect(Array.isArray(results)).toBe(true); expect(results.length).toBe(0); }); it('should respect case sensitivity for "CERVANTES"', async () => { const caseSensitiveResults = await searchInPDF(TEST_PDF_PATH, 'CERVANTES', true); const caseInsensitiveResults = await searchInPDF(TEST_PDF_PATH, 'CERVANTES', false); // Case sensitive no debería encontrar nada (está escrito como "Cervantes") expect(caseSensitiveResults.length).toBe(0); // Case insensitive debería encontrar al menos 1 expect(caseInsensitiveResults.length).toBeGreaterThan(0); }); it('should find "Lorem ipsum" and provide proper context', async () => { const results = await searchInPDF(TEST_PDF_PATH, 'Lorem ipsum', false); expect(results.length).toBeGreaterThan(0); const firstResult = results[0]; if (firstResult) { expect(firstResult.text).toBe('Lorem ipsum'); expect(firstResult.context.length).toBeGreaterThan(20); expect(firstResult.context).toContain('Lorem ipsum'); } }); it('should find "El quijote" in table data', async () => { const results = await searchInPDF(TEST_PDF_PATH, 'El quijote', false); expect(results.length).toBeGreaterThan(0); expect(results[0]?.text).toBe('El quijote'); // El contexto debería incluir información de la tabla expect(results[0]?.context).toBeTruthy(); }); }); describe('Table Extraction', () => { it('should extract and display table in ASCII format', async () => { const result = await extractPages(TEST_PDF_PATH, 2, 2); // Extract table from page 2 const tableData = extractTableFromText(result.text); expect(tableData.length).toBeGreaterThan(0); expect(tableData[0]).toEqual(['Título', 'Campo 1', 'Campo 2']); // Format as ASCII table const asciiTable = formatTableASCII(tableData); // Print to console console.log('\n╔═══════════════════════════════════════════════════════════╗'); console.log('║ TABLA EXTRAÍDA DEL PDF (Página 2) ║'); console.log('╚═══════════════════════════════════════════════════════════╝\n'); console.log(asciiTable); console.log(''); // Verify table content expect(asciiTable).toContain('Título'); expect(asciiTable).toContain('El quijote'); expect(asciiTable).toContain('Cervantes'); expect(asciiTable).toContain('España'); expect(asciiTable).toContain('Romeo y Julieta'); expect(asciiTable).toContain('Shakespeare'); expect(asciiTable).toContain('Inglaterra'); }); }); describe('PDF Images', () => { it('should list all images in the PDF', async () => { const images = await listPDFImages(TEST_PDF_PATH); console.log('\n╔═══════════════════════════════════════════════════════════╗'); console.log('║ IMÁGENES ENCONTRADAS EN EL PDF ║'); console.log('╚═══════════════════════════════════════════════════════════╝\n'); console.log(`Total de imágenes: ${images.length}\n`); images.forEach(img => { console.log(` [${img.index}] ${img.name}`); console.log(` Página: ${img.page}`); console.log(` Dimensiones: ${img.width}x${img.height}`); console.log(` Tipo: ${img.type}`); console.log(''); }); // Verify it's an array expect(Array.isArray(images)).toBe(true); // If there are images, verify structure if (images.length > 0) { const firstImage = images[0]; expect(firstImage).toHaveProperty('index'); expect(firstImage).toHaveProperty('page'); expect(firstImage).toHaveProperty('name'); expect(firstImage).toHaveProperty('width'); expect(firstImage).toHaveProperty('height'); expect(firstImage).toHaveProperty('type'); } }); it('should throw error when listing images from nonexistent file', async () => { await expect( listPDFImages('/nonexistent/file.pdf') ).rejects.toThrow('Failed to list PDF images'); }); it('should extract a specific image if images exist', async () => { const images = await listPDFImages(TEST_PDF_PATH); if (images.length > 0) { const imageData = await extractPDFImage(TEST_PDF_PATH, 0); console.log('\n╔═══════════════════════════════════════════════════════════╗'); console.log('║ IMAGEN EXTRAÍDA (Primera imagen) ║'); console.log('╚═══════════════════════════════════════════════════════════╝\n'); console.log(` Índice: ${imageData.index}`); console.log(` Página: ${imageData.page}`); console.log(` Nombre: ${imageData.name}`); console.log(` Dimensiones: ${imageData.width}x${imageData.height}`); console.log(` Tipo: ${imageData.type}`); console.log(` Tamaño de datos (Base64): ${imageData.data.length} caracteres`); console.log(''); expect(imageData).toHaveProperty('index'); expect(imageData).toHaveProperty('page'); expect(imageData).toHaveProperty('data'); expect(imageData.data).toBeTruthy(); expect(typeof imageData.data).toBe('string'); expect(imageData.data.length).toBeGreaterThan(0); } else { console.log('\n⚠️ No hay imágenes en el PDF para extraer\n'); } }); it('should throw error when extracting from nonexistent file', async () => { await expect( extractPDFImage('/nonexistent/file.pdf', 0) ).rejects.toThrow('Failed to extract PDF image'); }); it('should throw error when extracting non-existent image index', async () => { await expect( extractPDFImage(TEST_PDF_PATH, 9999) ).rejects.toThrow('not found'); }); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/rturv/mcp-pdf-reader'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf-tools.test.ts•14.6 KiB