Skip to main content
Glama

Calibre RAG MCP Server

by ispyridis
test-pdf-approaches.js8.38 kB
#!/usr/bin/env node const tesseract = require('node-tesseract-ocr'); const pdf2pic = require('pdf2pic'); const pdfParse = require('pdf-parse'); const fs = require('fs'); const path = require('path'); const os = require('os'); async function testBothPDFApproaches() { console.log('🔍 Testing both PDF processing approaches...'); try { const libraryPath = 'D:\\e-library'; console.log(`🔍 Searching for PDF files in: ${libraryPath}`); // Find first PDF file function findFirstPDF(dir) { try { const items = fs.readdirSync(dir); for (const item of items.slice(0, 50)) { const fullPath = path.join(dir, item); const stat = fs.statSync(fullPath); if (stat.isFile() && item.toLowerCase().endsWith('.pdf')) { return fullPath; } else if (stat.isDirectory()) { const found = findFirstPDF(fullPath); if (found) return found; } } } catch (error) { // Skip directories we can't access } return null; } const pdfFile = findFirstPDF(libraryPath); if (!pdfFile) { console.log('❌ No PDF files found in library'); return; } console.log('📄 Found PDF file:', pdfFile); console.log('📏 File size:', Math.round(fs.statSync(pdfFile).size / 1024), 'KB'); // Test 1: pdf-parse approach console.log('\\n🔄 Testing PDF text extraction with pdf-parse...'); try { const dataBuffer = fs.readFileSync(pdfFile); const pdfData = await pdfParse(dataBuffer); console.log('✅ pdf-parse successful!'); console.log('📄 Pages:', pdfData.numpages); console.log('📝 Text length:', pdfData.text.length, 'characters'); console.log('ℹ️ PDF Info:', JSON.stringify(pdfData.info)); if (pdfData.text && pdfData.text.trim().length > 50) { console.log('✅ Good text extraction - this PDF has readable text'); console.log('📄 Sample text (first 200 chars):'); console.log(pdfData.text.substring(0, 200)); console.log('\\n🎉 pdf-parse approach will work for this PDF!'); return; // No need to test OCR if text extraction works } else { console.log('⚠️ Minimal text found, this PDF likely needs OCR'); } } catch (pdfParseError) { console.log('❌ pdf-parse failed:', pdfParseError.message); console.log('⚠️ This PDF will need OCR approach'); } // Test 2: Check ImageMagick installation console.log('\\n🔄 Testing ImageMagick installation...'); // Test if ImageMagick convert command works const { spawn } = require('child_process'); try { await new Promise((resolve, reject) => { const child = spawn('magick', ['--version'], { stdio: 'pipe' }); let output = ''; child.stdout.on('data', data => output += data.toString()); child.stderr.on('data', data => output += data.toString()); child.on('close', (code) => { if (code === 0) { console.log('✅ ImageMagick (magick) command works!'); console.log('📋 Version info:', output.split('\\n')[0]); resolve(); } else { reject(new Error(`magick command failed with code ${code}`)); } }); child.on('error', reject); }); } catch (magickError) { console.log('❌ ImageMagick magick command failed:', magickError.message); // Try alternative convert command try { await new Promise((resolve, reject) => { const child = spawn('convert', ['--version'], { stdio: 'pipe' }); let output = ''; child.stdout.on('data', data => output += data.toString()); child.stderr.on('data', data => output += data.toString()); child.on('close', (code) => { if (code === 0) { console.log('✅ ImageMagick (convert) command works!'); console.log('📋 Version info:', output.split('\\n')[0]); resolve(); } else { reject(new Error(`convert command failed with code ${code}`)); } }); child.on('error', reject); }); } catch (convertError) { console.log('❌ ImageMagick convert command also failed:', convertError.message); console.log('⚠️ ImageMagick may not be properly installed or in PATH'); return; } } // Test 3: pdf2pic + OCR approach console.log('\\n🔄 Testing PDF to image conversion + OCR...'); const tempDir = path.join(os.tmpdir(), 'ocr-test'); if (!fs.existsSync(tempDir)) { fs.mkdirSync(tempDir, { recursive: true }); } const convert = pdf2pic.fromPath(pdfFile, { density: 150, saveFilename: 'test-page', savePath: tempDir, format: 'png', width: 1000, height: 1000 }); try { console.log('🖼️ Converting first page to image...'); const result = await convert(1); if (result && result.path && fs.existsSync(result.path)) { console.log('✅ PDF conversion successful:', result.path); console.log('📏 Image size:', Math.round(fs.statSync(result.path).size / 1024), 'KB'); // Test OCR console.log('\\n🔍 Testing OCR on converted image...'); const tesseractPath = 'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe'; if (fs.existsSync(tesseractPath)) { process.env.TESSERACT_BINARY_PATH = tesseractPath; } const ocrConfig = { lang: 'eng', oem: 1, psm: 3, }; const startTime = Date.now(); const text = await tesseract.recognize(result.path, ocrConfig); const duration = Date.now() - startTime; console.log('✅ OCR completed in', duration, 'ms'); console.log('📝 Text length:', text.length, 'characters'); if (text.length > 0) { console.log('📄 Sample OCR text (first 200 chars):'); console.log(text.substring(0, 200)); console.log('\\n🎉 Full OCR pipeline is working!'); } else { console.log('⚠️ OCR returned empty text - page might be blank or poor quality'); } // Clean up try { fs.unlinkSync(result.path); fs.rmSync(tempDir, { recursive: true, force: true }); } catch (e) { // Ignore cleanup errors } } else { console.log('❌ PDF conversion failed - no image created'); } } catch (conversionError) { console.log('❌ PDF conversion error:', conversionError.message); } } catch (error) { console.error('❌ Test failed:', error.message); } } testBothPDFApproaches();

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ispyridis/calibre-rag-mcp-nodejs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server