Calibre RAG MCP Server

test-pdf-approaches.js•8.18 KiB

#!/usr/bin/env node const tesseract = require('node-tesseract-ocr'); const pdf2pic = require('pdf2pic'); const pdfParse = require('pdf-parse'); const fs = require('fs'); const path = require('path'); const os = require('os'); async function testBothPDFApproaches() { console.log('🔍 Testing both PDF processing approaches...'); try { const libraryPath = 'D:\\e-library'; console.log(`🔍 Searching for PDF files in: ${libraryPath}`); // Find first PDF file function findFirstPDF(dir) { try { const items = fs.readdirSync(dir); for (const item of items.slice(0, 50)) { const fullPath = path.join(dir, item); const stat = fs.statSync(fullPath); if (stat.isFile() && item.toLowerCase().endsWith('.pdf')) { return fullPath; } else if (stat.isDirectory()) { const found = findFirstPDF(fullPath); if (found) return found; } } } catch (error) { // Skip directories we can't access } return null; } const pdfFile = findFirstPDF(libraryPath); if (!pdfFile) { console.log('❌ No PDF files found in library'); return; } console.log('📄 Found PDF file:', pdfFile); console.log('📏 File size:', Math.round(fs.statSync(pdfFile).size / 1024), 'KB'); // Test 1: pdf-parse approach console.log('\\n🔄 Testing PDF text extraction with pdf-parse...'); try { const dataBuffer = fs.readFileSync(pdfFile); const pdfData = await pdfParse(dataBuffer); console.log('✅ pdf-parse successful!'); console.log('📄 Pages:', pdfData.numpages); console.log('📝 Text length:', pdfData.text.length, 'characters'); console.log('ℹ️ PDF Info:', JSON.stringify(pdfData.info)); if (pdfData.text && pdfData.text.trim().length > 50) { console.log('✅ Good text extraction - this PDF has readable text'); console.log('📄 Sample text (first 200 chars):'); console.log(pdfData.text.substring(0, 200)); console.log('\\n🎉 pdf-parse approach will work for this PDF!'); return; // No need to test OCR if text extraction works } else { console.log('⚠️ Minimal text found, this PDF likely needs OCR'); } } catch (pdfParseError) { console.log('❌ pdf-parse failed:', pdfParseError.message); console.log('⚠️ This PDF will need OCR approach'); } // Test 2: Check ImageMagick installation console.log('\\n🔄 Testing ImageMagick installation...'); // Test if ImageMagick convert command works const { spawn } = require('child_process'); try { await new Promise((resolve, reject) => { const child = spawn('magick', ['--version'], { stdio: 'pipe' }); let output = ''; child.stdout.on('data', data => output += data.toString()); child.stderr.on('data', data => output += data.toString()); child.on('close', (code) => { if (code === 0) { console.log('✅ ImageMagick (magick) command works!'); console.log('📋 Version info:', output.split('\\n')[0]); resolve(); } else { reject(new Error(`magick command failed with code ${code}`)); } }); child.on('error', reject); }); } catch (magickError) { console.log('❌ ImageMagick magick command failed:', magickError.message); // Try alternative convert command try { await new Promise((resolve, reject) => { const child = spawn('convert', ['--version'], { stdio: 'pipe' }); let output = ''; child.stdout.on('data', data => output += data.toString()); child.stderr.on('data', data => output += data.toString()); child.on('close', (code) => { if (code === 0) { console.log('✅ ImageMagick (convert) command works!'); console.log('📋 Version info:', output.split('\\n')[0]); resolve(); } else { reject(new Error(`convert command failed with code ${code}`)); } }); child.on('error', reject); }); } catch (convertError) { console.log('❌ ImageMagick convert command also failed:', convertError.message); console.log('⚠️ ImageMagick may not be properly installed or in PATH'); return; } } // Test 3: pdf2pic + OCR approach console.log('\\n🔄 Testing PDF to image conversion + OCR...'); const tempDir = path.join(os.tmpdir(), 'ocr-test'); if (!fs.existsSync(tempDir)) { fs.mkdirSync(tempDir, { recursive: true }); } const convert = pdf2pic.fromPath(pdfFile, { density: 150, saveFilename: 'test-page', savePath: tempDir, format: 'png', width: 1000, height: 1000 }); try { console.log('🖼️ Converting first page to image...'); const result = await convert(1); if (result && result.path && fs.existsSync(result.path)) { console.log('✅ PDF conversion successful:', result.path); console.log('📏 Image size:', Math.round(fs.statSync(result.path).size / 1024), 'KB'); // Test OCR console.log('\\n🔍 Testing OCR on converted image...'); const tesseractPath = 'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe'; if (fs.existsSync(tesseractPath)) { process.env.TESSERACT_BINARY_PATH = tesseractPath; } const ocrConfig = { lang: 'eng', oem: 1, psm: 3, }; const startTime = Date.now(); const text = await tesseract.recognize(result.path, ocrConfig); const duration = Date.now() - startTime; console.log('✅ OCR completed in', duration, 'ms'); console.log('📝 Text length:', text.length, 'characters'); if (text.length > 0) { console.log('📄 Sample OCR text (first 200 chars):'); console.log(text.substring(0, 200)); console.log('\\n🎉 Full OCR pipeline is working!'); } else { console.log('⚠️ OCR returned empty text - page might be blank or poor quality'); } // Clean up try { fs.unlinkSync(result.path); fs.rmSync(tempDir, { recursive: true, force: true }); } catch (e) { // Ignore cleanup errors } } else { console.log('❌ PDF conversion failed - no image created'); } } catch (conversionError) { console.log('❌ PDF conversion error:', conversionError.message); } } catch (error) { console.error('❌ Test failed:', error.message); } } testBothPDFApproaches();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ispyridis/calibre-rag-mcp-nodejs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test-pdf-approaches.js•8.18 KiB