test-pdf-approaches.js•8.38 kB
#!/usr/bin/env node
const tesseract = require('node-tesseract-ocr');
const pdf2pic = require('pdf2pic');
const pdfParse = require('pdf-parse');
const fs = require('fs');
const path = require('path');
const os = require('os');
async function testBothPDFApproaches() {
console.log('🔍 Testing both PDF processing approaches...');
try {
const libraryPath = 'D:\\e-library';
console.log(`🔍 Searching for PDF files in: ${libraryPath}`);
// Find first PDF file
function findFirstPDF(dir) {
try {
const items = fs.readdirSync(dir);
for (const item of items.slice(0, 50)) {
const fullPath = path.join(dir, item);
const stat = fs.statSync(fullPath);
if (stat.isFile() && item.toLowerCase().endsWith('.pdf')) {
return fullPath;
} else if (stat.isDirectory()) {
const found = findFirstPDF(fullPath);
if (found) return found;
}
}
} catch (error) {
// Skip directories we can't access
}
return null;
}
const pdfFile = findFirstPDF(libraryPath);
if (!pdfFile) {
console.log('❌ No PDF files found in library');
return;
}
console.log('📄 Found PDF file:', pdfFile);
console.log('📏 File size:', Math.round(fs.statSync(pdfFile).size / 1024), 'KB');
// Test 1: pdf-parse approach
console.log('\\n🔄 Testing PDF text extraction with pdf-parse...');
try {
const dataBuffer = fs.readFileSync(pdfFile);
const pdfData = await pdfParse(dataBuffer);
console.log('✅ pdf-parse successful!');
console.log('📄 Pages:', pdfData.numpages);
console.log('📝 Text length:', pdfData.text.length, 'characters');
console.log('ℹ️ PDF Info:', JSON.stringify(pdfData.info));
if (pdfData.text && pdfData.text.trim().length > 50) {
console.log('✅ Good text extraction - this PDF has readable text');
console.log('📄 Sample text (first 200 chars):');
console.log(pdfData.text.substring(0, 200));
console.log('\\n🎉 pdf-parse approach will work for this PDF!');
return; // No need to test OCR if text extraction works
} else {
console.log('⚠️ Minimal text found, this PDF likely needs OCR');
}
} catch (pdfParseError) {
console.log('❌ pdf-parse failed:', pdfParseError.message);
console.log('⚠️ This PDF will need OCR approach');
}
// Test 2: Check ImageMagick installation
console.log('\\n🔄 Testing ImageMagick installation...');
// Test if ImageMagick convert command works
const { spawn } = require('child_process');
try {
await new Promise((resolve, reject) => {
const child = spawn('magick', ['--version'], { stdio: 'pipe' });
let output = '';
child.stdout.on('data', data => output += data.toString());
child.stderr.on('data', data => output += data.toString());
child.on('close', (code) => {
if (code === 0) {
console.log('✅ ImageMagick (magick) command works!');
console.log('📋 Version info:', output.split('\\n')[0]);
resolve();
} else {
reject(new Error(`magick command failed with code ${code}`));
}
});
child.on('error', reject);
});
} catch (magickError) {
console.log('❌ ImageMagick magick command failed:', magickError.message);
// Try alternative convert command
try {
await new Promise((resolve, reject) => {
const child = spawn('convert', ['--version'], { stdio: 'pipe' });
let output = '';
child.stdout.on('data', data => output += data.toString());
child.stderr.on('data', data => output += data.toString());
child.on('close', (code) => {
if (code === 0) {
console.log('✅ ImageMagick (convert) command works!');
console.log('📋 Version info:', output.split('\\n')[0]);
resolve();
} else {
reject(new Error(`convert command failed with code ${code}`));
}
});
child.on('error', reject);
});
} catch (convertError) {
console.log('❌ ImageMagick convert command also failed:', convertError.message);
console.log('⚠️ ImageMagick may not be properly installed or in PATH');
return;
}
}
// Test 3: pdf2pic + OCR approach
console.log('\\n🔄 Testing PDF to image conversion + OCR...');
const tempDir = path.join(os.tmpdir(), 'ocr-test');
if (!fs.existsSync(tempDir)) {
fs.mkdirSync(tempDir, { recursive: true });
}
const convert = pdf2pic.fromPath(pdfFile, {
density: 150,
saveFilename: 'test-page',
savePath: tempDir,
format: 'png',
width: 1000,
height: 1000
});
try {
console.log('🖼️ Converting first page to image...');
const result = await convert(1);
if (result && result.path && fs.existsSync(result.path)) {
console.log('✅ PDF conversion successful:', result.path);
console.log('📏 Image size:', Math.round(fs.statSync(result.path).size / 1024), 'KB');
// Test OCR
console.log('\\n🔍 Testing OCR on converted image...');
const tesseractPath = 'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe';
if (fs.existsSync(tesseractPath)) {
process.env.TESSERACT_BINARY_PATH = tesseractPath;
}
const ocrConfig = {
lang: 'eng',
oem: 1,
psm: 3,
};
const startTime = Date.now();
const text = await tesseract.recognize(result.path, ocrConfig);
const duration = Date.now() - startTime;
console.log('✅ OCR completed in', duration, 'ms');
console.log('📝 Text length:', text.length, 'characters');
if (text.length > 0) {
console.log('📄 Sample OCR text (first 200 chars):');
console.log(text.substring(0, 200));
console.log('\\n🎉 Full OCR pipeline is working!');
} else {
console.log('⚠️ OCR returned empty text - page might be blank or poor quality');
}
// Clean up
try {
fs.unlinkSync(result.path);
fs.rmSync(tempDir, { recursive: true, force: true });
} catch (e) {
// Ignore cleanup errors
}
} else {
console.log('❌ PDF conversion failed - no image created');
}
} catch (conversionError) {
console.log('❌ PDF conversion error:', conversionError.message);
}
} catch (error) {
console.error('❌ Test failed:', error.message);
}
}
testBothPDFApproaches();