test-ocr-full.js•4.73 kB
#!/usr/bin/env node
const tesseract = require('node-tesseract-ocr');
const pdf2pic = require('pdf2pic');
const fs = require('fs');
const path = require('path');
const os = require('os');
async function testOCROnLibrary() {
console.log('🔍 Testing OCR on library files...');
try {
// Set up Tesseract path
const tesseractPath = 'C:\\\\Program Files\\\\Tesseract-OCR\\\\tesseract.exe';
if (fs.existsSync(tesseractPath)) {
console.log('✅ Found Tesseract at:', tesseractPath);
process.env.TESSERACT_BINARY_PATH = tesseractPath;
} else {
console.log('⚠️ Using Tesseract from PATH');
}
const libraryPath = 'D:\\\\e-library';
console.log(`🔍 Searching for PDF files in: ${libraryPath}`);
// Find first PDF file
function findFirstPDF(dir) {
try {
const items = fs.readdirSync(dir);
for (const item of items.slice(0, 50)) { // Check first 50 items
const fullPath = path.join(dir, item);
const stat = fs.statSync(fullPath);
if (stat.isFile() && item.toLowerCase().endsWith('.pdf')) {
return fullPath;
} else if (stat.isDirectory()) {
const found = findFirstPDF(fullPath);
if (found) return found;
}
}
} catch (error) {
// Skip directories we can't access
}
return null;
}
const pdfFile = findFirstPDF(libraryPath);
if (!pdfFile) {
console.log('❌ No PDF files found in library');
return;
}
console.log('📄 Found PDF file:', pdfFile);
console.log('📏 File size:', Math.round(fs.statSync(pdfFile).size / 1024), 'KB');
// Test PDF to image conversion
console.log('\\n🔄 Testing PDF to image conversion...');
const tempDir = path.join(os.tmpdir(), 'ocr-test');
if (!fs.existsSync(tempDir)) {
fs.mkdirSync(tempDir, { recursive: true });
}
const convert = pdf2pic.fromPath(pdfFile, {
density: 150, // Lower density for faster testing
saveFilename: 'test-page',
savePath: tempDir,
format: 'png',
width: 1000,
height: 1000
});
try {
console.log('🖼️ Converting first page to image...');
const result = await convert(1);
if (result && result.path && fs.existsSync(result.path)) {
console.log('✅ PDF conversion successful:', result.path);
console.log('📏 Image size:', Math.round(fs.statSync(result.path).size / 1024), 'KB');
// Test OCR on the image
console.log('\\n🔍 Testing OCR on converted image...');
const ocrConfig = {
lang: 'eng',
oem: 1,
psm: 3,
};
const startTime = Date.now();
const text = await tesseract.recognize(result.path, ocrConfig);
const duration = Date.now() - startTime;
console.log('✅ OCR completed in', duration, 'ms');
console.log('📝 Text length:', text.length, 'characters');
if (text.length > 0) {
console.log('📄 Sample text (first 200 chars):');
console.log(text.substring(0, 200));
console.log('\\n🎉 OCR is working correctly!');
} else {
console.log('⚠️ OCR returned empty text - the page might be blank or image quality poor');
}
// Clean up
try {
fs.unlinkSync(result.path);
fs.rmSync(tempDir, { recursive: true, force: true });
} catch (e) {
// Ignore cleanup errors
}
} else {
console.log('❌ PDF conversion failed - no image created');
}
} catch (conversionError) {
console.log('❌ PDF conversion error:', conversionError.message);
}
} catch (error) {
console.error('❌ OCR test failed:', error.message);
console.error('Stack:', error.stack);
}
}
testOCROnLibrary();