/**
* PDF Content Extractor
* Extracts text content from PDF files (local or remote)
*/
// import pdfParse from 'pdf-parse'; // Temporarily disabled due to dependency issues
import fetch from 'node-fetch';
import { createLearnLogger } from '../utils/custom-logger.js';
export class PDFExtractor {
constructor() {
this.logger = createLearnLogger('PDFExtractor');
}
/**
* Check if URL/path is a PDF
*/
canHandle(url) {
return url.toLowerCase().endsWith('.pdf') || url.toLowerCase().includes('.pdf');
}
/**
* Extract content from PDF
*/
async extract(url) {
try {
this.logger.debug('Starting PDF extraction', { url });
let buffer;
let fileSize = 0;
// Handle remote PDF
if (url.startsWith('http')) {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Failed to fetch PDF: ${response.status} ${response.statusText}`);
}
const contentType = response.headers.get('content-type');
if (contentType && !contentType.includes('pdf')) {
this.logger.warn('Content-Type is not PDF', { contentType, url });
}
buffer = await response.buffer();
fileSize = buffer.length;
} else {
// Handle local file (if needed in future)
throw new Error('Local PDF files not supported yet');
}
// Parse PDF - temporarily stubbed
// const pdfData = await pdfParse(buffer);
throw new Error(
'PDF extraction temporarily disabled - dependency issues. Please use YouTube or article URLs for now.',
);
} catch (error) {
this.logger.error('PDF extraction failed', {
url,
error: error.message,
stack: error.stack,
});
throw new Error(`Failed to extract PDF content: ${error.message}`);
}
}
/**
* Extract title from URL if not available in PDF metadata
*/
extractTitleFromUrl(url) {
try {
const urlObj = new globalThis.URL(url);
const pathname = urlObj.pathname;
const filename = pathname.split('/').pop();
return filename.replace('.pdf', '').replace(/[-_]/g, ' ');
} catch {
return 'Unknown PDF';
}
}
/**
* Count words in text
*/
countWords(text) {
if (!text || typeof text !== 'string') return 0;
return text
.trim()
.split(/\s+/)
.filter(word => word.length > 0).length;
}
/**
* Validate PDF URL
*/
validateUrl(url) {
try {
// eslint-disable-next-line no-new
new globalThis.URL(url);
return this.canHandle(url);
} catch {
return false;
}
}
/**
* Get estimated processing time based on file size
*/
getEstimatedProcessingTime(url, fileSize = null) {
// PDF extraction time varies by file size
// Small PDFs: 5-15 seconds
// Large PDFs: 30-120 seconds
if (fileSize) {
const sizeMB = fileSize / (1024 * 1024);
if (sizeMB < 1) {
return { min: 5, max: 15, unit: 'seconds' };
} else if (sizeMB < 10) {
return { min: 15, max: 60, unit: 'seconds' };
} else {
return { min: 60, max: 180, unit: 'seconds' };
}
}
return { min: 10, max: 60, unit: 'seconds' };
}
/**
* Check if PDF is accessible
*/
async checkAccessibility(url) {
try {
const response = await fetch(url, { method: 'HEAD' });
return {
accessible: response.ok,
status: response.status,
contentType: response.headers.get('content-type'),
contentLength: response.headers.get('content-length'),
};
} catch (error) {
return {
accessible: false,
error: error.message,
};
}
}
}