/**
* Content Extractor Factory
* Routes URLs to appropriate extractors and provides unified interface
*/
import { YouTubeExtractor } from './youtube-extractor.js';
import { PDFExtractor } from './pdf-extractor.js';
import { ArticleExtractor } from './article-extractor.js';
import { createLearnLogger } from '../utils/custom-logger.js';
export class ContentExtractorFactory {
constructor() {
this.logger = createLearnLogger('ContentExtractorFactory');
// Initialize extractors
this.extractors = {
youtube: new YouTubeExtractor(),
pdf: new PDFExtractor(),
article: new ArticleExtractor(),
};
}
/**
* Get appropriate extractor for URL
*/
getExtractor(url) {
// Check each extractor in priority order
if (this.extractors.youtube.canHandle(url)) {
return { type: 'youtube', extractor: this.extractors.youtube };
}
if (this.extractors.pdf.canHandle(url)) {
return { type: 'pdf', extractor: this.extractors.pdf };
}
if (this.extractors.article.canHandle(url)) {
return { type: 'article', extractor: this.extractors.article };
}
return null;
}
/**
* Extract content from URL using appropriate extractor
*/
async extractContent(url) {
const extractorInfo = this.getExtractor(url);
if (!extractorInfo) {
throw new Error(`No suitable extractor found for URL: ${url}`);
}
this.logger.debug('Using extractor', {
url,
extractorType: extractorInfo.type,
});
try {
const content = await extractorInfo.extractor.extract(url);
// Add extractor type to content
content.extractorType = extractorInfo.type;
return content;
} catch (error) {
this.logger.error('Content extraction failed', {
url,
extractorType: extractorInfo.type,
error: error.message,
});
throw error;
}
}
/**
* Check if URL can be processed
*/
canProcess(url) {
return this.getExtractor(url) !== null;
}
/**
* Get supported URL types
*/
getSupportedTypes() {
return [
{
type: 'youtube',
description: 'YouTube videos with transcript extraction',
examples: ['https://youtube.com/watch?v=...', 'https://youtu.be/...'],
},
{
type: 'pdf',
description: 'PDF documents (remote URLs)',
examples: ['https://example.com/document.pdf'],
},
{
type: 'article',
description: 'Web articles and blog posts',
examples: ['https://example.com/article', 'https://blog.example.com/post'],
},
];
}
/**
* Validate multiple URLs
*/
validateUrls(urls) {
const results = [];
for (const url of urls) {
const extractorInfo = this.getExtractor(url);
results.push({
url,
valid: !!extractorInfo,
type: extractorInfo?.type || 'unsupported',
reason: extractorInfo ? 'Supported' : 'No suitable extractor found',
});
}
return results;
}
/**
* Get estimated processing time for URL
*/
getEstimatedProcessingTime(url) {
const extractorInfo = this.getExtractor(url);
if (!extractorInfo) {
return { min: 0, max: 0, unit: 'seconds' };
}
return extractorInfo.extractor.getEstimatedProcessingTime(url);
}
/**
* Batch validate URLs accessibility
*/
async checkUrlsAccessibility(urls) {
const results = [];
for (const url of urls) {
const extractorInfo = this.getExtractor(url);
if (!extractorInfo) {
results.push({
url,
accessible: false,
reason: 'Unsupported URL type',
});
continue;
}
try {
// Check if extractor has accessibility check method
if (typeof extractorInfo.extractor.checkAccessibility === 'function') {
const accessibilityResult = await extractorInfo.extractor.checkAccessibility(url);
results.push({
url,
...accessibilityResult,
extractorType: extractorInfo.type,
});
} else {
results.push({
url,
accessible: true,
extractorType: extractorInfo.type,
reason: 'Accessibility check not available for this extractor',
});
}
} catch (error) {
results.push({
url,
accessible: false,
extractorType: extractorInfo.type,
error: error.message,
});
}
}
return results;
}
/**
* Get extractor statistics
*/
getExtractorStats() {
return {
availableExtractors: Object.keys(this.extractors),
supportedTypes: this.getSupportedTypes().map(t => t.type),
totalExtractors: Object.keys(this.extractors).length,
};
}
}