/**
* YouTube Content Extractor
* Extracts video metadata and transcripts from YouTube URLs using yt-dlp
*/
import { spawn } from 'child_process';
import { YoutubeTranscript } from 'youtube-transcript';
import { createLearnLogger } from '../utils/custom-logger.js';
import { TranscriptAcquisition } from '../transcript-acquisition.js';
export class YouTubeExtractor {
constructor() {
this.logger = createLearnLogger('YouTubeExtractor');
this.transcriptAcquisition = new TranscriptAcquisition({
strategies: [
'youtube_transcript_api',
'yt_dlp_auto_subs',
'yt_dlp_manual_subs'
],
fallbackToAudio: false // Disable audio fallback for now
});
}
/**
* Check if URL is a valid YouTube URL
*/
canHandle(url) {
try {
const urlLower = url.toLowerCase();
return urlLower.includes('youtube.com/watch') ||
urlLower.includes('youtu.be/') ||
urlLower.includes('youtube.com/embed/') ||
urlLower.includes('youtube.com/v/');
} catch {
return false;
}
}
/**
* Find yt-dlp executable path
*/
async findYtDlpPath() {
const ytDlpPaths = [
'/Users/bretmeraki/Library/Python/3.9/bin/yt-dlp',
'/usr/local/bin/yt-dlp',
'yt-dlp'
];
const fs = await import('fs');
for (const path of ytDlpPaths) {
try {
await fs.promises.access(path);
return path;
} catch {
continue;
}
}
return 'yt-dlp'; // fallback
}
/**
* Get video information using yt-dlp
*/
async getVideoInfoWithYtDlp(url) {
return new Promise(async (resolve, reject) => {
const ytDlpArgs = [
'--dump-json',
'--no-download',
'--no-warnings',
url
];
this.logger.debug('Running yt-dlp for metadata', { url, args: ytDlpArgs });
// Find yt-dlp path
const ytDlpPath = await this.findYtDlpPath();
const ytDlp = spawn(ytDlpPath, ytDlpArgs, {
stdio: ['pipe', 'pipe', 'pipe']
});
let stdout = '';
let stderr = '';
ytDlp.stdout.on('data', (data) => {
stdout += data.toString();
});
ytDlp.stderr.on('data', (data) => {
stderr += data.toString();
});
ytDlp.on('close', (code) => {
if (code === 0) {
try {
const videoInfo = JSON.parse(stdout);
resolve(videoInfo);
} catch (error) {
reject(new Error(`Failed to parse yt-dlp output: ${error.message}`));
}
} else {
reject(new Error(`yt-dlp failed with code ${code}: ${stderr}`));
}
});
ytDlp.on('error', (error) => {
reject(new Error(`Failed to spawn yt-dlp: ${error.message}. Make sure yt-dlp is installed.`));
});
});
}
/**
* Extract video ID from YouTube URL
*/
extractVideoId(url) {
try {
const urlObj = new globalThis.URL(url);
// Handle different YouTube URL formats
if (urlObj.hostname.includes('youtu.be')) {
return urlObj.pathname.slice(1);
} else if (urlObj.hostname.includes('youtube.com')) {
return urlObj.searchParams.get('v');
}
return null;
} catch {
return null;
}
}
/**
* Extract content from YouTube video using yt-dlp
*/
async extract(url) {
const startTime = Date.now();
try {
this.logger.extractionStart(url, 'youtube');
// First try to get video metadata using yt-dlp
const videoInfo = await this.getVideoInfoWithYtDlp(url);
// Extract video ID for transcript
const videoId = this.extractVideoId(url);
// Use robust transcript acquisition system
let transcript = null;
let transcriptData = null;
let transcriptError = null;
let transcriptMethod = null;
let transcriptConfidence = 0;
if (videoId) {
try {
this.logger.info('Starting robust transcript acquisition', { videoId });
const transcriptResult = await this.transcriptAcquisition.getTranscript(
url,
videoId,
videoInfo
);
if (transcriptResult.transcript) {
transcriptData = transcriptResult.transcript;
transcriptMethod = transcriptResult.method;
transcriptConfidence = transcriptResult.confidence;
// Convert to text format for backward compatibility
if (Array.isArray(transcriptData)) {
transcript = transcriptData.map(item => item.text).join(' ');
} else {
transcript = transcriptData;
}
this.logger.info('Transcript acquired successfully', {
videoId,
method: transcriptMethod,
confidence: transcriptConfidence,
transcriptLength: transcript.length,
segmentCount: Array.isArray(transcriptData) ? transcriptData.length : 0
});
}
} catch (error) {
transcriptError = error.message;
this.logger.warn('All transcript acquisition strategies failed', {
videoId,
error: error.message
});
}
}
const extractedContent = {
type: 'youtube',
url,
metadata: {
title: videoInfo.title || 'Unknown Title',
description: videoInfo.description || '',
duration: videoInfo.duration || 0,
author: videoInfo.uploader || videoInfo.channel || 'Unknown',
channelId: videoInfo.channel_id || videoInfo.uploader_id || null,
videoId: videoId,
publishDate: videoInfo.upload_date || null,
viewCount: videoInfo.view_count || 0,
language: videoInfo.language || 'unknown',
keywords: videoInfo.tags || [],
thumbnail: videoInfo.thumbnail || null
},
content: {
transcript,
transcriptData, // Raw transcript data with timestamps
transcriptAvailable: !!transcript,
transcriptError,
transcriptMethod,
transcriptConfidence,
description: videoInfo.description || ''
},
extractedAt: new Date().toISOString(),
extractionMethod: 'yt-dlp + youtube-transcript'
};
this.logger.extractionComplete(url, 'youtube', Date.now() - startTime, {
title: extractedContent.metadata.title,
hasTranscript: !!transcript,
duration: extractedContent.metadata.duration
});
return extractedContent;
} catch (error) {
this.logger.extractionFailed(url, 'youtube', error);
throw new Error(`Failed to extract YouTube content: ${error.message}`);
}
}
/**
* Get video ID from YouTube URL (alias for extractVideoId)
*/
getVideoId(url) {
return this.extractVideoId(url);
}
/**
* Validate and normalize YouTube URL
*/
normalizeUrl(url) {
try {
const videoId = this.extractVideoId(url);
return videoId ? `https://www.youtube.com/watch?v=${videoId}` : url;
} catch {
return url;
}
}
/**
* Get estimated processing time
*/
getEstimatedProcessingTime(url) {
// YouTube extraction is typically fast (5-30 seconds)
return {
min: 5,
max: 30,
unit: 'seconds',
};
}
}