"""
YouTube metadata and subtitle extraction utilities
Uses yt-dlp to extract video metadata and subtitles
"""
import json
import asyncio
import logging
from typing import Dict, Optional, Tuple, Any
logger = logging.getLogger(__name__)
async def extract_youtube_metadata(url: str) -> Dict[str, Any]:
"""
Extract YouTube video metadata using yt-dlp
Args:
url: YouTube video URL
Returns:
dict: Video metadata including title, description, etc.
"""
try:
cmd = [
'yt-dlp',
'--dump-json',
'--no-playlist',
url
]
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
try:
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=10)
except asyncio.TimeoutError:
process.kill()
await process.wait()
raise Exception("YouTube metadata extraction timeout after 10 seconds")
if process.returncode != 0:
error_msg = stderr.decode() if stderr else "Unknown error"
logger.error(f"yt-dlp metadata extraction error: {error_msg}")
raise Exception(f"Failed to extract metadata: {error_msg}")
metadata = json.loads(stdout.decode())
return metadata
except FileNotFoundError:
raise Exception("yt-dlp not found. Please install yt-dlp: pip install yt-dlp")
except json.JSONDecodeError as e:
logger.error(f"Failed to parse metadata JSON: {str(e)}")
raise Exception(f"Failed to parse metadata: {str(e)}")
except Exception as e:
logger.error(f"Error extracting YouTube metadata: {str(e)}")
raise
async def extract_youtube_subtitles(url: str, duration: int = 7) -> str:
"""
Extract YouTube subtitles for the first N seconds
Args:
url: YouTube video URL
duration: Duration in seconds to extract (default: 7)
Returns:
str: Extracted subtitle text
"""
try:
import tempfile
import os
with tempfile.TemporaryDirectory() as temp_dir:
subtitle_path = os.path.join(temp_dir, "subtitle")
# Try to download subtitles
cmd = [
'yt-dlp',
'--write-subs',
'--write-auto-subs',
'--sub-lang', 'en,ko,en-US,ko-KR', # Try multiple languages
'--sub-format', 'vtt',
'--skip-download',
'--no-playlist',
'--output', subtitle_path,
url
]
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
stdout, stderr = await process.communicate()
# Find subtitle file
subtitle_text = ""
if os.path.exists(temp_dir):
for file in os.listdir(temp_dir):
if file.endswith('.vtt'):
subtitle_file = os.path.join(temp_dir, file)
try:
with open(subtitle_file, 'r', encoding='utf-8') as f:
content = f.read()
# Parse VTT and extract text for first N seconds
subtitle_text = parse_vtt_for_duration(content, duration)
break
except Exception as e:
logger.warning(f"Failed to read subtitle file: {str(e)}")
continue
if not subtitle_text:
logger.info("No subtitles found, will use audio transcription")
return ""
return subtitle_text
except Exception as e:
logger.warning(f"Error extracting YouTube subtitles: {str(e)}")
return ""
def parse_vtt_for_duration(vtt_content: str, max_duration: int) -> str:
"""
Parse VTT subtitle content and extract text for first N seconds
Args:
vtt_content: VTT file content
max_duration: Maximum duration in seconds
Returns:
str: Extracted subtitle text
"""
import re
lines = vtt_content.split('\n')
subtitle_texts = []
# VTT format: timestamp line followed by text lines
current_time = 0.0
i = 0
while i < len(lines):
line = lines[i].strip()
# Check if this is a timestamp line (format: 00:00:10.000 --> 00:00:15.000)
timestamp_match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})\.(\d{3})', line)
if timestamp_match:
# Extract start time
hours = int(timestamp_match.group(1))
minutes = int(timestamp_match.group(2))
seconds = int(timestamp_match.group(3))
milliseconds = int(timestamp_match.group(4))
start_time = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000.0
# If start time exceeds max_duration, stop
if start_time > max_duration:
break
# Get text lines until next timestamp or empty line
i += 1
text_lines = []
while i < len(lines):
next_line = lines[i].strip()
if not next_line or re.match(r'\d{2}:\d{2}:\d{2}', next_line):
break
if next_line and not next_line.startswith('<'):
text_lines.append(next_line)
i += 1
if text_lines:
subtitle_texts.append(' '.join(text_lines))
else:
i += 1
return ' '.join(subtitle_texts)
async def get_youtube_text_info(url: str, duration: int = 7) -> Tuple[str, str, str]:
"""
Get YouTube video text information: title, description, and subtitles
Args:
url: YouTube video URL
duration: Duration in seconds to extract subtitles (default: 7)
Returns:
tuple: (title, description, subtitles)
"""
try:
# Extract metadata
metadata = await extract_youtube_metadata(url)
title = metadata.get('title', '')
description = metadata.get('description', '')
# Extract subtitles
subtitles = await extract_youtube_subtitles(url, duration)
return title, description, subtitles
except Exception as e:
logger.error(f"Error getting YouTube text info: {str(e)}")
return "", "", ""