"""Category B: Video Intelligence (Deep Dive) Tools
Provides detailed video analysis including transcripts, metadata,
chapters, thumbnails, and comments.
"""
import asyncio
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
from ..middleware.rate_limiter import rate_limiter
def extract_video_id(url_or_id: str) -> str:
"""Extract video ID from URL or return as-is if already an ID."""
if "youtube.com" in url_or_id or "youtu.be" in url_or_id:
# Extract ID from various YouTube URL formats
if "v=" in url_or_id:
return url_or_id.split("v=")[1].split("&")[0]
elif "youtu.be/" in url_or_id:
return url_or_id.split("youtu.be/")[1].split("?")[0]
return url_or_id
@rate_limiter
async def get_transcript(video_id: str, languages: str = "en") -> str:
"""
Fetch time-synced transcript/subtitles for a YouTube video.
Retrieves both manual and auto-generated subtitles. This is CRITICAL for
video content analysis and summarization.
Args:
video_id: YouTube video ID or URL
languages: Comma-separated language codes (e.g., 'en,de,fr'). Default: 'en'
Returns:
Markdown-formatted transcript with timestamps
"""
video_id = extract_video_id(video_id)
lang_list = [lang.strip() for lang in languages.split(",")]
def _fetch():
return YouTubeTranscriptApi.get_transcript(video_id, languages=lang_list)
try:
transcript = await asyncio.to_thread(_fetch)
output = f"# Transcript for Video: {video_id}\n"
output += f"**URL**: https://www.youtube.com/watch?v={video_id}\n\n"
output += "---\n\n"
for entry in transcript:
timestamp = entry['start']
minutes = int(timestamp // 60)
seconds = int(timestamp % 60)
text = entry['text']
output += f"**[{minutes:02d}:{seconds:02d}]** {text}\n\n"
return output
except TranscriptsDisabled:
return f"❌ Transcripts are disabled for video: {video_id}"
except NoTranscriptFound:
return f"❌ No transcript found for video: {video_id} in languages: {languages}"
except Exception as e:
return f"❌ Error fetching transcript for {video_id}: {str(e)}"
@rate_limiter
async def get_video_metadata(video_id: str) -> str:
"""
Extract comprehensive metadata for a YouTube video.
Returns views, tags, description, likes, publish date, duration,
and more using yt-dlp's powerful extraction engine.
Args:
video_id: YouTube video ID or URL
Returns:
Markdown-formatted comprehensive video metadata
"""
video_id = extract_video_id(video_id)
url = f"https://www.youtube.com/watch?v={video_id}"
ydl_opts = {
'quiet': True,
'no_warnings': True,
'skip_download': True,
}
def _fetch():
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
return ydl.extract_info(url, download=False)
try:
info = await asyncio.to_thread(_fetch)
output = f"# Video Metadata: {info.get('title', 'N/A')}\n\n"
output += f"**URL**: {url}\n\n"
output += "## Basic Information\n"
output += f"- **Title**: {info.get('title', 'N/A')}\n"
output += f"- **Channel**: {info.get('channel', 'N/A')}\n"
output += f"- **Channel ID**: {info.get('channel_id', 'N/A')}\n"
output += f"- **Channel URL**: {info.get('channel_url', 'N/A')}\n"
output += f"- **Upload Date**: {info.get('upload_date', 'N/A')}\n"
output += f"- **Duration**: {info.get('duration', 0)} seconds\n\n"
output += "## Engagement Metrics\n"
output += f"- **Views**: {info.get('view_count', 0):,}\n"
output += f"- **Likes**: {info.get('like_count', 0):,}\n"
output += f"- **Comments**: {info.get('comment_count', 0):,}\n\n"
description = info.get('description', 'No description')
output += "## Description\n"
output += f"{description[:500]}...\n\n" if len(description) > 500 else f"{description}\n\n"
tags = info.get('tags', [])
if tags:
output += "## Tags\n"
output += ", ".join(tags[:20]) + "\n\n"
categories = info.get('categories', [])
if categories:
output += "## Categories\n"
output += ", ".join(categories) + "\n\n"
thumbnail = info.get('thumbnail', '')
if thumbnail:
output += f"## Thumbnail\n{thumbnail}\n\n"
return output
except Exception as e:
return f"❌ Error fetching metadata for {video_id}: {str(e)}"
@rate_limiter
async def get_video_chapters(video_id: str) -> str:
"""
Extract video chapters/key moments if available.
Chapters provide structured navigation points for long-form content.
Args:
video_id: YouTube video ID or URL
Returns:
Markdown-formatted chapter timestamps and titles
"""
video_id = extract_video_id(video_id)
url = f"https://www.youtube.com/watch?v={video_id}"
ydl_opts = {
'quiet': True,
'no_warnings': True,
'skip_download': True,
}
def _fetch():
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
return ydl.extract_info(url, download=False)
try:
info = await asyncio.to_thread(_fetch)
chapters = info.get('chapters', [])
if not chapters:
return f"No chapters available for video: {video_id}"
output = f"# Chapters for: {info.get('title', 'N/A')}\n"
output += f"**URL**: {url}\n\n"
for idx, chapter in enumerate(chapters, 1):
start_time = chapter.get('start_time', 0)
end_time = chapter.get('end_time', 0)
title = chapter.get('title', 'Untitled Chapter')
start_min = int(start_time // 60)
start_sec = int(start_time % 60)
end_min = int(end_time // 60)
end_sec = int(end_time % 60)
output += f"## {idx}. {title}\n"
output += f"**Time**: {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d}\n\n"
return output
except Exception as e:
return f"❌ Error fetching chapters for {video_id}: {str(e)}"
@rate_limiter
async def get_thumbnail(video_id: str, quality: str = "maxres") -> str:
"""
Get high-resolution thumbnail URL(s) for a video.
Args:
video_id: YouTube video ID or URL
quality: Thumbnail quality - 'maxres', 'sd', 'hq', 'mq', 'default'
Returns:
Markdown-formatted thumbnail URLs
"""
video_id = extract_video_id(video_id)
# Direct YouTube thumbnail URL pattern
qualities = {
'maxres': f'https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg',
'sd': f'https://i.ytimg.com/vi/{video_id}/sddefault.jpg',
'hq': f'https://i.ytimg.com/vi/{video_id}/hqdefault.jpg',
'mq': f'https://i.ytimg.com/vi/{video_id}/mqdefault.jpg',
'default': f'https://i.ytimg.com/vi/{video_id}/default.jpg',
}
output = f"# Thumbnails for Video: {video_id}\n\n"
output += f"**Video URL**: https://www.youtube.com/watch?v={video_id}\n\n"
if quality in qualities:
output += f"## {quality.upper()} Quality\n"
output += f"{qualities[quality]}\n\n"
else:
output += "## All Available Qualities\n"
for qual, url in qualities.items():
output += f"- **{qual.upper()}**: {url}\n"
return output
@rate_limiter
async def get_comments(video_id: str, max_comments: int = 20) -> str:
"""
Fetch top comments from a YouTube video.
⚠️ WARNING: This operation is SLOWER than others. Use strict limits.
Args:
video_id: YouTube video ID or URL
max_comments: Maximum number of comments (1-50, default: 20)
Returns:
Markdown-formatted top comments
"""
video_id = extract_video_id(video_id)
url = f"https://www.youtube.com/watch?v={video_id}"
max_comments = max(1, min(max_comments, 50))
ydl_opts = {
'quiet': True,
'no_warnings': True,
'skip_download': True,
'getcomments': True,
'max_comments': max_comments,
}
def _fetch():
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
return info.get('comments', [])
try:
comments = await asyncio.to_thread(_fetch)
if not comments:
return f"No comments available for video: {video_id}"
output = f"# Top {len(comments)} Comments\n"
output += f"**Video**: https://www.youtube.com/watch?v={video_id}\n\n"
for idx, comment in enumerate(comments[:max_comments], 1):
author = comment.get('author', 'Unknown')
text = comment.get('text', '')
likes = comment.get('like_count', 0)
output += f"## {idx}. {author} ({likes:,} likes)\n"
output += f"{text}\n\n"
output += "---\n\n"
return output
except Exception as e:
return f"❌ Error fetching comments for {video_id}: {str(e)}"