Link Scan MCP Server

youtube_extractor.py•6.76 KiB

""" YouTube metadata and subtitle extraction utilities Uses yt-dlp to extract video metadata and subtitles """ import json import asyncio import logging from typing import Dict, Optional, Tuple, Any logger = logging.getLogger(__name__) async def extract_youtube_metadata(url: str) -> Dict[str, Any]: """ Extract YouTube video metadata using yt-dlp Args: url: YouTube video URL Returns: dict: Video metadata including title, description, etc. """ try: cmd = [ 'yt-dlp', '--dump-json', '--no-playlist', url ] process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) try: stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=10) except asyncio.TimeoutError: process.kill() await process.wait() raise Exception("YouTube metadata extraction timeout after 10 seconds") if process.returncode != 0: error_msg = stderr.decode() if stderr else "Unknown error" logger.error(f"yt-dlp metadata extraction error: {error_msg}") raise Exception(f"Failed to extract metadata: {error_msg}") metadata = json.loads(stdout.decode()) return metadata except FileNotFoundError: raise Exception("yt-dlp not found. Please install yt-dlp: pip install yt-dlp") except json.JSONDecodeError as e: logger.error(f"Failed to parse metadata JSON: {str(e)}") raise Exception(f"Failed to parse metadata: {str(e)}") except Exception as e: logger.error(f"Error extracting YouTube metadata: {str(e)}") raise async def extract_youtube_subtitles(url: str, duration: int = 7) -> str: """ Extract YouTube subtitles for the first N seconds Args: url: YouTube video URL duration: Duration in seconds to extract (default: 7) Returns: str: Extracted subtitle text """ try: import tempfile import os with tempfile.TemporaryDirectory() as temp_dir: subtitle_path = os.path.join(temp_dir, "subtitle") # Try to download subtitles cmd = [ 'yt-dlp', '--write-subs', '--write-auto-subs', '--sub-lang', 'en,ko,en-US,ko-KR', # Try multiple languages '--sub-format', 'vtt', '--skip-download', '--no-playlist', '--output', subtitle_path, url ] process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) stdout, stderr = await process.communicate() # Find subtitle file subtitle_text = "" if os.path.exists(temp_dir): for file in os.listdir(temp_dir): if file.endswith('.vtt'): subtitle_file = os.path.join(temp_dir, file) try: with open(subtitle_file, 'r', encoding='utf-8') as f: content = f.read() # Parse VTT and extract text for first N seconds subtitle_text = parse_vtt_for_duration(content, duration) break except Exception as e: logger.warning(f"Failed to read subtitle file: {str(e)}") continue if not subtitle_text: logger.info("No subtitles found, will use audio transcription") return "" return subtitle_text except Exception as e: logger.warning(f"Error extracting YouTube subtitles: {str(e)}") return "" def parse_vtt_for_duration(vtt_content: str, max_duration: int) -> str: """ Parse VTT subtitle content and extract text for first N seconds Args: vtt_content: VTT file content max_duration: Maximum duration in seconds Returns: str: Extracted subtitle text """ import re lines = vtt_content.split('\n') subtitle_texts = [] # VTT format: timestamp line followed by text lines current_time = 0.0 i = 0 while i < len(lines): line = lines[i].strip() # Check if this is a timestamp line (format: 00:00:10.000 --> 00:00:15.000) timestamp_match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})\.(\d{3})', line) if timestamp_match: # Extract start time hours = int(timestamp_match.group(1)) minutes = int(timestamp_match.group(2)) seconds = int(timestamp_match.group(3)) milliseconds = int(timestamp_match.group(4)) start_time = hours * 3600 + minutes * 60 + seconds + milliseconds / 1000.0 # If start time exceeds max_duration, stop if start_time > max_duration: break # Get text lines until next timestamp or empty line i += 1 text_lines = [] while i < len(lines): next_line = lines[i].strip() if not next_line or re.match(r'\d{2}:\d{2}:\d{2}', next_line): break if next_line and not next_line.startswith('<'): text_lines.append(next_line) i += 1 if text_lines: subtitle_texts.append(' '.join(text_lines)) else: i += 1 return ' '.join(subtitle_texts) async def get_youtube_text_info(url: str, duration: int = 7) -> Tuple[str, str, str]: """ Get YouTube video text information: title, description, and subtitles Args: url: YouTube video URL duration: Duration in seconds to extract subtitles (default: 7) Returns: tuple: (title, description, subtitles) """ try: # Extract metadata metadata = await extract_youtube_metadata(url) title = metadata.get('title', '') description = metadata.get('description', '') # Extract subtitles subtitles = await extract_youtube_subtitles(url, duration) return title, description, subtitles except Exception as e: logger.error(f"Error getting YouTube text info: {str(e)}") return "", "", ""

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/chweyun/mcp-link-scan'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

youtube_extractor.py•6.76 KiB