Crawl-MCP

youtube.py•22.6 kB

""" YouTube tools for Crawl4AI MCP Server. Contains complete YouTube transcript extraction and video information tools. """ import asyncio from typing import Any, Dict, List, Optional, Annotated from pydantic import Field # Import models from the models module from ..models import ( YouTubeTranscriptResponse, YouTubeBatchRequest, YouTubeBatchResponse ) # Import the YouTube processor from ..youtube_processor import YouTubeProcessor # Initialize YouTube processor youtube_processor = YouTubeProcessor() async def extract_youtube_transcript( url: Annotated[str, Field(description="YouTube video URL")], languages: Annotated[Optional[List[str]], Field(description="Preferred languages in order of preference (default: ['ja', 'en'])")] = ["ja", "en"], translate_to: Annotated[Optional[str], Field(description="Target language for translation (default: None)")] = None, include_timestamps: Annotated[bool, Field(description="Include timestamps in transcript (default: True)")] = True, preserve_formatting: Annotated[bool, Field(description="Preserve original formatting (default: True)")] = True, include_metadata: Annotated[bool, Field(description="Include video metadata (default: True)")] = True, auto_summarize: Annotated[bool, Field(description="Automatically summarize long transcripts using LLM (default: False)")] = False, max_content_tokens: Annotated[int, Field(description="Maximum tokens before triggering auto-summarization (default: 15000)")] = 15000, summary_length: Annotated[str, Field(description="Summary length: 'short', 'medium', 'long' (default: 'medium')")] = "medium", llm_provider: Annotated[Optional[str], Field(description="LLM provider for summarization, auto-detected if not specified (default: None)")] = None, llm_model: Annotated[Optional[str], Field(description="Specific LLM model for summarization, auto-detected if not specified (default: None)")] = None ) -> YouTubeTranscriptResponse: """ Extract YouTube video transcripts with timestamps and optional AI summarization. Works with public videos that have captions. No authentication required. Auto-detects available languages and falls back appropriately. Note: Automatic transcription may contain errors. """ try: # Check if URL is valid YouTube URL if not youtube_processor.is_youtube_url(url): response = YouTubeTranscriptResponse( success=False, url=url, error="URL is not a valid YouTube video URL" ) return response.model_dump() # Extract video ID video_id = youtube_processor.extract_video_id(url) if not video_id: response = YouTubeTranscriptResponse( success=False, url=url, error="Could not extract video ID from URL" ) return response.model_dump() # Process the YouTube URL result = await youtube_processor.process_youtube_url( url=url, languages=languages, translate_to=translate_to, include_timestamps=include_timestamps, preserve_formatting=preserve_formatting, include_metadata=include_metadata ) if not result['success']: # Enhanced error messaging for different environments import os import sys is_uvx_env = 'UV_PROJECT_ENVIRONMENT' in os.environ or 'UVX' in str(sys.executable) base_error = result.get('error', 'Unknown error during transcript extraction') # Add UVX-specific guidance if applicable if is_uvx_env: enhanced_error = f"{base_error}\n\nUVX Environment Detected:\n" \ f"- If this worked in STDIO local setup, the issue may be UVX environment isolation\n" \ f"- YouTube API may behave differently in UVX vs local environments\n" \ f"- Try running system diagnostics: get_system_diagnostics()\n" \ f"- Consider switching to STDIO local setup for YouTube functionality" else: enhanced_error = f"{base_error}\n\nTroubleshooting:\n" \ f"- Correct method name: 'extract_youtube_transcript' (not 'get_transcript')\n" \ f"- Alternative methods: get_youtube_video_info, batch_extract_youtube_transcripts\n" \ f"- Check if video has available captions" response = YouTubeTranscriptResponse( success=False, url=url, video_id=video_id, error=enhanced_error, metadata={ 'uvx_environment': is_uvx_env, 'correct_method_name': 'extract_youtube_transcript', 'alternative_methods': ['get_youtube_video_info', 'batch_extract_youtube_transcripts'], 'diagnostic_tool': 'get_system_diagnostics' } ) return response.model_dump() # Get transcript data transcript_data = result['transcript'] language_info = result.get('language_info', {}) metadata = result.get('metadata', {}) # Apply auto-summarization if requested and content exceeds token limit if auto_summarize and transcript_data.get('full_text'): # Rough token estimation: 1 token ≈ 4 characters estimated_tokens = len(transcript_data['full_text']) // 4 # Only summarize if content exceeds the specified token limit if estimated_tokens > max_content_tokens: try: # Prepare video metadata for enhanced summarization video_metadata = { 'title': metadata.get('title', ''), 'url': url, 'video_id': result.get('video_id', ''), 'channel': metadata.get('channel', ''), 'description': metadata.get('description', '') } summary_result = await youtube_processor.summarize_transcript( transcript_text=transcript_data['full_text'], summary_length=summary_length, include_timestamps=include_timestamps, llm_provider=llm_provider, llm_model=llm_model, video_metadata=video_metadata, target_tokens=max_content_tokens ) if summary_result.get('success'): # Add enhanced summarization info to metadata metadata.update({ 'summarization_applied': True, 'original_length': len(transcript_data['full_text']), 'original_tokens_estimate': estimated_tokens, 'summary_length_setting': summary_length, 'target_tokens': summary_result.get('target_tokens', max_content_tokens), 'estimated_summary_tokens': summary_result.get('estimated_summary_tokens', 0), 'compression_ratio': summary_result.get('compression_ratio', 0), 'llm_provider': summary_result.get('llm_provider', 'unknown'), 'llm_model': summary_result.get('llm_model', 'unknown'), 'summarization_trigger': f'Content exceeded {max_content_tokens} tokens', # Preserve video metadata from summary 'video_title_preserved': summary_result.get('video_title', ''), 'video_url_preserved': summary_result.get('video_url', ''), 'channel_name_preserved': summary_result.get('channel_name', ''), 'key_topics_identified': summary_result.get('key_topics', []) }) # Replace content with summary transcript_data['full_text'] = summary_result['summary'] transcript_data['clean_text'] = summary_result['summary'] else: # Summarization failed, add error info metadata.update({ 'summarization_attempted': True, 'summarization_error': summary_result.get('error', 'Unknown error'), 'original_content_preserved': True }) except Exception as e: # Summarization failed, add error info metadata.update({ 'summarization_attempted': True, 'summarization_error': f'Exception during summarization: {str(e)}', 'original_content_preserved': True }) else: # Content is below threshold - preserve original content and add info metadata.update({ 'auto_summarize_requested': True, 'original_content_preserved': True, 'content_below_threshold': True, 'tokens_estimate': estimated_tokens, 'max_tokens_threshold': max_content_tokens, 'reason': f'Content ({estimated_tokens} tokens) is below threshold ({max_content_tokens} tokens)' }) response = YouTubeTranscriptResponse( success=True, url=url, video_id=video_id, transcript=transcript_data, language_info=language_info, metadata=metadata, processing_method="youtube_transcript_api" ) return response.model_dump() except Exception as e: response = YouTubeTranscriptResponse( success=False, url=url, error=f"YouTube transcript processing error: {str(e)}" ) return response.model_dump() async def batch_extract_youtube_transcripts( request: Annotated[Dict[str, Any], Field(description="YouTubeBatchRequest with URLs and extraction parameters")] ) -> YouTubeBatchResponse: """ Extract transcripts from multiple YouTube videos using youtube-transcript-api. Processes multiple YouTube URLs concurrently with controlled rate limiting. No authentication required for public videos with captions. Note: Automatic transcription may contain errors. """ try: # Extract parameters from request urls = request.get('urls', []) languages = request.get('languages', ['ja', 'en']) max_concurrent = min(request.get('max_concurrent', 3), 5) # Limit max concurrency include_timestamps = request.get('include_timestamps', True) translate_to = request.get('translate_to') preserve_formatting = request.get('preserve_formatting', True) include_metadata = request.get('include_metadata', True) if not urls: return YouTubeBatchResponse( success=False, total_urls=0, successful_extractions=0, failed_extractions=0, results=[], error="No URLs provided in request" ) # Limit number of URLs to prevent abuse if len(urls) > 20: urls = urls[:20] # Process URLs with controlled concurrency semaphore = asyncio.Semaphore(max_concurrent) async def process_single_url(url: str) -> YouTubeTranscriptResponse: async with semaphore: # Add small delay to prevent rate limiting await asyncio.sleep(0.1) return await extract_youtube_transcript( url=url, languages=languages, translate_to=translate_to, include_timestamps=include_timestamps, preserve_formatting=preserve_formatting, include_metadata=include_metadata ) # Process all URLs concurrently tasks = [process_single_url(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) # Convert exceptions to error responses processed_results = [] successful_count = 0 failed_count = 0 for i, result in enumerate(results): if isinstance(result, Exception): processed_results.append(YouTubeTranscriptResponse( success=False, url=urls[i], error=f"Processing exception: {str(result)}" )) failed_count += 1 else: processed_results.append(result) if result.success: successful_count += 1 else: failed_count += 1 return YouTubeBatchResponse( success=True, total_urls=len(urls), successful_extractions=successful_count, failed_extractions=failed_count, results=processed_results, metadata={ 'max_concurrent_used': max_concurrent, 'processing_method': 'batch_youtube_transcript_api', 'rate_limiting_applied': True } ) except Exception as e: return YouTubeBatchResponse( success=False, total_urls=len(request.get('urls', [])), successful_extractions=0, failed_extractions=len(request.get('urls', [])), results=[], error=f"Batch processing error: {str(e)}" ) async def get_youtube_video_info( video_url: Annotated[str, Field(description="YouTube video URL")], summarize_transcript: Annotated[bool, Field(description="Summarize long transcripts using LLM (default: False)")] = False, max_tokens: Annotated[int, Field(description="Token limit before triggering summarization (default: 25000)")] = 25000, llm_provider: Annotated[Optional[str], Field(description="LLM provider for summarization (default: auto-detected)")] = None, llm_model: Annotated[Optional[str], Field(description="Specific model to use (default: auto-detected)")] = None, summary_length: Annotated[str, Field(description="Summary length - 'short', 'medium', 'long' (default: 'medium')")] = "medium", include_timestamps: Annotated[bool, Field(description="Preserve key timestamps in summary (default: True)")] = True ) -> Dict[str, Any]: """ Get YouTube video information with optional transcript summarization. Retrieves basic video information and transcript availability using youtube-transcript-api. No authentication required for public videos. Note: Automatic transcription may contain errors. """ try: # Check if URL is valid YouTube URL if not youtube_processor.is_youtube_url(video_url): return { "success": False, "url": video_url, "error": "URL is not a valid YouTube video URL" } # Extract video ID video_id = youtube_processor.extract_video_id(video_url) if not video_id: return { "success": False, "url": video_url, "error": "Could not extract video ID from URL" } # Get basic video information video_info = youtube_processor.get_video_info(video_id) # Try to get transcript information transcript_info = {} try: # Get available transcript languages available_transcripts = youtube_processor.get_available_transcript_languages(video_id) transcript_info = { "has_transcript": len(available_transcripts) > 0, "available_languages": available_transcripts, "manually_created_count": len([t for t in available_transcripts if t.get('is_generated', True) == False]), "auto_generated_count": len([t for t in available_transcripts if t.get('is_generated', True) == True]) } # If transcript is available and summarization is requested if transcript_info["has_transcript"] and summarize_transcript: # Extract transcript transcript_result = await extract_youtube_transcript( url=video_url, languages=["en", "ja"], # Default languages include_timestamps=include_timestamps, auto_summarize=True, max_content_tokens=max_tokens, summary_length=summary_length, llm_provider=llm_provider, llm_model=llm_model ) if transcript_result.success: transcript_info.update({ "transcript_summary": transcript_result.transcript.get('full_text', ''), "original_length": transcript_result.metadata.get('original_length', 0), "summarization_applied": transcript_result.metadata.get('summarization_applied', False), "processing_method": transcript_result.processing_method }) else: transcript_info["transcript_error"] = transcript_result.error except Exception as e: transcript_info = { "has_transcript": False, "transcript_check_error": str(e) } return { "success": True, "url": video_url, "video_id": video_id, "video_info": video_info, "transcript_info": transcript_info, "processing_method": "youtube_video_info_api" } except Exception as e: return { "success": False, "url": video_url, "error": f"Video info processing error: {str(e)}" } async def get_youtube_api_setup_guide() -> Dict[str, Any]: """ Get setup information for youtube-transcript-api integration. Provides information about current youtube-transcript-api setup. No authentication or API keys required for basic transcript extraction. """ try: return { "success": True, "api_name": "youtube-transcript-api", "authentication_required": False, "api_key_required": False, "rate_limits": { "requests_per_minute": "No official limit, but recommended: 60 requests/minute", "concurrent_requests": "Recommended: 3-5 concurrent requests", "note": "Rate limiting is applied automatically in batch operations" }, "supported_features": { "transcript_extraction": True, "multiple_languages": True, "auto_generated_transcripts": True, "manually_created_transcripts": True, "timestamp_support": True, "translation_support": True, "batch_processing": True, "video_metadata": True, "ai_summarization": True }, "supported_youtube_formats": [ "https://www.youtube.com/watch?v=VIDEO_ID", "https://youtu.be/VIDEO_ID", "https://youtube.com/watch?v=VIDEO_ID", "https://www.youtube.com/embed/VIDEO_ID", "https://www.youtube.com/v/VIDEO_ID" ], "language_support": { "extraction_languages": [ "English (en)", "Japanese (ja)", "Spanish (es)", "French (fr)", "German (de)", "Italian (it)", "Portuguese (pt)", "Russian (ru)", "Chinese (zh)", "Korean (ko)", "Arabic (ar)", "Hindi (hi)", "And many more..." ], "translation_support": True, "auto_language_detection": True }, "usage_tips": [ "No API key required - works directly with YouTube's public transcript data", "Best results with videos that have manually created captions", "Auto-generated captions available for many videos", "Use batch processing for multiple videos to improve efficiency", "Enable AI summarization for long transcripts to reduce token usage", "Respect YouTube's terms of service when using transcript data" ], "limitations": [ "Only works with public videos that have transcripts available", "Some videos may not have transcripts (especially older videos)", "Private or unlisted videos are not accessible", "Age-restricted videos may have limited access", "Live streams may not have stable transcript access" ], "error_handling": { "transcript_disabled": "Video owner has disabled transcripts", "no_transcript_found": "No transcript available in requested language", "video_unavailable": "Video is private, deleted, or restricted", "connection_error": "Network or YouTube service issue" }, "ai_summarization": { "supported": True, "trigger_threshold": "15,000 tokens (approximately 60,000 characters)", "summary_lengths": ["short", "medium", "long"], "preserves_timestamps": True, "llm_providers": "Auto-detected based on configuration" } } except Exception as e: return { "success": False, "error": f"Setup guide generation error: {str(e)}" }

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/walksoda/crawl-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

youtube.py•22.6 kB