MCP-YouTube-Transcribe

youtube_tool.py•12.5 KiB

import logging
import os
import subprocess
import json
import multiprocessing
import whisper
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
from yt_dlp.utils import DownloadError
from pytube import YouTube
from pytube.exceptions import PytubeError

# This will get the logger that was configured in mcp_server.py
logger = logging.getLogger(__name__)

# A simple cache for the Whisper model so we don't reload it every time
_whisper_model = None


def _check_whisper_cpp():
    """Check if whisper.cpp is available in the system PATH."""
    try:
        result = subprocess.run(['whisper-cli', '--help'],
                                capture_output=True,
                                text=True)
        logger.info("whisper.cpp check result: %s", result.returncode == 0)
        return result.returncode == 0
    except FileNotFoundError:
        logger.info("whisper.cpp not found in PATH")
        return False


def _transcribe_with_whisper_cpp(audio_path):
    """
    Transcribe audio using whisper.cpp.
    Returns the transcript text if successful, None if failed.
    """
    try:
        logger.info("Attempting transcription with whisper.cpp...")
        model_path = os.path.join(os.path.dirname(__file__), "models", "ggml-tiny.bin")
        if not os.path.exists(model_path):
            logger.error(f"Model file not found at {model_path}")
            return None

        # Convert MP3 to WAV
        wav_path = audio_path.replace('.mp3', '.wav')
        logger.info(f"Converting {audio_path} to WAV format...")
        convert_result = subprocess.run(
            ['ffmpeg', '-y', '-i', audio_path, wav_path],
            capture_output=True,
            text=True
        )
        if convert_result.returncode != 0:
            logger.error(f"Failed to convert audio to WAV: {convert_result.stderr}")
            return None

        # Run whisper.cpp on the WAV file
        json_output_path = wav_path + '.json'

        # Determine thread count
        thread_env = os.getenv("WHISPER_THREADS")
        try:
            threads = int(thread_env) if thread_env else multiprocessing.cpu_count()
        except ValueError:
            threads = multiprocessing.cpu_count()

        cmd = [
            'whisper-cli',
            '-m', model_path,
            '-t', str(threads),  # multi-threading
            '--output-json-full',
            '--no-timestamps',
            wav_path
        ]
        logger.info(f"Running whisper.cpp command: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True)
        logger.info(f"whisper.cpp return code: {result.returncode}")
        logger.info(f"whisper.cpp stderr: {result.stderr}")

        # Clean up the WAV file
        try:
            os.remove(wav_path)
        except Exception as e:
            logger.warning(f"Failed to remove temporary WAV file: {e}")

        if result.returncode == 0:
            try:
                # Read the JSON output from the file
                with open(json_output_path, 'r') as f:
                    output = json.load(f)

                # Clean up the JSON file
                try:
                    os.remove(json_output_path)
                except Exception as e:
                    logger.warning(f"Failed to remove JSON output file: {e}")

                # Extract text from the full JSON output
                segments = output.get('transcription', [])
                text = ' '.join(seg.get('text', '') for seg in segments)
                return text.strip()
            except (json.JSONDecodeError, FileNotFoundError) as e:
                logger.error(f"Failed to read whisper.cpp JSON output: {e}")
                return None
        else:
            logger.error(f"whisper.cpp failed with error: {result.stderr}")
            return None
    except Exception as e:
        logger.error(f"Error running whisper.cpp: {e}")
        return None


def _yt_dlp_hook(d):
    """
    A hook for yt-dlp to capture its progress. We use this to prevent it from
    printing directly to stdout, which would corrupt the JSON-RPC communication.
    """
    if d['status'] == 'finished':
        # The 'filename' key is provided once the download is complete.
        logger.info(f"yt-dlp hook: Finished downloading '{d.get('filename', 'unknown file')}'")
    # We ignore the 'downloading' status to prevent log spam.


def _download_audio_with_fallbacks(video_url: str, audio_path: str) -> bool:
    """
    Tries to download audio, first with yt-dlp, then with pytube as a fallback.
    Returns True if download is successful, False otherwise.
    """
    # --- Attempt 1: yt-dlp (Primary Method) ---
    try:
        logger.info(f"Attempting audio download with yt-dlp...")
        ydl_opts_audio = {
            'format': 'bestaudio/best',
            'outtmpl': audio_path,
            'quiet': True,
            'noprogress': True,
            'overwrites': True,
            'logger': logger,
            'progress_hooks': [_yt_dlp_hook],
        }
        with yt_dlp.YoutubeDL(ydl_opts_audio) as ydl:
            ydl.download([video_url])
        logger.info("Audio download with yt-dlp complete.")
        return True
    except DownloadError as e:
        logger.warning(f"yt-dlp failed to download audio: {e}. Falling back to pytube.")
    except Exception as e:
        logger.error(f"An unexpected error occurred with yt-dlp: {e}. Falling back to pytube.")

    # --- Attempt 2: pytube (Fallback Method) ---
    try:
        logger.info("Attempting audio download with pytube...")
        yt = YouTube(video_url)
        # Get the best available audio stream (usually .mp4 container with audio)
        audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first()
        if not audio_stream:
            logger.error("Pytube could not find any audio-only streams.")
            return False

        # pytube downloads to a file with its own name, so we download and then rename if needed.
        # Or, we can specify the filename directly.
        output_dir = os.path.dirname(audio_path)
        file_name = os.path.basename(audio_path)

        audio_stream.download(output_path=output_dir, filename=file_name)
        logger.info("Audio download with pytube complete.")
        return True
    except PytubeError as e:
        logger.error(f"Pytube also failed to download audio: {e}")
        return False
    except Exception as e:
        logger.error(f"An unexpected error occurred with pytube: {e}")
        return False


def get_youtube_transcript(query: str, force_whisper: bool = False) -> dict:
    """
    Searches for a YouTube video, downloads it, and returns the transcript.
    It will try to get an official transcript first, unless force_whisper is True.
    If force_whisper is True, it will try whisper.cpp first, then fall back to Python whisper.
    """
    global _whisper_model
    logger.info(f"get_youtube_transcript called with query: '{query}', force_whisper: {force_whisper}")

    try:
        # --- 1. Search for the video and get its info (using yt-dlp for robust search) ---
        logger.info("Searching for video with yt-dlp to get info...")
        ydl_opts_info = {
            'format': 'bestaudio/best',
            'noplaylist': True,
            'default_search': 'ytsearch',
            'quiet': True,
            'noprogress': True,
            'logger': logger,
        }
        with yt_dlp.YoutubeDL(ydl_opts_info) as ydl:
            info = ydl.extract_info(query, download=False)
            video_info = info['entries'][0] if 'entries' in info else info

        video_url = video_info.get("webpage_url")
        video_title = video_info.get("title", "Unknown Title")
        video_id = video_info.get("id")  # Get video ID for transcript API
        logger.info(f"Found video: '{video_title}' (ID: {video_id}) at {video_url}")

        transcript_text = None
        transcript_source = "Not Available"

        # --- 2. Try to get the official transcript FIRST (unless forced to Whisper) ---
        if not force_whisper and video_id:
            logger.info("Attempting to fetch official YouTube transcript...")
            try:
                transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                # Prioritize manually created transcripts, then auto-generated
                transcript_obj = None
                for t in transcript_list:
                    if t.is_generated:
                        # Keep auto-generated as a fallback, but prefer manual
                        if transcript_obj is None:
                            transcript_obj = t
                    else:
                        # Found a manual transcript, use it immediately
                        transcript_obj = t
                        break  # Found best option, exit loop

                if transcript_obj:
                    # Fetch the actual transcript parts
                    transcript_parts = transcript_obj.fetch()
                    transcript_text = " ".join([item.text for item in transcript_parts])
                    transcript_source = f"Official YouTube Captions ({'Generated' if transcript_obj.is_generated else 'Manual'})"
                    logger.info("Successfully fetched official YouTube transcript.")
                else:
                    logger.info("No official transcript found for this video.")

            except NoTranscriptFound:
                logger.info("No official transcript found for this video (NoTranscriptFound exception).")
            except TranscriptsDisabled:
                logger.info("Transcripts are disabled for this video.")
            except Exception as e:
                logger.warning(f"Error fetching official transcript: {e}", exc_info=True)

        if transcript_text:
            return {
                "status": "success",
                "title": video_title,
                "url": video_url,
                "source": transcript_source,
                "transcript": transcript_text
            }

        # --- 3. If no official transcript or force_whisper, use Whisper (audio download + transcribe) ---
        # This block only runs if transcript_text is still None after the official transcript attempt
        logger.info(
            "Proceeding to audio download and Whisper transcription as no official transcript was available or force_whisper is True.")
        output_dir = os.path.join(os.path.dirname(__file__), "testing", "audio_cache")
        os.makedirs(output_dir, exist_ok=True)
        audio_path = os.path.join(output_dir, f"{video_info.get('id', 'default_id')}.mp3")

        logger.info(f"Downloading audio to: {audio_path}")
        download_successful = _download_audio_with_fallbacks(video_url, audio_path)

        if not download_successful:
            message = "Failed to download audio using all available methods (yt-dlp, pytube)."
            logger.error(message)
            return {"status": "error", "message": message}

        # First try whisper.cpp if available
        if _check_whisper_cpp():
            transcript_text = _transcribe_with_whisper_cpp(audio_path)
            if transcript_text:
                transcript_source = "whisper.cpp (AI Generated)"
                logger.info("Successfully transcribed using whisper.cpp")

        # Fall back to Python whisper if whisper.cpp failed or isn't available
        if transcript_text is None:
            logger.info("Falling back to Python whisper...")
            transcript_source = "Python Whisper (AI Generated)"

            if _whisper_model is None:
                logger.info("Whisper model not loaded. Loading 'tiny' model now...")
                _whisper_model = whisper.load_model("tiny")
                logger.info("Whisper model 'tiny' loaded successfully.")

            logger.info("Starting Python Whisper transcription...")
            result = _whisper_model.transcribe(audio_path, fp16=False)
            transcript_text = result["text"]
            logger.info("Python Whisper transcription complete.")

        return {
            "status": "success",
            "title": video_title,
            "url": video_url,
            "source": transcript_source,
            "transcript": transcript_text
        }

    except DownloadError as e:
        logger.error(f"yt-dlp download error during info extraction: {e}", exc_info=True)
        return {"status": "error", "message": f"Could not find video info or initial download failed: {e}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred in get_youtube_transcript: {e}", exc_info=True)
        return {"status": "error", "message": f"An unexpected error occurred: {str(e)}"}

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JackHP/MCP-YouTube-Transcribe'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

youtube_tool.py•12.5 KiB

import logging
import os
import subprocess
import json
import multiprocessing
import whisper
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
from yt_dlp.utils import DownloadError
from pytube import YouTube
from pytube.exceptions import PytubeError

# This will get the logger that was configured in mcp_server.py
logger = logging.getLogger(__name__)

# A simple cache for the Whisper model so we don't reload it every time
_whisper_model = None


def _check_whisper_cpp():
    """Check if whisper.cpp is available in the system PATH."""
    try:
        result = subprocess.run(['whisper-cli', '--help'],
                                capture_output=True,
                                text=True)
        logger.info("whisper.cpp check result: %s", result.returncode == 0)
        return result.returncode == 0
    except FileNotFoundError:
        logger.info("whisper.cpp not found in PATH")
        return False


def _transcribe_with_whisper_cpp(audio_path):
    """
    Transcribe audio using whisper.cpp.
    Returns the transcript text if successful, None if failed.
    """
    try:
        logger.info("Attempting transcription with whisper.cpp...")
        model_path = os.path.join(os.path.dirname(__file__), "models", "ggml-tiny.bin")
        if not os.path.exists(model_path):
            logger.error(f"Model file not found at {model_path}")
            return None

        # Convert MP3 to WAV
        wav_path = audio_path.replace('.mp3', '.wav')
        logger.info(f"Converting {audio_path} to WAV format...")
        convert_result = subprocess.run(
            ['ffmpeg', '-y', '-i', audio_path, wav_path],
            capture_output=True,
            text=True
        )
        if convert_result.returncode != 0:
            logger.error(f"Failed to convert audio to WAV: {convert_result.stderr}")
            return None

        # Run whisper.cpp on the WAV file
        json_output_path = wav_path + '.json'

        # Determine thread count
        thread_env = os.getenv("WHISPER_THREADS")
        try:
            threads = int(thread_env) if thread_env else multiprocessing.cpu_count()
        except ValueError:
            threads = multiprocessing.cpu_count()

        cmd = [
            'whisper-cli',
            '-m', model_path,
            '-t', str(threads),  # multi-threading
            '--output-json-full',
            '--no-timestamps',
            wav_path
        ]
        logger.info(f"Running whisper.cpp command: {' '.join(cmd)}")
        result = subprocess.run(cmd, capture_output=True, text=True)
        logger.info(f"whisper.cpp return code: {result.returncode}")
        logger.info(f"whisper.cpp stderr: {result.stderr}")

        # Clean up the WAV file
        try:
            os.remove(wav_path)
        except Exception as e:
            logger.warning(f"Failed to remove temporary WAV file: {e}")

        if result.returncode == 0:
            try:
                # Read the JSON output from the file
                with open(json_output_path, 'r') as f:
                    output = json.load(f)

                # Clean up the JSON file
                try:
                    os.remove(json_output_path)
                except Exception as e:
                    logger.warning(f"Failed to remove JSON output file: {e}")

                # Extract text from the full JSON output
                segments = output.get('transcription', [])
                text = ' '.join(seg.get('text', '') for seg in segments)
                return text.strip()
            except (json.JSONDecodeError, FileNotFoundError) as e:
                logger.error(f"Failed to read whisper.cpp JSON output: {e}")
                return None
        else:
            logger.error(f"whisper.cpp failed with error: {result.stderr}")
            return None
    except Exception as e:
        logger.error(f"Error running whisper.cpp: {e}")
        return None


def _yt_dlp_hook(d):
    """
    A hook for yt-dlp to capture its progress. We use this to prevent it from
    printing directly to stdout, which would corrupt the JSON-RPC communication.
    """
    if d['status'] == 'finished':
        # The 'filename' key is provided once the download is complete.
        logger.info(f"yt-dlp hook: Finished downloading '{d.get('filename', 'unknown file')}'")
    # We ignore the 'downloading' status to prevent log spam.


def _download_audio_with_fallbacks(video_url: str, audio_path: str) -> bool:
    """
    Tries to download audio, first with yt-dlp, then with pytube as a fallback.
    Returns True if download is successful, False otherwise.
    """
    # --- Attempt 1: yt-dlp (Primary Method) ---
    try:
        logger.info(f"Attempting audio download with yt-dlp...")
        ydl_opts_audio = {
            'format': 'bestaudio/best',
            'outtmpl': audio_path,
            'quiet': True,
            'noprogress': True,
            'overwrites': True,
            'logger': logger,
            'progress_hooks': [_yt_dlp_hook],
        }
        with yt_dlp.YoutubeDL(ydl_opts_audio) as ydl:
            ydl.download([video_url])
        logger.info("Audio download with yt-dlp complete.")
        return True
    except DownloadError as e:
        logger.warning(f"yt-dlp failed to download audio: {e}. Falling back to pytube.")
    except Exception as e:
        logger.error(f"An unexpected error occurred with yt-dlp: {e}. Falling back to pytube.")

    # --- Attempt 2: pytube (Fallback Method) ---
    try:
        logger.info("Attempting audio download with pytube...")
        yt = YouTube(video_url)
        # Get the best available audio stream (usually .mp4 container with audio)
        audio_stream = yt.streams.filter(only_audio=True).order_by('abr').desc().first()
        if not audio_stream:
            logger.error("Pytube could not find any audio-only streams.")
            return False

        # pytube downloads to a file with its own name, so we download and then rename if needed.
        # Or, we can specify the filename directly.
        output_dir = os.path.dirname(audio_path)
        file_name = os.path.basename(audio_path)

        audio_stream.download(output_path=output_dir, filename=file_name)
        logger.info("Audio download with pytube complete.")
        return True
    except PytubeError as e:
        logger.error(f"Pytube also failed to download audio: {e}")
        return False
    except Exception as e:
        logger.error(f"An unexpected error occurred with pytube: {e}")
        return False


def get_youtube_transcript(query: str, force_whisper: bool = False) -> dict:
    """
    Searches for a YouTube video, downloads it, and returns the transcript.
    It will try to get an official transcript first, unless force_whisper is True.
    If force_whisper is True, it will try whisper.cpp first, then fall back to Python whisper.
    """
    global _whisper_model
    logger.info(f"get_youtube_transcript called with query: '{query}', force_whisper: {force_whisper}")

    try:
        # --- 1. Search for the video and get its info (using yt-dlp for robust search) ---
        logger.info("Searching for video with yt-dlp to get info...")
        ydl_opts_info = {
            'format': 'bestaudio/best',
            'noplaylist': True,
            'default_search': 'ytsearch',
            'quiet': True,
            'noprogress': True,
            'logger': logger,
        }
        with yt_dlp.YoutubeDL(ydl_opts_info) as ydl:
            info = ydl.extract_info(query, download=False)
            video_info = info['entries'][0] if 'entries' in info else info

        video_url = video_info.get("webpage_url")
        video_title = video_info.get("title", "Unknown Title")
        video_id = video_info.get("id")  # Get video ID for transcript API
        logger.info(f"Found video: '{video_title}' (ID: {video_id}) at {video_url}")

        transcript_text = None
        transcript_source = "Not Available"

        # --- 2. Try to get the official transcript FIRST (unless forced to Whisper) ---
        if not force_whisper and video_id:
            logger.info("Attempting to fetch official YouTube transcript...")
            try:
                transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
                # Prioritize manually created transcripts, then auto-generated
                transcript_obj = None
                for t in transcript_list:
                    if t.is_generated:
                        # Keep auto-generated as a fallback, but prefer manual
                        if transcript_obj is None:
                            transcript_obj = t
                    else:
                        # Found a manual transcript, use it immediately
                        transcript_obj = t
                        break  # Found best option, exit loop

                if transcript_obj:
                    # Fetch the actual transcript parts
                    transcript_parts = transcript_obj.fetch()
                    transcript_text = " ".join([item.text for item in transcript_parts])
                    transcript_source = f"Official YouTube Captions ({'Generated' if transcript_obj.is_generated else 'Manual'})"
                    logger.info("Successfully fetched official YouTube transcript.")
                else:
                    logger.info("No official transcript found for this video.")

            except NoTranscriptFound:
                logger.info("No official transcript found for this video (NoTranscriptFound exception).")
            except TranscriptsDisabled:
                logger.info("Transcripts are disabled for this video.")
            except Exception as e:
                logger.warning(f"Error fetching official transcript: {e}", exc_info=True)

        if transcript_text:
            return {
                "status": "success",
                "title": video_title,
                "url": video_url,
                "source": transcript_source,
                "transcript": transcript_text
            }

        # --- 3. If no official transcript or force_whisper, use Whisper (audio download + transcribe) ---
        # This block only runs if transcript_text is still None after the official transcript attempt
        logger.info(
            "Proceeding to audio download and Whisper transcription as no official transcript was available or force_whisper is True.")
        output_dir = os.path.join(os.path.dirname(__file__), "testing", "audio_cache")
        os.makedirs(output_dir, exist_ok=True)
        audio_path = os.path.join(output_dir, f"{video_info.get('id', 'default_id')}.mp3")

        logger.info(f"Downloading audio to: {audio_path}")
        download_successful = _download_audio_with_fallbacks(video_url, audio_path)

        if not download_successful:
            message = "Failed to download audio using all available methods (yt-dlp, pytube)."
            logger.error(message)
            return {"status": "error", "message": message}

        # First try whisper.cpp if available
        if _check_whisper_cpp():
            transcript_text = _transcribe_with_whisper_cpp(audio_path)
            if transcript_text:
                transcript_source = "whisper.cpp (AI Generated)"
                logger.info("Successfully transcribed using whisper.cpp")

        # Fall back to Python whisper if whisper.cpp failed or isn't available
        if transcript_text is None:
            logger.info("Falling back to Python whisper...")
            transcript_source = "Python Whisper (AI Generated)"

            if _whisper_model is None:
                logger.info("Whisper model not loaded. Loading 'tiny' model now...")
                _whisper_model = whisper.load_model("tiny")
                logger.info("Whisper model 'tiny' loaded successfully.")

            logger.info("Starting Python Whisper transcription...")
            result = _whisper_model.transcribe(audio_path, fp16=False)
            transcript_text = result["text"]
            logger.info("Python Whisper transcription complete.")

        return {
            "status": "success",
            "title": video_title,
            "url": video_url,
            "source": transcript_source,
            "transcript": transcript_text
        }

    except DownloadError as e:
        logger.error(f"yt-dlp download error during info extraction: {e}", exc_info=True)
        return {"status": "error", "message": f"Could not find video info or initial download failed: {e}"}
    except Exception as e:
        logger.error(f"An unexpected error occurred in get_youtube_transcript: {e}", exc_info=True)
        return {"status": "error", "message": f"An unexpected error occurred: {str(e)}"}