import whisper
import logging
import threading
import numpy as np
from typing import Dict, Any, Optional
from youtube_mcp_server.config import Config
logger = logging.getLogger("youtube-mcp-server")
class WhisperService:
def __init__(self):
self.device = Config.get_device()
self.model_name = Config.WHISPER_MODEL_NAME
logger.info(f"Loading Whisper model '{self.model_name}' on {self.device}...")
self.model = whisper.load_model(self.model_name, device=self.device)
self.lock = threading.Lock()
def load_audio(self, path: str) -> np.ndarray:
"""Reads audio using ffmpeg to 16k mono float32."""
return whisper.load_audio(path)
def transcribe_segment(self, audio_segment: np.ndarray, language: Optional[str] = None, task: str = "transcribe") -> str:
"""
Transcribes a specific numpy audio segment.
Args:
audio_segment: The audio data.
language: Target language hint (or None for auto-detect).
task: 'transcribe' or 'translate'.
"""
with self.lock:
# Handle 'auto' language case mapping slightly differently if passed from upstream
lang_arg = language if language != "auto" else None
result = self.model.transcribe(audio_segment, language=lang_arg, task=task)
return result.get("text", "").strip()