Speech MCP
by Kvadratni
- src
- speech_mcp
"""
Speech recognition module for speech-mcp.
This module provides centralized speech recognition functionality including:
- Model loading and initialization
- Audio transcription
- Fallback mechanisms
- Consistent error handling
It consolidates speech recognition code that was previously duplicated
across server.py and speech_ui.py.
"""
import os
import time
from typing import Optional, Tuple, Dict, Any, List, Union
# Import the centralized logger
from speech_mcp.utils.logger import get_logger
# Get a logger for this module
logger = get_logger(__name__, component="stt")
class SpeechRecognizer:
"""
Core speech recognition class that handles transcription of audio files.
This class provides a unified interface for speech recognition with fallback mechanisms.
It supports multiple speech recognition engines, with faster-whisper as the primary engine
and SpeechRecognition as a fallback.
"""
def __init__(self, model_name: str = "base", device: str = "cpu", compute_type: str = "int8"):
"""
Initialize the speech recognizer.
Args:
model_name: The name of the faster-whisper model to use (e.g., "base", "small", "medium")
device: The device to use for inference ("cpu" or "cuda")
compute_type: The compute type to use for inference ("int8", "float16", "float32")
"""
self.whisper_model = None
self.sr_recognizer = None
self.model_name = model_name
self.device = device
self.compute_type = compute_type
self.is_initialized = False
# Initialize the speech recognition models in the background
self._initialize_speech_recognition()
def _initialize_speech_recognition(self) -> bool:
"""
Initialize speech recognition models.
Returns:
bool: True if initialization was successful, False otherwise
"""
if self.is_initialized:
logger.info("Speech recognition already initialized")
return True
# Try to initialize faster-whisper first
try:
logger.info(f"Loading faster-whisper speech recognition model '{self.model_name}' on {self.device}...")
import faster_whisper
# Load the model with the specified parameters
self.whisper_model = faster_whisper.WhisperModel(
self.model_name,
device=self.device,
compute_type=self.compute_type
)
logger.info("faster-whisper model loaded successfully!")
self.is_initialized = True
return True
except ImportError as e:
logger.error(f"Failed to load faster-whisper: {e}")
logger.info("Trying to fall back to SpeechRecognition library...")
return self._initialize_speech_recognition_fallback()
except Exception as e:
logger.error(f"Error initializing faster-whisper: {e}")
logger.info("Trying to fall back to SpeechRecognition library...")
return self._initialize_speech_recognition_fallback()
def _initialize_speech_recognition_fallback(self) -> bool:
"""
Initialize fallback speech recognition using SpeechRecognition library.
Returns:
bool: True if initialization was successful, False otherwise
"""
try:
logger.info("Initializing SpeechRecognition fallback...")
import speech_recognition as sr
self.sr_recognizer = sr.Recognizer()
logger.info("SpeechRecognition library loaded successfully as fallback!")
self.is_initialized = True
return True
except ImportError as e:
logger.error(f"Failed to load SpeechRecognition: {e}")
logger.warning("Please install it with: pip install SpeechRecognition")
self.is_initialized = False
return False
except Exception as e:
logger.error(f"Error initializing SpeechRecognition: {e}")
self.is_initialized = False
return False
def transcribe(self, audio_file_path: str, language: str = "en") -> Tuple[str, Dict[str, Any]]:
"""
Transcribe an audio file using the available speech recognition engine.
Args:
audio_file_path: Path to the audio file to transcribe
language: Language code for transcription (default: "en" for English)
Returns:
Tuple containing:
- The transcribed text
- A dictionary with metadata about the transcription
"""
# Check if the file exists
if not os.path.exists(audio_file_path):
error_msg = f"Audio file not found: {audio_file_path}"
logger.error(error_msg)
return "", {"error": error_msg, "engine": "none"}
# Ensure speech recognition is initialized
if not self.is_initialized and not self._initialize_speech_recognition():
error_msg = "Failed to initialize speech recognition"
logger.error(error_msg)
return "", {"error": error_msg, "engine": "none"}
# Try faster-whisper first
if self.whisper_model is not None:
try:
logger.info(f"Transcribing audio with faster-whisper: {audio_file_path}")
transcription_start = time.time()
segments, info = self.whisper_model.transcribe(audio_file_path, beam_size=5)
# Collect all segments to form the complete transcription
transcription = ""
for segment in segments:
transcription += segment.text + " "
transcription = transcription.strip()
transcription_time = time.time() - transcription_start
logger.info(f"Transcription completed in {transcription_time:.2f}s: {transcription}")
logger.debug(f"Transcription info: {info}")
# Return the transcription and metadata
return transcription, {
"engine": "faster-whisper",
"model": self.model_name,
"time_taken": transcription_time,
"language": info.language,
"language_probability": info.language_probability,
"duration": info.duration
}
except Exception as e:
logger.error(f"Error transcribing with faster-whisper: {e}")
logger.info("Falling back to SpeechRecognition...")
# Fall back to SpeechRecognition if available
if self.sr_recognizer is not None:
try:
import speech_recognition as sr
logger.info(f"Transcribing audio with SpeechRecognition (fallback): {audio_file_path}")
transcription_start = time.time()
with sr.AudioFile(audio_file_path) as source:
audio_data = self.sr_recognizer.record(source)
transcription = self.sr_recognizer.recognize_google(audio_data, language=language)
transcription_time = time.time() - transcription_start
logger.info(f"Fallback transcription completed in {transcription_time:.2f}s: {transcription}")
# Return the transcription and metadata
return transcription, {
"engine": "speech_recognition",
"api": "google",
"time_taken": transcription_time
}
except Exception as e:
logger.error(f"Error transcribing with SpeechRecognition: {e}")
# If all methods fail, return an error
error_msg = "All speech recognition methods failed"
logger.error(error_msg)
return "", {"error": error_msg, "engine": "none"}
def cleanup_audio_file(self, audio_file_path: str) -> bool:
"""
Clean up a temporary audio file.
Args:
audio_file_path: Path to the audio file to clean up
Returns:
bool: True if the file was cleaned up successfully, False otherwise
"""
try:
if os.path.exists(audio_file_path):
logger.debug(f"Removing temporary audio file: {audio_file_path}")
os.unlink(audio_file_path)
return True
return False
except Exception as e:
logger.error(f"Error removing temporary audio file: {e}")
return False
def get_available_models(self) -> List[Dict[str, Any]]:
"""
Get a list of available speech recognition models.
Returns:
List of dictionaries containing model information
"""
models = []
# Add faster-whisper models if available
if self.whisper_model is not None:
models.extend([
{"name": "tiny", "engine": "faster-whisper", "description": "Fastest, least accurate"},
{"name": "base", "engine": "faster-whisper", "description": "Fast, good accuracy"},
{"name": "small", "engine": "faster-whisper", "description": "Balanced speed and accuracy"},
{"name": "medium", "engine": "faster-whisper", "description": "Good accuracy, slower"},
{"name": "large-v2", "engine": "faster-whisper", "description": "Best accuracy, slowest"}
])
# Add SpeechRecognition models if available
if self.sr_recognizer is not None:
models.append({
"name": "google",
"engine": "speech_recognition",
"description": "Google Speech-to-Text API (requires internet)"
})
return models
def get_current_model(self) -> Dict[str, Any]:
"""
Get information about the currently active model.
Returns:
Dictionary containing information about the current model
"""
if self.whisper_model is not None:
return {
"name": self.model_name,
"engine": "faster-whisper",
"device": self.device,
"compute_type": self.compute_type
}
elif self.sr_recognizer is not None:
return {
"name": "google",
"engine": "speech_recognition"
}
else:
return {
"name": "none",
"engine": "none",
"error": "No speech recognition model initialized"
}
def set_model(self, model_name: str, device: Optional[str] = None, compute_type: Optional[str] = None) -> bool:
"""
Set the speech recognition model to use.
Args:
model_name: The name of the model to use
device: The device to use for inference (optional)
compute_type: The compute type to use for inference (optional)
Returns:
bool: True if the model was set successfully, False otherwise
"""
# Update parameters if provided
if device is not None:
self.device = device
if compute_type is not None:
self.compute_type = compute_type
# If the model name is the same and already initialized, no need to reinitialize
if model_name == self.model_name and self.is_initialized and self.whisper_model is not None:
return True
# Update the model name
self.model_name = model_name
# Reset initialization state
self.is_initialized = False
self.whisper_model = None
# Reinitialize with the new model
return self._initialize_speech_recognition()
# Create a singleton instance for easy import
default_recognizer = SpeechRecognizer()
def transcribe_audio(audio_file_path: str, language: str = "en") -> str:
"""
Transcribe an audio file using the default speech recognizer.
This is a convenience function that uses the default recognizer instance.
Args:
audio_file_path: Path to the audio file to transcribe
language: Language code for transcription (default: "en" for English)
Returns:
The transcribed text
"""
transcription, _ = default_recognizer.transcribe(audio_file_path, language)
return transcription
def initialize_speech_recognition(
model_name: str = "base",
device: str = "cpu",
compute_type: str = "int8"
) -> bool:
"""
Initialize the default speech recognizer with the specified parameters.
Args:
model_name: The name of the faster-whisper model to use
device: The device to use for inference
compute_type: The compute type to use for inference
Returns:
bool: True if initialization was successful, False otherwise
"""
global default_recognizer
default_recognizer = SpeechRecognizer(model_name, device, compute_type)
return default_recognizer.is_initialized
def get_available_models() -> List[Dict[str, Any]]:
"""
Get a list of available speech recognition models.
Returns:
List of dictionaries containing model information
"""
return default_recognizer.get_available_models()
def get_current_model() -> Dict[str, Any]:
"""
Get information about the currently active model.
Returns:
Dictionary containing information about the current model
"""
return default_recognizer.get_current_model()