Skip to main content
Glama

speech_to_text

Convert speech to text from audio files using ASR models, with options for timestamps, word boosting, and custom output directories.

Instructions

Convert speech to text with a given model and save the output text file to a given directory. Directory is optional, if not provided, the output file will be saved to $HOME/Desktop.

⚠️ COST WARNING: This tool makes an API call to Whissle which may incur costs. Only use when explicitly requested by the user.

Args:
    audio_file_path (str): Path to the audio file to transcribe
    model_name (str, optional): The name of the ASR model to use. Defaults to "en-NER"
    timestamps (bool, optional): Whether to include word timestamps
    boosted_lm_words (List[str], optional): Words to boost in recognition
    boosted_lm_score (int, optional): Score for boosted words (0-100)
    output_directory (str, optional): Directory where files should be saved.
        Defaults to $HOME/Desktop if not provided.

Returns:
    TextContent with the transcription and path to the output file.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
audio_file_pathYes
model_nameNoen-NER
timestampsNo
boosted_lm_wordsNo
boosted_lm_scoreNo

Implementation Reference

  • The handler function that implements the speech_to_text tool logic. It performs input validation on the audio file (existence, size, format), calls the Whissle client's speech_to_text method with retries and error handling, processes the response to extract transcript, timestamps, etc., and returns a dictionary with the results.
    def speech_to_text(audio_file_path: str, model_name: str = "en-NER", timestamps: bool = True, boosted_lm_words: List[str] = None, boosted_lm_score: int = 80) -> Dict:
        """Convert speech to text using Whissle API"""
        try:
            # Check if file exists
            if not os.path.exists(audio_file_path):
                logger.error(f"Audio file not found: {audio_file_path}")
                return {"error": f"Audio file not found: {audio_file_path}"}
            
            # Check file size
            file_size = os.path.getsize(audio_file_path)
            if file_size == 0:
                logger.error(f"Audio file is empty: {audio_file_path}")
                return {"error": f"Audio file is empty: {audio_file_path}"}
            
            # Check file format
            file_ext = os.path.splitext(audio_file_path)[1].lower()
            if file_ext not in ['.wav', '.mp3', '.ogg', '.flac', '.m4a']:
                logger.error(f"Unsupported audio format: {file_ext}")
                return {"error": f"Unsupported audio format: {file_ext}. Supported formats: wav, mp3, ogg, flac, m4a"}
            
            # Check file size limits
            max_size_mb = 25
            if file_size > max_size_mb * 1024 * 1024:
                logger.error(f"File too large: {file_size / (1024*1024):.2f} MB")
                return {"error": f"File too large ({file_size / (1024*1024):.2f} MB). Maximum size is {max_size_mb} MB."}
            
            # Log the request details
            logger.info(f"Transcribing audio file: {audio_file_path}")
            logger.info(f"File size: {file_size / (1024*1024):.2f} MB")
            logger.info(f"File format: {file_ext}")
            
            # Try with a different model if the default one fails
            models_to_try = ["en-NER"]
            last_error = None
            
            for try_model in models_to_try:
                retry_count = 0
                max_retries = 2
                
                while retry_count <= max_retries:
                    try:
                        logger.info(f"Attempting transcription with model: {try_model} (Attempt {retry_count+1}/{max_retries+1})")
                        response = client.speech_to_text(
                            audio_file_path=audio_file_path,
                            model_name=try_model,
                            timestamps=timestamps,
                            boosted_lm_words=boosted_lm_words,
                            boosted_lm_score=boosted_lm_score
                        )
                        
                        if response and hasattr(response, 'transcript'):
                            logger.info(f"Transcription successful with model: {try_model}")
                            
                            result = {
                                "transcript": response.transcript,
                                "duration_seconds": getattr(response, 'duration_seconds', 0),
                                "language_code": getattr(response, 'language_code', 'en')
                            }
                            
                            if hasattr(response, 'timestamps'):
                                result["timestamps"] = response.timestamps
                            
                            if hasattr(response, 'diarize_output') and response.diarize_output:
                                result["diarize_output"] = response.diarize_output
                            
                            return result
                        else:
                            last_error = "No transcription was returned from the API"
                            logger.error(f"No transcription returned from API with model {try_model}")
                            break
                    except Exception as api_error:
                        error_msg = str(api_error)
                        logger.error(f"Error with model {try_model}: {error_msg}")
                        last_error = error_msg
                        
                        error_result = handle_api_error(error_msg, "transcription", retry_count, max_retries)
                        if error_result is not None:
                            if retry_count == max_retries:
                                break
                            else:
                                return {"error": error_result}
                        
                        retry_count += 1
            
            if "HTTP 500" in last_error:
                logger.error(f"All transcription attempts failed with HTTP 500: {last_error}")
                return {"error": f"Server error during transcription. This might be a temporary issue with the Whissle API. Please try again later or contact Whissle support. Error: {last_error}"}
            else:
                logger.error(f"All transcription attempts failed: {last_error}")
                return {"error": f"Failed to transcribe audio: {last_error}"}
                
        except Exception as e:
            logger.error(f"Unexpected error during transcription: {str(e)}")
            return {"error": f"Failed to transcribe audio: {str(e)}"}
  • The @mcp.tool decorator registers the speech_to_text tool with the FastMCP server and defines its description, input parameters schema, and usage instructions.
    @mcp.tool(
        description="""Convert speech to text with a given model and save the output text file to a given directory.
        Directory is optional, if not provided, the output file will be saved to $HOME/Desktop.
    
        ⚠️ COST WARNING: This tool makes an API call to Whissle which may incur costs. Only use when explicitly requested by the user.
    
        Args:
            audio_file_path (str): Path to the audio file to transcribe
            model_name (str, optional): The name of the ASR model to use. Defaults to "en-NER"
            timestamps (bool, optional): Whether to include word timestamps
            boosted_lm_words (List[str], optional): Words to boost in recognition
            boosted_lm_score (int, optional): Score for boosted words (0-100)
            output_directory (str, optional): Directory where files should be saved.
                Defaults to $HOME/Desktop if not provided.
    
        Returns:
            TextContent with the transcription and path to the output file.
        """
    )
  • Helper function used by the speech_to_text handler to manage API errors, implement retries for HTTP 500 errors with exponential backoff, and generate user-friendly error messages for various HTTP status codes.
    def handle_api_error(error_msg, operation_name, retry_count=0, max_retries=2):
        """Helper function to handle API errors with retries and better error messages"""
        logger.error(f"API error during {operation_name}: {error_msg}")
        
        if "HTTP 500" in error_msg:
            if retry_count < max_retries:
                # Exponential backoff: 2, 4, 8 seconds
                wait_time = 2 ** (retry_count + 1)
                logger.info(f"HTTP 500 error during {operation_name}. Retrying in {wait_time} seconds... (Attempt {retry_count+1}/{max_retries+1})")
                time.sleep(wait_time)
                return None  # Signal to retry
            else:
                # Provide more detailed error message for upload issues
                if "uploading file" in error_msg.lower():
                    return make_error(
                        f"Server error during {operation_name}. The file upload to the Whissle API failed. "
                        f"This could be due to:\n"
                        f"1. Temporary server issues\n"
                        f"2. File format compatibility issues\n"
                        f"3. Network connectivity problems\n"
                        f"Please try again later or contact Whissle support. Error: {error_msg}"
                    )
                else:
                    return make_error(
                        f"Server error during {operation_name}. This might be a temporary issue with the Whissle API. "
                        f"Please try again later or contact Whissle support. Error: {error_msg}"
                    )
        elif "HTTP 413" in error_msg:
            return make_error(f"File too large. Please try a smaller file. Error: {error_msg}")
        elif "HTTP 415" in error_msg:
            return make_error(f"Unsupported file format. Please use a supported format. Error: {error_msg}")
        elif "HTTP 401" in error_msg or "HTTP 403" in error_msg:
            return make_error(f"Authentication error. Please check your API token. Error: {error_msg}")
        else:
            return make_error(f"API error during {operation_name}: {error_msg}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/WhissleAI/whissle-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server