Skip to main content
Glama

Resemble AI Voice Generation MCP Server

by obaid
resemble_ai_server.py11 kB
""" Resemble AI Voice Generation MCP Server This server implements an MCP interface for Resemble AI's voice generation API. It allows LLMs like Claude Desktop to generate voice content and list available voice models through natural language interactions. Created: 2023-03-06 Updated: 2025-03-06 - Updated API endpoint to match latest Resemble AI documentation Updated: 2025-03-06 - Fixed nested response structure handling """ import os import json import logging import base64 from typing import Dict, List, Optional, Any, Union from pathlib import Path import requests from dotenv import load_dotenv from pydantic import BaseModel, Field # Configure logging logging.basicConfig( level=logging.INFO, format='[%(levelname)s] %(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger("resemble-mcp") # Load environment variables load_dotenv() # Resemble AI API configuration RESEMBLE_API_KEY = os.getenv("RESEMBLE_API_KEY") # Updated API endpoint to match latest documentation RESEMBLE_API_BASE_URL = "https://app.resemble.ai/api/v2" OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./output") AUDIO_FORMAT = os.getenv("AUDIO_FORMAT", "mp3") # Create output directory if it doesn't exist Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) # Validate API key if not RESEMBLE_API_KEY: logger.error("[Setup] Missing RESEMBLE_API_KEY in environment variables") raise ValueError("RESEMBLE_API_KEY environment variable is required") # Define Pydantic models for request/response validation class GenerateTTSRequest(BaseModel): """Request model for generate_tts tool.""" text: str = Field(..., description="The text to convert to speech") voice_id: str = Field(..., description="The ID of the voice to use") return_type: str = Field("file", description="How to return the audio: 'file' or 'base64'") output_filename: Optional[str] = Field(None, description="Filename for the output (without extension)") class ListVoicesResponse(BaseModel): """Response model for list_voices tool.""" voices: List[Dict[str, Any]] = Field(..., description="List of available voices") class GenerateTTSResponse(BaseModel): """Response model for generate_tts tool.""" success: bool = Field(..., description="Whether the operation was successful") message: str = Field(..., description="Status message") audio_data: Optional[str] = Field(None, description="Base64-encoded audio data (if return_type is 'base64')") file_path: Optional[str] = Field(None, description="Path to the saved audio file (if return_type is 'file')") # Helper function to extract audio URL from any response structure def extract_audio_url(response: Dict[str, Any]) -> Optional[str]: """Extract audio URL from Resemble API response regardless of structure.""" # Try direct access first if "audio_src" in response: return response["audio_src"] # Try item.audio_src (common pattern) if "item" in response and isinstance(response["item"], dict): item = response["item"] if "audio_src" in item: return item["audio_src"] # Try other common URL fields for key in ["url", "audio_url", "clip_url"]: if key in response: return response[key] # Also check in item if "item" in response and isinstance(response["item"], dict): if key in response["item"]: return response["item"][key] # No audio URL found return None # Resemble AI API Client class ResembleClient: """Client for interacting with the Resemble AI API.""" def __init__(self, api_key: str, base_url: str): """Initialize the Resemble AI client. Args: api_key: Resemble AI API key base_url: Base URL for the Resemble AI API """ self.api_key = api_key self.base_url = base_url # Updated header format based on documentation self.headers = { "Authorization": f"Token {self.api_key}", "Content-Type": "application/json", "Accept": "application/json" } logger.info("[Setup] Initializing Resemble AI client") def get_voices(self) -> List[Dict[str, Any]]: """Get list of available voices. Returns: List of voice objects """ logger.info("[API] Fetching available voices") try: response = requests.get( f"{self.base_url}/voices", headers=self.headers ) response.raise_for_status() # Check if response is in the expected format data = response.json() voices = data.get('items', []) if isinstance(data, dict) and 'items' in data else data logger.info(f"[API] Successfully retrieved {len(voices)} voices") return voices except Exception as e: logger.error(f"[Error] Failed to fetch voices: {str(e)}") raise def generate_tts(self, text: str, voice_id: str) -> Dict[str, Any]: """Generate text-to-speech audio. Args: text: Text to convert to speech voice_id: ID of the voice to use Returns: API response with audio data """ logger.info(f"[API] Generating TTS for text (length: {len(text)})") try: # Get project UUID for the API call projects_response = requests.get( f"{self.base_url}/projects", headers=self.headers ) projects_response.raise_for_status() projects_data = projects_response.json() # Use the first project if not projects_data.get('items'): raise ValueError("No projects found in your Resemble account") project_uuid = projects_data['items'][0]['uuid'] logger.info(f"[API] Using project UUID: {project_uuid}") # Generate the clip using the sync endpoint payload = { "body": text, "voice_uuid": voice_id, "output_format": AUDIO_FORMAT } response = requests.post( f"{self.base_url}/projects/{project_uuid}/clips/sync", headers=self.headers, json=payload ) response.raise_for_status() result = response.json() # Debug log the response structure logger.info(f"[Debug] TTS Response Keys: {list(result.keys())}") audio_url = extract_audio_url(result) if audio_url: logger.info(f"[API] Successfully generated TTS, clip URL: {audio_url}") else: logger.warning("[API] Generated TTS but could not find audio URL in response") return result except Exception as e: logger.error(f"[Error] Failed to generate TTS: {str(e)}") raise # Initialize Resemble client resemble_client = ResembleClient(RESEMBLE_API_KEY, RESEMBLE_API_BASE_URL) # MCP Tools def list_voices() -> ListVoicesResponse: """List available voice models from Resemble AI. Returns: ListVoicesResponse with available voices """ logger.info("[Tool] Executing list_voices") try: voices = resemble_client.get_voices() # Format voice data for better readability formatted_voices = [] for voice in voices: formatted_voice = { "id": voice.get("uuid"), "name": voice.get("name"), "gender": voice.get("gender", "Unknown"), "language": voice.get("language", "Unknown"), "accent": voice.get("accent", "Unknown"), "description": voice.get("description", "") } formatted_voices.append(formatted_voice) return ListVoicesResponse(voices=formatted_voices) except Exception as e: logger.error(f"[Error] list_voices failed: {str(e)}") raise def generate_tts(text: str, voice_id: str, return_type: str = "file", output_filename: Optional[str] = None) -> GenerateTTSResponse: """Generate voice audio from text. Args: text: Text to convert to speech voice_id: ID of the voice to use return_type: How to return the audio ('file' or 'base64') output_filename: Filename for the output (without extension) Returns: GenerateTTSResponse with audio data or file path """ logger.info(f"[Tool] Executing generate_tts with {len(text)} characters of text") try: # Generate the TTS result = resemble_client.generate_tts(text, voice_id) # Get the audio URL from the response using helper function audio_url = extract_audio_url(result) if not audio_url: return GenerateTTSResponse( success=False, message="No audio URL found in the response. Response structure may have changed." ) # Download the audio file audio_response = requests.get(audio_url) audio_response.raise_for_status() audio_data = audio_response.content # Handle response based on return_type if return_type == "base64": # Convert to base64 encoded_audio = base64.b64encode(audio_data).decode("utf-8") return GenerateTTSResponse( success=True, message="Audio generated successfully", audio_data=encoded_audio ) else: # Save to file if not output_filename: output_filename = f"resemble_tts_{voice_id.split('-')[0]}" file_path = os.path.join(OUTPUT_DIR, f"{output_filename}.{AUDIO_FORMAT}") with open(file_path, "wb") as f: f.write(audio_data) return GenerateTTSResponse( success=True, message="Audio saved to file successfully", file_path=file_path ) except Exception as e: logger.error(f"[Error] generate_tts failed: {str(e)}") return GenerateTTSResponse( success=False, message=f"Error generating audio: {str(e)}" ) # Do not run the MCP server when imported, but register the necessary tools if __name__ == "__main__": # Import MCP Server late to avoid circular imports from mcp.server import Server # Create MCP Server server = Server() # Register tools server.tool(list_voices) server.tool(generate_tts) # Start the server logger.info("[Setup] Starting MCP Server for Resemble AI") server.start()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/obaid/resemble-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server