resemble_ai_server.py•11 kB
"""
Resemble AI Voice Generation MCP Server
This server implements an MCP interface for Resemble AI's voice generation API.
It allows LLMs like Claude Desktop to generate voice content and list available
voice models through natural language interactions.
Created: 2023-03-06
Updated: 2025-03-06 - Updated API endpoint to match latest Resemble AI documentation
Updated: 2025-03-06 - Fixed nested response structure handling
"""
import os
import json
import logging
import base64
from typing import Dict, List, Optional, Any, Union
from pathlib import Path
import requests
from dotenv import load_dotenv
from pydantic import BaseModel, Field
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("resemble-mcp")
# Load environment variables
load_dotenv()
# Resemble AI API configuration
RESEMBLE_API_KEY = os.getenv("RESEMBLE_API_KEY")
# Updated API endpoint to match latest documentation
RESEMBLE_API_BASE_URL = "https://app.resemble.ai/api/v2"
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./output")
AUDIO_FORMAT = os.getenv("AUDIO_FORMAT", "mp3")
# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
# Validate API key
if not RESEMBLE_API_KEY:
logger.error("[Setup] Missing RESEMBLE_API_KEY in environment variables")
raise ValueError("RESEMBLE_API_KEY environment variable is required")
# Define Pydantic models for request/response validation
class GenerateTTSRequest(BaseModel):
"""Request model for generate_tts tool."""
text: str = Field(..., description="The text to convert to speech")
voice_id: str = Field(..., description="The ID of the voice to use")
return_type: str = Field("file", description="How to return the audio: 'file' or 'base64'")
output_filename: Optional[str] = Field(None, description="Filename for the output (without extension)")
class ListVoicesResponse(BaseModel):
"""Response model for list_voices tool."""
voices: List[Dict[str, Any]] = Field(..., description="List of available voices")
class GenerateTTSResponse(BaseModel):
"""Response model for generate_tts tool."""
success: bool = Field(..., description="Whether the operation was successful")
message: str = Field(..., description="Status message")
audio_data: Optional[str] = Field(None, description="Base64-encoded audio data (if return_type is 'base64')")
file_path: Optional[str] = Field(None, description="Path to the saved audio file (if return_type is 'file')")
# Helper function to extract audio URL from any response structure
def extract_audio_url(response: Dict[str, Any]) -> Optional[str]:
"""Extract audio URL from Resemble API response regardless of structure."""
# Try direct access first
if "audio_src" in response:
return response["audio_src"]
# Try item.audio_src (common pattern)
if "item" in response and isinstance(response["item"], dict):
item = response["item"]
if "audio_src" in item:
return item["audio_src"]
# Try other common URL fields
for key in ["url", "audio_url", "clip_url"]:
if key in response:
return response[key]
# Also check in item
if "item" in response and isinstance(response["item"], dict):
if key in response["item"]:
return response["item"][key]
# No audio URL found
return None
# Resemble AI API Client
class ResembleClient:
"""Client for interacting with the Resemble AI API."""
def __init__(self, api_key: str, base_url: str):
"""Initialize the Resemble AI client.
Args:
api_key: Resemble AI API key
base_url: Base URL for the Resemble AI API
"""
self.api_key = api_key
self.base_url = base_url
# Updated header format based on documentation
self.headers = {
"Authorization": f"Token {self.api_key}",
"Content-Type": "application/json",
"Accept": "application/json"
}
logger.info("[Setup] Initializing Resemble AI client")
def get_voices(self) -> List[Dict[str, Any]]:
"""Get list of available voices.
Returns:
List of voice objects
"""
logger.info("[API] Fetching available voices")
try:
response = requests.get(
f"{self.base_url}/voices",
headers=self.headers
)
response.raise_for_status()
# Check if response is in the expected format
data = response.json()
voices = data.get('items', []) if isinstance(data, dict) and 'items' in data else data
logger.info(f"[API] Successfully retrieved {len(voices)} voices")
return voices
except Exception as e:
logger.error(f"[Error] Failed to fetch voices: {str(e)}")
raise
def generate_tts(self, text: str, voice_id: str) -> Dict[str, Any]:
"""Generate text-to-speech audio.
Args:
text: Text to convert to speech
voice_id: ID of the voice to use
Returns:
API response with audio data
"""
logger.info(f"[API] Generating TTS for text (length: {len(text)})")
try:
# Get project UUID for the API call
projects_response = requests.get(
f"{self.base_url}/projects",
headers=self.headers
)
projects_response.raise_for_status()
projects_data = projects_response.json()
# Use the first project
if not projects_data.get('items'):
raise ValueError("No projects found in your Resemble account")
project_uuid = projects_data['items'][0]['uuid']
logger.info(f"[API] Using project UUID: {project_uuid}")
# Generate the clip using the sync endpoint
payload = {
"body": text,
"voice_uuid": voice_id,
"output_format": AUDIO_FORMAT
}
response = requests.post(
f"{self.base_url}/projects/{project_uuid}/clips/sync",
headers=self.headers,
json=payload
)
response.raise_for_status()
result = response.json()
# Debug log the response structure
logger.info(f"[Debug] TTS Response Keys: {list(result.keys())}")
audio_url = extract_audio_url(result)
if audio_url:
logger.info(f"[API] Successfully generated TTS, clip URL: {audio_url}")
else:
logger.warning("[API] Generated TTS but could not find audio URL in response")
return result
except Exception as e:
logger.error(f"[Error] Failed to generate TTS: {str(e)}")
raise
# Initialize Resemble client
resemble_client = ResembleClient(RESEMBLE_API_KEY, RESEMBLE_API_BASE_URL)
# MCP Tools
def list_voices() -> ListVoicesResponse:
"""List available voice models from Resemble AI.
Returns:
ListVoicesResponse with available voices
"""
logger.info("[Tool] Executing list_voices")
try:
voices = resemble_client.get_voices()
# Format voice data for better readability
formatted_voices = []
for voice in voices:
formatted_voice = {
"id": voice.get("uuid"),
"name": voice.get("name"),
"gender": voice.get("gender", "Unknown"),
"language": voice.get("language", "Unknown"),
"accent": voice.get("accent", "Unknown"),
"description": voice.get("description", "")
}
formatted_voices.append(formatted_voice)
return ListVoicesResponse(voices=formatted_voices)
except Exception as e:
logger.error(f"[Error] list_voices failed: {str(e)}")
raise
def generate_tts(text: str, voice_id: str, return_type: str = "file",
output_filename: Optional[str] = None) -> GenerateTTSResponse:
"""Generate voice audio from text.
Args:
text: Text to convert to speech
voice_id: ID of the voice to use
return_type: How to return the audio ('file' or 'base64')
output_filename: Filename for the output (without extension)
Returns:
GenerateTTSResponse with audio data or file path
"""
logger.info(f"[Tool] Executing generate_tts with {len(text)} characters of text")
try:
# Generate the TTS
result = resemble_client.generate_tts(text, voice_id)
# Get the audio URL from the response using helper function
audio_url = extract_audio_url(result)
if not audio_url:
return GenerateTTSResponse(
success=False,
message="No audio URL found in the response. Response structure may have changed."
)
# Download the audio file
audio_response = requests.get(audio_url)
audio_response.raise_for_status()
audio_data = audio_response.content
# Handle response based on return_type
if return_type == "base64":
# Convert to base64
encoded_audio = base64.b64encode(audio_data).decode("utf-8")
return GenerateTTSResponse(
success=True,
message="Audio generated successfully",
audio_data=encoded_audio
)
else:
# Save to file
if not output_filename:
output_filename = f"resemble_tts_{voice_id.split('-')[0]}"
file_path = os.path.join(OUTPUT_DIR, f"{output_filename}.{AUDIO_FORMAT}")
with open(file_path, "wb") as f:
f.write(audio_data)
return GenerateTTSResponse(
success=True,
message="Audio saved to file successfully",
file_path=file_path
)
except Exception as e:
logger.error(f"[Error] generate_tts failed: {str(e)}")
return GenerateTTSResponse(
success=False,
message=f"Error generating audio: {str(e)}"
)
# Do not run the MCP server when imported, but register the necessary tools
if __name__ == "__main__":
# Import MCP Server late to avoid circular imports
from mcp.server import Server
# Create MCP Server
server = Server()
# Register tools
server.tool(list_voices)
server.tool(generate_tts)
# Start the server
logger.info("[Setup] Starting MCP Server for Resemble AI")
server.start()