resemble_ai_sdk_server.py•8.39 kB
"""
Resemble AI Voice Generation MCP Server (SDK Version)
This server implements an MCP interface for Resemble AI's voice generation API
using the official Resemble SDK. It allows LLMs like Claude Desktop to generate
voice content and list available voice models through natural language interactions.
Created: 2025-03-06
Updated: 2025-03-06 - Fixed nested response structure handling
"""
import os
import base64
import logging
import json
from typing import Dict, List, Optional, Any
from pathlib import Path
import requests
from dotenv import load_dotenv
from pydantic import BaseModel, Field
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger("resemble-mcp-sdk")
# Load environment variables
load_dotenv()
# Resemble AI API configuration
RESEMBLE_API_KEY = os.getenv("RESEMBLE_API_KEY")
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./output")
AUDIO_FORMAT = os.getenv("AUDIO_FORMAT", "mp3")
# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
# Validate API key
if not RESEMBLE_API_KEY:
logger.error("[Setup] Missing RESEMBLE_API_KEY in environment variables")
raise ValueError("RESEMBLE_API_KEY environment variable is required")
# Import and initialize Resemble SDK
try:
from resemble import Resemble
Resemble.api_key(RESEMBLE_API_KEY)
logger.info("[Setup] Successfully initialized Resemble SDK")
except ImportError as e:
logger.error(f"[Error] Failed to import Resemble SDK: {str(e)}")
logger.error("[Error] Make sure to install the SDK with: pip install resemble")
raise
# Define Pydantic models for request/response validation
class GenerateTTSRequest(BaseModel):
"""Request model for generate_tts tool."""
text: str = Field(..., description="The text to convert to speech")
voice_id: str = Field(..., description="The ID of the voice to use")
return_type: str = Field("file", description="How to return the audio: 'file' or 'base64'")
output_filename: Optional[str] = Field(None, description="Filename for the output (without extension)")
class ListVoicesResponse(BaseModel):
"""Response model for list_voices tool."""
voices: List[Dict[str, Any]] = Field(..., description="List of available voices")
class GenerateTTSResponse(BaseModel):
"""Response model for generate_tts tool."""
success: bool = Field(..., description="Whether the operation was successful")
message: str = Field(..., description="Status message")
audio_data: Optional[str] = Field(None, description="Base64-encoded audio data (if return_type is 'base64')")
file_path: Optional[str] = Field(None, description="Path to the saved audio file (if return_type is 'file')")
# Helper function to extract audio URL from any response structure
def extract_audio_url(response: Dict[str, Any]) -> Optional[str]:
"""Extract audio URL from Resemble API response regardless of structure."""
# Try direct access first
if "audio_src" in response:
return response["audio_src"]
# Try item.audio_src (common pattern)
if "item" in response and isinstance(response["item"], dict):
item = response["item"]
if "audio_src" in item:
return item["audio_src"]
# Try other common URL fields
for key in ["url", "audio_url", "clip_url"]:
if key in response:
return response[key]
# Also check in item
if "item" in response and isinstance(response["item"], dict):
if key in response["item"]:
return response["item"][key]
# No audio URL found
return None
# MCP Tools
def list_voices() -> ListVoicesResponse:
"""List available voice models from Resemble AI.
Returns:
ListVoicesResponse with available voices
"""
logger.info("[Tool] Executing list_voices")
try:
# Get all voices (paginated, up to 10 items per page)
response = Resemble.v2.voices.all(1, 10)
voices = response.get('items', [])
logger.info(f"[API] Successfully retrieved {len(voices)} voices")
# Format voice data for better readability
formatted_voices = []
for voice in voices:
formatted_voice = {
"id": voice.get("uuid"),
"name": voice.get("name"),
"gender": voice.get("gender", "Unknown"),
"language": voice.get("language", "Unknown"),
"accent": voice.get("accent", "Unknown"),
"description": voice.get("description", "")
}
formatted_voices.append(formatted_voice)
return ListVoicesResponse(voices=formatted_voices)
except Exception as e:
logger.error(f"[Error] list_voices failed: {str(e)}")
raise
def generate_tts(text: str, voice_id: str, return_type: str = "file",
output_filename: Optional[str] = None) -> GenerateTTSResponse:
"""Generate voice audio from text.
Args:
text: Text to convert to speech
voice_id: ID of the voice to use
return_type: How to return the audio ('file' or 'base64')
output_filename: Filename for the output (without extension)
Returns:
GenerateTTSResponse with audio data or file path
"""
logger.info(f"[Tool] Executing generate_tts with {len(text)} characters of text")
try:
# Get the first project UUID
projects = Resemble.v2.projects.all(1, 10)
if not projects.get('items'):
return GenerateTTSResponse(
success=False,
message="No projects found in your Resemble account"
)
project_uuid = projects['items'][0]['uuid']
logger.info(f"[API] Using project UUID: {project_uuid}")
# Generate TTS synchronously
result = Resemble.v2.clips.create_sync(
project_uuid,
voice_id,
text,
output_format=AUDIO_FORMAT
)
# Debug: Log the full response structure
logger.info(f"[Debug] TTS Response Keys: {list(result.keys())}")
# Extract audio URL using helper function
audio_url = extract_audio_url(result)
if not audio_url:
return GenerateTTSResponse(
success=False,
message="No audio URL found in the response. Response structure may have changed."
)
logger.info(f"[Debug] Found audio URL: {audio_url}")
# Download the audio file
audio_response = requests.get(audio_url)
audio_response.raise_for_status()
audio_data = audio_response.content
# Handle response based on return_type
if return_type == "base64":
# Convert to base64
encoded_audio = base64.b64encode(audio_data).decode("utf-8")
return GenerateTTSResponse(
success=True,
message="Audio generated successfully",
audio_data=encoded_audio
)
else:
# Save to file
if not output_filename:
output_filename = f"resemble_tts_{voice_id.split('-')[0]}"
file_path = os.path.join(OUTPUT_DIR, f"{output_filename}.{AUDIO_FORMAT}")
with open(file_path, "wb") as f:
f.write(audio_data)
return GenerateTTSResponse(
success=True,
message="Audio saved to file successfully",
file_path=file_path
)
except Exception as e:
logger.error(f"[Error] generate_tts failed: {str(e)}")
return GenerateTTSResponse(
success=False,
message=f"Error generating audio: {str(e)}"
)
# Do not run the MCP server when imported, but register the necessary tools
if __name__ == "__main__":
# Import MCP Server late to avoid circular imports
from mcp.server import Server
# Create MCP Server
server = Server()
# Register tools
server.tool(list_voices)
server.tool(generate_tts)
# Start the server
logger.info("[Setup] Starting MCP Server for Resemble AI (SDK Version)")
server.start()