resemble_stdio_server.py•11.9 kB
"""
Resemble AI Voice Generation MCP Server using StdIO transport.
This server integrates with Resemble AI's voice generation API
using simple stdio communication for Claude/Cursor integration.
"""
import os
import json
import base64
import logging
import sys
from typing import Dict, List, Optional, Any, Union
from pathlib import Path
import requests
from dotenv import load_dotenv
from pydantic import BaseModel
# Configure logging - log to file instead of stdout to avoid interfering with stdio communication
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
filename='resemble_stdio.log', # Log to file instead of stdout
filemode='a'
)
logger = logging.getLogger("resemble-stdio")
# Load environment variables
load_dotenv()
# Resemble AI API configuration
RESEMBLE_API_KEY = os.getenv("RESEMBLE_API_KEY")
RESEMBLE_API_BASE_URL = "https://app.resemble.ai/api/v2"
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./output")
AUDIO_FORMAT = os.getenv("AUDIO_FORMAT", "mp3")
# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
# Validate API key
if not RESEMBLE_API_KEY:
logger.error("[Setup] Missing RESEMBLE_API_KEY in environment variables")
raise ValueError("RESEMBLE_API_KEY environment variable is required")
# Models
class VoiceModel(BaseModel):
"""Voice model information."""
uuid: str
name: str
is_public: bool
is_cloned: bool
description: Optional[str] = None
class ListVoicesResponse(BaseModel):
"""Response model for list_voices endpoint."""
voices: List[VoiceModel]
class GenerateTTSResponse(BaseModel):
"""Response model for generate_tts endpoint."""
success: bool
message: str
file_path: Optional[str] = None
base64_audio: Optional[str] = None
# Client for Resemble AI API
class ResembleClient:
"""Client for interacting with Resemble AI API."""
def __init__(self, api_key: str, base_url: str = RESEMBLE_API_BASE_URL):
"""Initialize the client with API key and base URL."""
self.api_key = api_key
self.base_url = base_url
self.headers = {
"Authorization": f"Token token={api_key}",
"Content-Type": "application/json",
"Accept": "application/json"
}
logger.info("[Setup] Initializing Resemble AI client")
def get_voices(self) -> Dict[str, Any]:
"""Get a list of available voices."""
logger.info("[API] Fetching available voices from Resemble AI")
try:
response = requests.get(
f"{self.base_url}/voices",
headers=self.headers
)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
logger.error(f"[Error] Failed to fetch voices: {str(e)}")
raise
def generate_audio(self, text: str, voice_uuid: str) -> Dict[str, Any]:
"""Generate audio from text using specified voice."""
logger.info(f"[API] Generating audio for voice {voice_uuid}")
try:
payload = {
"title": f"generated_{voice_uuid[:8]}",
"voice_uuid": voice_uuid,
"body": text,
"output_format": AUDIO_FORMAT
}
response = requests.post(
f"{self.base_url}/clips",
headers=self.headers,
json=payload
)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
logger.error(f"[Error] Failed to generate audio: {str(e)}")
raise
def download_audio(self, clip_id: str, output_path: str) -> str:
"""Download generated audio clip and save to file."""
logger.info(f"[API] Downloading audio clip {clip_id}")
try:
response = requests.get(
f"{self.base_url}/clips/{clip_id}/audio",
headers=self.headers
)
response.raise_for_status()
# Save to file
with open(output_path, 'wb') as f:
f.write(response.content)
logger.info(f"[Success] Saved audio to {output_path}")
return output_path
except requests.RequestException as e:
logger.error(f"[Error] Failed to download audio: {str(e)}")
raise
# Initialize Resemble AI client
resemble_client = ResembleClient(RESEMBLE_API_KEY)
# Tool implementations
def list_voices() -> Dict[str, Any]:
"""List available voice models from Resemble AI."""
try:
voices_data = resemble_client.get_voices()
voices = []
for voice in voices_data.get("voices", []):
voices.append({
"uuid": voice.get("uuid", ""),
"name": voice.get("name", ""),
"is_public": voice.get("is_public", False),
"is_cloned": voice.get("is_cloned", False),
"description": voice.get("description", "")
})
return {"voices": voices}
except Exception as e:
logger.error(f"[Error] Failed to list voices: {str(e)}")
raise
def generate_tts(text: str, voice_id: str, return_type: str = "file", output_filename: Optional[str] = None) -> Dict[str, Any]:
"""Generate voice audio from text."""
try:
# Validate inputs
if not text:
raise ValueError("Text is required")
if not voice_id:
raise ValueError("Voice ID is required")
# Generate audio
result = resemble_client.generate_audio(text, voice_id)
clip_id = result.get("id")
if not clip_id:
raise ValueError("Failed to generate audio clip")
# Set output filename
if not output_filename:
output_filename = f"generated_{voice_id[:8]}"
# Set file path
file_path = os.path.join(OUTPUT_DIR, f"{output_filename}.{AUDIO_FORMAT}")
# Download audio
resemble_client.download_audio(clip_id, file_path)
# Return appropriate response based on return_type
if return_type == "base64":
with open(file_path, "rb") as audio_file:
encoded_audio = base64.b64encode(audio_file.read()).decode("utf-8")
return {
"success": True,
"message": "Successfully generated audio",
"base64_audio": encoded_audio
}
else:
return {
"success": True,
"message": "Successfully generated audio",
"file_path": file_path
}
except Exception as e:
logger.error(f"[Error] Failed to generate TTS: {str(e)}")
raise
# Define tool schemas for registration
TOOL_SCHEMAS = [
{
"name": "list_voices",
"description": "List available voice models from Resemble AI.",
"parameters": {
"type": "object",
"properties": {
"random_string": {
"type": "string",
"description": "Dummy parameter for no-parameter tools"
}
},
"required": ["random_string"]
}
},
{
"name": "generate_tts",
"description": "Generate voice audio from text.\n \n Args:\n text: Text to convert to speech\n voice_id: ID of the voice to use\n return_type: How to return the audio: 'file' or 'base64'\n output_filename: Filename for the output (without extension)\n ",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"title": "Text"
},
"voice_id": {
"type": "string",
"title": "Voice Id"
},
"return_type": {
"type": "string",
"title": "Return Type",
"default": "file"
},
"output_filename": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Output Filename",
"default": None
}
},
"required": [
"text",
"voice_id"
],
"title": "generate_ttsArguments"
}
}
]
def read_message():
"""Read a message from stdin."""
line = sys.stdin.readline()
if not line:
return None
try:
return json.loads(line)
except json.JSONDecodeError as e:
logger.error(f"[Error] Failed to parse message: {str(e)}")
return None
def write_message(message_type, **kwargs):
"""Write a message to stdout."""
message = {"type": message_type, **kwargs}
sys.stdout.write(json.dumps(message) + "\n")
sys.stdout.flush()
def start_server():
"""Start the MCP server using StdIO transport."""
logger.info("Starting Resemble AI MCP Server with StdIO transport")
# Send a register message to the client
write_message("register", tools=TOOL_SCHEMAS)
# Process messages from stdin
while True:
message = read_message()
if message is None:
logger.info("End of input, shutting down")
break
logger.info(f"[Message] Received: {json.dumps(message)[:100]}...")
try:
message_type = message.get("type", "")
if message_type == "invoke":
name = message.get("name", "")
parameters = message.get("parameters", {})
invoke_id = message.get("invoke_id", "")
logger.info(f"[Invoke] Tool: {name}, ID: {invoke_id}")
try:
if name == "list_voices":
result = list_voices()
write_message("tool_result", invoke_id=invoke_id, result=result)
elif name == "generate_tts":
result = generate_tts(
text=parameters.get("text", ""),
voice_id=parameters.get("voice_id", ""),
return_type=parameters.get("return_type", "file"),
output_filename=parameters.get("output_filename")
)
write_message("tool_result", invoke_id=invoke_id, result=result)
else:
logger.error(f"[Error] Unknown tool: {name}")
write_message("tool_error", invoke_id=invoke_id, error=f"Unknown tool: {name}")
except Exception as e:
logger.error(f"[Error] Failed to execute tool: {str(e)}")
write_message("tool_error", invoke_id=invoke_id, error=str(e))
elif message_type == "ping":
write_message("pong")
else:
logger.warning(f"[Warning] Unknown message type: {message_type}")
except Exception as e:
logger.error(f"[Error] Failed to handle message: {str(e)}")
logger.exception(e)
if __name__ == "__main__":
try:
start_server()
except Exception as e:
logger.error(f"[Error] Server crashed: {str(e)}")
logger.exception(e)
sys.exit(1)