ElevenLabs MCP Server

import logging import os import time import requests from pathlib import Path from typing import Dict, List, Optional, TypedDict from dotenv import load_dotenv load_dotenv() log_level = os.getenv("ELEVENLABS_LOG_LEVEL", "ERROR").upper() valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"} if log_level not in valid_levels: log_level = "ERROR" print(f"Invalid log level {log_level}. Using ERROR. Valid levels are: {', '.join(valid_levels)}") logging.basicConfig( level=getattr(logging, log_level), format='%(asctime)s - %(levelname)s - %(message)s' ) class VoiceData(TypedDict): voice_id: str name: str category: str labels: Dict[str, str] description: str preview_url: str high_quality_base_model_ids: List[str] from pydub import AudioSegment import io from datetime import datetime from tenacity import retry, stop_after_attempt, wait_exponential class ElevenLabsAPI: # Add model list as class constant MODELS = { "eleven_multilingual_v2": {"description": "Our most lifelike model with rich emotional expression", "languages": "32", "supports_stitching": True, "supports_style": True, "wait_time": 0.1}, "eleven_flash_v2_5": {"description": "Ultra-fast model optimized for real-time use (~75ms†)", "languages": "32", "supports_stitching": False, "supports_style": False, "wait_time": 0.1}, "eleven_flash_v2": {"description": "Ultra-fast model optimized for real-time use (~75ms†)", "languages": "English", "supports_stitching": False, "supports_style": False, "wait_time": 0.1} } @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) def get_voices(self) -> List[VoiceData]: """Fetch available voices from ElevenLabs API""" headers = { "Accept": "application/json", "xi-api-key": self.api_key } response = requests.get( f"{self.base_url}/voices", headers=headers ) if response.status_code == 200: voices_data = response.json()["voices"] return [ { "voice_id": voice["voice_id"], "name": voice["name"], "category": voice.get("category", ""), "labels": voice.get("labels", {}), "description": voice.get("description", ""), "preview_url": voice.get("preview_url", ""), "high_quality_base_model_ids": voice.get("high_quality_base_model_ids", []) } for voice in voices_data ] else: raise Exception(f"Failed to fetch voices: {response.text}") def __init__(self): self.api_key = os.getenv("ELEVENLABS_API_KEY") if not self.api_key: logging.error("ELEVENLABS_API_KEY environment variable not set") raise ValueError("ELEVENLABS_API_KEY environment variable not set") self.voice_id = os.getenv("ELEVENLABS_VOICE_ID") or "iEw1wkYocsNy7I7pteSN" self.model_id = os.getenv("ELEVENLABS_MODEL_ID") or "eleven_multilingual_v2" logging.info(f"Initializing ElevenLabsAPI with model_id: {self.model_id}") # Add validation for model_id if self.model_id not in self.MODELS: logging.error(f"Invalid model_id: {self.model_id}. Valid models: {list(self.MODELS.keys())}") raise ValueError(f"Invalid model_id: {self.model_id}. Must be one of {list(self.MODELS.keys())}") self.stability = float(os.getenv("ELEVENLABS_STABILITY", "0.5")) self.similarity_boost = float(os.getenv("ELEVENLABS_SIMILARITY_BOOST", "0.75")) self.style = float(os.getenv("ELEVENLABS_STYLE", "0.1")) self.base_url = "https://api.elevenlabs.io/v1" @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) def generate_audio_segment(self, text: str, voice_id: str, output_file: Optional[str] = None, previous_text: Optional[str] = None, next_text: Optional[str] = None, previous_request_ids: Optional[List[str]] = None, debug_info: Optional[List[str]] = None) -> tuple[bytes, str]: """Generate audio using specified voice with context conditioning""" headers = { "Accept": "application/json", "xi-api-key": self.api_key, "Content-Type": "application/json" } data = { "text": text, "model_id": self.model_id, "voice_settings": { "stability": self.stability, "similarity_boost": self.similarity_boost } } if self.MODELS[self.model_id]["supports_style"]: data["style"] = self.style # Add context conditioning if model supports it if self.MODELS[self.model_id]["supports_stitching"]: if previous_text is not None: data["previous_text"] = previous_text if next_text is not None: data["next_text"] = next_text if previous_request_ids: data["previous_request_ids"] = previous_request_ids[-3:] # Maximum of 3 previous IDs logging.info(f"Generating audio for text length: {len(text)} chars using voice_id: {voice_id}") logging.debug(f"Generation parameters: stability={self.stability}, similarity_boost={self.similarity_boost}, model={self.model_id}") try: response = requests.post( f"{self.base_url}/text-to-speech/{voice_id}", json=data, headers=headers ) logging.debug(f"API response status: {response.status_code}") if response.status_code == 200: logging.info("Audio generation successful") if output_file: with open(output_file, 'wb') as f: f.write(response.content) return response.content, response.headers["request-id"] else: debug_info.append(response.text) error_message = f"Failed to generate audio: {response.text} \n\n{debug_info} \n\n{data}" logging.error(f"API error response: {response.status_code}") logging.error(f"API error details: {response.text}") logging.error(f"Request data: {data}") raise Exception(error_message) except requests.exceptions.RequestException as e: error_message = f"Network error during API call: {str(e)}" logging.error(error_message) raise Exception(error_message) def generate_full_audio(self, script_parts: List[Dict], output_dir: Path) -> tuple[str, List[str], int]: """Generate audio for multiple parts using request stitching. Returns tuple of (output_file_path, debug_info, completed_parts)""" # Create output directory if it doesn't exist output_dir.mkdir(exist_ok=True) # Final output file path with unique file name timestamp = datetime.now().strftime("%Y%m%d%H%M%S") output_file = output_dir / f"full_audio_{timestamp}.mp3" debug_info = [] debug_info.append("ElevenLabsAPI - Starting generate_full_audio") debug_info.append(f"Input script_parts: {script_parts}") # Initialize segments list and request IDs tracking segments = [] previous_request_ids = [] failed_parts = [] completed_parts = 0 debug_info.append("Processing all_texts") all_texts = [] for part in script_parts: debug_info.append(f"Processing text from part: {part}") text = str(part.get('text', '')) debug_info.append(f"Extracted text: {text}") all_texts.append(text) debug_info.append(f"Final all_texts: {all_texts}") for i, part in enumerate(script_parts): debug_info.append(f"Processing part {i}: {part}") part_voice_id = part.get('voice_id') if not part_voice_id: part_voice_id = self.voice_id text = str(part.get('text', '')) if not text: continue debug_info.append(f"Using voice ID: {part_voice_id}") # Determine previous and next text for context is_first = i == 0 is_last = i == len(script_parts) - 1 previous_text = None if is_first else " ".join(all_texts[:i]) next_text = None if is_last else " ".join(all_texts[i + 1:]) try: logging.info(f"Processing part {i+1}/{len(script_parts)}") logging.info(f"Text length: {len(text)} chars") logging.debug(f"Context - Previous text: {'Yes' if previous_text else 'No'}, Next text: {'Yes' if next_text else 'No'}") # Generate audio with context conditioning audio_content, request_id = self.generate_audio_segment( text=text, voice_id=part_voice_id, previous_text=previous_text, next_text=next_text, previous_request_ids=previous_request_ids, debug_info=debug_info ) debug_info.append(f"Successfully generated audio for part {i}") completed_parts += 1 # Add request ID to history previous_request_ids.append(request_id) # Convert audio content to AudioSegment and add to segments audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_content)) segments.append(audio_segment) # Wait for the specified wait_time time.sleep(self.MODELS[self.model_id]["wait_time"]) except Exception as e: debug_info.append(f"Error generating audio: {e}") failed_parts.append(part) continue # Combine all segments if segments: final_audio = segments[0] for segment in segments[1:]: final_audio = final_audio + segment # Export combined audio final_audio.export(output_file, format="mp3") if failed_parts: debug_info.append(f"Failed parts: {failed_parts}") else: logging.debug("All parts generated successfully") debug_info.append("All parts generated successfully") debug_info.append(f"Model: {self.model_id}") logging.debug(f"Model: {self.model_id}") return str(output_file), debug_info, completed_parts else: error_msg = "\n".join([ "No audio segments were generated. Debug info:", *debug_info ]) logging.error("No audio segments were generated. Debug info: %s", debug_info) raise Exception(error_msg)