generate_audio_script
Convert text scripts into audio with multiple voices and actors using ElevenLabs text-to-speech API. Supports plain text or structured JSON format for voice assignments.
Instructions
Generate audio from a structured script with multiple voices and actors. Accepts either: 1. Plain text string 2. JSON string with format: { "script": [ { "text": "Text to speak", "voice_id": "optional-voice-id", "actor": "optional-actor-name" }, ... ] }
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| script | Yes | JSON string containing script array or plain text. For JSON format, provide an object with a 'script' array containing objects with 'text' (required), 'voice_id' (optional), and 'actor' (optional) fields. |
Implementation Reference
- src/elevenlabs_mcp/server.py:414-478 (handler)Handler function for the 'generate_audio_script' tool within the call_tool method. Parses script input, creates and manages job in database, generates audio via API, and returns embedded audio resource.elif name == "generate_audio_script": script_json = arguments.get("script", "{}") script_parts, parse_debug_info = self.parse_script(script_json) debug_info.extend(parse_debug_info) # Create job record job_id = str(uuid.uuid4()) job = AudioJob( id=job_id, status="pending", script_parts=script_parts, total_parts=len(script_parts) ) await self.db.insert_job(job) debug_info.append(f"Created job record: {job_id}") try: job.status = "processing" await self.db.update_job(job) output_file, api_debug_info, completed_parts = self.api.generate_full_audio( script_parts, self.output_dir ) debug_info.extend(api_debug_info) job.status = "completed" job.output_file = str(output_file) job.completed_parts = completed_parts await self.db.update_job(job) except Exception as e: job.status = "failed" job.error = str(e) await self.db.update_job(job) raise # Read the generated audio file and encode it as base64 with open(output_file, 'rb') as f: audio_bytes = f.read() audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') # Generate unique URI for the resource filename = Path(output_file).name resource_uri = f"audio://{filename}" # Return both a status message and the audio file content return [ types.TextContent( type="text", text="\n".join([ "Audio generation successful. Debug info:", *debug_info ]) ), types.EmbeddedResource( type="resource", resource=types.BlobResourceContents( uri=resource_uri, name=filename, blob=audio_base64, mimeType="audio/mpeg" ) ) ]
- src/elevenlabs_mcp/server.py:227-252 (registration)Registration of the 'generate_audio_script' tool in list_tools(), including name, description, and input schema definition.types.Tool( name="generate_audio_script", description="""Generate audio from a structured script with multiple voices and actors. Accepts either: 1. Plain text string 2. JSON string with format: { "script": [ { "text": "Text to speak", "voice_id": "optional-voice-id", "actor": "optional-actor-name" }, ... ] }""", inputSchema={ "type": "object", "properties": { "script": { "type": "string", "description": "JSON string containing script array or plain text. For JSON format, provide an object with a 'script' array containing objects with 'text' (required), 'voice_id' (optional), and 'actor' (optional) fields." } }, "required": ["script"] } ),
- src/elevenlabs_mcp/server.py:63-132 (helper)Helper method to parse the script input (JSON or plain text) into structured parts with text, voice_id, and actor.def parse_script(self, script_json: str) -> tuple[list[dict], list[str]]: """ Parse the input into a list of script parts and collect debug information. Accepts: 1. A JSON string with a script array containing dialogue parts 2. Plain text to be converted to speech Each dialogue part should have: - text (required): The text to speak - voice_id (optional): The voice to use - actor (optional): The actor/character name Args: script_json: Input text or JSON string Returns: tuple containing: - list of parsed script parts - list of debug information strings """ debug_info = [] debug_info.append(f"Raw input: {script_json}") script_array = [] # Remove any leading/trailing whitespace script_json = script_json.strip() try: # Try to parse as JSON first if script_json.startswith('['): # Direct array of script parts script_array = json.loads(script_json) elif script_json.startswith('{'): # Object with script array script_data = json.loads(script_json) script_array = script_data.get('script', []) else: # Treat as plain text if not JSON formatted script_array = [{"text": script_json}] except json.JSONDecodeError as e: # If JSON parsing fails and input looks like JSON, raise error if script_json.startswith('{') or script_json.startswith('['): debug_info.append(f"JSON parsing failed: {str(e)}") raise Exception("Invalid JSON format") # Otherwise treat as plain text debug_info.append("Input is plain text") script_array = [{"text": script_json}] script_parts = [] for part in script_array: if not isinstance(part, dict): debug_info.append(f"Skipping non-dict part: {part}") continue text = part.get("text", "").strip() if not text: debug_info.append("Missing or empty text field") raise Exception("Missing required field 'text'") new_part = { "text": text, "voice_id": part.get("voice_id"), "actor": part.get("actor") } debug_info.append(f"Created part: {new_part}") script_parts.append(new_part) debug_info.append(f"Final script_parts: {script_parts}") return script_parts, debug_info