Skip to main content
Glama
mamertofabian

ElevenLabs MCP Server

generate_audio_simple

Convert plain text into audio using ElevenLabs text-to-speech with default voice settings. Specify text and optionally choose a voice ID.

Instructions

Generate audio from plain text using default voice settings

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
textYesPlain text to convert to audio
voice_idNoOptional voice ID to use for generation

Implementation Reference

  • Registration of the generate_audio_simple tool with its input schema and description in the list_tools handler.
    types.Tool(
        name="generate_audio_simple",
        description="Generate audio from plain text using default voice settings",
        inputSchema={
            "type": "object",
            "properties": {
                "text": {
                    "type": "string",
                    "description": "Plain text to convert to audio"
                },
                "voice_id": {
                    "type": "string",
                    "description": "Optional voice ID to use for generation"
                }
            },
            "required": ["text"]
        }
    ),
  • Main handler logic for generate_audio_simple tool: processes input, manages database job, calls ElevenLabsAPI to generate audio, returns text status and embedded base64 audio resource.
    if name == "generate_audio_simple":
        debug_info.append(f"Processing simple audio request")
        debug_info.append(f"Arguments: {arguments}")
        
        text = arguments.get("text", "").strip()
        voice_id = arguments.get("voice_id")
        
        if not text:
            raise ValueError("Text cannot be empty")
        
        script_parts = [{
            "text": text,
            "voice_id": voice_id
        }]
        
        debug_info.append(f"Created script parts: {script_parts}")
        
        # Create job record
        job_id = str(uuid.uuid4())
        job = AudioJob(
            id=job_id,
            status="pending",
            script_parts=script_parts,
            total_parts=1
        )
        await self.db.insert_job(job)
        debug_info.append(f"Created job record: {job_id}")
    
        try:
            job.status = "processing"
            await self.db.update_job(job)
    
            # # Send progress notification
            # if hasattr(self.server, 'session'):
            #     await self.server.session.send_notification({
            #         "method": "notifications/progress",
            #         "params": {
            #             "progressToken": str(job.id),
            #             "progress": {
            #                 "kind": "begin",
            #                 "message": "Starting audio generation"
            #             }
            #         }
            #     })
    
            output_file, api_debug_info, completed_parts = self.api.generate_full_audio(
                script_parts,
                self.output_dir
            )
            debug_info.extend(api_debug_info)
    
            job.status = "completed"
            job.output_file = str(output_file)
            job.completed_parts = completed_parts
            await self.db.update_job(job)
    
            # # Send completion notification
            # if hasattr(self.server, 'session'):
            #     await self.server.session.send_notification({
            #         "method": "notifications/progress",
            #         "params": {
            #             "progressToken": str(job.id),
            #             "progress": {
            #                 "kind": "end",
            #                 "message": "Audio generation completed"
            #             }
            #         }
            #     })
        except Exception as e:
            job.status = "failed"
            job.error = str(e)
            await self.db.update_job(job)
            raise
        
        # Read the generated audio file and encode it as base64
        with open(output_file, 'rb') as f:
            audio_bytes = f.read()
            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            
        # Generate unique URI for the resource
        filename = Path(output_file).name
        resource_uri = f"audio://{filename}"
            
        # Return both a status message and the audio file content
        return [
            types.TextContent(
                type="text",
                text="\n".join([
                    "Audio generation successful. Debug info:",
                    *debug_info
                ])
            ),
            types.EmbeddedResource(
                type="resource",
                resource=types.BlobResourceContents(
                    uri=resource_uri,
                    name=filename,
                    blob=audio_base64,
                    mimeType="audio/mpeg"
                )
            )
        ]
  • Key helper function called by the tool handler: generates audio segments for each script part using ElevenLabs TTS API with context stitching, combines them using pydub, and saves the full MP3 file.
    def generate_full_audio(self, script_parts: List[Dict], output_dir: Path) -> tuple[str, List[str], int]:
        """Generate audio for multiple parts using request stitching. Returns tuple of (output_file_path, debug_info, completed_parts)"""
        # Create output directory if it doesn't exist
        output_dir.mkdir(exist_ok=True)
        
        # Final output file path with unique file name
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        output_file = output_dir / f"full_audio_{timestamp}.mp3"
        
        debug_info = []
        debug_info.append("ElevenLabsAPI - Starting generate_full_audio")
        debug_info.append(f"Input script_parts: {script_parts}")
        
        # Initialize segments list and request IDs tracking
        segments = []
        previous_request_ids = []
        failed_parts = []
        completed_parts = 0
        
        debug_info.append("Processing all_texts")
        all_texts = []
        for part in script_parts:
            debug_info.append(f"Processing text from part: {part}")
            text = str(part.get('text', ''))
            debug_info.append(f"Extracted text: {text}")
            all_texts.append(text)
        debug_info.append(f"Final all_texts: {all_texts}")
        
        for i, part in enumerate(script_parts):
            debug_info.append(f"Processing part {i}: {part}")
            part_voice_id = part.get('voice_id')
            if not part_voice_id:
                part_voice_id = self.voice_id
            text = str(part.get('text', ''))
            if not text:
                continue
                
            debug_info.append(f"Using voice ID: {part_voice_id}")
            
            # Determine previous and next text for context
            is_first = i == 0
            is_last = i == len(script_parts) - 1
            
            previous_text = None if is_first else " ".join(all_texts[:i])
            next_text = None if is_last else " ".join(all_texts[i + 1:])
            
            try:
                logging.info(f"Processing part {i+1}/{len(script_parts)}")
                logging.info(f"Text length: {len(text)} chars")
                logging.debug(f"Context - Previous text: {'Yes' if previous_text else 'No'}, Next text: {'Yes' if next_text else 'No'}")
                
                # Generate audio with context conditioning
                audio_content, request_id = self.generate_audio_segment(
                    text=text,
                    voice_id=part_voice_id,
                    previous_text=previous_text,
                    next_text=next_text,
                    previous_request_ids=previous_request_ids,
                    debug_info=debug_info
                )
                
                debug_info.append(f"Successfully generated audio for part {i}")
                completed_parts += 1
                
                # Add request ID to history
                previous_request_ids.append(request_id)
                
                # Convert audio content to AudioSegment and add to segments
                audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_content))
                segments.append(audio_segment)
    
                # Wait for the specified wait_time
                time.sleep(self.MODELS[self.model_id]["wait_time"])
            except Exception as e:
                debug_info.append(f"Error generating audio: {e}")
                failed_parts.append(part)
                continue
        
        # Combine all segments
        if segments:
            final_audio = segments[0]
            for segment in segments[1:]:
                final_audio = final_audio + segment
            
            # Export combined audio
            final_audio.export(output_file, format="mp3")
    
            if failed_parts:
                debug_info.append(f"Failed parts: {failed_parts}")
            else:
                logging.debug("All parts generated successfully")
                debug_info.append("All parts generated successfully")
            
            debug_info.append(f"Model: {self.model_id}")
            logging.debug(f"Model: {self.model_id}")
            
            return str(output_file), debug_info, completed_parts
        else:
            error_msg = "\n".join([
                "No audio segments were generated. Debug info:",
                *debug_info
            ])
            logging.error("No audio segments were generated. Debug info: %s", debug_info)
            raise Exception(error_msg)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mamertofabian/elevenlabs-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server