tts_mcp_server.py5.45 kB
#!/usr/bin/env python3
"""
TTS MCP Server
A Model Context Protocol server that provides text-to-speech functionality
through OpenAI's TTS API with low-latency streaming audio playback.
"""
import sys
import os
import logging
from dotenv import load_dotenv
from fastmcp import FastMCP
from tts_engine import TTSEngine, TTSEngineError
# Load environment variables
load_dotenv()
# Configure logging
logging.basicConfig(
level=getattr(logging, os.getenv('LOG_LEVEL', 'INFO').upper(), logging.INFO),
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stderr), # Log to stderr to avoid interfering with MCP protocol
logging.FileHandler('tts_mcp_server.log') # Log to file for debugging
]
)
logger = logging.getLogger(__name__)
# Initialize FastMCP server
logger.info("Initializing FastMCP server...")
mcp = FastMCP("TTS Server")
# Initialize TTS engine (will be created on first use)
logger.info("Initializing TTS engine...")
tts_engine = TTSEngine()
# Get TTS model from environment with fallback
TTS_MODEL = os.getenv("TTS_MODEL", "gpt-4o-mini-tts")
logger.info(f"Using TTS model: {TTS_MODEL}")
@mcp.tool()
def speak(
text: str,
voice: str = "alloy",
blocking: bool = False,
instructions: str | None = None,
) -> str:
"""Convert text to speech and play it aloud to the user.
This tool enables AI agents to communicate with users through voice instead of text,
providing a more natural and accessible interaction experience. The audio is played
immediately through the system's default audio output device.
**When to use:**
- When the user or system prompt instructs you to use voice to speak in specific situations and tasks
- To provide important information that benefits from voice delivery
- To add personality and warmth to AI interactions
- When emphasizing key points or conclusions
**Best practices:**
- Keep text concise and conversational for natural speech
- Use natural phrasing, punctuation, and contractions.
- Use punctuation strategically to control pacing, tone, breathing and emphasis
- Mimic spoken language rhythm, add fillers or discourse markers to simulate natural speech ("Well," / "So," / "You see," / "Actually," / "Right?", etc.)
- Use instructions to guide the delivery of the text and speak with a specific character - this can be as detailed as you want.
> TIP: Add commas, ellipses and exclamation marks even if grammatically optional if you want a pause for breath, emphasis or drama.
Args:
text: The text to convert to speech and play aloud. Should be clear,
conversational text without special formatting. Maximum recommended
length is ~500 characters for optimal user experience.
voice: Voice personality to use for speech synthesis. Each voice has
distinct characteristics:
- alloy: Neutral, balanced tone (default)
- ash: warm, expressive; friendly support vibes.
- ballad: smooth narrator; long-form storytelling.
- coral: bright, upbeat; cheerful promos.
- echo: Clear, professional tone
- fable: Warm, storytelling tone
- onyx: Deep, authoritative tone
- nova: Bright, energetic tone
- sage: calm, measured; helpful explainer.
- shimmer: Soft, gentle tone
- verse: dramatic, theatrical; trailer read.
blocking: If True, wait for the audio to complete playing before returning.
If False (default), return immediately after queueing the audio.
instructions: Optional free-form guidance on how to deliver the text, by specificing delivery, charcter, speed of speech, tone, voice, emotion, etc.
Returns:
Status message confirming the text is being played as audio, or error details if
the operation fails.
"""
try:
# Validate text
if not text or not text.strip():
return "Error: Text parameter is required and cannot be empty"
# Validate voice
valid_voices = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"]
if voice not in valid_voices:
return f"Error: Invalid voice '{voice}'. Must be one of: {valid_voices}"
logger.info(f"Speaking text: '{text[:50]}...' with voice: {voice} (blocking: {blocking})")
# Execute TTS with model from environment
result = tts_engine.speak(text, voice=voice, model=TTS_MODEL, instructions=instructions, blocking=blocking)
return result["message"]
except TTSEngineError as e:
logger.error(f"TTS engine error: {e}")
return f"TTS Error: {str(e)}"
except Exception as e:
logger.error(f"Unexpected error in speak tool: {e}")
return f"Unexpected error: {str(e)}"
if __name__ == "__main__":
try:
logger.info("Starting TTS MCP server with FastMCP...")
mcp.run()
except TTSEngineError as e:
logger.error(f"TTS Engine initialization failed: {e}")
sys.exit(1)
except KeyboardInterrupt:
logger.info("Server interrupted by user")
except Exception as e:
logger.error(f"Unexpected error: {e}")
sys.exit(1)