Skip to main content
Glama
server.py6.32 kB
import os import re from datetime import datetime from enum import Enum from pathlib import Path import httpx import sounddevice as sd import soundfile as sf from mcp.server.fastmcp import FastMCP from pydantic import BaseModel, Field API_HOST = os.environ.get("TYPECAST_API_HOST", "https://api.typecast.ai") API_KEY = os.environ.get("TYPECAST_API_KEY") OUTPUT_DIR = Path(os.environ.get("TYPECAST_OUTPUT_DIR", os.path.expanduser("~/Downloads/typecast_output"))) HTTP_HEADERS = { "X-API-KEY": API_KEY } app = FastMCP( "typecast-api-mcp-server", host="0.0.0.0", port=8000, ) class TTSModel(str, Enum): SSFM_V21 = "ssfm-v21" class EmotionEnum(str, Enum): NORMAL = "normal" SAD = "sad" HAPPY = "happy" ANGRY = "angry" REGRET = "regret" URGENT = "urgent" WHISPER = "whisper" SCREAM = "scream" SHOUT = "shout" TRUSTFUL = "trustful" SOFT = "soft" COLD = "cold" SARCASM = "sarcasm" INSPIRE = "inspire" CUTE = "cute" CHEER = "cheer" CASUAL = "casual" TUNELV1 = "tunelv1" TUNELV2 = "tunelv2" TONEMID = "tonemid" TONEUP = "toneup" TONEDOWN = "tonedown" class Prompt(BaseModel): emotion_preset: EmotionEnum = Field(default=EmotionEnum.NORMAL, description="Emotion preset type") emotion_intensity: float = Field(default=1.0, description="Intensity of the emotion", ge=0.0, le=2.0) class Output(BaseModel): volume: int = Field(default=100, description="Audio volume level", ge=0, le=200) audio_pitch: int = Field(default=0, description="Audio pitch adjustment", ge=-12, le=12) audio_tempo: float = Field(default=1.0, description="Audio playback speed", ge=0.5, le=2.0) audio_format: str = Field(default="wav", pattern="^(wav|mp3)$", description="Audio file format") class Voice(BaseModel): voice_id: str = Field(description="Unique voice identifier") voice_name: str = Field(description="Display name of the voice") model: TTSModel = Field(description="TTS model type") emotions: list[EmotionEnum] = Field(description="List of supported emotions") class TTSRequest(BaseModel): voice_id: str = Field(description="Voice identifier to use") text: str = Field(description="Text to convert to speech") model: TTSModel = Field(description="TTS model to use") language: str | None = Field(default=None, description="Language code based on ISO 639-3") prompt: Prompt | None = Field(default_factory=Prompt, description="Prompt configuration for speech generation") output: Output | None = Field(default_factory=Output, description="Output audio configuration") seed: int | None = Field(default=None, description="Random seed for consistent generation", ge=0, le=2147483647) @app.tool("get_voices", "Get a list of available voices for text-to-speech") async def get_voices(model: str = TTSModel.SSFM_V21.value) -> dict: """Get a list of available voices for text-to-speech Args: model: Optional filter for specific TTS models. Returns: List of available voices. """ model = TTSModel(model) async with httpx.AsyncClient() as client: response = await client.get( f"{API_HOST}/v1/voices?model={model.value}", headers=HTTP_HEADERS, ) if response.status_code != 200: raise Exception(f"Failed to get voices: {response.status_code}") return response.json() @app.tool("text_to_speech", "Convert text to speech using the specified voice and parameters") async def text_to_speech( voice_id: str, text: str, model: str, emotion_preset: str = EmotionEnum.NORMAL.value, emotion_intensity: float = 1.0, volume: int = 100, audio_pitch: int = 0, audio_tempo: float = 1.0, audio_format: str = "wav", ) -> str: """Convert text to speech using the specified voice and parameters Args: voice_id: ID of the voice to use text: Text to convert to speech model: TTS model to use emotion_preset: Emotion preset type (default: normal) emotion_intensity: Intensity of the emotion, between 0.0 and 2.0 (default: 1.0) volume: Audio volume level, between 0 and 200 (default: 100) audio_pitch: Audio pitch adjustment, between -12 and 12 (default: 0) audio_tempo: Audio playback speed, between 0.5 and 2.0 (default: 1.0) audio_format: Audio format, either 'wav' or 'mp3' (default: wav) Returns: Path to the saved audio file """ if not OUTPUT_DIR.exists(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Pydantic 모델을 사용하여 자동 검증 prompt_model = Prompt(emotion_preset=emotion_preset, emotion_intensity=emotion_intensity) output_model = Output(volume=volume, audio_pitch=audio_pitch, audio_tempo=audio_tempo, audio_format=audio_format) request = TTSRequest(voice_id=voice_id, text=text, model=model, prompt=prompt_model, output=output_model) # TTSModel 검증은 Pydantic이 자동으로 처리 async with httpx.AsyncClient() as client: response = await client.post( f"{API_HOST}/v1/text-to-speech", json=request.model_dump(exclude_none=True), headers=HTTP_HEADERS, ) if response.status_code != 200: raise Exception(f"Failed to generate speech: {response.status_code}, {response.text}") safe_text = re.sub(r'\s+', '', text[:10]) output_path = OUTPUT_DIR / f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_{voice_id}_{safe_text}.wav" output_path.write_bytes(response.content) return str(output_path) @app.tool("play_audio", "Play the generated audio file") async def play_audio(file_path: str) -> str: """Play the audio file at the specified path Args: file_path: Path to the audio file to play Returns: Status message """ try: data, samplerate = sf.read(file_path) # Get the current output device output_device = sd.default.device[1] # [input, output] # Play on the current output device sd.play(data, samplerate, device=output_device) sd.wait() return f"Successfully played audio file: {file_path}" except Exception as e: return f"Failed to play audio file: {str(e)}"

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/neosapience/typecast-api-mcp-server-sample'

If you have feedback or need assistance with the MCP directory API, please join our Discord server