Voice MCP

Overview Schema Related Servers Score Discussions

voice-mcp
src
voice_mcp

server.py•10.4 KiB

"""MCP server for voice tools (speech-to-text and text-to-speech).""" import asyncio from mcp.server import Server from mcp.server.stdio import stdio_server from mcp.types import Tool, TextContent from .tools import listen_and_confirm, listen_for_yes_no, speak_and_listen, speak_and_confirm from .tts import speak # Create MCP server server = Server("voice-mcp") @server.list_tools() async def list_tools() -> list[Tool]: """List available voice tools.""" return [ Tool( name="listen_and_confirm", description=( "Record audio from the user's microphone and transcribe it using Whisper. " "Use this when you need to hear the user explain something verbally. " "Recording stops automatically after silence is detected. " "After calling this, repeat the transcript back to the user so they can confirm or correct it." ), inputSchema={ "type": "object", "properties": { "timeout_seconds": { "type": "integer", "description": "Maximum recording duration in seconds", "default": 30, }, "silence_seconds": { "type": "number", "description": "Seconds of silence before auto-stop (min 2.0, default 2.5). Increase if user needs more pause time.", "default": 2.5, }, }, }, ), Tool( name="listen_for_yes_no", description=( "Record a short audio response and interpret it as yes or no. " "Use this for quick confirmations or binary decisions. " "Returns 'yes', 'no', or 'unclear' based on what the user says." ), inputSchema={ "type": "object", "properties": { "timeout_seconds": { "type": "integer", "description": "Maximum recording duration in seconds", "default": 10, }, "silence_seconds": { "type": "number", "description": "Seconds of silence before auto-stop (min 2.0, default 2.5)", "default": 2.5, }, }, }, ), Tool( name="speak", description=( "Speak text aloud to the user using text-to-speech. " "Use this to verbally communicate with the user instead of just displaying text. " "Good for announcing results, reading content, or having a voice conversation. " "Tips: Keep text concise, describe code/URLs instead of reading verbatim, " "summarize rather than recite syntax." ), inputSchema={ "type": "object", "properties": { "text": { "type": "string", "description": "The text to speak aloud", }, "voice": { "type": "string", "description": "Voice to use (default: M1)", "default": "M1", }, }, "required": ["text"], }, ), Tool( name="speak_and_listen", description=( "Speak text aloud then immediately listen for a full response. " "Combines speak and listen_and_confirm in one call to reduce round trips. " "Use this for conversational exchanges where you ask a question and wait for an answer. " "Tips: Keep text concise, describe code/URLs instead of reading verbatim." ), inputSchema={ "type": "object", "properties": { "text": { "type": "string", "description": "The text to speak aloud", }, "voice": { "type": "string", "description": "Voice to use (default: M1)", "default": "M1", }, "timeout_seconds": { "type": "integer", "description": "Maximum recording duration in seconds", "default": 30, }, "silence_seconds": { "type": "number", "description": "Seconds of silence before auto-stop (min 2.0, default 2.5). Increase if user needs more pause time.", "default": 2.5, }, }, "required": ["text"], }, ), Tool( name="speak_and_confirm", description=( "Speak text aloud then immediately listen for a yes/no response. " "Combines speak and listen_for_yes_no in one call to reduce round trips. " "Use this for confirmations like 'Should I proceed?' or 'Is that correct?' " "Tips: Keep questions short and clear." ), inputSchema={ "type": "object", "properties": { "text": { "type": "string", "description": "The text to speak aloud", }, "voice": { "type": "string", "description": "Voice to use (default: M1)", "default": "M1", }, "timeout_seconds": { "type": "integer", "description": "Maximum recording duration in seconds", "default": 15, }, "silence_seconds": { "type": "number", "description": "Seconds of silence before auto-stop (min 2.0, default 2.5)", "default": 2.5, }, }, "required": ["text"], }, ), ] @server.call_tool() async def call_tool(name: str, arguments: dict) -> list[TextContent]: """Handle tool calls.""" if name == "listen_and_confirm": timeout = arguments.get("timeout_seconds", 30) silence = arguments.get("silence_seconds", 2.5) # Run in thread pool since audio recording is blocking result = await asyncio.get_event_loop().run_in_executor( None, lambda: listen_and_confirm(timeout, silence) ) if result["success"]: return [TextContent( type="text", text=f"Transcript: {result['transcript']}\nLanguage: {result['language']}" )] else: return [TextContent( type="text", text=f"Error: {result.get('error', 'Unknown error')}" )] elif name == "listen_for_yes_no": timeout = arguments.get("timeout_seconds", 10) silence = arguments.get("silence_seconds", 2.5) result = await asyncio.get_event_loop().run_in_executor( None, lambda: listen_for_yes_no(timeout, silence) ) if result["success"]: return [TextContent( type="text", text=f"Answer: {result['answer']}\nTranscript: {result['transcript']}" )] else: return [TextContent( type="text", text=f"Answer: unclear\nError: {result.get('error', 'Unknown error')}" )] elif name == "speak": text = arguments.get("text", "") voice = arguments.get("voice", "M1") result = await asyncio.get_event_loop().run_in_executor( None, lambda: speak(text, voice) ) if result["success"]: return [TextContent( type="text", text=f"Spoke: {text[:100]}{'...' if len(text) > 100 else ''}\nDuration: {result['duration']:.2f}s" )] else: return [TextContent( type="text", text=f"Error: {result.get('error', 'Unknown error')}" )] elif name == "speak_and_listen": text = arguments.get("text", "") voice = arguments.get("voice", "M1") timeout = arguments.get("timeout_seconds", 30) silence = arguments.get("silence_seconds", 2.5) result = await asyncio.get_event_loop().run_in_executor( None, lambda: speak_and_listen(text, voice, timeout, silence) ) if result["success"]: return [TextContent( type="text", text=f"Spoke: {text[:100]}{'...' if len(text) > 100 else ''}\nTranscript: {result['transcript']}\nLanguage: {result['language']}" )] else: return [TextContent( type="text", text=f"Spoke: {result['spoke']}\nError: {result.get('error', 'Unknown error')}" )] elif name == "speak_and_confirm": text = arguments.get("text", "") voice = arguments.get("voice", "M1") timeout = arguments.get("timeout_seconds", 15) silence = arguments.get("silence_seconds", 2.5) result = await asyncio.get_event_loop().run_in_executor( None, lambda: speak_and_confirm(text, voice, timeout, silence) ) if result["success"]: return [TextContent( type="text", text=f"Spoke: {text[:100]}{'...' if len(text) > 100 else ''}\nAnswer: {result['answer']}\nTranscript: {result['transcript']}" )] else: return [TextContent( type="text", text=f"Spoke: {result['spoke']}\nAnswer: unclear\nError: {result.get('error', 'Unknown error')}" )] else: return [TextContent(type="text", text=f"Unknown tool: {name}")] async def run_server(): """Run the MCP server.""" async with stdio_server() as (read_stream, write_stream): await server.run(read_stream, write_stream, server.create_initialization_options()) def main(): """Entry point.""" asyncio.run(run_server()) if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jochiang/voice-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

server.py•10.4 KiB