Voice MCP

Overview Schema Related Servers Score Discussions

voice-mcp
src
voice_mcp

tools.py•6.4 kB

"""MCP tool implementations.""" from .audio import AudioRecorder from .transcribe import transcribe from .tts import speak as tts_speak # Words/phrases that indicate "yes" YES_PATTERNS = { "yes", "yeah", "yep", "yup", "sure", "correct", "right", "affirmative", "absolutely", "definitely", "of course", "ok", "okay", "uh huh", "uh-huh", "go ahead", "do it", "proceed", "confirm", "confirmed", "that's right", "yes please", "please do", "sounds good", "go for it", } # Words/phrases that indicate "no" NO_PATTERNS = { "no", "nope", "nah", "negative", "wrong", "incorrect", "don't", "do not", "stop", "cancel", "abort", "wait", "hold on", "not yet", "no thanks", "no thank you", "i don't think so", "that's wrong", "that's not right", } def listen_and_confirm(timeout_seconds: int = 30) -> dict: """ Record and transcribe user speech for confirmation. Args: timeout_seconds: Maximum recording duration Returns: dict with 'transcript' and 'success' keys """ recorder = AudioRecorder() try: audio = recorder.record(timeout_seconds=float(timeout_seconds)) if len(audio) == 0: return { "transcript": "", "success": False, "error": "No audio recorded", } result = transcribe(audio) return { "transcript": result["text"], "language": result["language"], "success": True, } except Exception as e: return { "transcript": "", "success": False, "error": str(e), } def listen_for_yes_no(timeout_seconds: int = 10) -> dict: """ Record and interpret user speech as yes/no response. Args: timeout_seconds: Maximum recording duration Returns: dict with 'answer' (yes/no/unclear), 'transcript', and 'success' keys """ recorder = AudioRecorder() try: audio = recorder.record(timeout_seconds=float(timeout_seconds)) if len(audio) == 0: return { "answer": "unclear", "transcript": "", "success": False, "error": "No audio recorded", } result = transcribe(audio) transcript = result["text"].lower().strip() # Check for yes/no patterns answer = "unclear" # Check exact matches and common phrases for pattern in YES_PATTERNS: if pattern in transcript: answer = "yes" break if answer == "unclear": for pattern in NO_PATTERNS: if pattern in transcript: answer = "no" break return { "answer": answer, "transcript": result["text"], "language": result["language"], "success": True, } except Exception as e: return { "answer": "unclear", "transcript": "", "success": False, "error": str(e), } def speak_and_listen(text: str, voice: str = "M1", timeout_seconds: int = 30) -> dict: """ Speak text then listen for a full response. Args: text: The text to speak voice: Voice name (default: M1) timeout_seconds: Maximum recording duration Returns: dict with 'spoke', 'transcript', 'language', and 'success' keys """ # First speak speak_result = tts_speak(text, voice) if not speak_result["success"]: return { "spoke": False, "transcript": "", "success": False, "error": speak_result.get("error", "TTS failed"), } # Then listen recorder = AudioRecorder() try: audio = recorder.record(timeout_seconds=float(timeout_seconds)) if len(audio) == 0: return { "spoke": True, "transcript": "", "success": False, "error": "No audio recorded", } result = transcribe(audio) return { "spoke": True, "transcript": result["text"], "language": result["language"], "success": True, } except Exception as e: return { "spoke": True, "transcript": "", "success": False, "error": str(e), } def speak_and_confirm(text: str, voice: str = "M1", timeout_seconds: int = 15) -> dict: """ Speak text then listen for a yes/no response. Args: text: The text to speak voice: Voice name (default: M1) timeout_seconds: Maximum recording duration Returns: dict with 'spoke', 'answer', 'transcript', and 'success' keys """ # First speak speak_result = tts_speak(text, voice) if not speak_result["success"]: return { "spoke": False, "answer": "unclear", "transcript": "", "success": False, "error": speak_result.get("error", "TTS failed"), } # Then listen for yes/no recorder = AudioRecorder() try: audio = recorder.record(timeout_seconds=float(timeout_seconds)) if len(audio) == 0: return { "spoke": True, "answer": "unclear", "transcript": "", "success": False, "error": "No audio recorded", } result = transcribe(audio) transcript = result["text"].lower().strip() # Check for yes/no patterns answer = "unclear" for pattern in YES_PATTERNS: if pattern in transcript: answer = "yes" break if answer == "unclear": for pattern in NO_PATTERNS: if pattern in transcript: answer = "no" break return { "spoke": True, "answer": answer, "transcript": result["text"], "language": result["language"], "success": True, } except Exception as e: return { "spoke": True, "answer": "unclear", "transcript": "", "success": False, "error": str(e), }

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jochiang/voice-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server