#!/usr/bin/env python3
"""
MCP Server for LocalVoiceMode
-----------------------------
Integrates voice mode with any AI assistant - runs invisibly in background.
Tools:
- speak: Speak text aloud (TTS)
- listen: Listen for speech (STT)
- converse: Speak and listen for response
- start_voice: Start voice chat (runs headlessly)
- stop_voice: Stop voice chat
- voice_status: Check if voice mode is running
- list_voices: List available character skills
"""
import os
import sys
import json
import signal
import subprocess
import re
from pathlib import Path
from typing import Optional
# Ensure we're in the right directory
SCRIPT_DIR = Path(__file__).parent.absolute()
os.chdir(SCRIPT_DIR)
sys.path.insert(0, str(SCRIPT_DIR))
from mcp.server.fastmcp import FastMCP
# Initialize MCP server
mcp = FastMCP(
"localvoicemode",
instructions="""LocalVoiceMode - hands-free voice chat integration.
Commands:
- speak: Speak text aloud (TTS)
- listen: Listen for speech (STT)
- converse: Speak and listen for response
- start_voice: Begin voice conversation (runs in background)
- stop_voice: End voice conversation
- voice_status: Check if voice is active
- list_voices: See available characters
- provider_status: Show available LLM providers
Supports auto-detection of:
- LM Studio (local, ports 1234/1235/1236/8080)
- OpenRouter (if OPENROUTER_API_KEY is set)
- OpenAI (if OPENAI_API_KEY is set)
Voice commands while running:
- Say "stop" or "goodbye" to end
- Say "change voice" to switch characters
""",
)
# Track running voice process
_voice_process: Optional[subprocess.Popen] = None
def _get_api_settings() -> tuple[str, Optional[str], Optional[str]]:
"""Get API URL, key, and model - auto-detects LM Studio, OpenRouter, or OpenAI."""
import httpx
# First: Try LM Studio on common ports
for port in [1234, 1235, 1236, 8080, 5000]:
try:
resp = httpx.get(f"http://localhost:{port}/v1/models", timeout=0.5)
if resp.status_code == 200:
api_url = f"http://localhost:{port}/v1"
data = resp.json()
model = None
if data.get("data"):
model = data["data"][0].get("id")
return api_url, None, model
except:
continue
# Second: Check OpenRouter
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
if openrouter_key:
return "https://openrouter.ai/api/v1", openrouter_key, "deepseek/deepseek-v3.2"
# Third: Check OpenAI
openai_key = os.environ.get("OPENAI_API_KEY")
if openai_key:
return "https://api.openai.com/v1", openai_key, "gpt-4o"
# Last resort: return localhost
return "http://localhost:1234/v1", None, None
@mcp.tool()
def list_voices() -> str:
"""List available voice characters/skills."""
try:
from voice_client import SkillLoader, Config
cfg = Config()
loader = SkillLoader(cfg.skills_dir, cfg.voice_refs_dir)
skills = loader.list_skills()
if not skills:
return "No voice characters found."
result = "Available voice characters:\n\n"
for skill in skills:
result += f"• **{skill['name']}** (`{skill['id']}`)\n"
result += f" {skill['description'][:80]}\n\n"
return result
except Exception as e:
return f"Error: {e}"
@mcp.tool()
def start_voice(skill: str = "assistant") -> str:
"""Start voice chat mode.
Runs invisibly in the background. Speak naturally and get voice responses.
"""
global _voice_process
if _voice_process is not None and _voice_process.poll() is None:
return "Voice mode is already running. Say 'stop' to end, or use stop_voice()."
api_url, api_key, model = _get_api_settings()
python_exe = SCRIPT_DIR / ".venv" / "Scripts" / "python.exe"
if not python_exe.exists():
python_exe = SCRIPT_DIR / ".venv" / "bin" / "python"
if not python_exe.exists():
return "Error: Virtual environment not found. Run setup first."
cmd = [
str(python_exe),
str(SCRIPT_DIR / "voice_client.py"),
"--headless",
"--skill",
skill,
"--api-url",
api_url,
]
if api_key:
cmd.extend(["--api-key", api_key])
if model:
cmd.extend(["--model", model])
try:
startupinfo = None
creationflags = 0
if sys.platform == "win32":
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
startupinfo.wShowWindow = subprocess.SW_HIDE
creationflags = subprocess.CREATE_NO_WINDOW
_voice_process = subprocess.Popen(
cmd,
cwd=str(SCRIPT_DIR),
startupinfo=startupinfo,
creationflags=creationflags,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
return f"Voice mode started with **{skill}**."
except Exception as e:
return f"Failed to start voice mode: {e}"
@mcp.tool()
def stop_voice() -> str:
"""Stop voice chat mode."""
global _voice_process
if _voice_process is None:
return "Voice mode is not running."
try:
if sys.platform == "win32":
_voice_process.terminate()
else:
_voice_process.send_signal(signal.SIGINT)
_voice_process = None
return "Voice mode stopped."
except Exception as e:
_voice_process = None
return f"Error stopping voice mode: {e}"
@mcp.tool()
def voice_status() -> str:
"""Check voice mode status including current mode.
Returns whether voice mode is running and provides available commands.
When running, voice mode supports four modes:
- full_voice: Listening + speaking (default)
- tts_only: Speaking only
- stt_only: Listening only
- silent: Paused
Switch modes by saying: 'stop listening', 'stop talking', 'full voice', 'go silent'
"""
global _voice_process
if _voice_process is None:
return "Voice mode is **not running**.\n\nUse `start_voice()` to begin."
if _voice_process.poll() is not None:
exit_code = _voice_process.returncode
_voice_process = None
return f"Voice mode **ended** (exit code: {exit_code}).\n\nUse `start_voice()` to begin again."
# Get current settings
api_url, _, model = _get_api_settings()
return f"""Voice mode is **running**.
**Voice Commands:**
- Say **"stop listening"** - TTS only mode (no mic)
- Say **"stop talking"** - STT only mode (no voice)
- Say **"full voice"** - Resume both
- Say **"go silent"** - Pause all
- Say **"stop"** or **"goodbye"** - End session
API: {api_url}
Model: {model or 'default'}"""
@mcp.tool()
def set_voice_mode(mode: str) -> str:
"""Set voice mode for mid-session control.
Changes how voice interaction works without restarting:
- full_voice: Both listening and speaking enabled (default)
- tts_only: Speaking only, not listening (for reading aloud)
- stt_only: Listening only, no voice output (for dictation)
- silent: Neither listening nor speaking (pause mode)
Aliases: 'full', 'tts', 'stt', 'mute'
Args:
mode: Target mode (full_voice, tts_only, stt_only, silent)
Returns:
JSON status with mode guidance and voice commands
"""
global _voice_process
# Check if voice mode is running
if _voice_process is None or _voice_process.poll() is not None:
return json.dumps({
"error": "Voice mode is not running. Use start_voice() first.",
"requested_mode": mode,
})
# Voice mode runs in a subprocess - guide users to voice commands
return json.dumps({
"note": "Voice mode runs in a separate process. Use voice commands instead:",
"commands": {
"tts_only": "Say 'stop listening'",
"stt_only": "Say 'stop talking' or 'be quiet'",
"full_voice": "Say 'full voice' or 'unmute'",
"silent": "Say 'go silent'",
},
"requested_mode": mode,
})
@mcp.tool()
def provider_status() -> str:
"""Show available LLM providers."""
try:
from voice_client import ProviderManager
providers = ProviderManager.detect_all()
if not providers:
return "No LLM providers available."
result = "**Available LLM Providers:**\n\n"
for p in providers:
status = "✓" if p.available else "✗"
result += f"- [{status}] **{p.name}** ({p.model or 'unknown'})\n"
return result
except Exception as e:
return f"Error: {e}"
# Global TTS/ASR engine instances
_tts_engine = None
_asr_engine = None
def _get_tts_engine():
global _tts_engine
if _tts_engine is None:
from voice_client import TTSEngine
_tts_engine = TTSEngine()
return _tts_engine
def _get_asr_engine():
global _asr_engine
if _asr_engine is None:
from voice_client import ASREngine
_asr_engine = ASREngine()
return _asr_engine
@mcp.tool()
def speak(text: str, voice: str = "default") -> str:
"""Speak text aloud using Pocket TTS.
Args:
text: The text to speak aloud
voice: Voice to use - 'default' or a custom voice
"""
try:
tts = _get_tts_engine()
# Load voice
if voice != "default":
voice_path = SCRIPT_DIR / "voice_references" / f"{voice}.wav"
if voice_path.exists():
tts.load_voice(voice_path, voice)
else:
tts.load_voice(voice_name=voice)
else:
tts.load_voice(voice_name="default")
# Speak the message
tts.speak(text)
return f"Spoke: {text[:50]}..." if len(text) > 50 else f"Spoke: {text}"
except Exception as e:
return f"Error in speak: {e}"
@mcp.tool()
def listen(max_duration: float = 30.0) -> str:
"""Listen for speech and transcribe to text.
Args:
max_duration: Maximum duration to listen in seconds
"""
try:
from voice_client import AudioRecorder
recorder = AudioRecorder()
asr = _get_asr_engine()
audio = recorder.record_vad(max_duration=max_duration)
if len(audio) < 1600:
return "[No speech detected]"
text = asr.transcribe(audio)
return text if text else "[Could not transcribe]"
except Exception as e:
return f"Error in listen: {e}"
@mcp.tool()
def converse(message: str, voice: str = "default", wait_for_response: bool = True) -> str:
"""Speak a message and optionally listen for a response.
Args:
message: The message to speak
voice: Voice to use
wait_for_response: Whether to listen for a response after speaking
"""
try:
speak_res = speak(message, voice)
if not wait_for_response:
return speak_res
listen_res = listen()
return f"{speak_res}\n\nUser response: {listen_res}"
except Exception as e:
return f"Error in converse: {e}"
@mcp.tool()
def service(service_name: str, action: str = "status") -> str:
"""Manage voice services (tts, stt, all)."""
global _tts_engine, _asr_engine
results = []
if service_name in ("tts", "all"):
results.append(f"TTS: {'Ready' if _tts_engine else 'Not loaded'}")
if service_name in ("stt", "all"):
results.append(f"STT: {'Ready' if _asr_engine else 'Not loaded'}")
return "\n".join(results)
if __name__ == "__main__":
mcp.run()