Voice Mode

MIT License

401

Overview InspectNew Endpoints Schema Related Servers Reviews Score

voicemode
voice_mode

config.py•41.4 kB

""" Configuration and shared utilities for Voicemode Server. This module contains all configuration constants, global state, initialization functions, and shared utilities used across the voicemode server. """ import os import logging import asyncio import subprocess from pathlib import Path from typing import Dict, Optional from datetime import datetime # ==================== ENVIRONMENT CONFIGURATION ==================== def find_voicemode_env_files() -> list[Path]: """ Find .voicemode.env files by walking up the directory tree. Looks for (in order of priority - closest to current directory wins): 1. .voicemode.env in current or parent directories 2. .voicemode/voicemode.env in current or parent directories 3. ~/.voicemode/voicemode.env in user home (global config) Returns: List of Path objects in loading order (global first, then project-specific) """ config_files = [] # First add global config (lowest priority - loaded first) global_config = Path.home() / ".voicemode" / "voicemode.env" # Backwards compatibility: check for old filename if not global_config.exists(): old_global = Path.home() / ".voicemode" / ".voicemode.env" if old_global.exists(): global_config = old_global if global_config.exists(): config_files.append(global_config) # Then walk up directory tree for project-specific configs (higher priority) current_dir = Path.cwd() project_configs = [] while current_dir != current_dir.parent: # Check for standalone .voicemode.env first standalone_file = current_dir / ".voicemode.env" if standalone_file.exists(): project_configs.append(standalone_file) break # Stop at first found (closest wins) # Then check .voicemode/voicemode.env dir_file = current_dir / ".voicemode" / "voicemode.env" # Skip if this is the global config file (already added) if dir_file.exists() and dir_file != global_config: project_configs.append(dir_file) break # Stop at first found (closest wins) current_dir = current_dir.parent # Add project configs (they were collected closest-first, so add as-is) config_files.extend(project_configs) return config_files def load_voicemode_env(): """Load configuration from voicemode.env files, with cascading from global to project-specific.""" config_files = find_voicemode_env_files() # If no config files found, create default global config if not config_files: default_path = Path.home() / ".voicemode" / "voicemode.env" default_path.parent.mkdir(parents=True, exist_ok=True) default_config = '''# Voice Mode Configuration File # This file is automatically generated and can be customized # Environment variables always take precedence over this file ############# # Core Configuration ############# # Base directory for all voicemode data (default: ~/.voicemode) # VOICEMODE_BASE_DIR=~/.voicemode # Models directory (default: ~/.voicemode/models) # VOICEMODE_MODELS_DIR=~/.voicemode/models # Enable debug mode (true/false) # VOICEMODE_DEBUG=false # Enable VAD debug logging (true/false) # VOICEMODE_VAD_DEBUG=false # Save all audio and transcriptions (true/false) # VOICEMODE_SAVE_ALL=false # Save audio files (true/false) # VOICEMODE_SAVE_AUDIO=false # Save transcription files (true/false) # VOICEMODE_SAVE_TRANSCRIPTIONS=false # Skip TTS for faster text-only responses (true/false) # VOICEMODE_SKIP_TTS=false # Enable audio feedback chimes (true/false) # VOICEMODE_AUDIO_FEEDBACK=true # Enable sound fonts for tool use hooks (true/false) # VOICEMODE_SOUNDFONTS_ENABLED=false ############# # Tool Loading Configuration ############# # Control which MCP tools are loaded to reduce token usage # Whitelist mode - only load specified tools (most efficient) # VOICEMODE_TOOLS_ENABLED=converse,service # Blacklist mode - load all tools except specified ones # VOICEMODE_TOOLS_DISABLED=pronunciation_add,pronunciation_remove # Examples: # Minimal (just voice conversation): VOICEMODE_TOOLS_ENABLED=converse # Voice + config: VOICEMODE_TOOLS_ENABLED=converse,service,config_get,config_set # Load all tools: VOICEMODE_TOOLS_DISABLED= # All except pronunciation: VOICEMODE_TOOLS_DISABLED=pronunciation_add,pronunciation_remove,pronunciation_list # Default: converse,service (basic voice interaction and service management) ############# # Provider Configuration ############# # Comma-separated list of TTS endpoints # VOICEMODE_TTS_BASE_URLS=http://127.0.0.1:8880/v1,https://api.openai.com/v1 # Comma-separated list of STT endpoints # VOICEMODE_STT_BASE_URLS=http://127.0.0.1:2022/v1,https://api.openai.com/v1 # Comma-separated list of preferred voices # VOICEMODE_VOICES=af_sky,alloy # Comma-separated list of preferred models # VOICEMODE_TTS_MODELS=tts-1,tts-1-hd,gpt-4o-mini-tts # Prefer local providers over cloud (true/false) # VOICEMODE_PREFER_LOCAL=true # Always attempt local providers (true/false) # VOICEMODE_ALWAYS_TRY_LOCAL=true # Auto-start Kokoro service (true/false) # VOICEMODE_AUTO_START_KOKORO=false ############# # Whisper Configuration ############# # Whisper model to use (tiny, base, small, medium, large, large-v2, large-v3) # VOICEMODE_WHISPER_MODEL=base # Whisper server port (default: 2022) # VOICEMODE_WHISPER_PORT=2022 # Language for transcription (auto, en, es, fr, de, it, pt, ru, zh, ja, ko, etc.) # VOICEMODE_WHISPER_LANGUAGE=auto # Path to Whisper models # VOICEMODE_WHISPER_MODEL_PATH=~/.voicemode/services/whisper/models ############# # Kokoro Configuration ############# # Kokoro server port (default: 8880) # VOICEMODE_KOKORO_PORT=8880 # Directory for Kokoro models # VOICEMODE_KOKORO_MODELS_DIR=~/.voicemode/models/kokoro # Directory for Kokoro cache # VOICEMODE_KOKORO_CACHE_DIR=~/.voicemode/cache/kokoro # Default Kokoro voice # VOICEMODE_KOKORO_DEFAULT_VOICE=af_sky ############# # LiveKit Configuration ############# # LiveKit server port (default: 7880) # VOICEMODE_LIVEKIT_PORT=7880 # Frontend server host (default: 127.0.0.1) # VOICEMODE_FRONTEND_HOST=127.0.0.1 # Frontend server port (default: 3000) # VOICEMODE_FRONTEND_PORT=3000 ############# # Recording & Voice Activity Detection ############# # Default maximum listening duration in seconds (default: 120) # VOICEMODE_DEFAULT_LISTEN_DURATION=120.0 # Disable silence detection for noisy environments (true/false) # VOICEMODE_DISABLE_SILENCE_DETECTION=false # VAD aggressiveness level 0-3, higher = more strict (default: 2) # VOICEMODE_VAD_AGGRESSIVENESS=2 # Silence threshold in milliseconds before stopping (default: 1000) # VOICEMODE_SILENCE_THRESHOLD_MS=1000 # Minimum recording duration in seconds (default: 0.5) # VOICEMODE_MIN_RECORDING_DURATION=0.5 # Initial silence grace period before VAD starts (default: 1.0) # VOICEMODE_INITIAL_SILENCE_GRACE_PERIOD=1.0 # Audio feedback chime timing # Silence before chime in seconds - helps Bluetooth devices wake up (default: 0.1) # VOICEMODE_CHIME_LEADING_SILENCE=0.1 # Silence after chime in seconds - prevents cutoff (default: 0.2) # VOICEMODE_CHIME_TRAILING_SILENCE=0.2 ############# # Audio Format Configuration ############# # Global audio format: pcm, opus, mp3, wav, flac, aac (default: pcm) # VOICEMODE_AUDIO_FORMAT=pcm # TTS-specific format override (default: pcm for optimal streaming) # VOICEMODE_TTS_AUDIO_FORMAT=pcm # STT-specific format override (default: mp3 if global format is pcm, otherwise uses global format) # VOICEMODE_STT_AUDIO_FORMAT=mp3 # Format-specific quality settings # VOICEMODE_OPUS_BITRATE=32000 # VOICEMODE_MP3_BITRATE=64k # VOICEMODE_AAC_BITRATE=64k ############# # Streaming Configuration ############# # Enable streaming playback for lower latency (true/false, default: true) # VOICEMODE_STREAMING_ENABLED=true # Download chunk size in bytes (default: 4096) # VOICEMODE_STREAM_CHUNK_SIZE=4096 # Initial buffer before playback starts in milliseconds (default: 150) # VOICEMODE_STREAM_BUFFER_MS=150 # Maximum buffer size in seconds (default: 2.0) # VOICEMODE_STREAM_MAX_BUFFER=2.0 ############# # Event Logging ############# # Enable comprehensive event logging (true/false, default: true) # VOICEMODE_EVENT_LOG_ENABLED=true # Event log directory (default: ~/.voicemode/logs/events) # VOICEMODE_EVENT_LOG_DIR=~/.voicemode/logs/events # Log rotation policy (currently only 'daily' supported) # VOICEMODE_EVENT_LOG_ROTATION=daily ############# # Pronunciation System ############# # Enable pronunciation rules (true/false, default: true) # VOICEMODE_PRONUNCIATION_ENABLED=true # Custom pronunciation config paths (comma-separated) # VOICEMODE_PRONUNCIATION_CONFIG= # Log pronunciation substitutions (true/false, default: false) # VOICEMODE_PRONUNCIATION_LOG_SUBSTITUTIONS=false # Privacy mode - hide pronunciations from LLM tool listings (true/false, default: false) # VOICEMODE_PRONUNCIATION_PRIVATE_MODE=false ############# # Think Out Loud Mode (Experimental) ############# # Enable multi-voice thinking mode (true/false, default: false) # VOICEMODE_THINK_OUT_LOUD=false # Voice persona mappings for thinking roles (role:voice pairs, comma-separated) # VOICEMODE_THINKING_VOICES=analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova # Thinking presentation style: sequential, debate, or chorus (default: sequential) # VOICEMODE_THINKING_STYLE=sequential # Announce which voice is speaking (true/false, default: true) # VOICEMODE_THINKING_ANNOUNCE_VOICE=true ############# # Service Management ############# # Auto-enable services after installation (true/false, default: true) # VOICEMODE_SERVICE_AUTO_ENABLE=true ############# # Advanced Configuration ############# # Download progress style: auto, rich, simple (default: auto) # VOICEMODE_PROGRESS_STYLE=auto ############# # API Keys (set these in your environment for security) ############# # OpenAI API key for cloud TTS/STT # OPENAI_API_KEY=your-key-here # LiveKit server URL # LIVEKIT_URL=ws://127.0.0.1:7880 # LiveKit API credentials # LIVEKIT_API_KEY=devkey # LIVEKIT_API_SECRET=secret ''' with open(default_path, 'w') as f: f.write(default_config) os.chmod(default_path, 0o600) # Secure permissions config_files = [default_path] # Load configuration from all files in order (global first, project-specific last) for config_path in config_files: if config_path.exists(): with open(config_path, 'r') as f: for line in f: line = line.strip() # Skip comments and empty lines if not line or line.startswith('#'): continue # Parse KEY=VALUE format if '=' in line: key, value = line.split('=', 1) key = key.strip() value = value.strip() # Only set if not already in environment (env vars take precedence) if key and key not in os.environ: os.environ[key] = value # Load configuration file before other configuration load_voicemode_env() # Helper function to parse boolean environment variables def env_bool(env_var: str, default: bool = False) -> bool: """Parse boolean from environment variable.""" value = os.getenv(env_var, "").lower() return value in ("true", "1", "yes", "on") if value else default # Helper function to expand paths with tilde def expand_path(path_str: str) -> Path: """Expand tilde and environment variables in path strings.""" # First expand any environment variables expanded = os.path.expandvars(path_str) # Then expand tilde expanded = os.path.expanduser(expanded) return Path(expanded) # Base directory for all voicemode data BASE_DIR = expand_path(os.getenv("VOICEMODE_BASE_DIR", str(Path.home() / ".voicemode"))) # Unified directory structure AUDIO_DIR = BASE_DIR / "audio" TRANSCRIPTIONS_DIR = BASE_DIR / "transcriptions" LOGS_DIR = BASE_DIR / "logs" # CONFIG_DIR = BASE_DIR / "config" # Removed - config stored in .voicemode.env file instead MODELS_DIR = expand_path(os.getenv("VOICEMODE_MODELS_DIR", str(BASE_DIR / "models"))) # Debug configuration DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() in ("true", "1", "yes", "on") TRACE_DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() == "trace" VAD_DEBUG = os.getenv("VOICEMODE_VAD_DEBUG", "").lower() in ("true", "1", "yes", "on") DEBUG_DIR = LOGS_DIR / "debug" # Debug files now go under logs # Master save-all configuration SAVE_ALL = os.getenv("VOICEMODE_SAVE_ALL", "").lower() in ("true", "1", "yes", "on") # Audio saving configuration # Enable if SAVE_ALL is true, DEBUG is true, or individually enabled SAVE_AUDIO = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_AUDIO", "").lower() in ("true", "1", "yes", "on") SAVE_TRANSCRIPTIONS = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_TRANSCRIPTIONS", "").lower() in ("true", "1", "yes", "on") # Audio feedback configuration AUDIO_FEEDBACK_ENABLED = os.getenv("VOICEMODE_AUDIO_FEEDBACK", "true").lower() in ("true", "1", "yes", "on") # Skip TTS configuration (skip text-to-speech for faster responses) SKIP_TTS = os.getenv("VOICEMODE_SKIP_TTS", "false").lower() in ("true", "1", "yes", "on") # Local provider preference configuration PREFER_LOCAL = os.getenv("VOICEMODE_PREFER_LOCAL", "true").lower() in ("true", "1", "yes", "on") # Always try local providers (don't mark them as permanently unavailable) ALWAYS_TRY_LOCAL = os.getenv("VOICEMODE_ALWAYS_TRY_LOCAL", "true").lower() in ("true", "1", "yes", "on") # Use simple failover without health checks # Simple failover is now the only mode - configuration removed # Auto-start configuration AUTO_START_KOKORO = os.getenv("VOICEMODE_AUTO_START_KOKORO", "").lower() in ("true", "1", "yes", "on") # ==================== SERVICE CONFIGURATION ==================== # OpenAI configuration OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Helper function to parse comma-separated lists def parse_comma_list(env_var: str, fallback: str) -> list: """Parse comma-separated list from environment variable.""" value = os.getenv(env_var, fallback) return [item.strip() for item in value.split(",") if item.strip()] # New provider endpoint lists configuration TTS_BASE_URLS = parse_comma_list("VOICEMODE_TTS_BASE_URLS", "http://127.0.0.1:8880/v1,https://api.openai.com/v1") STT_BASE_URLS = parse_comma_list("VOICEMODE_STT_BASE_URLS", "http://127.0.0.1:2022/v1,https://api.openai.com/v1") TTS_VOICES = parse_comma_list("VOICEMODE_VOICES", "af_sky,alloy") TTS_MODELS = parse_comma_list("VOICEMODE_TTS_MODELS", "tts-1,tts-1-hd,gpt-4o-mini-tts") # Voice preferences cache _cached_voice_preferences: Optional[list] = None _voice_preferences_loaded = False def get_voice_preferences() -> list[str]: """ Get voice preferences from configuration. Uses the VOICEMODE_VOICES configuration which is loaded from: 1. Environment variables (highest priority) 2. Project-specific .voicemode.env files 3. Global ~/.voicemode/voicemode.env file 4. Built-in defaults Returns: List of voice names in preference order """ global _cached_voice_preferences, _voice_preferences_loaded # Return cached preferences if already loaded if _voice_preferences_loaded: return _cached_voice_preferences or [] _voice_preferences_loaded = True # Get voices from TTS_VOICES configuration _cached_voice_preferences = TTS_VOICES.copy() logger.info(f"Voice preferences loaded: {_cached_voice_preferences}") return _cached_voice_preferences def clear_voice_preferences_cache(): """Clear the voice preferences cache, forcing a reload on next access.""" global _cached_voice_preferences, _voice_preferences_loaded _cached_voice_preferences = None _voice_preferences_loaded = False logger.debug("Voice preferences cache cleared") def reload_configuration(): """Reload configuration from files and clear all caches.""" # Clear voice preferences cache clear_voice_preferences_cache() # Reload environment configuration load_voicemode_env() # Update global configuration variables global TTS_VOICES, TTS_MODELS, TTS_BASE_URLS, STT_BASE_URLS TTS_BASE_URLS = parse_comma_list("VOICEMODE_TTS_BASE_URLS", "http://127.0.0.1:8880/v1,https://api.openai.com/v1") STT_BASE_URLS = parse_comma_list("VOICEMODE_STT_BASE_URLS", "http://127.0.0.1:2022/v1,https://api.openai.com/v1") TTS_VOICES = parse_comma_list("VOICEMODE_VOICES", "af_sky,alloy") TTS_MODELS = parse_comma_list("VOICEMODE_TTS_MODELS", "tts-1,tts-1-hd,gpt-4o-mini-tts") logger.info("Configuration reloaded successfully") # Legacy variables have been removed - use the new list-based configuration: # - VOICEMODE_TTS_BASE_URLS (comma-separated list) # - VOICEMODE_STT_BASE_URLS (comma-separated list) # - VOICEMODE_VOICES (comma-separated list) # - VOICEMODE_TTS_MODELS (comma-separated list) # LiveKit configuration LIVEKIT_URL = os.getenv("LIVEKIT_URL", "ws://127.0.0.1:7880") LIVEKIT_API_KEY = os.getenv("LIVEKIT_API_KEY", "devkey") LIVEKIT_API_SECRET = os.getenv("LIVEKIT_API_SECRET", "secret") # ==================== WHISPER CONFIGURATION ==================== # Default Whisper model for installation and runtime DEFAULT_WHISPER_MODEL = "base" # Whisper-specific configuration WHISPER_MODEL = os.getenv("VOICEMODE_WHISPER_MODEL", DEFAULT_WHISPER_MODEL) WHISPER_PORT = int(os.getenv("VOICEMODE_WHISPER_PORT", "2022")) WHISPER_LANGUAGE = os.getenv("VOICEMODE_WHISPER_LANGUAGE", "auto") WHISPER_MODEL_PATH = expand_path(os.getenv("VOICEMODE_WHISPER_MODEL_PATH", str(Path.home() / ".voicemode" / "services" / "whisper" / "models"))) # ==================== KOKORO CONFIGURATION ==================== # Kokoro-specific configuration KOKORO_PORT = int(os.getenv("VOICEMODE_KOKORO_PORT", "8880")) KOKORO_MODELS_DIR = expand_path(os.getenv("VOICEMODE_KOKORO_MODELS_DIR", str(BASE_DIR / "models" / "kokoro"))) KOKORO_CACHE_DIR = expand_path(os.getenv("VOICEMODE_KOKORO_CACHE_DIR", str(BASE_DIR / "cache" / "kokoro"))) KOKORO_DEFAULT_VOICE = os.getenv("VOICEMODE_KOKORO_DEFAULT_VOICE", "af_sky") # ==================== LIVEKIT CONFIGURATION ==================== # LiveKit-specific configuration LIVEKIT_PORT = int(os.getenv("VOICEMODE_LIVEKIT_PORT", "7880")) LIVEKIT_URL = os.getenv("LIVEKIT_URL", f"ws://localhost:{LIVEKIT_PORT}") LIVEKIT_API_KEY = os.getenv("LIVEKIT_API_KEY", "devkey") LIVEKIT_API_SECRET = os.getenv("LIVEKIT_API_SECRET", "secret") # LiveKit Frontend configuration FRONTEND_HOST = os.getenv("VOICEMODE_FRONTEND_HOST", "127.0.0.1") FRONTEND_PORT = int(os.getenv("VOICEMODE_FRONTEND_PORT", "3000")) # ==================== SERVICE MANAGEMENT CONFIGURATION ==================== # Auto-enable services after installation SERVICE_AUTO_ENABLE = env_bool("VOICEMODE_SERVICE_AUTO_ENABLE", True) # ==================== SOUND FONTS CONFIGURATION ==================== # Sound fonts are disabled by default to avoid annoying users with unexpected sounds SOUNDFONTS_ENABLED = env_bool("VOICEMODE_SOUNDFONTS_ENABLED", False) # ==================== AUDIO CONFIGURATION ==================== # Audio parameters SAMPLE_RATE = 24000 # Standard TTS sample rate for both OpenAI and Kokoro CHANNELS = 1 # ==================== SILENCE DETECTION CONFIGURATION ==================== # Disable silence detection (useful for noisy environments) # Silence detection is enabled by default DISABLE_SILENCE_DETECTION = os.getenv("VOICEMODE_DISABLE_SILENCE_DETECTION", "false").lower() in ("true", "1", "yes", "on") # VAD (Voice Activity Detection) configuration VAD_AGGRESSIVENESS = int(os.getenv("VOICEMODE_VAD_AGGRESSIVENESS", "2")) # 0-3, higher = more aggressive SILENCE_THRESHOLD_MS = int(os.getenv("VOICEMODE_SILENCE_THRESHOLD_MS", "1000")) # Stop after 1000ms (1 second) of silence MIN_RECORDING_DURATION = float(os.getenv("VOICEMODE_MIN_RECORDING_DURATION", "0.5")) # Minimum 0.5s recording VAD_CHUNK_DURATION_MS = 30 # VAD frame size (must be 10, 20, or 30ms) INITIAL_SILENCE_GRACE_PERIOD = float(os.getenv("VOICEMODE_INITIAL_SILENCE_GRACE_PERIOD", "1")) # No initial silence grace period by default # Default listen duration for converse tool DEFAULT_LISTEN_DURATION = float(os.getenv("VOICEMODE_DEFAULT_LISTEN_DURATION", "120.0")) # Default 120s listening time # Audio feedback chime configuration # Leading silence before chimes to allow Bluetooth devices to wake up CHIME_LEADING_SILENCE = float(os.getenv("VOICEMODE_CHIME_LEADING_SILENCE", "0.1")) # Default 0.1s - minimal delay for Bluetooth # Trailing silence after chimes to prevent cutoff CHIME_TRAILING_SILENCE = float(os.getenv("VOICEMODE_CHIME_TRAILING_SILENCE", "0.2")) # Default 0.2s - reduced for responsiveness # Audio format configuration AUDIO_FORMAT = os.getenv("VOICEMODE_AUDIO_FORMAT", "pcm").lower() TTS_AUDIO_FORMAT = os.getenv("VOICEMODE_TTS_AUDIO_FORMAT", "pcm").lower() # Default to PCM for optimal streaming # STT requires a format supported by the STT provider - PCM is not supported by OpenAI Whisper STT_AUDIO_FORMAT = os.getenv("VOICEMODE_STT_AUDIO_FORMAT", "mp3" if AUDIO_FORMAT == "pcm" else AUDIO_FORMAT).lower() # Supported audio formats SUPPORTED_AUDIO_FORMATS = ["pcm", "opus", "mp3", "wav", "flac", "aac"] # Validate formats (validation messages will be logged after logger is initialized) if AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS: _invalid_audio_format = AUDIO_FORMAT AUDIO_FORMAT = "pcm" if TTS_AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS: _invalid_tts_format = TTS_AUDIO_FORMAT TTS_AUDIO_FORMAT = AUDIO_FORMAT if STT_AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS: _invalid_stt_format = STT_AUDIO_FORMAT STT_AUDIO_FORMAT = AUDIO_FORMAT # Format-specific quality settings OPUS_BITRATE = int(os.getenv("VOICEMODE_OPUS_BITRATE", "32000")) # Default 32kbps for voice MP3_BITRATE = os.getenv("VOICEMODE_MP3_BITRATE", "64k") # Default 64kbps AAC_BITRATE = os.getenv("VOICEMODE_AAC_BITRATE", "64k") # Default 64kbps # ==================== STREAMING CONFIGURATION ==================== # Streaming playback configuration STREAMING_ENABLED = os.getenv("VOICEMODE_STREAMING_ENABLED", "true").lower() in ("true", "1", "yes", "on") STREAM_CHUNK_SIZE = int(os.getenv("VOICEMODE_STREAM_CHUNK_SIZE", "4096")) # Download chunk size STREAM_BUFFER_MS = int(os.getenv("VOICEMODE_STREAM_BUFFER_MS", "150")) # Initial buffer before playback STREAM_MAX_BUFFER = float(os.getenv("VOICEMODE_STREAM_MAX_BUFFER", "2.0")) # Max buffer in seconds # ==================== EVENT LOGGING CONFIGURATION ==================== # Event logging configuration # Event logs are enabled by default, or if SAVE_ALL is true EVENT_LOG_ENABLED = SAVE_ALL or os.getenv("VOICEMODE_EVENT_LOG_ENABLED", "true").lower() in ("true", "1", "yes", "on") EVENT_LOG_DIR = os.getenv("VOICEMODE_EVENT_LOG_DIR", str(LOGS_DIR / "events")) EVENT_LOG_ROTATION = os.getenv("VOICEMODE_EVENT_LOG_ROTATION", "daily") # Currently only daily is supported # ==================== GLOBAL STATE ==================== # Service management service_processes: Dict[str, subprocess.Popen] = {} # Concurrency control for audio operations # This prevents multiple audio operations from interfering with stdio audio_operation_lock = asyncio.Lock() # Flag to track if startup initialization has run _startup_initialized = False # ==================== LOGGING CONFIGURATION ==================== def setup_logging() -> logging.Logger: """Configure logging for the voice-mode server. Returns: Logger instance configured for voice-mode """ log_level = logging.DEBUG if DEBUG else logging.INFO logging.basicConfig( level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger("voicemode") # Trace logging setup if TRACE_DEBUG: import sys from datetime import datetime # Create debug log directory debug_log_dir = Path.home() / ".voicemode" / "logs" / "debug" debug_log_dir.mkdir(parents=True, exist_ok=True) # Create dated debug log file debug_log_file = debug_log_dir / f"voicemode_debug_{datetime.now().strftime('%Y-%m-%d')}.log" # Set up file handler for debug logs debug_handler = logging.FileHandler(debug_log_file, mode='a') debug_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) # Enable debug logging for httpx and openai httpx_logger = logging.getLogger("httpx") httpx_logger.setLevel(logging.DEBUG) httpx_logger.addHandler(debug_handler) openai_logger = logging.getLogger("openai") openai_logger.setLevel(logging.DEBUG) openai_logger.addHandler(debug_handler) # Also add to main logger logger.addHandler(debug_handler) logger.info(f"Trace debug logging enabled, writing to {debug_log_file}") # Legacy trace file support trace_file = Path.home() / "voicemode_trace.log" trace_logger = logging.getLogger("voicemode.trace") trace_handler = logging.FileHandler(trace_file, mode='a') trace_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s')) trace_logger.addHandler(trace_handler) trace_logger.setLevel(logging.DEBUG) def trace_calls(frame, event, arg): if event == 'call': code = frame.f_code if 'voicemode' in code.co_filename or 'voice_mode' in code.co_filename: trace_logger.debug(f"Called {code.co_filename}:{frame.f_lineno} {code.co_name}") elif event == 'exception': trace_logger.debug(f"Exception: {arg}") return trace_calls sys.settrace(trace_calls) logger.info(f"Trace debugging enabled, writing to: {trace_file}") # Also log to file in debug mode if DEBUG: debug_log_file = Path.home() / "voicemode_debug.log" file_handler = logging.FileHandler(debug_log_file, mode='a') file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(file_handler) logger.info(f"Debug logging to file: {debug_log_file}") # Suppress verbose binary data in HTTP logs if DEBUG: # Keep our debug logs but reduce HTTP client verbosity logging.getLogger("openai._base_client").setLevel(logging.INFO) logging.getLogger("httpcore").setLevel(logging.INFO) logging.getLogger("httpx").setLevel(logging.INFO) return logger # ==================== DIRECTORY INITIALIZATION ==================== def initialize_directories(): """Create necessary directories for voicemode data storage.""" # Create base directory BASE_DIR.mkdir(exist_ok=True) # Create all subdirectories AUDIO_DIR.mkdir(exist_ok=True) TRANSCRIPTIONS_DIR.mkdir(exist_ok=True) LOGS_DIR.mkdir(exist_ok=True) # CONFIG_DIR.mkdir(exist_ok=True) # Removed - config stored in .voicemode.env file instead # Create subdirectories for logs if DEBUG: DEBUG_DIR.mkdir(parents=True, exist_ok=True) # Create events log directory if EVENT_LOG_ENABLED: Path(EVENT_LOG_DIR).mkdir(parents=True, exist_ok=True) # Initialize sound fonts if not present initialize_soundfonts() # ==================== SOUND FONTS INITIALIZATION ==================== def initialize_soundfonts(): """Install default sound fonts from package data if not present.""" import shutil import importlib.resources soundfonts_dir = BASE_DIR / "soundfonts" default_soundfont_dir = soundfonts_dir / "default" current_symlink = soundfonts_dir / "current" # Skip if soundfonts already exist (user has customized them) if default_soundfont_dir.exists(): # Ensure symlink exists if directory exists if not current_symlink.exists(): try: current_symlink.symlink_to(default_soundfont_dir.resolve()) except OSError: # Symlinks might not work on all systems pass return try: # Create soundfonts directory soundfonts_dir.mkdir(exist_ok=True) # Copy default soundfonts from package data try: # For Python 3.9+ from importlib.resources import files package_soundfonts = files("voice_mode.data.soundfonts.default") if package_soundfonts.is_dir(): # Create the default directory default_soundfont_dir.mkdir(exist_ok=True) # Recursively copy all files from package data def copy_tree(src, dst): """Recursively copy directory tree from package data.""" dst.mkdir(exist_ok=True) for item in src.iterdir(): if item.is_file(): target = dst / item.name target.write_bytes(item.read_bytes()) elif item.is_dir(): copy_tree(item, dst / item.name) # Copy entire tree structure copy_tree(package_soundfonts, default_soundfont_dir) except ImportError: # Fallback for older Python versions import pkg_resources # Create the default directory default_soundfont_dir.mkdir(exist_ok=True) # List all resources in the soundfonts directory resource_dir = "data/soundfonts/default" if pkg_resources.resource_exists("voice_mode", resource_dir): # This is a bit more complex with pkg_resources # We'll need to manually copy the structure pass # Create symlink to current soundfont (points to default) if default_soundfont_dir.exists() and not current_symlink.exists(): try: current_symlink.symlink_to(default_soundfont_dir.resolve()) except OSError: # Symlinks might not work on all systems (e.g., Windows without admin) pass except Exception as e: # Don't fail initialization if soundfonts can't be installed # They're optional and disabled by default if DEBUG: import logging logging.getLogger("voicemode").debug(f"Could not initialize soundfonts: {e}") # ==================== UTILITY FUNCTIONS ==================== def get_debug_filename(prefix: str, extension: str) -> str: """Generate a timestamped filename for debug files. Args: prefix: Prefix for the filename (e.g., 'stt-input', 'tts-output') extension: File extension (e.g., 'wav', 'mp3') Returns: Timestamped filename string """ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] return f"{prefix}_{timestamp}.{extension}" def get_project_path() -> str: """Get the current project path (git root or current working directory).""" try: # Try to get git root result = subprocess.run( ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True, cwd=os.getcwd() ) if result.returncode == 0: return result.stdout.strip() except Exception: pass # Fall back to current working directory return os.getcwd() def save_transcription(text: str, prefix: str = "transcript", metadata: Optional[Dict] = None) -> Optional[Path]: """Save a transcription to the transcriptions directory. Args: text: The transcription text to save prefix: Prefix for the filename (e.g., 'stt', 'conversation') metadata: Optional metadata to include at the top of the file Returns: Path to the saved file or None if saving is disabled """ if not SAVE_TRANSCRIPTIONS: return None try: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] filename = f"{prefix}_{timestamp}.txt" filepath = TRANSCRIPTIONS_DIR / filename content = [] # Create metadata with project path if metadata is None: metadata = {} metadata["project_path"] = get_project_path() # Add metadata header content.append("--- METADATA ---") for key, value in metadata.items(): content.append(f"{key}: {value}") content.append("--- TRANSCRIPT ---") content.append("") content.append(text) filepath.write_text("\n".join(content), encoding="utf-8") logger.debug(f"Transcription saved to: {filepath}") return filepath except Exception as e: logger.error(f"Failed to save transcription: {e}") return None # ==================== SOUNDDEVICE WORKAROUND ==================== def disable_sounddevice_stderr_redirect(): """Comprehensively disable sounddevice's stderr redirection. This prevents sounddevice from redirecting stderr to /dev/null which can interfere with audio playback in MCP server context. """ try: import sounddevice as sd import sys import atexit # Method 1: Override _ignore_stderr in various locations if hasattr(sd, '_sounddevice'): if hasattr(sd._sounddevice, '_ignore_stderr'): sd._sounddevice._ignore_stderr = lambda: None if hasattr(sd, '_ignore_stderr'): sd._ignore_stderr = lambda: None # Method 2: Override _check_error if it exists if hasattr(sd, '_check'): original_check = sd._check def safe_check(*args, **kwargs): # Prevent any stderr manipulation return original_check(*args, **kwargs) sd._check = safe_check # Method 3: Protect file descriptors original_stderr = sys.stderr # Create a hook to prevent stderr replacement def protect_stderr(): if sys.stderr != original_stderr: sys.stderr = original_stderr # Install protection atexit.register(protect_stderr) except Exception as e: # Log but continue - audio might still work if DEBUG: # Can't use logger here as it's not initialized yet print(f"DEBUG: Could not fully disable sounddevice stderr redirect: {e}", file=sys.stderr) # ==================== HTTP CLIENT CONFIGURATION ==================== # HTTP client configuration for OpenAI clients HTTP_CLIENT_CONFIG = { 'timeout': { 'total': 30.0, 'connect': 5.0 }, 'limits': { 'max_keepalive_connections': 5, 'max_connections': 10 } } # ==================== INITIALIZATION ==================== # Initialize directories on module import initialize_directories() # Apply sounddevice workaround on module import disable_sounddevice_stderr_redirect() # Set up logger logger = setup_logging() # Log any format validation warnings if 'AUDIO_FORMAT' in locals() and '_invalid_audio_format' in locals(): logger.warning(f"Unsupported audio format '{_invalid_audio_format}', falling back to 'pcm'") if 'TTS_AUDIO_FORMAT' in locals() and '_invalid_tts_format' in locals(): logger.warning(f"Unsupported TTS audio format '{_invalid_tts_format}', falling back to '{AUDIO_FORMAT}'") if 'STT_AUDIO_FORMAT' in locals() and '_invalid_stt_format' in locals(): logger.warning(f"Unsupported STT audio format '{_invalid_stt_format}', falling back to '{AUDIO_FORMAT}'") # ==================== AUDIO FORMAT UTILITIES ==================== def get_provider_supported_formats(provider: str, operation: str = "tts") -> list: """Get list of audio formats supported by a provider. Args: provider: Provider name (e.g., 'openai', 'kokoro', 'whisper-local') operation: 'tts' or 'stt' Returns: List of supported format strings """ # Provider format capabilities # Based on API documentation and testing provider_formats = { # TTS providers "openai": { "tts": ["opus", "mp3", "aac", "flac", "wav", "pcm"], "stt": ["mp3", "opus", "wav", "flac", "m4a", "webm"] }, "kokoro": { "tts": ["mp3", "opus", "flac", "wav", "pcm"], # AAC is not currently supported "stt": [] # Kokoro is TTS only }, # STT providers "whisper-local": { "tts": [], # Whisper is STT only "stt": ["wav", "mp3", "opus", "flac", "m4a"] }, "openai-whisper": { "tts": [], # Whisper is STT only "stt": ["mp3", "opus", "wav", "flac", "m4a", "webm"] } } provider_info = provider_formats.get(provider, {}) return provider_info.get(operation, []) def validate_audio_format(format: str, provider: str, operation: str = "tts") -> str: """Validate and potentially adjust audio format based on provider capabilities. Args: format: Requested audio format provider: Provider name operation: 'tts' or 'stt' Returns: Valid format for the provider (may differ from requested) """ supported = get_provider_supported_formats(provider, operation) if not supported: logger.warning(f"Provider '{provider}' does not support {operation} operation") return format if format in supported: return format # Fallback logic - prefer common formats fallback_order = ["opus", "mp3", "wav"] for fallback in fallback_order: if fallback in supported: logger.info(f"Format '{format}' not supported by {provider}, using '{fallback}' instead") return fallback # Last resort - use first supported format first_supported = supported[0] logger.warning(f"Using {provider}'s first supported format: {first_supported}") return first_supported def get_audio_loader_for_format(format: str): """Get the appropriate AudioSegment loader for a format. Args: format: Audio format string Returns: AudioSegment method reference or None """ from pydub import AudioSegment format_loaders = { "mp3": AudioSegment.from_mp3, "wav": AudioSegment.from_wav, "opus": AudioSegment.from_ogg, # Opus uses OGG container "flac": AudioSegment.from_file if not hasattr(AudioSegment, 'from_flac') else AudioSegment.from_flac, "aac": AudioSegment.from_file, # Generic loader for AAC "m4a": AudioSegment.from_file, # Generic loader for M4A "webm": AudioSegment.from_file, # Generic loader for WebM "ogg": AudioSegment.from_ogg, "pcm": AudioSegment.from_raw # Requires additional parameters } return format_loaders.get(format) def get_format_export_params(format: str) -> dict: """Get export parameters for a specific audio format. Args: format: Audio format string Returns: Dict with export parameters for pydub """ params = { "format": format } if format == "mp3": params["bitrate"] = MP3_BITRATE elif format == "opus": # Opus in OGG container params["format"] = "opus" # pydub uses 'opus' for OGG/Opus params["parameters"] = ["-b:a", str(OPUS_BITRATE)] elif format == "aac": params["bitrate"] = AAC_BITRATE elif format == "flac": # FLAC is lossless, no bitrate setting pass elif format == "wav": # WAV is uncompressed, no bitrate setting pass return params # ==================== THINK OUT LOUD CONFIGURATION ==================== # Enable Think Out Loud mode THINK_OUT_LOUD_ENABLED = env_bool("VOICEMODE_THINK_OUT_LOUD", False) # Voice persona mappings for thinking roles (role:voice pairs) # Default: analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova THINKING_VOICES_STR = os.getenv( "VOICEMODE_THINKING_VOICES", "analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova" ) # Parse thinking voices into a dictionary THINKING_VOICES = {} for pair in THINKING_VOICES_STR.split(","): if ":" in pair: role, voice = pair.strip().split(":", 1) THINKING_VOICES[role.strip()] = voice.strip() # Thinking presentation style: sequential, debate, or chorus THINKING_STYLE = os.getenv("VOICEMODE_THINKING_STYLE", "sequential") # Whether to announce which voice is speaking THINKING_ANNOUNCE_VOICE = env_bool("VOICEMODE_THINKING_ANNOUNCE_VOICE", True)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mbailey/voicemode'

If you have feedback or need assistance with the MCP directory API, please join our Discord server