config.pyā¢41.4 kB
"""
Configuration and shared utilities for Voicemode Server.
This module contains all configuration constants, global state, initialization functions,
and shared utilities used across the voicemode server.
"""
import os
import logging
import asyncio
import subprocess
from pathlib import Path
from typing import Dict, Optional
from datetime import datetime
# ==================== ENVIRONMENT CONFIGURATION ====================
def find_voicemode_env_files() -> list[Path]:
"""
Find .voicemode.env files by walking up the directory tree.
Looks for (in order of priority - closest to current directory wins):
1. .voicemode.env in current or parent directories
2. .voicemode/voicemode.env in current or parent directories
3. ~/.voicemode/voicemode.env in user home (global config)
Returns:
List of Path objects in loading order (global first, then project-specific)
"""
config_files = []
# First add global config (lowest priority - loaded first)
global_config = Path.home() / ".voicemode" / "voicemode.env"
# Backwards compatibility: check for old filename
if not global_config.exists():
old_global = Path.home() / ".voicemode" / ".voicemode.env"
if old_global.exists():
global_config = old_global
if global_config.exists():
config_files.append(global_config)
# Then walk up directory tree for project-specific configs (higher priority)
current_dir = Path.cwd()
project_configs = []
while current_dir != current_dir.parent:
# Check for standalone .voicemode.env first
standalone_file = current_dir / ".voicemode.env"
if standalone_file.exists():
project_configs.append(standalone_file)
break # Stop at first found (closest wins)
# Then check .voicemode/voicemode.env
dir_file = current_dir / ".voicemode" / "voicemode.env"
# Skip if this is the global config file (already added)
if dir_file.exists() and dir_file != global_config:
project_configs.append(dir_file)
break # Stop at first found (closest wins)
current_dir = current_dir.parent
# Add project configs (they were collected closest-first, so add as-is)
config_files.extend(project_configs)
return config_files
def load_voicemode_env():
"""Load configuration from voicemode.env files, with cascading from global to project-specific."""
config_files = find_voicemode_env_files()
# If no config files found, create default global config
if not config_files:
default_path = Path.home() / ".voicemode" / "voicemode.env"
default_path.parent.mkdir(parents=True, exist_ok=True)
default_config = '''# Voice Mode Configuration File
# This file is automatically generated and can be customized
# Environment variables always take precedence over this file
#############
# Core Configuration
#############
# Base directory for all voicemode data (default: ~/.voicemode)
# VOICEMODE_BASE_DIR=~/.voicemode
# Models directory (default: ~/.voicemode/models)
# VOICEMODE_MODELS_DIR=~/.voicemode/models
# Enable debug mode (true/false)
# VOICEMODE_DEBUG=false
# Enable VAD debug logging (true/false)
# VOICEMODE_VAD_DEBUG=false
# Save all audio and transcriptions (true/false)
# VOICEMODE_SAVE_ALL=false
# Save audio files (true/false)
# VOICEMODE_SAVE_AUDIO=false
# Save transcription files (true/false)
# VOICEMODE_SAVE_TRANSCRIPTIONS=false
# Skip TTS for faster text-only responses (true/false)
# VOICEMODE_SKIP_TTS=false
# Enable audio feedback chimes (true/false)
# VOICEMODE_AUDIO_FEEDBACK=true
# Enable sound fonts for tool use hooks (true/false)
# VOICEMODE_SOUNDFONTS_ENABLED=false
#############
# Tool Loading Configuration
#############
# Control which MCP tools are loaded to reduce token usage
# Whitelist mode - only load specified tools (most efficient)
# VOICEMODE_TOOLS_ENABLED=converse,service
# Blacklist mode - load all tools except specified ones
# VOICEMODE_TOOLS_DISABLED=pronunciation_add,pronunciation_remove
# Examples:
# Minimal (just voice conversation): VOICEMODE_TOOLS_ENABLED=converse
# Voice + config: VOICEMODE_TOOLS_ENABLED=converse,service,config_get,config_set
# Load all tools: VOICEMODE_TOOLS_DISABLED=
# All except pronunciation: VOICEMODE_TOOLS_DISABLED=pronunciation_add,pronunciation_remove,pronunciation_list
# Default: converse,service (basic voice interaction and service management)
#############
# Provider Configuration
#############
# Comma-separated list of TTS endpoints
# VOICEMODE_TTS_BASE_URLS=http://127.0.0.1:8880/v1,https://api.openai.com/v1
# Comma-separated list of STT endpoints
# VOICEMODE_STT_BASE_URLS=http://127.0.0.1:2022/v1,https://api.openai.com/v1
# Comma-separated list of preferred voices
# VOICEMODE_VOICES=af_sky,alloy
# Comma-separated list of preferred models
# VOICEMODE_TTS_MODELS=tts-1,tts-1-hd,gpt-4o-mini-tts
# Prefer local providers over cloud (true/false)
# VOICEMODE_PREFER_LOCAL=true
# Always attempt local providers (true/false)
# VOICEMODE_ALWAYS_TRY_LOCAL=true
# Auto-start Kokoro service (true/false)
# VOICEMODE_AUTO_START_KOKORO=false
#############
# Whisper Configuration
#############
# Whisper model to use (tiny, base, small, medium, large, large-v2, large-v3)
# VOICEMODE_WHISPER_MODEL=base
# Whisper server port (default: 2022)
# VOICEMODE_WHISPER_PORT=2022
# Language for transcription (auto, en, es, fr, de, it, pt, ru, zh, ja, ko, etc.)
# VOICEMODE_WHISPER_LANGUAGE=auto
# Path to Whisper models
# VOICEMODE_WHISPER_MODEL_PATH=~/.voicemode/services/whisper/models
#############
# Kokoro Configuration
#############
# Kokoro server port (default: 8880)
# VOICEMODE_KOKORO_PORT=8880
# Directory for Kokoro models
# VOICEMODE_KOKORO_MODELS_DIR=~/.voicemode/models/kokoro
# Directory for Kokoro cache
# VOICEMODE_KOKORO_CACHE_DIR=~/.voicemode/cache/kokoro
# Default Kokoro voice
# VOICEMODE_KOKORO_DEFAULT_VOICE=af_sky
#############
# LiveKit Configuration
#############
# LiveKit server port (default: 7880)
# VOICEMODE_LIVEKIT_PORT=7880
# Frontend server host (default: 127.0.0.1)
# VOICEMODE_FRONTEND_HOST=127.0.0.1
# Frontend server port (default: 3000)
# VOICEMODE_FRONTEND_PORT=3000
#############
# Recording & Voice Activity Detection
#############
# Default maximum listening duration in seconds (default: 120)
# VOICEMODE_DEFAULT_LISTEN_DURATION=120.0
# Disable silence detection for noisy environments (true/false)
# VOICEMODE_DISABLE_SILENCE_DETECTION=false
# VAD aggressiveness level 0-3, higher = more strict (default: 2)
# VOICEMODE_VAD_AGGRESSIVENESS=2
# Silence threshold in milliseconds before stopping (default: 1000)
# VOICEMODE_SILENCE_THRESHOLD_MS=1000
# Minimum recording duration in seconds (default: 0.5)
# VOICEMODE_MIN_RECORDING_DURATION=0.5
# Initial silence grace period before VAD starts (default: 1.0)
# VOICEMODE_INITIAL_SILENCE_GRACE_PERIOD=1.0
# Audio feedback chime timing
# Silence before chime in seconds - helps Bluetooth devices wake up (default: 0.1)
# VOICEMODE_CHIME_LEADING_SILENCE=0.1
# Silence after chime in seconds - prevents cutoff (default: 0.2)
# VOICEMODE_CHIME_TRAILING_SILENCE=0.2
#############
# Audio Format Configuration
#############
# Global audio format: pcm, opus, mp3, wav, flac, aac (default: pcm)
# VOICEMODE_AUDIO_FORMAT=pcm
# TTS-specific format override (default: pcm for optimal streaming)
# VOICEMODE_TTS_AUDIO_FORMAT=pcm
# STT-specific format override (default: mp3 if global format is pcm, otherwise uses global format)
# VOICEMODE_STT_AUDIO_FORMAT=mp3
# Format-specific quality settings
# VOICEMODE_OPUS_BITRATE=32000
# VOICEMODE_MP3_BITRATE=64k
# VOICEMODE_AAC_BITRATE=64k
#############
# Streaming Configuration
#############
# Enable streaming playback for lower latency (true/false, default: true)
# VOICEMODE_STREAMING_ENABLED=true
# Download chunk size in bytes (default: 4096)
# VOICEMODE_STREAM_CHUNK_SIZE=4096
# Initial buffer before playback starts in milliseconds (default: 150)
# VOICEMODE_STREAM_BUFFER_MS=150
# Maximum buffer size in seconds (default: 2.0)
# VOICEMODE_STREAM_MAX_BUFFER=2.0
#############
# Event Logging
#############
# Enable comprehensive event logging (true/false, default: true)
# VOICEMODE_EVENT_LOG_ENABLED=true
# Event log directory (default: ~/.voicemode/logs/events)
# VOICEMODE_EVENT_LOG_DIR=~/.voicemode/logs/events
# Log rotation policy (currently only 'daily' supported)
# VOICEMODE_EVENT_LOG_ROTATION=daily
#############
# Pronunciation System
#############
# Enable pronunciation rules (true/false, default: true)
# VOICEMODE_PRONUNCIATION_ENABLED=true
# Custom pronunciation config paths (comma-separated)
# VOICEMODE_PRONUNCIATION_CONFIG=
# Log pronunciation substitutions (true/false, default: false)
# VOICEMODE_PRONUNCIATION_LOG_SUBSTITUTIONS=false
# Privacy mode - hide pronunciations from LLM tool listings (true/false, default: false)
# VOICEMODE_PRONUNCIATION_PRIVATE_MODE=false
#############
# Think Out Loud Mode (Experimental)
#############
# Enable multi-voice thinking mode (true/false, default: false)
# VOICEMODE_THINK_OUT_LOUD=false
# Voice persona mappings for thinking roles (role:voice pairs, comma-separated)
# VOICEMODE_THINKING_VOICES=analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova
# Thinking presentation style: sequential, debate, or chorus (default: sequential)
# VOICEMODE_THINKING_STYLE=sequential
# Announce which voice is speaking (true/false, default: true)
# VOICEMODE_THINKING_ANNOUNCE_VOICE=true
#############
# Service Management
#############
# Auto-enable services after installation (true/false, default: true)
# VOICEMODE_SERVICE_AUTO_ENABLE=true
#############
# Advanced Configuration
#############
# Download progress style: auto, rich, simple (default: auto)
# VOICEMODE_PROGRESS_STYLE=auto
#############
# API Keys (set these in your environment for security)
#############
# OpenAI API key for cloud TTS/STT
# OPENAI_API_KEY=your-key-here
# LiveKit server URL
# LIVEKIT_URL=ws://127.0.0.1:7880
# LiveKit API credentials
# LIVEKIT_API_KEY=devkey
# LIVEKIT_API_SECRET=secret
'''
with open(default_path, 'w') as f:
f.write(default_config)
os.chmod(default_path, 0o600) # Secure permissions
config_files = [default_path]
# Load configuration from all files in order (global first, project-specific last)
for config_path in config_files:
if config_path.exists():
with open(config_path, 'r') as f:
for line in f:
line = line.strip()
# Skip comments and empty lines
if not line or line.startswith('#'):
continue
# Parse KEY=VALUE format
if '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
# Only set if not already in environment (env vars take precedence)
if key and key not in os.environ:
os.environ[key] = value
# Load configuration file before other configuration
load_voicemode_env()
# Helper function to parse boolean environment variables
def env_bool(env_var: str, default: bool = False) -> bool:
"""Parse boolean from environment variable."""
value = os.getenv(env_var, "").lower()
return value in ("true", "1", "yes", "on") if value else default
# Helper function to expand paths with tilde
def expand_path(path_str: str) -> Path:
"""Expand tilde and environment variables in path strings."""
# First expand any environment variables
expanded = os.path.expandvars(path_str)
# Then expand tilde
expanded = os.path.expanduser(expanded)
return Path(expanded)
# Base directory for all voicemode data
BASE_DIR = expand_path(os.getenv("VOICEMODE_BASE_DIR", str(Path.home() / ".voicemode")))
# Unified directory structure
AUDIO_DIR = BASE_DIR / "audio"
TRANSCRIPTIONS_DIR = BASE_DIR / "transcriptions"
LOGS_DIR = BASE_DIR / "logs"
# CONFIG_DIR = BASE_DIR / "config" # Removed - config stored in .voicemode.env file instead
MODELS_DIR = expand_path(os.getenv("VOICEMODE_MODELS_DIR", str(BASE_DIR / "models")))
# Debug configuration
DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() in ("true", "1", "yes", "on")
TRACE_DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() == "trace"
VAD_DEBUG = os.getenv("VOICEMODE_VAD_DEBUG", "").lower() in ("true", "1", "yes", "on")
DEBUG_DIR = LOGS_DIR / "debug" # Debug files now go under logs
# Master save-all configuration
SAVE_ALL = os.getenv("VOICEMODE_SAVE_ALL", "").lower() in ("true", "1", "yes", "on")
# Audio saving configuration
# Enable if SAVE_ALL is true, DEBUG is true, or individually enabled
SAVE_AUDIO = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_AUDIO", "").lower() in ("true", "1", "yes", "on")
SAVE_TRANSCRIPTIONS = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_TRANSCRIPTIONS", "").lower() in ("true", "1", "yes", "on")
# Audio feedback configuration
AUDIO_FEEDBACK_ENABLED = os.getenv("VOICEMODE_AUDIO_FEEDBACK", "true").lower() in ("true", "1", "yes", "on")
# Skip TTS configuration (skip text-to-speech for faster responses)
SKIP_TTS = os.getenv("VOICEMODE_SKIP_TTS", "false").lower() in ("true", "1", "yes", "on")
# Local provider preference configuration
PREFER_LOCAL = os.getenv("VOICEMODE_PREFER_LOCAL", "true").lower() in ("true", "1", "yes", "on")
# Always try local providers (don't mark them as permanently unavailable)
ALWAYS_TRY_LOCAL = os.getenv("VOICEMODE_ALWAYS_TRY_LOCAL", "true").lower() in ("true", "1", "yes", "on")
# Use simple failover without health checks
# Simple failover is now the only mode - configuration removed
# Auto-start configuration
AUTO_START_KOKORO = os.getenv("VOICEMODE_AUTO_START_KOKORO", "").lower() in ("true", "1", "yes", "on")
# ==================== SERVICE CONFIGURATION ====================
# OpenAI configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Helper function to parse comma-separated lists
def parse_comma_list(env_var: str, fallback: str) -> list:
"""Parse comma-separated list from environment variable."""
value = os.getenv(env_var, fallback)
return [item.strip() for item in value.split(",") if item.strip()]
# New provider endpoint lists configuration
TTS_BASE_URLS = parse_comma_list("VOICEMODE_TTS_BASE_URLS", "http://127.0.0.1:8880/v1,https://api.openai.com/v1")
STT_BASE_URLS = parse_comma_list("VOICEMODE_STT_BASE_URLS", "http://127.0.0.1:2022/v1,https://api.openai.com/v1")
TTS_VOICES = parse_comma_list("VOICEMODE_VOICES", "af_sky,alloy")
TTS_MODELS = parse_comma_list("VOICEMODE_TTS_MODELS", "tts-1,tts-1-hd,gpt-4o-mini-tts")
# Voice preferences cache
_cached_voice_preferences: Optional[list] = None
_voice_preferences_loaded = False
def get_voice_preferences() -> list[str]:
"""
Get voice preferences from configuration.
Uses the VOICEMODE_VOICES configuration which is loaded from:
1. Environment variables (highest priority)
2. Project-specific .voicemode.env files
3. Global ~/.voicemode/voicemode.env file
4. Built-in defaults
Returns:
List of voice names in preference order
"""
global _cached_voice_preferences, _voice_preferences_loaded
# Return cached preferences if already loaded
if _voice_preferences_loaded:
return _cached_voice_preferences or []
_voice_preferences_loaded = True
# Get voices from TTS_VOICES configuration
_cached_voice_preferences = TTS_VOICES.copy()
logger.info(f"Voice preferences loaded: {_cached_voice_preferences}")
return _cached_voice_preferences
def clear_voice_preferences_cache():
"""Clear the voice preferences cache, forcing a reload on next access."""
global _cached_voice_preferences, _voice_preferences_loaded
_cached_voice_preferences = None
_voice_preferences_loaded = False
logger.debug("Voice preferences cache cleared")
def reload_configuration():
"""Reload configuration from files and clear all caches."""
# Clear voice preferences cache
clear_voice_preferences_cache()
# Reload environment configuration
load_voicemode_env()
# Update global configuration variables
global TTS_VOICES, TTS_MODELS, TTS_BASE_URLS, STT_BASE_URLS
TTS_BASE_URLS = parse_comma_list("VOICEMODE_TTS_BASE_URLS", "http://127.0.0.1:8880/v1,https://api.openai.com/v1")
STT_BASE_URLS = parse_comma_list("VOICEMODE_STT_BASE_URLS", "http://127.0.0.1:2022/v1,https://api.openai.com/v1")
TTS_VOICES = parse_comma_list("VOICEMODE_VOICES", "af_sky,alloy")
TTS_MODELS = parse_comma_list("VOICEMODE_TTS_MODELS", "tts-1,tts-1-hd,gpt-4o-mini-tts")
logger.info("Configuration reloaded successfully")
# Legacy variables have been removed - use the new list-based configuration:
# - VOICEMODE_TTS_BASE_URLS (comma-separated list)
# - VOICEMODE_STT_BASE_URLS (comma-separated list)
# - VOICEMODE_VOICES (comma-separated list)
# - VOICEMODE_TTS_MODELS (comma-separated list)
# LiveKit configuration
LIVEKIT_URL = os.getenv("LIVEKIT_URL", "ws://127.0.0.1:7880")
LIVEKIT_API_KEY = os.getenv("LIVEKIT_API_KEY", "devkey")
LIVEKIT_API_SECRET = os.getenv("LIVEKIT_API_SECRET", "secret")
# ==================== WHISPER CONFIGURATION ====================
# Default Whisper model for installation and runtime
DEFAULT_WHISPER_MODEL = "base"
# Whisper-specific configuration
WHISPER_MODEL = os.getenv("VOICEMODE_WHISPER_MODEL", DEFAULT_WHISPER_MODEL)
WHISPER_PORT = int(os.getenv("VOICEMODE_WHISPER_PORT", "2022"))
WHISPER_LANGUAGE = os.getenv("VOICEMODE_WHISPER_LANGUAGE", "auto")
WHISPER_MODEL_PATH = expand_path(os.getenv("VOICEMODE_WHISPER_MODEL_PATH", str(Path.home() / ".voicemode" / "services" / "whisper" / "models")))
# ==================== KOKORO CONFIGURATION ====================
# Kokoro-specific configuration
KOKORO_PORT = int(os.getenv("VOICEMODE_KOKORO_PORT", "8880"))
KOKORO_MODELS_DIR = expand_path(os.getenv("VOICEMODE_KOKORO_MODELS_DIR", str(BASE_DIR / "models" / "kokoro")))
KOKORO_CACHE_DIR = expand_path(os.getenv("VOICEMODE_KOKORO_CACHE_DIR", str(BASE_DIR / "cache" / "kokoro")))
KOKORO_DEFAULT_VOICE = os.getenv("VOICEMODE_KOKORO_DEFAULT_VOICE", "af_sky")
# ==================== LIVEKIT CONFIGURATION ====================
# LiveKit-specific configuration
LIVEKIT_PORT = int(os.getenv("VOICEMODE_LIVEKIT_PORT", "7880"))
LIVEKIT_URL = os.getenv("LIVEKIT_URL", f"ws://localhost:{LIVEKIT_PORT}")
LIVEKIT_API_KEY = os.getenv("LIVEKIT_API_KEY", "devkey")
LIVEKIT_API_SECRET = os.getenv("LIVEKIT_API_SECRET", "secret")
# LiveKit Frontend configuration
FRONTEND_HOST = os.getenv("VOICEMODE_FRONTEND_HOST", "127.0.0.1")
FRONTEND_PORT = int(os.getenv("VOICEMODE_FRONTEND_PORT", "3000"))
# ==================== SERVICE MANAGEMENT CONFIGURATION ====================
# Auto-enable services after installation
SERVICE_AUTO_ENABLE = env_bool("VOICEMODE_SERVICE_AUTO_ENABLE", True)
# ==================== SOUND FONTS CONFIGURATION ====================
# Sound fonts are disabled by default to avoid annoying users with unexpected sounds
SOUNDFONTS_ENABLED = env_bool("VOICEMODE_SOUNDFONTS_ENABLED", False)
# ==================== AUDIO CONFIGURATION ====================
# Audio parameters
SAMPLE_RATE = 24000 # Standard TTS sample rate for both OpenAI and Kokoro
CHANNELS = 1
# ==================== SILENCE DETECTION CONFIGURATION ====================
# Disable silence detection (useful for noisy environments)
# Silence detection is enabled by default
DISABLE_SILENCE_DETECTION = os.getenv("VOICEMODE_DISABLE_SILENCE_DETECTION", "false").lower() in ("true", "1", "yes", "on")
# VAD (Voice Activity Detection) configuration
VAD_AGGRESSIVENESS = int(os.getenv("VOICEMODE_VAD_AGGRESSIVENESS", "2")) # 0-3, higher = more aggressive
SILENCE_THRESHOLD_MS = int(os.getenv("VOICEMODE_SILENCE_THRESHOLD_MS", "1000")) # Stop after 1000ms (1 second) of silence
MIN_RECORDING_DURATION = float(os.getenv("VOICEMODE_MIN_RECORDING_DURATION", "0.5")) # Minimum 0.5s recording
VAD_CHUNK_DURATION_MS = 30 # VAD frame size (must be 10, 20, or 30ms)
INITIAL_SILENCE_GRACE_PERIOD = float(os.getenv("VOICEMODE_INITIAL_SILENCE_GRACE_PERIOD", "1")) # No initial silence grace period by default
# Default listen duration for converse tool
DEFAULT_LISTEN_DURATION = float(os.getenv("VOICEMODE_DEFAULT_LISTEN_DURATION", "120.0")) # Default 120s listening time
# Audio feedback chime configuration
# Leading silence before chimes to allow Bluetooth devices to wake up
CHIME_LEADING_SILENCE = float(os.getenv("VOICEMODE_CHIME_LEADING_SILENCE", "0.1")) # Default 0.1s - minimal delay for Bluetooth
# Trailing silence after chimes to prevent cutoff
CHIME_TRAILING_SILENCE = float(os.getenv("VOICEMODE_CHIME_TRAILING_SILENCE", "0.2")) # Default 0.2s - reduced for responsiveness
# Audio format configuration
AUDIO_FORMAT = os.getenv("VOICEMODE_AUDIO_FORMAT", "pcm").lower()
TTS_AUDIO_FORMAT = os.getenv("VOICEMODE_TTS_AUDIO_FORMAT", "pcm").lower() # Default to PCM for optimal streaming
# STT requires a format supported by the STT provider - PCM is not supported by OpenAI Whisper
STT_AUDIO_FORMAT = os.getenv("VOICEMODE_STT_AUDIO_FORMAT", "mp3" if AUDIO_FORMAT == "pcm" else AUDIO_FORMAT).lower()
# Supported audio formats
SUPPORTED_AUDIO_FORMATS = ["pcm", "opus", "mp3", "wav", "flac", "aac"]
# Validate formats (validation messages will be logged after logger is initialized)
if AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS:
_invalid_audio_format = AUDIO_FORMAT
AUDIO_FORMAT = "pcm"
if TTS_AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS:
_invalid_tts_format = TTS_AUDIO_FORMAT
TTS_AUDIO_FORMAT = AUDIO_FORMAT
if STT_AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS:
_invalid_stt_format = STT_AUDIO_FORMAT
STT_AUDIO_FORMAT = AUDIO_FORMAT
# Format-specific quality settings
OPUS_BITRATE = int(os.getenv("VOICEMODE_OPUS_BITRATE", "32000")) # Default 32kbps for voice
MP3_BITRATE = os.getenv("VOICEMODE_MP3_BITRATE", "64k") # Default 64kbps
AAC_BITRATE = os.getenv("VOICEMODE_AAC_BITRATE", "64k") # Default 64kbps
# ==================== STREAMING CONFIGURATION ====================
# Streaming playback configuration
STREAMING_ENABLED = os.getenv("VOICEMODE_STREAMING_ENABLED", "true").lower() in ("true", "1", "yes", "on")
STREAM_CHUNK_SIZE = int(os.getenv("VOICEMODE_STREAM_CHUNK_SIZE", "4096")) # Download chunk size
STREAM_BUFFER_MS = int(os.getenv("VOICEMODE_STREAM_BUFFER_MS", "150")) # Initial buffer before playback
STREAM_MAX_BUFFER = float(os.getenv("VOICEMODE_STREAM_MAX_BUFFER", "2.0")) # Max buffer in seconds
# ==================== EVENT LOGGING CONFIGURATION ====================
# Event logging configuration
# Event logs are enabled by default, or if SAVE_ALL is true
EVENT_LOG_ENABLED = SAVE_ALL or os.getenv("VOICEMODE_EVENT_LOG_ENABLED", "true").lower() in ("true", "1", "yes", "on")
EVENT_LOG_DIR = os.getenv("VOICEMODE_EVENT_LOG_DIR", str(LOGS_DIR / "events"))
EVENT_LOG_ROTATION = os.getenv("VOICEMODE_EVENT_LOG_ROTATION", "daily") # Currently only daily is supported
# ==================== GLOBAL STATE ====================
# Service management
service_processes: Dict[str, subprocess.Popen] = {}
# Concurrency control for audio operations
# This prevents multiple audio operations from interfering with stdio
audio_operation_lock = asyncio.Lock()
# Flag to track if startup initialization has run
_startup_initialized = False
# ==================== LOGGING CONFIGURATION ====================
def setup_logging() -> logging.Logger:
"""Configure logging for the voice-mode server.
Returns:
Logger instance configured for voice-mode
"""
log_level = logging.DEBUG if DEBUG else logging.INFO
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("voicemode")
# Trace logging setup
if TRACE_DEBUG:
import sys
from datetime import datetime
# Create debug log directory
debug_log_dir = Path.home() / ".voicemode" / "logs" / "debug"
debug_log_dir.mkdir(parents=True, exist_ok=True)
# Create dated debug log file
debug_log_file = debug_log_dir / f"voicemode_debug_{datetime.now().strftime('%Y-%m-%d')}.log"
# Set up file handler for debug logs
debug_handler = logging.FileHandler(debug_log_file, mode='a')
debug_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
# Enable debug logging for httpx and openai
httpx_logger = logging.getLogger("httpx")
httpx_logger.setLevel(logging.DEBUG)
httpx_logger.addHandler(debug_handler)
openai_logger = logging.getLogger("openai")
openai_logger.setLevel(logging.DEBUG)
openai_logger.addHandler(debug_handler)
# Also add to main logger
logger.addHandler(debug_handler)
logger.info(f"Trace debug logging enabled, writing to {debug_log_file}")
# Legacy trace file support
trace_file = Path.home() / "voicemode_trace.log"
trace_logger = logging.getLogger("voicemode.trace")
trace_handler = logging.FileHandler(trace_file, mode='a')
trace_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
trace_logger.addHandler(trace_handler)
trace_logger.setLevel(logging.DEBUG)
def trace_calls(frame, event, arg):
if event == 'call':
code = frame.f_code
if 'voicemode' in code.co_filename or 'voice_mode' in code.co_filename:
trace_logger.debug(f"Called {code.co_filename}:{frame.f_lineno} {code.co_name}")
elif event == 'exception':
trace_logger.debug(f"Exception: {arg}")
return trace_calls
sys.settrace(trace_calls)
logger.info(f"Trace debugging enabled, writing to: {trace_file}")
# Also log to file in debug mode
if DEBUG:
debug_log_file = Path.home() / "voicemode_debug.log"
file_handler = logging.FileHandler(debug_log_file, mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
logger.info(f"Debug logging to file: {debug_log_file}")
# Suppress verbose binary data in HTTP logs
if DEBUG:
# Keep our debug logs but reduce HTTP client verbosity
logging.getLogger("openai._base_client").setLevel(logging.INFO)
logging.getLogger("httpcore").setLevel(logging.INFO)
logging.getLogger("httpx").setLevel(logging.INFO)
return logger
# ==================== DIRECTORY INITIALIZATION ====================
def initialize_directories():
"""Create necessary directories for voicemode data storage."""
# Create base directory
BASE_DIR.mkdir(exist_ok=True)
# Create all subdirectories
AUDIO_DIR.mkdir(exist_ok=True)
TRANSCRIPTIONS_DIR.mkdir(exist_ok=True)
LOGS_DIR.mkdir(exist_ok=True)
# CONFIG_DIR.mkdir(exist_ok=True) # Removed - config stored in .voicemode.env file instead
# Create subdirectories for logs
if DEBUG:
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
# Create events log directory
if EVENT_LOG_ENABLED:
Path(EVENT_LOG_DIR).mkdir(parents=True, exist_ok=True)
# Initialize sound fonts if not present
initialize_soundfonts()
# ==================== SOUND FONTS INITIALIZATION ====================
def initialize_soundfonts():
"""Install default sound fonts from package data if not present."""
import shutil
import importlib.resources
soundfonts_dir = BASE_DIR / "soundfonts"
default_soundfont_dir = soundfonts_dir / "default"
current_symlink = soundfonts_dir / "current"
# Skip if soundfonts already exist (user has customized them)
if default_soundfont_dir.exists():
# Ensure symlink exists if directory exists
if not current_symlink.exists():
try:
current_symlink.symlink_to(default_soundfont_dir.resolve())
except OSError:
# Symlinks might not work on all systems
pass
return
try:
# Create soundfonts directory
soundfonts_dir.mkdir(exist_ok=True)
# Copy default soundfonts from package data
try:
# For Python 3.9+
from importlib.resources import files
package_soundfonts = files("voice_mode.data.soundfonts.default")
if package_soundfonts.is_dir():
# Create the default directory
default_soundfont_dir.mkdir(exist_ok=True)
# Recursively copy all files from package data
def copy_tree(src, dst):
"""Recursively copy directory tree from package data."""
dst.mkdir(exist_ok=True)
for item in src.iterdir():
if item.is_file():
target = dst / item.name
target.write_bytes(item.read_bytes())
elif item.is_dir():
copy_tree(item, dst / item.name)
# Copy entire tree structure
copy_tree(package_soundfonts, default_soundfont_dir)
except ImportError:
# Fallback for older Python versions
import pkg_resources
# Create the default directory
default_soundfont_dir.mkdir(exist_ok=True)
# List all resources in the soundfonts directory
resource_dir = "data/soundfonts/default"
if pkg_resources.resource_exists("voice_mode", resource_dir):
# This is a bit more complex with pkg_resources
# We'll need to manually copy the structure
pass
# Create symlink to current soundfont (points to default)
if default_soundfont_dir.exists() and not current_symlink.exists():
try:
current_symlink.symlink_to(default_soundfont_dir.resolve())
except OSError:
# Symlinks might not work on all systems (e.g., Windows without admin)
pass
except Exception as e:
# Don't fail initialization if soundfonts can't be installed
# They're optional and disabled by default
if DEBUG:
import logging
logging.getLogger("voicemode").debug(f"Could not initialize soundfonts: {e}")
# ==================== UTILITY FUNCTIONS ====================
def get_debug_filename(prefix: str, extension: str) -> str:
"""Generate a timestamped filename for debug files.
Args:
prefix: Prefix for the filename (e.g., 'stt-input', 'tts-output')
extension: File extension (e.g., 'wav', 'mp3')
Returns:
Timestamped filename string
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
return f"{prefix}_{timestamp}.{extension}"
def get_project_path() -> str:
"""Get the current project path (git root or current working directory)."""
try:
# Try to get git root
result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
cwd=os.getcwd()
)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
# Fall back to current working directory
return os.getcwd()
def save_transcription(text: str, prefix: str = "transcript", metadata: Optional[Dict] = None) -> Optional[Path]:
"""Save a transcription to the transcriptions directory.
Args:
text: The transcription text to save
prefix: Prefix for the filename (e.g., 'stt', 'conversation')
metadata: Optional metadata to include at the top of the file
Returns:
Path to the saved file or None if saving is disabled
"""
if not SAVE_TRANSCRIPTIONS:
return None
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
filename = f"{prefix}_{timestamp}.txt"
filepath = TRANSCRIPTIONS_DIR / filename
content = []
# Create metadata with project path
if metadata is None:
metadata = {}
metadata["project_path"] = get_project_path()
# Add metadata header
content.append("--- METADATA ---")
for key, value in metadata.items():
content.append(f"{key}: {value}")
content.append("--- TRANSCRIPT ---")
content.append("")
content.append(text)
filepath.write_text("\n".join(content), encoding="utf-8")
logger.debug(f"Transcription saved to: {filepath}")
return filepath
except Exception as e:
logger.error(f"Failed to save transcription: {e}")
return None
# ==================== SOUNDDEVICE WORKAROUND ====================
def disable_sounddevice_stderr_redirect():
"""Comprehensively disable sounddevice's stderr redirection.
This prevents sounddevice from redirecting stderr to /dev/null
which can interfere with audio playback in MCP server context.
"""
try:
import sounddevice as sd
import sys
import atexit
# Method 1: Override _ignore_stderr in various locations
if hasattr(sd, '_sounddevice'):
if hasattr(sd._sounddevice, '_ignore_stderr'):
sd._sounddevice._ignore_stderr = lambda: None
if hasattr(sd, '_ignore_stderr'):
sd._ignore_stderr = lambda: None
# Method 2: Override _check_error if it exists
if hasattr(sd, '_check'):
original_check = sd._check
def safe_check(*args, **kwargs):
# Prevent any stderr manipulation
return original_check(*args, **kwargs)
sd._check = safe_check
# Method 3: Protect file descriptors
original_stderr = sys.stderr
# Create a hook to prevent stderr replacement
def protect_stderr():
if sys.stderr != original_stderr:
sys.stderr = original_stderr
# Install protection
atexit.register(protect_stderr)
except Exception as e:
# Log but continue - audio might still work
if DEBUG:
# Can't use logger here as it's not initialized yet
print(f"DEBUG: Could not fully disable sounddevice stderr redirect: {e}", file=sys.stderr)
# ==================== HTTP CLIENT CONFIGURATION ====================
# HTTP client configuration for OpenAI clients
HTTP_CLIENT_CONFIG = {
'timeout': {
'total': 30.0,
'connect': 5.0
},
'limits': {
'max_keepalive_connections': 5,
'max_connections': 10
}
}
# ==================== INITIALIZATION ====================
# Initialize directories on module import
initialize_directories()
# Apply sounddevice workaround on module import
disable_sounddevice_stderr_redirect()
# Set up logger
logger = setup_logging()
# Log any format validation warnings
if 'AUDIO_FORMAT' in locals() and '_invalid_audio_format' in locals():
logger.warning(f"Unsupported audio format '{_invalid_audio_format}', falling back to 'pcm'")
if 'TTS_AUDIO_FORMAT' in locals() and '_invalid_tts_format' in locals():
logger.warning(f"Unsupported TTS audio format '{_invalid_tts_format}', falling back to '{AUDIO_FORMAT}'")
if 'STT_AUDIO_FORMAT' in locals() and '_invalid_stt_format' in locals():
logger.warning(f"Unsupported STT audio format '{_invalid_stt_format}', falling back to '{AUDIO_FORMAT}'")
# ==================== AUDIO FORMAT UTILITIES ====================
def get_provider_supported_formats(provider: str, operation: str = "tts") -> list:
"""Get list of audio formats supported by a provider.
Args:
provider: Provider name (e.g., 'openai', 'kokoro', 'whisper-local')
operation: 'tts' or 'stt'
Returns:
List of supported format strings
"""
# Provider format capabilities
# Based on API documentation and testing
provider_formats = {
# TTS providers
"openai": {
"tts": ["opus", "mp3", "aac", "flac", "wav", "pcm"],
"stt": ["mp3", "opus", "wav", "flac", "m4a", "webm"]
},
"kokoro": {
"tts": ["mp3", "opus", "flac", "wav", "pcm"], # AAC is not currently supported
"stt": [] # Kokoro is TTS only
},
# STT providers
"whisper-local": {
"tts": [], # Whisper is STT only
"stt": ["wav", "mp3", "opus", "flac", "m4a"]
},
"openai-whisper": {
"tts": [], # Whisper is STT only
"stt": ["mp3", "opus", "wav", "flac", "m4a", "webm"]
}
}
provider_info = provider_formats.get(provider, {})
return provider_info.get(operation, [])
def validate_audio_format(format: str, provider: str, operation: str = "tts") -> str:
"""Validate and potentially adjust audio format based on provider capabilities.
Args:
format: Requested audio format
provider: Provider name
operation: 'tts' or 'stt'
Returns:
Valid format for the provider (may differ from requested)
"""
supported = get_provider_supported_formats(provider, operation)
if not supported:
logger.warning(f"Provider '{provider}' does not support {operation} operation")
return format
if format in supported:
return format
# Fallback logic - prefer common formats
fallback_order = ["opus", "mp3", "wav"]
for fallback in fallback_order:
if fallback in supported:
logger.info(f"Format '{format}' not supported by {provider}, using '{fallback}' instead")
return fallback
# Last resort - use first supported format
first_supported = supported[0]
logger.warning(f"Using {provider}'s first supported format: {first_supported}")
return first_supported
def get_audio_loader_for_format(format: str):
"""Get the appropriate AudioSegment loader for a format.
Args:
format: Audio format string
Returns:
AudioSegment method reference or None
"""
from pydub import AudioSegment
format_loaders = {
"mp3": AudioSegment.from_mp3,
"wav": AudioSegment.from_wav,
"opus": AudioSegment.from_ogg, # Opus uses OGG container
"flac": AudioSegment.from_file if not hasattr(AudioSegment, 'from_flac') else AudioSegment.from_flac,
"aac": AudioSegment.from_file, # Generic loader for AAC
"m4a": AudioSegment.from_file, # Generic loader for M4A
"webm": AudioSegment.from_file, # Generic loader for WebM
"ogg": AudioSegment.from_ogg,
"pcm": AudioSegment.from_raw # Requires additional parameters
}
return format_loaders.get(format)
def get_format_export_params(format: str) -> dict:
"""Get export parameters for a specific audio format.
Args:
format: Audio format string
Returns:
Dict with export parameters for pydub
"""
params = {
"format": format
}
if format == "mp3":
params["bitrate"] = MP3_BITRATE
elif format == "opus":
# Opus in OGG container
params["format"] = "opus" # pydub uses 'opus' for OGG/Opus
params["parameters"] = ["-b:a", str(OPUS_BITRATE)]
elif format == "aac":
params["bitrate"] = AAC_BITRATE
elif format == "flac":
# FLAC is lossless, no bitrate setting
pass
elif format == "wav":
# WAV is uncompressed, no bitrate setting
pass
return params
# ==================== THINK OUT LOUD CONFIGURATION ====================
# Enable Think Out Loud mode
THINK_OUT_LOUD_ENABLED = env_bool("VOICEMODE_THINK_OUT_LOUD", False)
# Voice persona mappings for thinking roles (role:voice pairs)
# Default: analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova
THINKING_VOICES_STR = os.getenv(
"VOICEMODE_THINKING_VOICES",
"analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova"
)
# Parse thinking voices into a dictionary
THINKING_VOICES = {}
for pair in THINKING_VOICES_STR.split(","):
if ":" in pair:
role, voice = pair.strip().split(":", 1)
THINKING_VOICES[role.strip()] = voice.strip()
# Thinking presentation style: sequential, debate, or chorus
THINKING_STYLE = os.getenv("VOICEMODE_THINKING_STYLE", "sequential")
# Whether to announce which voice is speaking
THINKING_ANNOUNCE_VOICE = env_bool("VOICEMODE_THINKING_ANNOUNCE_VOICE", True)