"""
Configuration and shared utilities for Voicemode Server.
This module contains all configuration constants, global state, initialization functions,
and shared utilities used across the voicemode server.
"""
import os
import logging
import asyncio
import subprocess
from pathlib import Path
from typing import Dict, Optional
from datetime import datetime
# ==================== ENVIRONMENT CONFIGURATION ====================
def find_voicemode_env_files() -> list[Path]:
"""
Find .voicemode.env files by walking up the directory tree.
Looks for (in order of priority - closest to current directory wins):
1. .voicemode.env in current or parent directories
2. .voicemode/voicemode.env in current or parent directories
3. ~/.voicemode/voicemode.env in user home (global config)
Returns:
List of Path objects in loading order (global first, then project-specific)
"""
config_files = []
# First add global config (lowest priority - loaded first)
global_config = Path.home() / ".voicemode" / "voicemode.env"
# Backwards compatibility: check for old filename
if not global_config.exists():
old_global = Path.home() / ".voicemode" / ".voicemode.env"
if old_global.exists():
global_config = old_global
if global_config.exists():
config_files.append(global_config)
# Then walk up directory tree for project-specific configs (higher priority)
current_dir = Path.cwd()
project_configs = []
while current_dir != current_dir.parent:
# Check for standalone .voicemode.env first
standalone_file = current_dir / ".voicemode.env"
if standalone_file.exists():
project_configs.append(standalone_file)
break # Stop at first found (closest wins)
# Then check .voicemode/voicemode.env
dir_file = current_dir / ".voicemode" / "voicemode.env"
# Skip if this is the global config file (already added)
if dir_file.exists() and dir_file != global_config:
project_configs.append(dir_file)
break # Stop at first found (closest wins)
current_dir = current_dir.parent
# Add project configs (they were collected closest-first, so add as-is)
config_files.extend(project_configs)
return config_files
def load_voicemode_env():
"""Load configuration from voicemode.env files, with cascading from global to project-specific."""
config_files = find_voicemode_env_files()
# If no config files found, create default global config
if not config_files:
default_path = Path.home() / ".voicemode" / "voicemode.env"
default_path.parent.mkdir(parents=True, exist_ok=True)
default_config = '''# Voice Mode Configuration File
# This file is automatically generated and can be customized
# Environment variables always take precedence over this file
#############
# Core Configuration
#############
# Base directory for all voicemode data (default: ~/.voicemode)
# VOICEMODE_BASE_DIR=~/.voicemode
# Models directory (default: ~/.voicemode/models)
# VOICEMODE_MODELS_DIR=~/.voicemode/models
# Enable debug mode (true/false)
# VOICEMODE_DEBUG=false
# Enable VAD debug logging (true/false)
# VOICEMODE_VAD_DEBUG=false
# Save all audio and transcriptions (true/false)
# VOICEMODE_SAVE_ALL=false
# Save audio files (true/false)
# VOICEMODE_SAVE_AUDIO=false
# Save transcription files (true/false)
# VOICEMODE_SAVE_TRANSCRIPTIONS=false
# Skip TTS for faster text-only responses (true/false)
# VOICEMODE_SKIP_TTS=false
# Metrics output level in converse results (minimal/summary/verbose)
# - minimal: Just the response text, no timing (saves tokens)
# - summary: Response + compact timing string (default)
# - verbose: Response + detailed metrics breakdown
# VOICEMODE_METRICS_LEVEL=summary
# Enable audio feedback chimes (true/false)
# VOICEMODE_AUDIO_FEEDBACK=true
# Enable sound fonts for tool use hooks (true/false, default: true)
# VOICEMODE_SOUNDFONTS_ENABLED=true
#############
# Tool Loading Configuration
#############
# Control which MCP tools are loaded to reduce token usage
# Whitelist mode - only load specified tools (most efficient)
# VOICEMODE_TOOLS_ENABLED=converse,service
# Blacklist mode - load all tools except specified ones
# VOICEMODE_TOOLS_DISABLED=pronunciation_add,pronunciation_remove
# Examples:
# Minimal (just voice conversation): VOICEMODE_TOOLS_ENABLED=converse
# Voice + config: VOICEMODE_TOOLS_ENABLED=converse,service,config_get,config_set
# Load all tools: VOICEMODE_TOOLS_DISABLED=
# All except pronunciation: VOICEMODE_TOOLS_DISABLED=pronunciation_add,pronunciation_remove,pronunciation_list
# Default: converse,service (basic voice interaction and service management)
#############
# Provider Configuration
#############
# Comma-separated list of TTS endpoints
# VOICEMODE_TTS_BASE_URLS=http://127.0.0.1:8880/v1,https://api.openai.com/v1
# Comma-separated list of STT endpoints
# VOICEMODE_STT_BASE_URLS=http://127.0.0.1:2022/v1,https://api.openai.com/v1
# STT prompt for vocabulary biasing - helps Whisper recognize names and technical terms
# Use when specific words are consistently misrecognized
# Example: VOICEMODE_STT_PROMPT=tmux, Tali, kubectl, VoiceMode
# VOICEMODE_STT_PROMPT=
# Comma-separated list of preferred voices
# VOICEMODE_VOICES=af_sky,alloy
# Comma-separated list of preferred models
# VOICEMODE_TTS_MODELS=tts-1,tts-1-hd,gpt-4o-mini-tts
# Prefer local providers over cloud (true/false)
# VOICEMODE_PREFER_LOCAL=true
# Always attempt local providers (true/false)
# VOICEMODE_ALWAYS_TRY_LOCAL=true
# Auto-start Kokoro service (true/false)
# VOICEMODE_AUTO_START_KOKORO=false
#############
# Whisper Configuration
#############
# Whisper model to use (tiny, base, small, medium, large, large-v2, large-v3)
# VOICEMODE_WHISPER_MODEL=base
# Whisper server port (default: 2022)
# VOICEMODE_WHISPER_PORT=2022
# Number of threads for Whisper processing (auto-detected if not set)
# VOICEMODE_WHISPER_THREADS=
# Language for transcription (auto, en, es, fr, de, it, pt, ru, zh, ja, ko, etc.)
# VOICEMODE_WHISPER_LANGUAGE=auto
# Path to Whisper models
# VOICEMODE_WHISPER_MODEL_PATH=~/.voicemode/services/whisper/models
#############
# Kokoro Configuration
#############
# Kokoro server port (default: 8880)
# VOICEMODE_KOKORO_PORT=8880
# Directory for Kokoro models
# VOICEMODE_KOKORO_MODELS_DIR=~/.voicemode/models/kokoro
# Directory for Kokoro cache
# VOICEMODE_KOKORO_CACHE_DIR=~/.voicemode/cache/kokoro
# Default Kokoro voice
# VOICEMODE_KOKORO_DEFAULT_VOICE=af_sky
# Max requests before Kokoro worker restarts (mitigates memory leak)
# See: https://github.com/hexgrad/kokoro/issues/152
# VOICEMODE_KOKORO_MAX_REQUESTS=200
#############
# Recording & Voice Activity Detection
#############
# Default maximum listening duration in seconds (default: 120)
# VOICEMODE_DEFAULT_LISTEN_DURATION=120.0
# Disable silence detection for noisy environments (true/false)
# VOICEMODE_DISABLE_SILENCE_DETECTION=false
# VAD aggressiveness level 0-3, higher = more strict (default: 3)
# VOICEMODE_VAD_AGGRESSIVENESS=3
# Silence threshold in milliseconds before stopping (default: 1000)
# VOICEMODE_SILENCE_THRESHOLD_MS=1000
# Minimum recording duration in seconds (default: 0.5)
# VOICEMODE_MIN_RECORDING_DURATION=0.5
# Initial silence grace period before VAD starts (default: 1.0)
# VOICEMODE_INITIAL_SILENCE_GRACE_PERIOD=1.0
# Audio feedback chime timing
# Silence before chime in seconds - helps Bluetooth devices wake up (default: 0.1)
# VOICEMODE_CHIME_LEADING_SILENCE=0.1
# Silence after chime in seconds - prevents cutoff (default: 0.2)
# VOICEMODE_CHIME_TRAILING_SILENCE=0.2
#############
# Audio Format Configuration
#############
# Global audio format: pcm, opus, mp3, wav, flac, aac (default: pcm)
# VOICEMODE_AUDIO_FORMAT=pcm
# TTS-specific format override (default: pcm for optimal streaming)
# VOICEMODE_TTS_AUDIO_FORMAT=pcm
# STT-specific format override (default: mp3 if global format is pcm, otherwise uses global format)
# VOICEMODE_STT_AUDIO_FORMAT=mp3
# Format-specific quality settings
# VOICEMODE_OPUS_BITRATE=32000
# VOICEMODE_MP3_BITRATE=64k
# VOICEMODE_AAC_BITRATE=64k
#############
# Streaming Configuration
#############
# Enable streaming playback for lower latency (true/false, default: true)
# VOICEMODE_STREAMING_ENABLED=true
# Download chunk size in bytes (default: 4096)
# VOICEMODE_STREAM_CHUNK_SIZE=4096
# Initial buffer before playback starts in milliseconds (default: 150)
# VOICEMODE_STREAM_BUFFER_MS=150
# Maximum buffer size in seconds (default: 2.0)
# VOICEMODE_STREAM_MAX_BUFFER=2.0
#############
# Event Logging
#############
# Enable comprehensive event logging (true/false, default: true)
# VOICEMODE_EVENT_LOG_ENABLED=true
# Event log directory (default: ~/.voicemode/logs/events)
# VOICEMODE_EVENT_LOG_DIR=~/.voicemode/logs/events
# Log rotation policy (currently only 'daily' supported)
# VOICEMODE_EVENT_LOG_ROTATION=daily
#############
# Pronunciation System
#############
# Enable pronunciation middleware (true/false, default: true)
# VOICEMODE_PRONUNCIATION_ENABLED=true
# Default pronunciation rules - common technical terms
# Format: DIRECTION pattern replacement # description
# See docs for full format details
VOICEMODE_PRONOUNCE="
TTS \\bJSON\\b jason # JSON as jason
TTS \\bYAML\\b yammel # YAML as yammel
TTS \\bAPI\\b A P I # API as individual letters
"
# Add custom rules with VOICEMODE_PRONOUNCE_* variables
# VOICEMODE_PRONOUNCE_CUSTOM=
# Log pronunciation substitutions for debugging (true/false, default: false)
# VOICEMODE_PRONUNCIATION_LOG_SUBSTITUTIONS=false
#############
# Think Out Loud Mode (Experimental)
#############
# Enable multi-voice thinking mode (true/false, default: false)
# VOICEMODE_THINK_OUT_LOUD=false
# Voice persona mappings for thinking roles (role:voice pairs, comma-separated)
# VOICEMODE_THINKING_VOICES=analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova
# Thinking presentation style: sequential, debate, or chorus (default: sequential)
# VOICEMODE_THINKING_STYLE=sequential
# Announce which voice is speaking (true/false, default: true)
# VOICEMODE_THINKING_ANNOUNCE_VOICE=true
#############
# Service Management
#############
# Auto-enable services after installation (true/false, default: true)
# VOICEMODE_SERVICE_AUTO_ENABLE=true
#############
# HTTP Serve Configuration
#############
# Host/IP address to bind the server to (default: 127.0.0.1)
# VOICEMODE_SERVE_HOST=127.0.0.1
# Port to bind the server to (default: 8765)
# VOICEMODE_SERVE_PORT=8765
# Transport protocol: streamable-http or sse (default: streamable-http)
# VOICEMODE_SERVE_TRANSPORT=streamable-http
# Security: Allow connections from local/private IP ranges (default: true)
# VOICEMODE_SERVE_ALLOW_LOCAL=true
# Security: Allow connections from Anthropic IP ranges for Claude Cowork (default: false)
# VOICEMODE_SERVE_ALLOW_ANTHROPIC=false
# Security: Allow connections from Tailscale IP range 100.64.0.0/10 (default: false)
# VOICEMODE_SERVE_ALLOW_TAILSCALE=false
# Security: Additional allowed CIDR ranges (comma-separated)
# VOICEMODE_SERVE_ALLOWED_IPS=
# Authentication: URL secret path segment (e.g., /secret-path/mcp)
# VOICEMODE_SERVE_SECRET=
# Authentication: Bearer token for Authorization header
# VOICEMODE_SERVE_TOKEN=
#############
# Advanced Configuration
#############
# Download progress style: auto, rich, simple (default: auto)
# VOICEMODE_PROGRESS_STYLE=auto
#############
# Credential Storage
#############
# Where to store OAuth credentials (keyring or plaintext)
# keyring uses the OS keychain (macOS Keychain, Linux Secret Service)
# plaintext stores in ~/.voicemode/credentials (chmod 600)
# VOICEMODE_CREDENTIAL_STORE=keyring
#############
# API Keys (set these in your environment for security)
#############
# OpenAI API key for cloud TTS/STT
# OPENAI_API_KEY=your-key-here
'''
with open(default_path, 'w') as f:
f.write(default_config)
os.chmod(default_path, 0o600) # Secure permissions
config_files = [default_path]
# Load configuration from all files in order (global first, project-specific last)
for config_path in config_files:
if config_path.exists():
with open(config_path, 'r') as f:
lines = f.readlines()
i = 0
while i < len(lines):
line = lines[i].strip()
# Skip comments and empty lines
if not line or line.startswith('#'):
i += 1
continue
# Parse KEY=VALUE format
if '=' in line:
key, value = line.split('=', 1)
key = key.strip()
value = value.strip()
# Handle multiline quoted values
if value and value[0] in ('"', "'"):
quote_char = value[0]
# Check if the quote is closed on the same line
if len(value) > 1 and value[-1] == quote_char:
# Single line quoted value - strip quotes
value = value[1:-1]
else:
# Multiline quoted value - collect lines until closing quote
value_parts = [value[1:]] # Start after opening quote
i += 1
while i < len(lines):
next_line = lines[i].rstrip('\n')
if next_line.endswith(quote_char):
# Found closing quote
value_parts.append(next_line[:-1])
break
else:
value_parts.append(next_line)
i += 1
value = '\n'.join(value_parts)
# Only set if not already in environment (env vars take precedence)
if key and key not in os.environ:
os.environ[key] = value
i += 1
# Load configuration file before other configuration
load_voicemode_env()
# Helper function to parse boolean environment variables
def env_bool(env_var: str, default: bool = False) -> bool:
"""Parse boolean from environment variable."""
value = os.getenv(env_var, "").lower()
return value in ("true", "1", "yes", "on") if value else default
# Helper function to expand paths with tilde
def expand_path(path_str: str) -> Path:
"""Expand tilde and environment variables in path strings."""
# First expand any environment variables
expanded = os.path.expandvars(path_str)
# Then expand tilde
expanded = os.path.expanduser(expanded)
return Path(expanded)
# Base directory for all voicemode data
BASE_DIR = expand_path(os.getenv("VOICEMODE_BASE_DIR", str(Path.home() / ".voicemode")))
# Unified directory structure
AUDIO_DIR = BASE_DIR / "audio"
TRANSCRIPTIONS_DIR = BASE_DIR / "transcriptions"
LOGS_DIR = BASE_DIR / "logs"
# CONFIG_DIR = BASE_DIR / "config" # Removed - config stored in .voicemode.env file instead
MODELS_DIR = expand_path(os.getenv("VOICEMODE_MODELS_DIR", str(BASE_DIR / "models")))
# Debug configuration
DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() in ("true", "1", "yes", "on")
TRACE_DEBUG = os.getenv("VOICEMODE_DEBUG", "").lower() == "trace"
VAD_DEBUG = os.getenv("VOICEMODE_VAD_DEBUG", "").lower() in ("true", "1", "yes", "on")
DEBUG_DIR = LOGS_DIR / "debug" # Debug files now go under logs
# Master save-all configuration
SAVE_ALL = os.getenv("VOICEMODE_SAVE_ALL", "").lower() in ("true", "1", "yes", "on")
# Audio saving configuration
# Enable if SAVE_ALL is true, DEBUG is true, or individually enabled
SAVE_AUDIO = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_AUDIO", "").lower() in ("true", "1", "yes", "on")
SAVE_TRANSCRIPTIONS = SAVE_ALL or DEBUG or os.getenv("VOICEMODE_SAVE_TRANSCRIPTIONS", "").lower() in ("true", "1", "yes", "on")
# Audio feedback configuration
AUDIO_FEEDBACK_ENABLED = os.getenv("VOICEMODE_AUDIO_FEEDBACK", "true").lower() in ("true", "1", "yes", "on")
# Skip TTS configuration (skip text-to-speech for faster responses)
SKIP_TTS = os.getenv("VOICEMODE_SKIP_TTS", "false").lower() in ("true", "1", "yes", "on")
# TTS speed configuration (0.25-4.0, default None uses provider default)
TTS_SPEED = float(os.getenv("VOICEMODE_TTS_SPEED")) if os.getenv("VOICEMODE_TTS_SPEED") else None
# Metrics output level configuration (minimal/summary/verbose)
# - minimal: Just the response text, no timing
# - summary: Response + compact timing string (default)
# - verbose: Response + detailed metrics breakdown
_metrics_level = os.getenv("VOICEMODE_METRICS_LEVEL", "summary").lower()
METRICS_LEVEL = _metrics_level if _metrics_level in ("minimal", "summary", "verbose") else "summary"
# Local provider preference configuration
PREFER_LOCAL = os.getenv("VOICEMODE_PREFER_LOCAL", "true").lower() in ("true", "1", "yes", "on")
# Always try local providers (don't mark them as permanently unavailable)
ALWAYS_TRY_LOCAL = os.getenv("VOICEMODE_ALWAYS_TRY_LOCAL", "true").lower() in ("true", "1", "yes", "on")
# Use simple failover without health checks
# Simple failover is now the only mode - configuration removed
# Auto-start configuration
AUTO_START_KOKORO = os.getenv("VOICEMODE_AUTO_START_KOKORO", "").lower() in ("true", "1", "yes", "on")
# ==================== CONCH CONFIGURATION ====================
# The conch is a coordination mechanism for multi-agent voice conversations
# Only the agent holding the conch may speak
# Enable/disable the conch system entirely
CONCH_ENABLED = os.getenv("VOICEMODE_CONCH_ENABLED", "true").lower() in ("true", "1", "yes", "on")
# Maximum time (seconds) to wait for conch when wait_for_conch=true
CONCH_TIMEOUT = float(os.getenv("VOICEMODE_CONCH_TIMEOUT", "60"))
# How often (seconds) to check if conch is free when waiting
CONCH_CHECK_INTERVAL = float(os.getenv("VOICEMODE_CONCH_CHECK_INTERVAL", "0.5"))
# Maximum age (seconds) before a lock is considered stale and can be forcibly released
# This prevents stuck locks from blocking all voice interactions indefinitely
# Should be longer than your typical conversation turn (listen + TTS + buffer)
# Default 300s (5 min) covers 2 min listen + long TTS. Set to 0 to disable.
CONCH_LOCK_EXPIRY = float(os.getenv("VOICEMODE_CONCH_LOCK_EXPIRY", "300"))
# ==================== SERVICE CONFIGURATION ====================
# OpenAI configuration
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Helper function to parse comma-separated lists
def parse_comma_list(env_var: str, fallback: str) -> list:
"""Parse comma-separated list from environment variable."""
value = os.getenv(env_var, fallback)
return [item.strip() for item in value.split(",") if item.strip()]
# New provider endpoint lists configuration
TTS_BASE_URLS = parse_comma_list("VOICEMODE_TTS_BASE_URLS", "http://127.0.0.1:8880/v1,https://api.openai.com/v1")
STT_BASE_URLS = parse_comma_list("VOICEMODE_STT_BASE_URLS", "http://127.0.0.1:2022/v1,https://api.openai.com/v1")
TTS_VOICES = parse_comma_list("VOICEMODE_VOICES", "af_sky,alloy")
TTS_MODELS = parse_comma_list("VOICEMODE_TTS_MODELS", "tts-1,tts-1-hd,gpt-4o-mini-tts")
# STT prompt for vocabulary biasing (helps with specialized terminology)
# See: https://platform.openai.com/docs/guides/speech-to-text#prompting
STT_PROMPT = os.getenv("VOICEMODE_STT_PROMPT", "")
# Voice preferences cache
_cached_voice_preferences: Optional[list] = None
_voice_preferences_loaded = False
def get_voice_preferences() -> list[str]:
"""
Get voice preferences from configuration.
Uses the VOICEMODE_VOICES configuration which is loaded from:
1. Environment variables (highest priority)
2. Project-specific .voicemode.env files
3. Global ~/.voicemode/voicemode.env file
4. Built-in defaults
Returns:
List of voice names in preference order
"""
global _cached_voice_preferences, _voice_preferences_loaded
# Return cached preferences if already loaded
if _voice_preferences_loaded:
return _cached_voice_preferences or []
_voice_preferences_loaded = True
# Get voices from TTS_VOICES configuration
_cached_voice_preferences = TTS_VOICES.copy()
logger.info(f"Voice preferences loaded: {_cached_voice_preferences}")
return _cached_voice_preferences
def clear_voice_preferences_cache():
"""Clear the voice preferences cache, forcing a reload on next access."""
global _cached_voice_preferences, _voice_preferences_loaded
_cached_voice_preferences = None
_voice_preferences_loaded = False
logger.debug("Voice preferences cache cleared")
def reload_configuration():
"""Reload configuration from files and clear all caches."""
# Clear voice preferences cache
clear_voice_preferences_cache()
# Reload environment configuration
load_voicemode_env()
# Update global configuration variables
global TTS_VOICES, TTS_MODELS, TTS_BASE_URLS, STT_BASE_URLS
TTS_BASE_URLS = parse_comma_list("VOICEMODE_TTS_BASE_URLS", "http://127.0.0.1:8880/v1,https://api.openai.com/v1")
STT_BASE_URLS = parse_comma_list("VOICEMODE_STT_BASE_URLS", "http://127.0.0.1:2022/v1,https://api.openai.com/v1")
TTS_VOICES = parse_comma_list("VOICEMODE_VOICES", "af_sky,alloy")
TTS_MODELS = parse_comma_list("VOICEMODE_TTS_MODELS", "tts-1,tts-1-hd,gpt-4o-mini-tts")
logger.info("Configuration reloaded successfully")
# Legacy variables have been removed - use the new list-based configuration:
# - VOICEMODE_TTS_BASE_URLS (comma-separated list)
# - VOICEMODE_STT_BASE_URLS (comma-separated list)
# - VOICEMODE_VOICES (comma-separated list)
# - VOICEMODE_TTS_MODELS (comma-separated list)
# ==================== WHISPER CONFIGURATION ====================
# Default Whisper model for installation and runtime
DEFAULT_WHISPER_MODEL = "base"
# Whisper-specific configuration
WHISPER_MODEL = os.getenv("VOICEMODE_WHISPER_MODEL", DEFAULT_WHISPER_MODEL)
WHISPER_PORT = int(os.getenv("VOICEMODE_WHISPER_PORT", "2022"))
WHISPER_LANGUAGE = os.getenv("VOICEMODE_WHISPER_LANGUAGE", "auto")
WHISPER_MODEL_PATH = expand_path(os.getenv("VOICEMODE_WHISPER_MODEL_PATH", str(Path.home() / ".voicemode" / "services" / "whisper" / "models")))
# ==================== KOKORO CONFIGURATION ====================
# Kokoro-specific configuration
KOKORO_PORT = int(os.getenv("VOICEMODE_KOKORO_PORT", "8880"))
KOKORO_MODELS_DIR = expand_path(os.getenv("VOICEMODE_KOKORO_MODELS_DIR", str(BASE_DIR / "models" / "kokoro")))
KOKORO_CACHE_DIR = expand_path(os.getenv("VOICEMODE_KOKORO_CACHE_DIR", str(BASE_DIR / "cache" / "kokoro")))
KOKORO_DEFAULT_VOICE = os.getenv("VOICEMODE_KOKORO_DEFAULT_VOICE", "af_sky")
KOKORO_MAX_REQUESTS = int(os.getenv("VOICEMODE_KOKORO_MAX_REQUESTS", "200"))
# ==================== SERVICE MANAGEMENT CONFIGURATION ====================
# Auto-enable services after installation
SERVICE_AUTO_ENABLE = env_bool("VOICEMODE_SERVICE_AUTO_ENABLE", True)
# ==================== CONNECT CONFIGURATION ====================
CONNECT_ENABLED = env_bool("VOICEMODE_CONNECT_ENABLED", False)
CONNECT_WS_URL = os.getenv("VOICEMODE_CONNECT_WS_URL", "wss://voicemode.dev/ws")
CONNECT_USERS = [u.strip() for u in os.getenv("VOICEMODE_CONNECT_USERS", "").split(",") if u.strip()]
AGENT_NAME = os.getenv("VOICEMODE_AGENT_NAME", "")
HOST_ALIAS = os.getenv("VOICEMODE_HOST_ALIAS", "")
# Derived: effective hostname for addressing
import socket as _socket
CONNECT_HOST = HOST_ALIAS or _socket.gethostname().split('.')[0]
# Backward compatibility: support old env var names during migration
if not CONNECT_ENABLED and env_bool("VOICEMODE_CONNECT_AUTO", False):
CONNECT_ENABLED = True
# ==================== SOUND FONTS CONFIGURATION ====================
# Sound fonts are enabled by default for audio feedback during tool calls
# Set VOICEMODE_SOUNDFONTS_ENABLED=false to disable
SOUNDFONTS_ENABLED = env_bool("VOICEMODE_SOUNDFONTS_ENABLED", True)
# ==================== AUDIO CONFIGURATION ====================
# Audio parameters
SAMPLE_RATE = 24000 # Standard TTS sample rate for both OpenAI and Kokoro
CHANNELS = 1
# ==================== SILENCE DETECTION CONFIGURATION ====================
# Disable silence detection (useful for noisy environments)
# Silence detection is enabled by default
DISABLE_SILENCE_DETECTION = os.getenv("VOICEMODE_DISABLE_SILENCE_DETECTION", "false").lower() in ("true", "1", "yes", "on")
# VAD (Voice Activity Detection) configuration
VAD_AGGRESSIVENESS = int(os.getenv("VOICEMODE_VAD_AGGRESSIVENESS", "3")) # 0-3, higher = more aggressive
SILENCE_THRESHOLD_MS = int(os.getenv("VOICEMODE_SILENCE_THRESHOLD_MS", "1000")) # Stop after 1000ms (1 second) of silence
MIN_RECORDING_DURATION = float(os.getenv("VOICEMODE_MIN_RECORDING_DURATION", "0.5")) # Minimum 0.5s recording
VAD_CHUNK_DURATION_MS = 30 # VAD frame size (must be 10, 20, or 30ms)
INITIAL_SILENCE_GRACE_PERIOD = float(os.getenv("VOICEMODE_INITIAL_SILENCE_GRACE_PERIOD", "1")) # No initial silence grace period by default
# Default listen duration for converse tool
DEFAULT_LISTEN_DURATION = float(os.getenv("VOICEMODE_DEFAULT_LISTEN_DURATION", "120.0")) # Default 120s listening time
# Repeat phrase detection for audio replay
REPEAT_PHRASES = parse_comma_list("VOICEMODE_REPEAT_PHRASES", "repeat,say that again,pardon,what,come again")
# Wait phrase detection for pausing conversation
WAIT_PHRASES = parse_comma_list("VOICEMODE_WAIT_PHRASES", "wait")
# Wait duration in seconds when wait phrase is detected
WAIT_DURATION = float(os.getenv("VOICEMODE_WAIT_DURATION", "60.0")) # Default 60s (1 minute)
# Audio feedback chime configuration
# Leading silence before chimes to allow Bluetooth devices to wake up
CHIME_LEADING_SILENCE = float(os.getenv("VOICEMODE_CHIME_LEADING_SILENCE", "0.1")) # Default 0.1s - minimal delay for Bluetooth
# Trailing silence after chimes to prevent cutoff
CHIME_TRAILING_SILENCE = float(os.getenv("VOICEMODE_CHIME_TRAILING_SILENCE", "0.2")) # Default 0.2s - reduced for responsiveness
# Audio format configuration
AUDIO_FORMAT = os.getenv("VOICEMODE_AUDIO_FORMAT", "pcm").lower()
TTS_AUDIO_FORMAT = os.getenv("VOICEMODE_TTS_AUDIO_FORMAT", "pcm").lower() # Default to PCM for optimal streaming
# STT upload format - compressed for bandwidth efficiency
# Supported: mp3, wav, flac, m4a, ogg (must be supported by STT provider)
# Default: mp3 (32kbps, ~90% bandwidth reduction vs WAV)
STT_AUDIO_FORMAT = os.getenv("VOICEMODE_STT_AUDIO_FORMAT", "mp3" if AUDIO_FORMAT == "pcm" else AUDIO_FORMAT).lower()
# STT save format - format for saved recordings when SAVE_AUDIO is enabled
# Supported: wav, mp3, flac (wav recommended for full quality archival)
# Default: wav (uncompressed, full quality)
STT_SAVE_FORMAT = os.getenv("VOICEMODE_STT_SAVE_FORMAT", "wav").lower()
# STT compression mode - controls when audio is compressed before upload
# Options:
# auto - Compress for remote endpoints, skip for local (default)
# Saves ~200-800ms transcode time for local endpoints where
# bandwidth isn't a bottleneck. Remote uploads benefit from
# smaller file sizes (MP3 is ~90% smaller than WAV).
# always - Always compress regardless of endpoint type
# never - Never compress, always send WAV (highest quality, larger files)
STT_COMPRESS = os.getenv("VOICEMODE_STT_COMPRESS", "auto").lower()
# Validate STT_COMPRESS value
if STT_COMPRESS not in ("auto", "always", "never"):
STT_COMPRESS = "auto"
# Supported audio formats
SUPPORTED_AUDIO_FORMATS = ["pcm", "opus", "mp3", "wav", "flac", "aac"]
SUPPORTED_SAVE_FORMATS = ["wav", "mp3", "flac"] # Formats suitable for saving recordings
# Validate formats (validation messages will be logged after logger is initialized)
if AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS:
_invalid_audio_format = AUDIO_FORMAT
AUDIO_FORMAT = "pcm"
if TTS_AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS:
_invalid_tts_format = TTS_AUDIO_FORMAT
TTS_AUDIO_FORMAT = AUDIO_FORMAT
if STT_AUDIO_FORMAT not in SUPPORTED_AUDIO_FORMATS:
_invalid_stt_format = STT_AUDIO_FORMAT
STT_AUDIO_FORMAT = AUDIO_FORMAT
if STT_SAVE_FORMAT not in SUPPORTED_SAVE_FORMATS:
_invalid_stt_save_format = STT_SAVE_FORMAT
STT_SAVE_FORMAT = "wav"
# Format-specific quality settings
OPUS_BITRATE = int(os.getenv("VOICEMODE_OPUS_BITRATE", "32000")) # Default 32kbps for voice
MP3_BITRATE = os.getenv("VOICEMODE_MP3_BITRATE", "32k") # Default 32kbps (optimal for speech per Whisper research)
AAC_BITRATE = os.getenv("VOICEMODE_AAC_BITRATE", "64k") # Default 64kbps
# ==================== STREAMING CONFIGURATION ====================
# Streaming playback configuration
STREAMING_ENABLED = os.getenv("VOICEMODE_STREAMING_ENABLED", "true").lower() in ("true", "1", "yes", "on")
STREAM_CHUNK_SIZE = int(os.getenv("VOICEMODE_STREAM_CHUNK_SIZE", "4096")) # Download chunk size
STREAM_BUFFER_MS = int(os.getenv("VOICEMODE_STREAM_BUFFER_MS", "150")) # Initial buffer before playback
STREAM_MAX_BUFFER = float(os.getenv("VOICEMODE_STREAM_MAX_BUFFER", "2.0")) # Max buffer in seconds
# ==================== EVENT LOGGING CONFIGURATION ====================
# Event logging configuration
# Event logs are enabled by default, or if SAVE_ALL is true
EVENT_LOG_ENABLED = SAVE_ALL or os.getenv("VOICEMODE_EVENT_LOG_ENABLED", "true").lower() in ("true", "1", "yes", "on")
EVENT_LOG_DIR = os.getenv("VOICEMODE_EVENT_LOG_DIR", str(LOGS_DIR / "events"))
EVENT_LOG_ROTATION = os.getenv("VOICEMODE_EVENT_LOG_ROTATION", "daily") # Currently only daily is supported
# ==================== GLOBAL STATE ====================
# Service management
service_processes: Dict[str, subprocess.Popen] = {}
# Concurrency control for audio operations
# This prevents multiple audio operations from interfering with stdio
audio_operation_lock = asyncio.Lock()
# Flag to track if startup initialization has run
_startup_initialized = False
# ==================== LOGGING CONFIGURATION ====================
def setup_logging() -> logging.Logger:
"""Configure logging for the voice-mode server.
Returns:
Logger instance configured for voice-mode
"""
log_level = logging.DEBUG if DEBUG else logging.INFO
logging.basicConfig(
level=log_level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("voicemode")
# Trace logging setup
if TRACE_DEBUG:
import sys
from datetime import datetime
# Create debug log directory
debug_log_dir = Path.home() / ".voicemode" / "logs" / "debug"
debug_log_dir.mkdir(parents=True, exist_ok=True)
# Create dated debug log file
debug_log_file = debug_log_dir / f"voicemode_debug_{datetime.now().strftime('%Y-%m-%d')}.log"
# Set up file handler for debug logs
debug_handler = logging.FileHandler(debug_log_file, mode='a')
debug_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
# Enable debug logging for httpx and openai
httpx_logger = logging.getLogger("httpx")
httpx_logger.setLevel(logging.DEBUG)
httpx_logger.addHandler(debug_handler)
openai_logger = logging.getLogger("openai")
openai_logger.setLevel(logging.DEBUG)
openai_logger.addHandler(debug_handler)
# Also add to main logger
logger.addHandler(debug_handler)
logger.info(f"Trace debug logging enabled, writing to {debug_log_file}")
# Legacy trace file support
trace_file = Path.home() / "voicemode_trace.log"
trace_logger = logging.getLogger("voicemode.trace")
trace_handler = logging.FileHandler(trace_file, mode='a')
trace_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
trace_logger.addHandler(trace_handler)
trace_logger.setLevel(logging.DEBUG)
def trace_calls(frame, event, arg):
if event == 'call':
code = frame.f_code
if 'voicemode' in code.co_filename or 'voice_mode' in code.co_filename:
trace_logger.debug(f"Called {code.co_filename}:{frame.f_lineno} {code.co_name}")
elif event == 'exception':
trace_logger.debug(f"Exception: {arg}")
return trace_calls
sys.settrace(trace_calls)
logger.info(f"Trace debugging enabled, writing to: {trace_file}")
# Also log to file in debug mode
if DEBUG:
debug_log_file = Path.home() / "voicemode_debug.log"
file_handler = logging.FileHandler(debug_log_file, mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
logger.info(f"Debug logging to file: {debug_log_file}")
# Suppress verbose binary data in HTTP logs
if DEBUG:
# Keep our debug logs but reduce HTTP client verbosity
logging.getLogger("openai._base_client").setLevel(logging.INFO)
logging.getLogger("httpcore").setLevel(logging.INFO)
logging.getLogger("httpx").setLevel(logging.INFO)
return logger
# ==================== DIRECTORY INITIALIZATION ====================
def initialize_directories():
"""Create necessary directories for voicemode data storage."""
# Create base directory
BASE_DIR.mkdir(exist_ok=True)
# Create all subdirectories
AUDIO_DIR.mkdir(exist_ok=True)
TRANSCRIPTIONS_DIR.mkdir(exist_ok=True)
LOGS_DIR.mkdir(exist_ok=True)
# CONFIG_DIR.mkdir(exist_ok=True) # Removed - config stored in .voicemode.env file instead
# Create subdirectories for logs
if DEBUG:
DEBUG_DIR.mkdir(parents=True, exist_ok=True)
# Create events log directory
if EVENT_LOG_ENABLED:
Path(EVENT_LOG_DIR).mkdir(parents=True, exist_ok=True)
# Initialize sound fonts if not present
initialize_soundfonts()
# ==================== SOUND FONTS INITIALIZATION ====================
def initialize_soundfonts():
"""Install package sound fonts and set up soundfonts directory structure.
Directory structure:
~/.voicemode/soundfonts/
voicemode/ - Package-managed soundfonts (synced from package)
current -> voicemode - Relative symlink to active soundfont
.version - Package version that last synced soundfonts
Users can create custom soundfont directories and point 'current' to them.
The 'voicemode' directory is synced only when the package version changes.
"""
from voice_mode.__version__ import __version__
soundfonts_dir = BASE_DIR / "soundfonts"
package_soundfont_dir = soundfonts_dir / "voicemode"
current_symlink = soundfonts_dir / "current"
version_file = soundfonts_dir / ".version"
# Migration: rename old 'default' directory to 'voicemode'
old_default_dir = soundfonts_dir / "default"
if old_default_dir.exists() and not package_soundfont_dir.exists():
try:
old_default_dir.rename(package_soundfont_dir)
# Update symlink if it pointed to default
if current_symlink.is_symlink():
link_target = str(current_symlink.readlink())
if "default" in link_target:
current_symlink.unlink()
# Use relative symlink
current_symlink.symlink_to("voicemode")
except OSError:
pass # Migration failed, will recreate below
# Fix absolute symlinks: convert to relative
if current_symlink.is_symlink():
try:
link_target = str(current_symlink.readlink())
# If it's an absolute path pointing to voicemode, make it relative
if link_target.startswith("/") and link_target.endswith("/voicemode"):
current_symlink.unlink()
current_symlink.symlink_to("voicemode")
except OSError:
pass
# Check if sync is needed (version mismatch or missing)
needs_sync = True
if version_file.exists():
try:
installed_version = version_file.read_text().strip()
if installed_version == __version__:
needs_sync = False
except (IOError, OSError):
pass # Can't read version, sync needed
if not needs_sync:
return # Skip sync, soundfonts already up to date
try:
# Create soundfonts directory
soundfonts_dir.mkdir(exist_ok=True)
# Sync package soundfonts to 'voicemode' directory incrementally
# Only update files that are missing or different
try:
# For Python 3.9+
from importlib.resources import files
package_soundfonts = files("voice_mode.data.soundfonts.default")
if package_soundfonts.is_dir():
# Files/dirs to skip (Python package artifacts)
skip_names = {"__init__.py", "__pycache__"}
def sync_tree(src, dst, depth=0):
"""Sync directory tree, only updating changed files.
Skips __init__.py and __pycache__ (Python package artifacts).
Limits recursion depth to prevent runaway loops.
"""
if depth > 10: # Reasonable max depth for soundfonts
return
dst.mkdir(exist_ok=True)
for item in src.iterdir():
# Skip Python package artifacts
if item.name in skip_names:
continue
if item.is_file():
target = dst / item.name
# Skip if destination is a symlink (could cause issues)
if target.is_symlink():
continue
new_content = item.read_bytes()
# Only write if file doesn't exist or content differs
if not target.exists():
target.write_bytes(new_content)
else:
try:
existing_content = target.read_bytes()
if existing_content != new_content:
target.write_bytes(new_content)
except (IOError, OSError):
# Can't read existing, overwrite
target.write_bytes(new_content)
elif item.is_dir():
target_dir = dst / item.name
# Skip if destination is a symlink (could be cycle)
if target_dir.exists() and target_dir.is_symlink():
continue
sync_tree(item, target_dir, depth + 1)
# Sync tree structure incrementally
sync_tree(package_soundfonts, package_soundfont_dir)
# Update version file after successful sync
version_file.write_text(__version__)
except ImportError:
# Fallback for older Python versions
import pkg_resources
package_soundfont_dir.mkdir(exist_ok=True)
# List all resources in the soundfonts directory
resource_dir = "data/soundfonts/default"
if pkg_resources.resource_exists("voice_mode", resource_dir):
# This is a bit more complex with pkg_resources
# We'll need to manually copy the structure
pass
# Create relative symlink to current soundfont (points to voicemode)
# Only create if it doesn't exist - user may have customized it
if package_soundfont_dir.exists() and not current_symlink.exists():
try:
# Use relative path, not absolute
current_symlink.symlink_to("voicemode")
except OSError:
# Symlinks might not work on all systems (e.g., Windows without admin)
pass
except Exception as e:
# Don't fail initialization if soundfonts can't be installed
# They're optional and disabled by default
if DEBUG:
import logging
logging.getLogger("voicemode").debug(f"Could not initialize soundfonts: {e}")
# ==================== UTILITY FUNCTIONS ====================
def get_debug_filename(prefix: str, extension: str) -> str:
"""Generate a timestamped filename for debug files.
Args:
prefix: Prefix for the filename (e.g., 'stt-input', 'tts-output')
extension: File extension (e.g., 'wav', 'mp3')
Returns:
Timestamped filename string
"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
return f"{prefix}_{timestamp}.{extension}"
def get_project_path() -> str:
"""Get the current project path (git root or current working directory)."""
try:
# Try to get git root
result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
cwd=os.getcwd()
)
if result.returncode == 0:
return result.stdout.strip()
except Exception:
pass
# Fall back to current working directory
return os.getcwd()
def save_transcription(text: str, prefix: str = "transcript", metadata: Optional[Dict] = None) -> Optional[Path]:
"""Save a transcription to the transcriptions directory.
Args:
text: The transcription text to save
prefix: Prefix for the filename (e.g., 'stt', 'conversation')
metadata: Optional metadata to include at the top of the file
Returns:
Path to the saved file or None if saving is disabled
"""
if not SAVE_TRANSCRIPTIONS:
return None
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
filename = f"{prefix}_{timestamp}.txt"
filepath = TRANSCRIPTIONS_DIR / filename
content = []
# Create metadata with project path
if metadata is None:
metadata = {}
metadata["project_path"] = get_project_path()
# Add metadata header
content.append("--- METADATA ---")
for key, value in metadata.items():
content.append(f"{key}: {value}")
content.append("--- TRANSCRIPT ---")
content.append("")
content.append(text)
filepath.write_text("\n".join(content), encoding="utf-8")
logger.debug(f"Transcription saved to: {filepath}")
return filepath
except Exception as e:
logger.error(f"Failed to save transcription: {e}")
return None
# ==================== SOUNDDEVICE WORKAROUND ====================
def disable_sounddevice_stderr_redirect():
"""Comprehensively disable sounddevice's stderr redirection.
This prevents sounddevice from redirecting stderr to /dev/null
which can interfere with audio playback in MCP server context.
"""
try:
import sounddevice as sd
import sys
import atexit
# Method 1: Override _ignore_stderr in various locations
if hasattr(sd, '_sounddevice'):
if hasattr(sd._sounddevice, '_ignore_stderr'):
sd._sounddevice._ignore_stderr = lambda: None
if hasattr(sd, '_ignore_stderr'):
sd._ignore_stderr = lambda: None
# Method 2: Override _check_error if it exists
if hasattr(sd, '_check'):
original_check = sd._check
def safe_check(*args, **kwargs):
# Prevent any stderr manipulation
return original_check(*args, **kwargs)
sd._check = safe_check
# Method 3: Protect file descriptors
original_stderr = sys.stderr
# Create a hook to prevent stderr replacement
def protect_stderr():
if sys.stderr != original_stderr:
sys.stderr = original_stderr
# Install protection
atexit.register(protect_stderr)
except Exception as e:
# Log but continue - audio might still work
if DEBUG:
# Can't use logger here as it's not initialized yet
print(f"DEBUG: Could not fully disable sounddevice stderr redirect: {e}", file=sys.stderr)
# ==================== HTTP CLIENT CONFIGURATION ====================
# HTTP client configuration for OpenAI clients
HTTP_CLIENT_CONFIG = {
'timeout': {
'total': 30.0,
'connect': 5.0
},
'limits': {
'max_keepalive_connections': 5,
'max_connections': 10
}
}
# ==================== INITIALIZATION ====================
# Initialize directories on module import
initialize_directories()
# Apply sounddevice workaround on module import
disable_sounddevice_stderr_redirect()
# Set up logger
logger = setup_logging()
# Log any format validation warnings
if 'AUDIO_FORMAT' in locals() and '_invalid_audio_format' in locals():
logger.warning(f"Unsupported audio format '{_invalid_audio_format}', falling back to 'pcm'")
if 'TTS_AUDIO_FORMAT' in locals() and '_invalid_tts_format' in locals():
logger.warning(f"Unsupported TTS audio format '{_invalid_tts_format}', falling back to '{AUDIO_FORMAT}'")
if 'STT_AUDIO_FORMAT' in locals() and '_invalid_stt_format' in locals():
logger.warning(f"Unsupported STT audio format '{_invalid_stt_format}', falling back to '{AUDIO_FORMAT}'")
# ==================== AUDIO FORMAT UTILITIES ====================
def get_provider_supported_formats(provider: str, operation: str = "tts") -> list:
"""Get list of audio formats supported by a provider.
Args:
provider: Provider name (e.g., 'openai', 'kokoro', 'whisper-local')
operation: 'tts' or 'stt'
Returns:
List of supported format strings
"""
# Provider format capabilities
# Based on API documentation and testing
provider_formats = {
# TTS providers
"openai": {
"tts": ["opus", "mp3", "aac", "flac", "wav", "pcm"],
"stt": ["mp3", "opus", "wav", "flac", "m4a", "webm"]
},
"kokoro": {
"tts": ["mp3", "opus", "flac", "wav", "pcm"], # AAC is not currently supported
"stt": [] # Kokoro is TTS only
},
# STT providers
"whisper-local": {
"tts": [], # Whisper is STT only
"stt": ["wav", "mp3", "opus", "flac", "m4a"]
},
"openai-whisper": {
"tts": [], # Whisper is STT only
"stt": ["mp3", "opus", "wav", "flac", "m4a", "webm"]
}
}
provider_info = provider_formats.get(provider, {})
return provider_info.get(operation, [])
def validate_audio_format(format: str, provider: str, operation: str = "tts") -> str:
"""Validate and potentially adjust audio format based on provider capabilities.
Args:
format: Requested audio format
provider: Provider name
operation: 'tts' or 'stt'
Returns:
Valid format for the provider (may differ from requested)
"""
supported = get_provider_supported_formats(provider, operation)
if not supported:
logger.warning(f"Provider '{provider}' does not support {operation} operation")
return format
if format in supported:
return format
# Fallback logic - prefer common formats
fallback_order = ["opus", "mp3", "wav"]
for fallback in fallback_order:
if fallback in supported:
logger.info(f"Format '{format}' not supported by {provider}, using '{fallback}' instead")
return fallback
# Last resort - use first supported format
first_supported = supported[0]
logger.warning(f"Using {provider}'s first supported format: {first_supported}")
return first_supported
def get_audio_loader_for_format(format: str):
"""Get the appropriate AudioSegment loader for a format.
Args:
format: Audio format string
Returns:
AudioSegment method reference or None
"""
from pydub import AudioSegment
format_loaders = {
"mp3": AudioSegment.from_mp3,
"wav": AudioSegment.from_wav,
"opus": AudioSegment.from_ogg, # Opus uses OGG container
"flac": AudioSegment.from_file if not hasattr(AudioSegment, 'from_flac') else AudioSegment.from_flac,
"aac": AudioSegment.from_file, # Generic loader for AAC
"m4a": AudioSegment.from_file, # Generic loader for M4A
"webm": AudioSegment.from_file, # Generic loader for WebM
"ogg": AudioSegment.from_ogg,
"pcm": AudioSegment.from_raw # Requires additional parameters
}
return format_loaders.get(format)
def get_format_export_params(format: str) -> dict:
"""Get export parameters for a specific audio format.
Args:
format: Audio format string
Returns:
Dict with export parameters for pydub
"""
params = {
"format": format
}
if format == "mp3":
params["bitrate"] = MP3_BITRATE
elif format == "opus":
# Opus in OGG container
params["format"] = "opus" # pydub uses 'opus' for OGG/Opus
params["parameters"] = ["-b:a", str(OPUS_BITRATE)]
elif format == "aac":
params["bitrate"] = AAC_BITRATE
elif format == "flac":
# FLAC is lossless, no bitrate setting
pass
elif format == "wav":
# WAV is uncompressed, no bitrate setting
pass
return params
# ==================== SERVE COMMAND CONFIGURATION ====================
# Host/IP address to bind the server to (default: 127.0.0.1)
SERVE_HOST = os.getenv("VOICEMODE_SERVE_HOST", "127.0.0.1")
# Port to bind the server to (default: 8765)
SERVE_PORT = int(os.getenv("VOICEMODE_SERVE_PORT", "8765"))
# Allow connections from local/private IP ranges (default: true)
SERVE_ALLOW_LOCAL = env_bool("VOICEMODE_SERVE_ALLOW_LOCAL", True)
# Allow connections from Anthropic IP ranges for Claude Cowork (default: false)
SERVE_ALLOW_ANTHROPIC = env_bool("VOICEMODE_SERVE_ALLOW_ANTHROPIC", False)
# Allow connections from Tailscale IP range 100.64.0.0/10 (default: false)
SERVE_ALLOW_TAILSCALE = env_bool("VOICEMODE_SERVE_ALLOW_TAILSCALE", False)
# Additional allowed CIDR ranges (comma-separated, default: empty)
SERVE_ALLOWED_IPS = os.getenv("VOICEMODE_SERVE_ALLOWED_IPS", "")
# URL secret path segment for authentication (default: empty/disabled)
SERVE_SECRET = os.getenv("VOICEMODE_SERVE_SECRET", "")
# Bearer token for authentication (default: empty/disabled)
SERVE_TOKEN = os.getenv("VOICEMODE_SERVE_TOKEN", "")
# Transport protocol (streamable-http or sse)
SERVE_TRANSPORT = os.getenv("VOICEMODE_SERVE_TRANSPORT", "streamable-http")
# ==================== THINK OUT LOUD CONFIGURATION ====================
# Enable Think Out Loud mode
THINK_OUT_LOUD_ENABLED = env_bool("VOICEMODE_THINK_OUT_LOUD", False)
# Voice persona mappings for thinking roles (role:voice pairs)
# Default: analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova
THINKING_VOICES_STR = os.getenv(
"VOICEMODE_THINKING_VOICES",
"analytical:am_adam,creative:af_sarah,critical:af_bella,synthesis:af_nova"
)
# Parse thinking voices into a dictionary
THINKING_VOICES = {}
for pair in THINKING_VOICES_STR.split(","):
if ":" in pair:
role, voice = pair.strip().split(":", 1)
THINKING_VOICES[role.strip()] = voice.strip()
# Thinking presentation style: sequential, debate, or chorus
THINKING_STYLE = os.getenv("VOICEMODE_THINKING_STYLE", "sequential")
# Whether to announce which voice is speaking
THINKING_ANNOUNCE_VOICE = env_bool("VOICEMODE_THINKING_ANNOUNCE_VOICE", True)