#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LocalVoiceMode - Voice Chat with Character Skills
-------------------------------------------------
Local voice interface with support for character-based roleplay skills.
Uses Pocket TTS (CPU) and Whisper (CPU) for speech.
Usage:
python voice_client.py # Default assistant
python voice_client.py --skill hermione # Load Hermione character
python voice_client.py --list-skills # List available skills
"""
import argparse
import asyncio
import json
import os
import sys
import time
import warnings
import io
import contextlib
# Completely disable torch dynamo/compile BEFORE any torch imports
# Triton is not available on Windows, causing verbose error spam
os.environ["TORCHDYNAMO_DISABLE"] = "1"
# Suppress specific warnings
warnings.filterwarnings("ignore", message=".*torch_dtype.*")
warnings.filterwarnings("ignore", message=".*cache_implementation.*")
warnings.filterwarnings("ignore", message=".*sliding_window.*deprecated.*")
warnings.filterwarnings("ignore", message=".*TensorFloat32.*")
warnings.filterwarnings("ignore", message=".*WON'T CONVERT.*")
warnings.filterwarnings("ignore", message=".*triton.*")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import re
# =============================================================================
# Import from modular package
# =============================================================================
from src.localvoicemode.audio import AudioRecorder
from src.localvoicemode.speech import (
ASREngine,
TTSEngine,
TTSFilter,
SileroVAD,
SmartTurnVAD,
get_smart_turn_vad,
get_tts_filter,
SpeechMode,
set_speech_mode,
OutputPipeline,
create_pipeline_for_skill,
InjectionDetector,
)
from src.localvoicemode.llm import (
LLMClient,
ProviderManager,
ProviderConfig,
ProviderType,
)
from src.localvoicemode.skills import SkillLoader, Skill
from src.localvoicemode.state import (
Config,
ConversationManager,
VoiceMode,
ModeStateMachine,
TurnStateMachine,
)
# Fix Windows console encoding for emojis (skip if running as MCP server)
if sys.platform == "win32" and hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
pass # Skip if stdout/stderr doesn't support reconfigure (e.g., MCP mode)
from pathlib import Path
from dataclasses import dataclass, field
# Setup NVIDIA DLL paths BEFORE importing onnxruntime (Windows only)
# This must be done before onnxruntime is imported anywhere
# We add to PATH environment variable which is checked before add_dll_directory
if sys.platform == "win32":
venv_nvidia = Path(__file__).parent / ".venv" / "Lib" / "site-packages" / "nvidia"
if venv_nvidia.exists():
nvidia_paths = []
for pkg_dir in venv_nvidia.iterdir():
if pkg_dir.is_dir():
bin_dir = pkg_dir / "bin"
if bin_dir.exists():
abs_path = str(bin_dir.absolute())
nvidia_paths.append(abs_path)
if hasattr(os, "add_dll_directory"):
try:
os.add_dll_directory(abs_path)
except Exception:
pass
# Also add to PATH for DLLs that load other DLLs
if nvidia_paths:
os.environ["PATH"] = (
os.pathsep.join(nvidia_paths) + os.pathsep + os.environ.get("PATH", "")
)
from typing import Optional, Dict, Any, List
from enum import Enum
# Ensure we're in the right directory
SCRIPT_DIR = Path(__file__).parent.absolute()
os.chdir(SCRIPT_DIR)
# ============================================================================
# Lazy imports with auto-install
# ============================================================================
def ensure_package(package: str, import_name: str = None):
"""Ensure a package is installed, install if missing."""
import_name = import_name or package
try:
with (
contextlib.redirect_stdout(io.StringIO()),
contextlib.redirect_stderr(io.StringIO()),
):
__import__(import_name)
except ImportError:
print(f"Installing {package}...")
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
ensure_package("numpy")
ensure_package("sounddevice")
ensure_package("scipy")
ensure_package("httpx")
ensure_package("pynput")
ensure_package("pyyaml", "yaml")
import numpy as np
import sounddevice as sd
from rich.console import Console, Group
from rich.panel import Panel
from rich.text import Text
from rich.table import Table
from rich.live import Live
from rich.align import Align
from rich.columns import Columns
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
from rich.padding import Padding
from rich.style import Style
from rich import box
import threading
import scipy.io.wavfile as wavfile
import httpx
import yaml
from pynput import keyboard
# ============================================================================
# Configuration (from src/localvoicemode/state/config.py)
# ============================================================================
# Create config instance from modular Config class
config = Config.from_base_dir(SCRIPT_DIR)
# ============================================================================
# Theme & UI Colors
# ============================================================================
class Theme:
"""Consistent color palette for the UI - Green Theme."""
# Primary colors (green palette matching notification style)
PRIMARY = "#00875f" # Forest green - main accent
SECONDARY = "#5faf5f" # Medium green
ACCENT = "#87d787" # Light green highlight
# Status colors
SUCCESS = "#00ff5f" # Bright green
WARNING = "#d7af00" # Gold/amber
ERROR = "#ff5f5f" # Soft red
INFO = "#5fafaf" # Teal
# Text colors
MUTED = "dim"
BRIGHT = "bold white"
# UI elements
BORDER = "#00875f" # Forest green borders
PANEL_BG = ""
# Status indicators (all green-themed) - using ASCII-safe characters
STATUS_READY = ("#00ff5f", "*") # Bright green
STATUS_LISTENING = ("#5faf5f", "o") # Medium green
STATUS_RECORDING = ("#ff5f5f", "O") # Red (keep for recording)
STATUS_TRANSCRIBING = ("#d7af00", "~") # Gold
STATUS_THINKING = ("#87d787", ".") # Light green
STATUS_SPEAKING = ("#00875f", ">") # Forest green
def create_banner(provider: str = "Unknown", platform: str = "Unknown") -> Text:
"""Create ASCII art banner for LOCALVOICE in block style."""
banner_lines = [
"╔══════════════════════════════════════════════════════════════════════════════╗",
"║ ██╗ ██████╗ ██████╗ █████╗ ██╗ ██╗ ██╗ ██████╗ ██╗ ██████╗███████╗ ║",
"║ ██║ ██╔═══██╗██╔════╝██╔══██╗██║ ██║ ██║██╔═══██╗██║██╔════╝██╔════╝ ║",
"║ ██║ ██║ ██║██║ ███████║██║ ██║ ██║██║ ██║██║██║ █████╗ ║",
"║ ██║ ██║ ██║██║ ██╔══██║██║ ╚██╗ ██╔╝██║ ██║██║██║ ██╔══╝ ║",
"║ ██████╗╚██████╔╝╚██████╗██║ ██║██████╗╚████╔╝ ╚██████╔╝██║╚██████╗███████╗ ║",
"║ ╚═════╝ ╚═════╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚═══╝ ╚═════╝ ╚═╝ ╚═════╝╚══════╝ ║",
"╚══════════════════════════════════════════════════════════════════════════════╝",
]
text = Text(justify="center")
for line in banner_lines:
text.append(line + "\n", style=f"bold {Theme.PRIMARY}")
return text
def create_subtitle(features: list = None) -> Text:
"""Create the feature subtitle line, centered."""
if features is None:
features = ["MULTI-CHARACTER", "SKILLS", "VAD", "PTT", "MCP"]
text = Text(justify="center")
for i, feature in enumerate(features):
if i > 0:
text.append(" • ", style=Theme.SECONDARY)
text.append(feature, style=f"bold {Theme.ACCENT}")
return text
def create_waveform(levels: list, width: int = 40, height: int = 5) -> Text:
"""Create a multi-line ASCII waveform visualization.
Creates a vertical bar chart using ASCII characters:
- Uses | and # characters for reliable rendering
- Multiple rows for better visual impact
"""
text = Text()
# Resample levels to fit width
if len(levels) > width:
step = len(levels) / width
resampled = [levels[int(i * step)] for i in range(width)]
elif len(levels) < width:
resampled = [0.0] * (width - len(levels)) + levels
else:
resampled = levels
# Build multi-line visualization (top to bottom)
for row in range(height, 0, -1):
threshold = row / height
for level in resampled:
if level >= threshold:
# Filled - color based on height
if row > height * 0.7:
style = f"bold {Theme.SUCCESS}" # Top = bright green
elif row > height * 0.4:
style = f"bold {Theme.ACCENT}" # Middle = light green
else:
style = f"bold {Theme.PRIMARY}" # Bottom = forest green
text.append("|", style=style)
else:
text.append(" ", style="dim")
text.append("\n")
# Bottom line (baseline)
for level in resampled:
if level > 0.05:
text.append("-", style=f"dim {Theme.PRIMARY}")
else:
text.append(".", style="dim #333333")
return text
def create_waveform_compact(levels: list, width: int = 50) -> Text:
"""Create a compact single-line waveform using ASCII characters."""
chars = " ._-=+#@" # 8 levels of intensity
text = Text()
# Resample levels to fit width
if len(levels) > width:
step = len(levels) / width
resampled = [levels[int(i * step)] for i in range(width)]
elif len(levels) < width:
resampled = [0.0] * (width - len(levels)) + levels
else:
resampled = levels
for level in resampled:
idx = int(level * 7)
idx = max(0, min(7, idx))
char = chars[idx]
if level > 0.7:
style = f"bold {Theme.SUCCESS}"
elif level > 0.4:
style = f"bold {Theme.ACCENT}"
elif level > 0.1:
style = f"{Theme.PRIMARY}"
else:
style = f"dim {Theme.PRIMARY}"
text.append(char, style=style)
return text
def create_status_bar(
latency_ms: int = None,
tts_speed: float = None,
tokens_in: int = None,
tokens_out: int = None,
provider: str = None,
status: str = "READY",
) -> Text:
"""Create the bottom HUD-style status bar with metrics."""
text = Text()
# Format values
latency_str = f"{latency_ms}ms" if latency_ms is not None else "--ms"
tts_str = f"{tts_speed:.1f}x" if tts_speed is not None else "--x"
tokens_in_str = str(tokens_in) if tokens_in is not None else "--"
tokens_out_str = str(tokens_out) if tokens_out is not None else "--"
# Build HUD-style status bar (matching screenshot design)
text.append(" LATENCY:", style=f"dim {Theme.SECONDARY}")
text.append(f" {latency_str:<7}", style=f"bold {Theme.ACCENT}")
text.append(" TTS:", style=f"dim {Theme.SECONDARY}")
text.append(f" {tts_str:<6}", style=f"bold {Theme.ACCENT}")
text.append(" TOKENS:", style=f"dim {Theme.SECONDARY}")
text.append(f" {tokens_in_str}", style=f"bold {Theme.ACCENT}")
text.append(" IN ", style="dim #666666")
text.append(f"{tokens_out_str}", style=f"bold {Theme.ACCENT}")
text.append(" OUT", style="dim #666666")
text.append(" │ ", style="dim #666666")
text.append(
f"{status}",
style=f"bold {Theme.SUCCESS}" if status == "READY" else f"bold {Theme.WARNING}",
)
return text
# ============================================================================
# Provider Management (from src/localvoicemode/llm/)
# Classes imported: ProviderType, ProviderConfig, ProviderManager
# ============================================================================
# ============================================================================
# Skill System (from src/localvoicemode/skills/)
# Classes imported: Skill, SkillLoader
# ============================================================================
# ============================================================================
# Voice Mode Controller with Rich UI
# ============================================================================
console = Console()
class VoiceModeController:
# Voice commands for mode switching (MODE-03)
# Maps command phrases to target mode names
VOICE_COMMANDS = {
# STT control - stop listening to microphone
"stop listening": "tts_only",
"stop recording": "tts_only",
"mute microphone": "tts_only",
"mute mic": "tts_only",
# TTS control - stop speaking
"stop talking": "stt_only",
"stop speaking": "stt_only",
"be quiet": "stt_only",
"shut up": "stt_only",
"mute": "stt_only",
# Full mode - both listening and speaking
"full voice": "full_voice",
"unmute": "full_voice",
"resume listening": "full_voice",
"start listening": "full_voice",
# Silent mode - neither listening nor speaking
"go silent": "silent",
"silent mode": "silent",
"all off": "silent",
}
def __init__(self, provider_config: Optional[ProviderConfig] = None):
self.tts = TTSEngine()
self.asr = ASREngine()
self.recorder = AudioRecorder()
# Store provider info
self.provider_config = provider_config
# LLM client - auto-detect or use specified provider
if provider_config:
self.llm = LLMClient(provider_config=provider_config)
else:
# Legacy mode: use config values directly
self.llm = LLMClient(
base_url=config.api_url, api_key=config.api_key, model=config.model
)
# Skill system
self.skill_loader = SkillLoader(config.skills_dir, config.voice_refs_dir)
self.active_skill: Optional[Skill] = None
self.running = False
self.ptt_pressed = False
# UI state
self.status = "Ready"
self.audio_level = 0.0 # For audio level meter
self.level_history = [0.0] * 50 # Waveform history for visualization
self._live = None # Reference to Live display for real-time updates
# Conversation state (delegated to ConversationManager)
self.conversation = ConversationManager()
# Voice mode and turn-taking state machines (Phase 2)
self.mode_machine = ModeStateMachine()
self.turn_machine = TurnStateMachine()
# Wire turn state callbacks to control audio flow
self.turn_machine.set_on_enter_speaking(self._on_enter_speaking)
self.turn_machine.set_on_exit_speaking(self._on_exit_speaking)
# Phase 3: Output pipeline and injection detection
self.output_pipeline: Optional[OutputPipeline] = None
self.injection_detector = InjectionDetector()
# Additional metrics not in ConversationManager
self.tts_speed: Optional[float] = None
# Set up audio level callback for waveform visualization
self.recorder.set_level_callback(self._on_audio_level)
# =========================================================================
# ConversationManager property accessors (backward compatibility)
# =========================================================================
@property
def last_user_text(self) -> str:
"""Get most recent user message."""
return self.conversation.last_user_text
@property
def last_response(self) -> str:
"""Get most recent assistant message."""
return self.conversation.last_response
@property
def conversation_history(self) -> List[tuple]:
"""Get conversation history as list of (role, text) tuples."""
return self.conversation.history
@property
def tokens_in(self) -> int:
"""Get total input tokens."""
return self.conversation.tokens_in
@property
def tokens_out(self) -> int:
"""Get total output tokens."""
return self.conversation.tokens_out
@property
def latency_ms(self) -> Optional[int]:
"""Get last response latency in milliseconds."""
return self.conversation.latency_ms
# =========================================================================
# Turn-taking callbacks (VAD/TTS race prevention)
# =========================================================================
def _on_enter_speaking(self):
"""Called when TTS starts - suspend VAD to prevent self-feedback."""
# VAD suspension is handled by checking turn state in recording loop
pass
def _on_exit_speaking(self):
"""Called when TTS ends - resume VAD if in appropriate mode."""
pass
# =========================================================================
# Mode control methods (Phase 2)
# =========================================================================
def set_voice_mode(self, mode: str) -> dict:
"""
Set voice mode. MCP-callable with status response.
Args:
mode: One of 'full_voice', 'tts_only', 'stt_only', 'silent'
Aliases: 'full', 'tts', 'stt', 'mute'
Returns:
Dict with mode, stt_enabled, tts_enabled, turn_state, transition_ms
"""
import time
start = time.perf_counter()
# Normalize mode aliases
mode_map = {
"full": "full_voice",
"full_voice": "full_voice",
"tts": "tts_only",
"tts_only": "tts_only",
"stt": "stt_only",
"stt_only": "stt_only",
"silent": "silent",
"mute": "silent",
}
normalized = mode_map.get(mode.lower())
if not normalized:
return {"error": f"Unknown mode: {mode}. Use: full_voice, tts_only, stt_only, silent"}
# Cancel any in-progress turn before mode switch
if not self.turn_machine.is_idle():
self.turn_machine.safe_send("cancel")
# Event names match state names with set_ prefix
event_name = f"set_{normalized}"
self.mode_machine.send(event_name)
elapsed_ms = (time.perf_counter() - start) * 1000
return {
"mode": self.mode_machine.mode_name,
"stt_enabled": self.mode_machine.stt_enabled,
"tts_enabled": self.mode_machine.tts_enabled,
"turn_state": self.turn_machine.state_name,
"transition_ms": round(elapsed_ms, 2),
}
def _should_process_audio(self) -> bool:
"""Check if STT should process audio (mode + turn state)."""
return (
self.mode_machine.stt_enabled and
self.turn_machine.current_state.id in ["idle", "listening"]
)
def _should_speak(self) -> bool:
"""Check if TTS should speak (mode check)."""
return self.mode_machine.tts_enabled
def _filter_for_tts(self, text: str) -> str:
"""Filter text through output pipeline before TTS.
Uses the configured output pipeline if filters are defined,
otherwise falls back to the default TTS filter.
Args:
text: Raw LLM response text
Returns:
Filtered text suitable for TTS
"""
if self.output_pipeline and len(self.output_pipeline) > 0:
return self.output_pipeline.process(text)
else:
return get_tts_filter().filter_for_speech(text, max_length=500)
def _check_injection(self, text: str) -> bool:
"""Check if text contains prompt injection attempt.
Only checks if the active skill has input_validation.block_injection
enabled. When an injection is detected, logs the pattern.
Args:
text: User input text to check
Returns:
True if injection detected (should block), False if safe
"""
if not self.active_skill:
return False
if not self.active_skill.input_validation.get("block_injection", False):
return False
pattern = self.injection_detector.detect(text)
if pattern:
print(f"[Security] Blocked injection attempt: {pattern}")
return True
return False
def get_status(self) -> dict:
"""Get current status for MCP/headless mode."""
return {
"running": self.running,
"mode": self.mode_machine.mode_name,
"turn_state": self.turn_machine.state_name,
"stt_enabled": self.mode_machine.stt_enabled,
"tts_enabled": self.mode_machine.tts_enabled,
"conversation_turns": self.conversation.turn_count,
"skill": self.active_skill.id if self.active_skill else None,
}
def _handle_voice_command(self, text: str) -> bool:
"""
Check if text is a voice command and handle it.
Voice commands are intercepted BEFORE sending to LLM to avoid
the feedback loop where LLM responds to commands as questions.
Args:
text: Transcribed speech text
Returns:
True if command was handled (don't send to LLM), False otherwise
"""
if not text:
return False
# Normalize for matching
import string
normalized = text.lower().strip()
# Remove punctuation for flexible matching
normalized = normalized.translate(str.maketrans("", "", string.punctuation))
# Check for command matches (longest commands first to avoid substring issues)
# e.g., "unmute" should match before "mute"
sorted_commands = sorted(self.VOICE_COMMANDS.items(), key=lambda x: len(x[0]), reverse=True)
for command, target_mode in sorted_commands:
if command in normalized:
# Found a command - switch mode
result = self.set_voice_mode(target_mode)
mode_name = result.get("mode", target_mode)
# Provide voice feedback if TTS is enabled in new mode
if result.get("tts_enabled"):
feedback_messages = {
"tts_only": "Okay, I'll stop listening.",
"stt_only": "Okay, I'll stop talking.",
"full_voice": "Full voice mode activated.",
"silent": "Going silent.",
}
feedback = feedback_messages.get(mode_name, f"Mode set to {mode_name}")
self.tts.speak(feedback)
self.status = f"Mode: {mode_name}"
return True
return False
# =========================================================================
# Audio callbacks
# =========================================================================
def _on_audio_level(self, level: float, history: list):
"""Callback for real-time audio level updates from microphone."""
self.audio_level = level
self.level_history = history.copy()
# Update UI if we have a live display
if self._live and self.status in ["Listening...", "Recording..."]:
self._live.update(self._build_ui())
def _on_tts_level(self, level: float):
"""Callback for real-time TTS audio level updates."""
self.audio_level = level
# Shift history and add new level
self.level_history.pop(0)
self.level_history.append(level)
# Don't update UI here - let Live's refresh handle it
def load_skill(self, skill_id: str) -> bool:
"""Load and activate a skill."""
skill = self.skill_loader.load_skill(skill_id)
if not skill:
return False
self.active_skill = skill
# Set system prompt
self.llm.set_system_prompt(skill.system_prompt)
# Load voice if available
if skill.voice_file:
try:
self.tts.load_voice(skill.voice_file, skill.id)
except Exception as e:
console.print(
f"[yellow]Warning: Could not load custom voice: {e}[/yellow]"
)
self.tts.load_voice(voice_name="default")
else:
self.tts.load_voice(voice_name="default")
# Create output pipeline based on skill config (Phase 3)
self.output_pipeline = create_pipeline_for_skill(skill)
# Reset conversation
self.conversation.clear()
self.llm.reset()
self.llm.set_system_prompt(skill.system_prompt)
return True
def show_skill_menu(self) -> Optional[str]:
"""Show interactive skill selection menu with visual cards, full width."""
skills = self.skill_loader.list_skills()
if not skills:
console.print(f"[{Theme.ERROR}]No skills found![/{Theme.ERROR}]")
return None
console.clear()
# Full terminal width
total_width = console.width
# Header - full width panel
header_text = Text(
"Select a Character", style=f"bold {Theme.ACCENT}", justify="center"
)
header_panel = Panel(
Align.center(header_text),
border_style=Theme.PRIMARY,
box=box.ROUNDED,
padding=(1, 4),
expand=True,
)
# Build skill cards
cards = []
for i, skill in enumerate(skills, 1):
is_current = self.active_skill and skill["id"] == self.active_skill.id
border_style = Theme.SUCCESS if is_current else Theme.PRIMARY
content = Text()
content.append(f"{skill['display_name']}\n\n", style=f"bold {Theme.BRIGHT}")
desc = skill["description"]
# Approximate max desc based on half terminal width minus padding
max_desc = (total_width // 2) - 12
if len(desc) > max_desc:
desc = desc[: max_desc - 3] + "..."
content.append(f"{desc}\n\n", style=Theme.MUTED)
if is_current:
content.append("[*] Current", style=f"bold {Theme.SUCCESS}")
else:
content.append(f"[{i}] Select", style=f"{Theme.SECONDARY}")
card = Panel(
content,
border_style=border_style,
box=box.ROUNDED,
height=8,
padding=(1, 2),
expand=True,
)
cards.append(card)
# Arrange cards in a table that expands - use Columns for proper expansion
row_table = Table.grid(padding=(0, 2), expand=True)
row_table.add_column(ratio=1)
row_table.add_column(ratio=1)
for i in range(0, len(cards), 2):
row_cards = cards[i : i + 2]
if len(row_cards) == 2:
row_table.add_row(row_cards[0], row_cards[1])
else:
row_table.add_row(row_cards[0], Text(""))
# Print full-width layout
console.print()
console.print(header_panel)
console.print()
console.print(row_table)
console.print()
console.print(Align.center(Text("[0] Cancel", style=Theme.MUTED)))
try:
# Manually center the input prompt
# We print this below the layout
console.print()
prompt_text = "Enter number: "
padding = max(0, (console.width - len(prompt_text)) // 2)
prompt = (
" " * padding
+ f"[bold {Theme.ACCENT}]{prompt_text}[/bold {Theme.ACCENT}]"
)
choice = console.input(prompt)
choice_num = int(choice)
if choice_num == 0:
return None
if 1 <= choice_num <= len(skills):
return skills[choice_num - 1]["id"]
except (ValueError, KeyboardInterrupt):
pass
return None
def show_mode_menu(self) -> Optional[str]:
"""Show interactive mode selection menu."""
console.clear()
modes = [
("vad", "Voice Activity", "Auto-detects when you speak"),
("ptt", "Push-to-Talk", "Hold SPACE or RIGHT SHIFT to talk"),
("type", "Type Mode", "Type messages, hear voice responses"),
]
# Header
header_text = Text(
"Select Input Mode", style=f"bold {Theme.ACCENT}", justify="center"
)
header_panel = Panel(
Align.center(header_text),
border_style=Theme.PRIMARY,
box=box.ROUNDED,
padding=(1, 4),
expand=True,
)
console.print()
console.print(header_panel)
console.print()
# Mode options
for i, (mode_id, name, desc) in enumerate(modes, 1):
is_current = config.mode == mode_id
style = f"bold {Theme.SUCCESS}" if is_current else Theme.BRIGHT
marker = "[*]" if is_current else f"[{i}]"
console.print(f" {marker} {name} - {desc}", style=style)
console.print()
console.print(f" [0] Cancel", style=Theme.MUTED)
console.print()
try:
choice = console.input(
f"[bold {Theme.ACCENT}]Enter number: [/bold {Theme.ACCENT}]"
)
choice_num = int(choice)
if choice_num == 0:
return None
if 1 <= choice_num <= len(modes):
return modes[choice_num - 1][0]
except (ValueError, KeyboardInterrupt):
pass
return None
def _build_ui(self) -> Group:
"""Build the main UI with retro ASCII art banner, perfectly centered."""
# Get provider info
provider_info = (
self.llm.get_provider_info()
if hasattr(self.llm, "get_provider_info")
else "LOCAL"
)
skill_name = (
self.active_skill.display_name if self.active_skill else "Voice Assistant"
)
# Full terminal width
ui_width = console.width
# Status mapping
status_styles = {
"Ready": Theme.STATUS_READY,
"Listening...": Theme.STATUS_LISTENING,
"Recording...": Theme.STATUS_RECORDING,
"Transcribing...": Theme.STATUS_TRANSCRIBING,
"Thinking...": Theme.STATUS_THINKING,
"Speaking...": Theme.STATUS_SPEAKING,
}
style_color, icon = status_styles.get(self.status, ("white", "○"))
# Master Table for horizontal centering
main_table = Table.grid(expand=True)
main_table.add_column(justify="center")
# 1. Header Section (Banner & Subtitle)
banner = create_banner()
main_table.add_row(banner)
features = ["MULTI-CHARACTER", "SKILLS", "VAD", "PTT", "MCP"]
platform = "WINDOWS" if sys.platform == "win32" else "LINUX"
subtitle_text = Text()
for i, feature in enumerate(features):
if i > 0:
subtitle_text.append(" • ", style=Theme.SECONDARY)
subtitle_text.append(feature, style=f"bold {Theme.ACCENT}")
subtitle_text.append(
f" | RUNNING ON {platform}", style=f"dim {Theme.PRIMARY}"
)
main_table.add_row(subtitle_text)
main_table.add_row(Text("\n"))
# 2. Information Grid (Character & Status)
info_grid = Table.grid(padding=(0, 4))
info_grid.add_column(justify="right", width=20)
info_grid.add_column(justify="left", width=30)
info_grid.add_row(
Text("Character:", style=f"dim {Theme.SECONDARY}"),
Text(skill_name, style=f"bold {Theme.ACCENT}"),
)
info_grid.add_row(
Text("Status:", style=f"dim {Theme.SECONDARY}"),
Text(f"{icon} {self.status}", style=f"bold {style_color}"),
)
main_table.add_row(info_grid)
main_table.add_row(Text("\n"))
# 3. Audio Visualization Panel
if self.status in ["Listening...", "Recording...", "Speaking..."]:
waveform = create_waveform(self.level_history, width=60, height=6)
audio_panel = Panel(
Align.center(waveform),
title=f"[dim {Theme.SECONDARY}]Audio Level[/dim {Theme.SECONDARY}]",
border_style=f"dim {Theme.PRIMARY}",
box=box.ROUNDED,
expand=True,
padding=(0, 2),
)
main_table.add_row(audio_panel)
else:
# Show empty placeholder to maintain layout
main_table.add_row(Text("\n"))
# 4. Conversation History - Centered Panel
conv_list = Text()
recent = (
self.conversation_history[-5:]
if len(self.conversation_history) > 5
else self.conversation_history
)
for role, text_content in recent:
if role == "user":
role_text = Text("You: ", style=f"bold {Theme.SUCCESS}")
content_text = Text(
text_content[:75] + ("..." if len(text_content) > 75 else ""),
style=Theme.SUCCESS,
)
else:
role_text = Text("AI: ", style=f"bold {Theme.PRIMARY}")
content_text = Text(
text_content[:75] + ("..." if len(text_content) > 75 else ""),
style=Theme.SECONDARY,
)
conv_list.append_text(role_text)
conv_list.append_text(content_text)
conv_list.append("\n")
if not recent:
conv_list.append(
"Start speaking to begin...\n", style=f"dim {Theme.SECONDARY}"
)
conv_panel = Panel(
conv_list,
title="[dim]Conversation[/dim]",
border_style=f"dim {Theme.PRIMARY}",
box=box.ROUNDED,
expand=True,
padding=(1, 2),
)
main_table.add_row(conv_panel)
# 5. Shortcuts & Status Bar
shortcuts_text = Text()
shortcuts_text.append("[Q]", style=f"bold {Theme.SECONDARY}")
shortcuts_text.append(" Quit ", style="dim #666666")
shortcuts_text.append("[V]", style=f"bold {Theme.SECONDARY}")
shortcuts_text.append(" Voice ", style="dim #666666")
shortcuts_text.append("[C]", style=f"bold {Theme.SECONDARY}")
shortcuts_text.append(" Clear ", style="dim #666666")
shortcuts_text.append("[ESC]", style=f"bold {Theme.SECONDARY}")
shortcuts_text.append(" Stop", style="dim #666666")
main_table.add_row(shortcuts_text)
status_bar = create_status_bar(
latency_ms=self.latency_ms,
tts_speed=self.tts_speed,
tokens_in=self.tokens_in,
tokens_out=self.tokens_out,
provider=provider_info.split()[0] if provider_info else "LOCAL",
status=self.status.upper().replace(".", ""),
)
status_panel = Panel(
status_bar,
border_style=f"dim {Theme.PRIMARY}",
box=box.ROUNDED,
expand=True,
)
main_table.add_row(status_panel)
return main_table
def _update_status(self, status: str, live: Optional[Live] = None):
"""Update status and refresh UI."""
self.status = status
if live:
live.update(self._build_ui())
def _process_audio_with_ui(self, audio: np.ndarray, live: Live) -> bool:
"""Process recorded audio with UI updates. Returns True to continue, False to exit."""
if len(audio) < config.sample_rate * 0.3:
# Return to idle if audio too short
self.turn_machine.safe_send("cancel")
return True
# Transcribe
self._update_status("Transcribing...", live)
text = self.asr.transcribe(audio)
if not text.strip():
self._update_status("Listening...", live)
# Return to idle if no text
self.turn_machine.safe_send("cancel")
return True
# Check for voice commands BEFORE sending to LLM (MODE-03)
if self._handle_voice_command(text):
self.turn_machine.safe_send("cancel")
self._update_status("Listening...", live)
return True # Command handled, don't send to LLM
# Check for injection attempts BEFORE sending to LLM (Phase 3)
if self._check_injection(text):
# Speak warning and return to listening
self._update_status("Speaking...", live)
self.turn_machine.safe_send("start_speaking")
if self._should_speak():
self.tts.speak(
InjectionDetector.WARNING_MESSAGE,
level_callback=self._on_tts_level
)
self.turn_machine.safe_send("finish_speaking")
self._update_status("Listening...", live)
self.turn_machine.safe_send("cancel")
return True # Blocked, don't send to LLM
# Add to conversation
self.conversation.add_turn("user", text)
self._update_status("Thinking...", live)
# Check for exit commands
if text.lower().strip() in ["exit", "quit", "goodbye", "bye", "stop"]:
self._update_status("Speaking...", live)
self.turn_machine.safe_send("start_speaking")
if self._should_speak():
self.tts.speak("Goodbye!", level_callback=self._on_tts_level)
self.turn_machine.safe_send("finish_speaking")
return False
# Check for voice change command
if "change voice" in text.lower() or "change character" in text.lower():
self.turn_machine.safe_send("cancel")
return "change_voice"
# Send to LLM with latency tracking
llm_start = time.time()
response = self.llm.send_message(text)
llm_end = time.time()
# Update metrics via ConversationManager
self.conversation.set_latency(int((llm_end - llm_start) * 1000))
# Add assistant response to conversation
self.conversation.add_turn("assistant", response)
# Filter and speak response (Phase 3 pipeline)
filtered_response = self._filter_for_tts(response)
self._update_status("Speaking...", live)
self.turn_machine.safe_send("start_speaking")
if self._should_speak() and filtered_response:
self.tts.speak(filtered_response, level_callback=self._on_tts_level)
self.turn_machine.safe_send("finish_speaking")
# Small delay to prevent echo pickup
time.sleep(0.3)
self._update_status("Listening...", live)
return True
def on_key_press(self, key):
"""Handle key press."""
try:
if hasattr(key, "char") and key.char:
if key.char.lower() == "q":
self.running = False
return False
elif key.char.lower() == "v":
self.running = "change_voice"
return False
elif key.char.lower() == "c":
self.conversation.clear()
self.llm.reset()
if self.active_skill:
self.llm.set_system_prompt(self.active_skill.system_prompt)
# PTT keys: space or right shift
if key == keyboard.Key.space or key == keyboard.Key.shift_r:
if not self.ptt_pressed:
self.ptt_pressed = True
self.recorder.start_recording()
if key == keyboard.Key.esc:
self.running = False
return False
except AttributeError:
pass
def on_key_release(self, key):
"""Handle key release."""
try:
# PTT keys: space or right shift
if key == keyboard.Key.space or key == keyboard.Key.shift_r:
if self.ptt_pressed:
self.ptt_pressed = False
return "process_audio"
except AttributeError:
pass
def run_vad_mode(self):
"""Run in voice activity detection mode with rich UI."""
console.clear()
self.running = True
# Start keyboard listener in background for hotkeys
def key_listener():
def on_press(key):
result = self.on_key_press(key)
if result is False:
return False
with keyboard.Listener(on_press=on_press) as listener:
listener.join()
key_thread = threading.Thread(target=key_listener, daemon=True)
key_thread.start()
# Greeting
if self.active_skill:
greeting = self.active_skill.metadata.get(
"greeting", f"Hello! {self.active_skill.name} here. How can I help?"
)
else:
greeting = "Voice mode active. How can I help you?"
with Live(
self._build_ui(), refresh_per_second=4, console=console, screen=True
) as live:
# Store reference for real-time waveform updates
self._live = live
self._update_status("Speaking...", live)
# Transition to speaking state for greeting
self.turn_machine.safe_send("start_speaking")
if self._should_speak():
self.tts.speak(greeting, level_callback=self._on_tts_level)
self.turn_machine.safe_send("finish_speaking")
time.sleep(0.3) # Prevent echo pickup
self._update_status("Listening...", live)
while self.running is True:
try:
# Check mode before recording
if not self._should_process_audio():
time.sleep(0.1) # Avoid busy loop when STT disabled
continue
# Transition to listening state
self.turn_machine.safe_send("start_listening")
audio = self.recorder.record_vad()
# Voice detected - transition to processing
self.turn_machine.safe_send("voice_detected")
result = self._process_audio_with_ui(audio, live)
if result == "change_voice":
self.running = "change_voice"
break
elif result is False:
break
except KeyboardInterrupt:
break
# Clear live reference
self._live = None
return self.running # Return state for main loop
def run_ptt_mode(self):
"""Run in push-to-talk mode with rich UI."""
console.clear()
self.running = True
with Live(
self._build_ui(), refresh_per_second=4, console=console, screen=True
) as live:
# Store reference for real-time waveform updates
self._live = live
self._update_status("Ready - Hold SPACE or RIGHT SHIFT to talk", live)
# Greeting
if self.active_skill:
greeting = self.active_skill.metadata.get(
"greeting", f"Hello! {self.active_skill.name} here."
)
else:
greeting = "Push-to-talk mode active."
self._update_status("Speaking...", live)
self.tts.speak(greeting, level_callback=self._on_tts_level)
self._update_status("Ready - Hold SPACE or RIGHT SHIFT to talk", live)
def on_press(key):
self.on_key_press(key)
if self.ptt_pressed:
self._update_status("Recording...", live)
def on_release(key):
result = self.on_key_release(key)
if result == "process_audio":
audio = self.recorder.stop_recording()
self._update_status("Processing...", live)
proc_result = self._process_audio_with_ui(audio, live)
if proc_result is False:
self.running = False
return False
elif proc_result == "change_voice":
self.running = "change_voice"
return False
self._update_status(
"Ready - Hold SPACE or RIGHT SHIFT to talk", live
)
if self.running is not True:
return False
with keyboard.Listener(
on_press=on_press, on_release=on_release
) as listener:
listener.join()
return self.running
def run_type_mode(self):
"""Run in type mode - keyboard input with voice response."""
console.clear()
self.running = True
# Build header info
if self.provider_config:
provider_info = f"{self.provider_config.name}"
if self.provider_config.model:
provider_info += f" ({self.provider_config.model})"
else:
provider_info = "Custom API"
skill_name = (
self.active_skill.name if self.active_skill else "Default Assistant"
)
# Print header
console.print(
Panel(
Text(
"LocalVoiceMode - Type Mode",
style=f"bold {Theme.SUCCESS}",
justify="center",
),
border_style=Theme.BORDER,
box=box.ROUNDED,
)
)
console.print(f"[dim]Provider:[/dim] [bold]{provider_info}[/bold]")
console.print(f"[dim]Character:[/dim] [bold]{skill_name}[/bold]")
console.print()
console.print(
f"[dim]Type your message and press Enter. The assistant will speak the response.[/dim]"
)
console.print(
f"[dim]Commands: 'quit' to exit, 'voice' to change character, 'clear' to reset[/dim]"
)
console.print()
# Greeting
if self.active_skill:
greeting = self.active_skill.metadata.get(
"greeting", f"Hello! {self.active_skill.name} here. How can I help?"
)
else:
greeting = "Type mode active. How can I help you?"
console.print(
f"[bold {Theme.PRIMARY}]{skill_name}:[/bold {Theme.PRIMARY}] {greeting}"
)
self.tts.speak(greeting, level_callback=self._on_tts_level)
console.print()
while self.running:
try:
# Get user input
user_input = console.input(
f"[bold {Theme.SUCCESS}]You:[/bold {Theme.SUCCESS}] "
)
if not user_input.strip():
continue
text = user_input.strip()
# Check for exit commands
if text.lower() in ["exit", "quit", "goodbye", "bye", "stop", "q"]:
console.print(
f"\n[bold {Theme.PRIMARY}]{skill_name}:[/bold {Theme.PRIMARY}] Goodbye!"
)
self.tts.speak("Goodbye!", level_callback=self._on_tts_level)
break
# Check for voice change command
if text.lower() in ["voice", "change voice", "change character", "v"]:
return "change_voice"
# Check for clear command
if text.lower() in ["clear", "reset", "c"]:
self.conversation.clear()
self.llm.reset()
if self.active_skill:
self.llm.set_system_prompt(self.active_skill.system_prompt)
console.print(f"[dim]Conversation cleared.[/dim]\n")
continue
# Check for injection attempts BEFORE sending to LLM (Phase 3)
if self._check_injection(text):
console.print(
f"\n[bold {Theme.ERROR}]{skill_name}:[/bold {Theme.ERROR}] "
f"{InjectionDetector.WARNING_MESSAGE}"
)
self.tts.speak(
InjectionDetector.WARNING_MESSAGE,
level_callback=self._on_tts_level
)
console.print()
continue
# Add to conversation and send to LLM
self.conversation.add_turn("user", text)
# Show thinking indicator
with console.status(f"[{Theme.ACCENT}]Thinking...", spinner="dots"):
llm_start = time.time()
response = self.llm.send_message(text)
llm_end = time.time()
# Update metrics via ConversationManager
self.conversation.set_latency(int((llm_end - llm_start) * 1000))
# Add assistant response
self.conversation.add_turn("assistant", response)
# Print and speak response (filter through pipeline for TTS)
console.print(
f"\n[bold {Theme.PRIMARY}]{skill_name}:[/bold {Theme.PRIMARY}] {response}"
)
console.print(f"[dim]({self.latency_ms}ms)[/dim]\n")
filtered_response = self._filter_for_tts(response)
if filtered_response:
self.tts.speak(filtered_response, level_callback=self._on_tts_level)
except KeyboardInterrupt:
console.print("\n[dim]Interrupted[/dim]")
break
except EOFError:
break
return self.running
def run_headless(self):
"""Run in headless mode for MCP integration - no UI, just voice."""
self.running = True
# Greeting
if self.active_skill:
greeting = self.active_skill.metadata.get(
"greeting", f"Hello! {self.active_skill.name} here. How can I help?"
)
else:
greeting = "Voice mode active."
self.tts.speak(greeting)
while self.running:
try:
audio = self.recorder.record_vad()
if len(audio) < config.sample_rate * 0.3:
continue
text = self.asr.transcribe(audio)
if not text.strip():
continue
# Check for voice commands BEFORE sending to LLM (MODE-03)
if self._handle_voice_command(text):
continue # Command handled, don't send to LLM
# Check for injection attempts BEFORE sending to LLM (Phase 3)
if self._check_injection(text):
self.tts.speak(InjectionDetector.WARNING_MESSAGE)
continue # Blocked, don't send to LLM
# Check for exit commands
lower_text = text.lower().strip()
if lower_text in [
"exit",
"quit",
"goodbye",
"bye",
"stop",
"stop voice",
"end voice",
]:
self.tts.speak("Goodbye!")
break
# Check for skill change
if "change voice" in lower_text or "change character" in lower_text:
# List available skills
skills = self.skill_loader.list_skills()
skill_names = ", ".join([s["name"] for s in skills])
self.tts.speak(
f"Available characters are: {skill_names}. Say the name of who you want to talk to."
)
continue
# Check if user is selecting a skill by name
for skill in self.skill_loader.list_skills():
if (
skill["name"].lower() in lower_text
or skill["id"].lower() in lower_text
):
self.load_skill(skill["id"])
self.tts.speak(f"Switched to {skill['name']}.")
continue
# Normal conversation
self.conversation.add_turn("user", text)
response = self.llm.send_message(text)
self.conversation.add_turn("assistant", response)
# Filter response through pipeline before speaking (Phase 3)
filtered_response = self._filter_for_tts(response)
if filtered_response:
self.tts.speak(filtered_response)
except KeyboardInterrupt:
break
def show_startup_menu(self) -> bool:
"""Show startup menu with character and mode selection. Returns False to quit."""
console.clear()
# Show banner
console.print(create_banner())
console.print()
# Current settings
skill_name = (
self.active_skill.display_name if self.active_skill else "Voice Assistant"
)
mode_names = {
"vad": "Voice Activity",
"ptt": "Push-to-Talk",
"type": "Type Mode",
}
mode_name = mode_names.get(config.mode, config.mode)
console.print(
f" Character: [bold {Theme.ACCENT}]{skill_name}[/bold {Theme.ACCENT}]"
)
console.print(f" Mode: [bold {Theme.ACCENT}]{mode_name}[/bold {Theme.ACCENT}]")
console.print()
# Menu options
console.print(f" [1] Start Voice Chat", style=f"bold {Theme.SUCCESS}")
console.print(f" [2] Change Character", style=Theme.BRIGHT)
console.print(f" [3] Change Input Mode", style=Theme.BRIGHT)
console.print(f" [0] Quit", style=Theme.MUTED)
console.print()
try:
choice = console.input(
f"[bold {Theme.ACCENT}]Enter number: [/bold {Theme.ACCENT}]"
)
choice_num = int(choice)
if choice_num == 0:
return False
elif choice_num == 1:
return True # Start voice chat
elif choice_num == 2:
new_skill = self.show_skill_menu()
if new_skill:
self.load_skill(new_skill)
return None # Show menu again
elif choice_num == 3:
new_mode = self.show_mode_menu()
if new_mode:
config.mode = new_mode
return None # Show menu again
except (ValueError, KeyboardInterrupt):
return False
return None
def run(self):
"""Run voice mode with interactive menu."""
# Headless mode - no UI
if config.headless:
self.run_headless()
return
# Show startup menu
while True:
result = self.show_startup_menu()
if result is True:
break # Start voice chat
elif result is False:
console.clear()
console.print("\n[bold cyan]Goodbye![/bold cyan]\n")
return
# result is None means show menu again
while True:
# Run the appropriate mode
if config.mode == "ptt":
result = self.run_ptt_mode()
elif config.mode == "type":
result = self.run_type_mode()
else:
result = self.run_vad_mode()
# Check if we should change voice
if result == "change_voice":
new_skill = self.show_skill_menu()
if new_skill:
self.load_skill(new_skill)
console.clear()
continue
else:
break
console.clear()
console.print("\n[bold cyan]Thanks for using LocalVoiceMode![/bold cyan]\n")
# ============================================================================
# Main
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description="LocalVoiceMode - Voice Chat with Character Skills"
)
parser.add_argument(
"--skill", "-s", help="Skill/character to load (e.g., 'hermione')"
)
parser.add_argument(
"--list-skills", "-l", action="store_true", help="List available skills"
)
parser.add_argument(
"--list-providers", action="store_true", help="List available LLM providers"
)
parser.add_argument(
"--mode",
"-m",
choices=["vad", "ptt", "type"],
default="vad",
help="Input mode: vad (voice activity), ptt (push-to-talk), or type (keyboard input with voice response)",
)
# Provider settings
parser.add_argument(
"--provider",
"-p",
choices=["lm_studio", "openrouter", "openai"],
help="Force specific provider (default: auto-detect)",
)
# LLM API settings (manual override)
parser.add_argument(
"--api-url", help="OpenAI-compatible API URL (overrides auto-detection)"
)
parser.add_argument("--api-key", help="API key for the LLM service")
parser.add_argument(
"--model",
help="Model name to use (e.g., 'gpt-4', 'claude-sonnet-4-20250514', 'llama-3')",
)
# ASR device settings
parser.add_argument(
"--device",
default="cuda",
choices=["cuda", "cpu"],
help="Device for ASR model (cuda for GPU with TensorRT/CUDA, cpu for CPU)",
)
# Headless mode for MCP integration
parser.add_argument(
"--headless", action="store_true", help="Run without UI (for MCP integration)"
)
args = parser.parse_args()
# Update config
config.mode = args.mode
config.device = args.device
config.headless = args.headless
# List providers
if args.list_providers:
console.print(
f"\n[bold {Theme.PRIMARY}]LLM Provider Status:[/bold {Theme.PRIMARY}]"
)
console.print("-" * 50)
console.print(ProviderManager.get_status_report())
console.print()
return
# List skills (non-interactive)
if args.list_skills:
loader = SkillLoader(config.skills_dir, config.voice_refs_dir)
skills = loader.list_skills()
console.print(
f"\n[bold {Theme.PRIMARY}]Available Skills:[/bold {Theme.PRIMARY}]"
)
console.print("-" * 50)
if not skills:
console.print(
f"[{Theme.ERROR}]No skills found in {config.skills_dir}[/{Theme.ERROR}]"
)
else:
for skill in skills:
console.print(f" {skill['display_name']}")
console.print(f" [{Theme.MUTED}]ID: {skill['id']}[/{Theme.MUTED}]")
console.print(
f" [{Theme.MUTED}]{skill['description'][:60]}...[/{Theme.MUTED}]"
)
console.print()
return
# Determine provider
provider_config = None
force_provider = args.provider or os.environ.get("VOICE_PROVIDER")
if args.api_url:
# Manual override - create a provider config from args
model = args.model
# If no model specified, try to detect from the API
if not model:
try:
api_url = args.api_url.rstrip("/")
resp = httpx.get(f"{api_url}/models", timeout=2.0)
if resp.status_code == 200:
data = resp.json()
if data.get("data"):
model = data["data"][0].get("id")
except Exception:
pass # Will use default model handling
provider_config = ProviderConfig(
type=ProviderType.LM_STUDIO, # Treat manual URL as generic OpenAI-compatible
name="Custom API",
api_url=args.api_url,
api_key=args.api_key,
model=model,
available=True,
)
else:
# Auto-detect provider (verbose to show what's happening)
provider_config = ProviderManager.get_best_provider(
force_provider, verbose=True
)
if not provider_config:
console.print(
f"\n[{Theme.ERROR}]No LLM provider available![/{Theme.ERROR}]"
)
console.print()
console.print(f"[{Theme.MUTED}]Options:[/{Theme.MUTED}]")
console.print(f" 1. Start LM Studio with local server enabled")
console.print(f" 2. Set OPENROUTER_API_KEY environment variable")
console.print(f" 3. Set OPENAI_API_KEY environment variable")
console.print(f" 4. Use --api-url to specify a custom endpoint")
console.print()
return
# Override model if specified
if args.model:
provider_config.model = args.model
# Headless mode - minimal output, for MCP integration
if config.headless:
controller = VoiceModeController(provider_config=provider_config)
skill = args.skill or "assistant"
controller.load_skill(skill)
try:
controller.run()
except KeyboardInterrupt:
pass
return
# Show startup banner
console.clear()
console.print()
banner_content = Text()
banner_content.append("LocalVoiceMode\n", style=f"bold {Theme.PRIMARY}")
banner_content.append(f"Provider: {provider_config.name}\n", style=Theme.MUTED)
if provider_config.model:
banner_content.append(f"Model: {provider_config.model}\n", style=Theme.MUTED)
asr_model = f"Parakeet TDT 0.6B v3 on {config.device.upper()}"
banner_content.append(f"ASR: {asr_model}", style=Theme.MUTED)
console.print(
Panel.fit(
banner_content,
border_style=Theme.BORDER,
title=f"[{Theme.BRIGHT}] Voice Mode [{Theme.BRIGHT}]",
)
)
console.print()
# Create controller with detected provider
controller = VoiceModeController(provider_config=provider_config)
# Load skill - either from args or show menu
if args.skill:
if not controller.load_skill(args.skill):
console.print(
f"[{Theme.ERROR}]Failed to load skill: {args.skill}[/{Theme.ERROR}]"
)
console.print(
f"[{Theme.MUTED}]Use --list-skills to see available skills[/{Theme.MUTED}]"
)
return
else:
# Show interactive skill selection
skill_id = controller.show_skill_menu()
if skill_id:
controller.load_skill(skill_id)
else:
# Default to assistant if cancelled
controller.load_skill("assistant")
# Run
try:
controller.run()
except KeyboardInterrupt:
console.clear()
console.print("\n[bold cyan]Goodbye![/bold cyan]\n")
if __name__ == "__main__":
main()