LocalVoiceMode

voice_client.py•63.5 KiB

#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ LocalVoiceMode - Voice Chat with Character Skills ------------------------------------------------- Local voice interface with support for character-based roleplay skills. Uses Pocket TTS (CPU) and Whisper (CPU) for speech. Usage: python voice_client.py # Default assistant python voice_client.py --skill hermione # Load Hermione character python voice_client.py --list-skills # List available skills """ import argparse import asyncio import json import os import sys import time import warnings import io import contextlib # Completely disable torch dynamo/compile BEFORE any torch imports # Triton is not available on Windows, causing verbose error spam os.environ["TORCHDYNAMO_DISABLE"] = "1" # Suppress specific warnings warnings.filterwarnings("ignore", message=".*torch_dtype.*") warnings.filterwarnings("ignore", message=".*cache_implementation.*") warnings.filterwarnings("ignore", message=".*sliding_window.*deprecated.*") warnings.filterwarnings("ignore", message=".*TensorFloat32.*") warnings.filterwarnings("ignore", message=".*WON'T CONVERT.*") warnings.filterwarnings("ignore", message=".*triton.*") warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) import re # ============================================================================= # Import from modular package # ============================================================================= from src.localvoicemode.audio import AudioRecorder from src.localvoicemode.speech import ( ASREngine, TTSEngine, TTSFilter, SileroVAD, SmartTurnVAD, get_smart_turn_vad, get_tts_filter, SpeechMode, set_speech_mode, OutputPipeline, create_pipeline_for_skill, InjectionDetector, ) from src.localvoicemode.llm import ( LLMClient, ProviderManager, ProviderConfig, ProviderType, ) from src.localvoicemode.skills import SkillLoader, Skill from src.localvoicemode.state import ( Config, ConversationManager, VoiceMode, ModeStateMachine, TurnStateMachine, ) # Fix Windows console encoding for emojis (skip if running as MCP server) if sys.platform == "win32" and hasattr(sys.stdout, "reconfigure"): try: sys.stdout.reconfigure(encoding="utf-8", errors="replace") sys.stderr.reconfigure(encoding="utf-8", errors="replace") except Exception: pass # Skip if stdout/stderr doesn't support reconfigure (e.g., MCP mode) from pathlib import Path from dataclasses import dataclass, field # Setup NVIDIA DLL paths BEFORE importing onnxruntime (Windows only) # This must be done before onnxruntime is imported anywhere # We add to PATH environment variable which is checked before add_dll_directory if sys.platform == "win32": venv_nvidia = Path(__file__).parent / ".venv" / "Lib" / "site-packages" / "nvidia" if venv_nvidia.exists(): nvidia_paths = [] for pkg_dir in venv_nvidia.iterdir(): if pkg_dir.is_dir(): bin_dir = pkg_dir / "bin" if bin_dir.exists(): abs_path = str(bin_dir.absolute()) nvidia_paths.append(abs_path) if hasattr(os, "add_dll_directory"): try: os.add_dll_directory(abs_path) except Exception: pass # Also add to PATH for DLLs that load other DLLs if nvidia_paths: os.environ["PATH"] = ( os.pathsep.join(nvidia_paths) + os.pathsep + os.environ.get("PATH", "") ) from typing import Optional, Dict, Any, List from enum import Enum # Ensure we're in the right directory SCRIPT_DIR = Path(__file__).parent.absolute() os.chdir(SCRIPT_DIR) # ============================================================================ # Lazy imports with auto-install # ============================================================================ def ensure_package(package: str, import_name: str = None): """Ensure a package is installed, install if missing.""" import_name = import_name or package try: with ( contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()), ): __import__(import_name) except ImportError: print(f"Installing {package}...") import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"]) ensure_package("numpy") ensure_package("sounddevice") ensure_package("scipy") ensure_package("httpx") ensure_package("pynput") ensure_package("pyyaml", "yaml") import numpy as np import sounddevice as sd from rich.console import Console, Group from rich.panel import Panel from rich.text import Text from rich.table import Table from rich.live import Live from rich.align import Align from rich.columns import Columns from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn from rich.padding import Padding from rich.style import Style from rich import box import threading import scipy.io.wavfile as wavfile import httpx import yaml from pynput import keyboard # ============================================================================ # Configuration (from src/localvoicemode/state/config.py) # ============================================================================ # Create config instance from modular Config class config = Config.from_base_dir(SCRIPT_DIR) # ============================================================================ # Theme & UI Colors # ============================================================================ class Theme: """Consistent color palette for the UI - Green Theme.""" # Primary colors (green palette matching notification style) PRIMARY = "#00875f" # Forest green - main accent SECONDARY = "#5faf5f" # Medium green ACCENT = "#87d787" # Light green highlight # Status colors SUCCESS = "#00ff5f" # Bright green WARNING = "#d7af00" # Gold/amber ERROR = "#ff5f5f" # Soft red INFO = "#5fafaf" # Teal # Text colors MUTED = "dim" BRIGHT = "bold white" # UI elements BORDER = "#00875f" # Forest green borders PANEL_BG = "" # Status indicators (all green-themed) - using ASCII-safe characters STATUS_READY = ("#00ff5f", "*") # Bright green STATUS_LISTENING = ("#5faf5f", "o") # Medium green STATUS_RECORDING = ("#ff5f5f", "O") # Red (keep for recording) STATUS_TRANSCRIBING = ("#d7af00", "~") # Gold STATUS_THINKING = ("#87d787", ".") # Light green STATUS_SPEAKING = ("#00875f", ">") # Forest green def create_banner(provider: str = "Unknown", platform: str = "Unknown") -> Text: """Create ASCII art banner for LOCALVOICE in block style.""" banner_lines = [ "╔══════════════════════════════════════════════════════════════════════════════╗", "║ ██╗ ██████╗ ██████╗ █████╗ ██╗ ██╗ ██╗ ██████╗ ██╗ ██████╗███████╗ ║", "║ ██║ ██╔═══██╗██╔════╝██╔══██╗██║ ██║ ██║██╔═══██╗██║██╔════╝██╔════╝ ║", "║ ██║ ██║ ██║██║ ███████║██║ ██║ ██║██║ ██║██║██║ █████╗ ║", "║ ██║ ██║ ██║██║ ██╔══██║██║ ╚██╗ ██╔╝██║ ██║██║██║ ██╔══╝ ║", "║ ██████╗╚██████╔╝╚██████╗██║ ██║██████╗╚████╔╝ ╚██████╔╝██║╚██████╗███████╗ ║", "║ ╚═════╝ ╚═════╝ ╚═════╝╚═╝ ╚═╝╚═════╝ ╚═══╝ ╚═════╝ ╚═╝ ╚═════╝╚══════╝ ║", "╚══════════════════════════════════════════════════════════════════════════════╝", ] text = Text(justify="center") for line in banner_lines: text.append(line + "\n", style=f"bold {Theme.PRIMARY}") return text def create_subtitle(features: list = None) -> Text: """Create the feature subtitle line, centered.""" if features is None: features = ["MULTI-CHARACTER", "SKILLS", "VAD", "PTT", "MCP"] text = Text(justify="center") for i, feature in enumerate(features): if i > 0: text.append(" • ", style=Theme.SECONDARY) text.append(feature, style=f"bold {Theme.ACCENT}") return text def create_waveform(levels: list, width: int = 40, height: int = 5) -> Text: """Create a multi-line ASCII waveform visualization. Creates a vertical bar chart using ASCII characters: - Uses | and # characters for reliable rendering - Multiple rows for better visual impact """ text = Text() # Resample levels to fit width if len(levels) > width: step = len(levels) / width resampled = [levels[int(i * step)] for i in range(width)] elif len(levels) < width: resampled = [0.0] * (width - len(levels)) + levels else: resampled = levels # Build multi-line visualization (top to bottom) for row in range(height, 0, -1): threshold = row / height for level in resampled: if level >= threshold: # Filled - color based on height if row > height * 0.7: style = f"bold {Theme.SUCCESS}" # Top = bright green elif row > height * 0.4: style = f"bold {Theme.ACCENT}" # Middle = light green else: style = f"bold {Theme.PRIMARY}" # Bottom = forest green text.append("|", style=style) else: text.append(" ", style="dim") text.append("\n") # Bottom line (baseline) for level in resampled: if level > 0.05: text.append("-", style=f"dim {Theme.PRIMARY}") else: text.append(".", style="dim #333333") return text def create_waveform_compact(levels: list, width: int = 50) -> Text: """Create a compact single-line waveform using ASCII characters.""" chars = " ._-=+#@" # 8 levels of intensity text = Text() # Resample levels to fit width if len(levels) > width: step = len(levels) / width resampled = [levels[int(i * step)] for i in range(width)] elif len(levels) < width: resampled = [0.0] * (width - len(levels)) + levels else: resampled = levels for level in resampled: idx = int(level * 7) idx = max(0, min(7, idx)) char = chars[idx] if level > 0.7: style = f"bold {Theme.SUCCESS}" elif level > 0.4: style = f"bold {Theme.ACCENT}" elif level > 0.1: style = f"{Theme.PRIMARY}" else: style = f"dim {Theme.PRIMARY}" text.append(char, style=style) return text def create_status_bar( latency_ms: int = None, tts_speed: float = None, tokens_in: int = None, tokens_out: int = None, provider: str = None, status: str = "READY", ) -> Text: """Create the bottom HUD-style status bar with metrics.""" text = Text() # Format values latency_str = f"{latency_ms}ms" if latency_ms is not None else "--ms" tts_str = f"{tts_speed:.1f}x" if tts_speed is not None else "--x" tokens_in_str = str(tokens_in) if tokens_in is not None else "--" tokens_out_str = str(tokens_out) if tokens_out is not None else "--" # Build HUD-style status bar (matching screenshot design) text.append(" LATENCY:", style=f"dim {Theme.SECONDARY}") text.append(f" {latency_str:<7}", style=f"bold {Theme.ACCENT}") text.append(" TTS:", style=f"dim {Theme.SECONDARY}") text.append(f" {tts_str:<6}", style=f"bold {Theme.ACCENT}") text.append(" TOKENS:", style=f"dim {Theme.SECONDARY}") text.append(f" {tokens_in_str}", style=f"bold {Theme.ACCENT}") text.append(" IN ", style="dim #666666") text.append(f"{tokens_out_str}", style=f"bold {Theme.ACCENT}") text.append(" OUT", style="dim #666666") text.append(" │ ", style="dim #666666") text.append( f"{status}", style=f"bold {Theme.SUCCESS}" if status == "READY" else f"bold {Theme.WARNING}", ) return text # ============================================================================ # Provider Management (from src/localvoicemode/llm/) # Classes imported: ProviderType, ProviderConfig, ProviderManager # ============================================================================ # ============================================================================ # Skill System (from src/localvoicemode/skills/) # Classes imported: Skill, SkillLoader # ============================================================================ # ============================================================================ # Voice Mode Controller with Rich UI # ============================================================================ console = Console() class VoiceModeController: # Voice commands for mode switching (MODE-03) # Maps command phrases to target mode names VOICE_COMMANDS = { # STT control - stop listening to microphone "stop listening": "tts_only", "stop recording": "tts_only", "mute microphone": "tts_only", "mute mic": "tts_only", # TTS control - stop speaking "stop talking": "stt_only", "stop speaking": "stt_only", "be quiet": "stt_only", "shut up": "stt_only", "mute": "stt_only", # Full mode - both listening and speaking "full voice": "full_voice", "unmute": "full_voice", "resume listening": "full_voice", "start listening": "full_voice", # Silent mode - neither listening nor speaking "go silent": "silent", "silent mode": "silent", "all off": "silent", } def __init__(self, provider_config: Optional[ProviderConfig] = None): self.tts = TTSEngine() self.asr = ASREngine() self.recorder = AudioRecorder() # Store provider info self.provider_config = provider_config # LLM client - auto-detect or use specified provider if provider_config: self.llm = LLMClient(provider_config=provider_config) else: # Legacy mode: use config values directly self.llm = LLMClient( base_url=config.api_url, api_key=config.api_key, model=config.model ) # Skill system self.skill_loader = SkillLoader(config.skills_dir, config.voice_refs_dir) self.active_skill: Optional[Skill] = None self.running = False self.ptt_pressed = False # UI state self.status = "Ready" self.audio_level = 0.0 # For audio level meter self.level_history = [0.0] * 50 # Waveform history for visualization self._live = None # Reference to Live display for real-time updates # Conversation state (delegated to ConversationManager) self.conversation = ConversationManager() # Voice mode and turn-taking state machines (Phase 2) self.mode_machine = ModeStateMachine() self.turn_machine = TurnStateMachine() # Wire turn state callbacks to control audio flow self.turn_machine.set_on_enter_speaking(self._on_enter_speaking) self.turn_machine.set_on_exit_speaking(self._on_exit_speaking) # Phase 3: Output pipeline and injection detection self.output_pipeline: Optional[OutputPipeline] = None self.injection_detector = InjectionDetector() # Additional metrics not in ConversationManager self.tts_speed: Optional[float] = None # Set up audio level callback for waveform visualization self.recorder.set_level_callback(self._on_audio_level) # ========================================================================= # ConversationManager property accessors (backward compatibility) # ========================================================================= @property def last_user_text(self) -> str: """Get most recent user message.""" return self.conversation.last_user_text @property def last_response(self) -> str: """Get most recent assistant message.""" return self.conversation.last_response @property def conversation_history(self) -> List[tuple]: """Get conversation history as list of (role, text) tuples.""" return self.conversation.history @property def tokens_in(self) -> int: """Get total input tokens.""" return self.conversation.tokens_in @property def tokens_out(self) -> int: """Get total output tokens.""" return self.conversation.tokens_out @property def latency_ms(self) -> Optional[int]: """Get last response latency in milliseconds.""" return self.conversation.latency_ms # ========================================================================= # Turn-taking callbacks (VAD/TTS race prevention) # ========================================================================= def _on_enter_speaking(self): """Called when TTS starts - suspend VAD to prevent self-feedback.""" # VAD suspension is handled by checking turn state in recording loop pass def _on_exit_speaking(self): """Called when TTS ends - resume VAD if in appropriate mode.""" pass # ========================================================================= # Mode control methods (Phase 2) # ========================================================================= def set_voice_mode(self, mode: str) -> dict: """ Set voice mode. MCP-callable with status response. Args: mode: One of 'full_voice', 'tts_only', 'stt_only', 'silent' Aliases: 'full', 'tts', 'stt', 'mute' Returns: Dict with mode, stt_enabled, tts_enabled, turn_state, transition_ms """ import time start = time.perf_counter() # Normalize mode aliases mode_map = { "full": "full_voice", "full_voice": "full_voice", "tts": "tts_only", "tts_only": "tts_only", "stt": "stt_only", "stt_only": "stt_only", "silent": "silent", "mute": "silent", } normalized = mode_map.get(mode.lower()) if not normalized: return {"error": f"Unknown mode: {mode}. Use: full_voice, tts_only, stt_only, silent"} # Cancel any in-progress turn before mode switch if not self.turn_machine.is_idle(): self.turn_machine.safe_send("cancel") # Event names match state names with set_ prefix event_name = f"set_{normalized}" self.mode_machine.send(event_name) elapsed_ms = (time.perf_counter() - start) * 1000 return { "mode": self.mode_machine.mode_name, "stt_enabled": self.mode_machine.stt_enabled, "tts_enabled": self.mode_machine.tts_enabled, "turn_state": self.turn_machine.state_name, "transition_ms": round(elapsed_ms, 2), } def _should_process_audio(self) -> bool: """Check if STT should process audio (mode + turn state).""" return ( self.mode_machine.stt_enabled and self.turn_machine.current_state.id in ["idle", "listening"] ) def _should_speak(self) -> bool: """Check if TTS should speak (mode check).""" return self.mode_machine.tts_enabled def _filter_for_tts(self, text: str) -> str: """Filter text through output pipeline before TTS. Uses the configured output pipeline if filters are defined, otherwise falls back to the default TTS filter. Args: text: Raw LLM response text Returns: Filtered text suitable for TTS """ if self.output_pipeline and len(self.output_pipeline) > 0: return self.output_pipeline.process(text) else: return get_tts_filter().filter_for_speech(text, max_length=500) def _check_injection(self, text: str) -> bool: """Check if text contains prompt injection attempt. Only checks if the active skill has input_validation.block_injection enabled. When an injection is detected, logs the pattern. Args: text: User input text to check Returns: True if injection detected (should block), False if safe """ if not self.active_skill: return False if not self.active_skill.input_validation.get("block_injection", False): return False pattern = self.injection_detector.detect(text) if pattern: print(f"[Security] Blocked injection attempt: {pattern}") return True return False def get_status(self) -> dict: """Get current status for MCP/headless mode.""" return { "running": self.running, "mode": self.mode_machine.mode_name, "turn_state": self.turn_machine.state_name, "stt_enabled": self.mode_machine.stt_enabled, "tts_enabled": self.mode_machine.tts_enabled, "conversation_turns": self.conversation.turn_count, "skill": self.active_skill.id if self.active_skill else None, } def _handle_voice_command(self, text: str) -> bool: """ Check if text is a voice command and handle it. Voice commands are intercepted BEFORE sending to LLM to avoid the feedback loop where LLM responds to commands as questions. Args: text: Transcribed speech text Returns: True if command was handled (don't send to LLM), False otherwise """ if not text: return False # Normalize for matching import string normalized = text.lower().strip() # Remove punctuation for flexible matching normalized = normalized.translate(str.maketrans("", "", string.punctuation)) # Check for command matches (longest commands first to avoid substring issues) # e.g., "unmute" should match before "mute" sorted_commands = sorted(self.VOICE_COMMANDS.items(), key=lambda x: len(x[0]), reverse=True) for command, target_mode in sorted_commands: if command in normalized: # Found a command - switch mode result = self.set_voice_mode(target_mode) mode_name = result.get("mode", target_mode) # Provide voice feedback if TTS is enabled in new mode if result.get("tts_enabled"): feedback_messages = { "tts_only": "Okay, I'll stop listening.", "stt_only": "Okay, I'll stop talking.", "full_voice": "Full voice mode activated.", "silent": "Going silent.", } feedback = feedback_messages.get(mode_name, f"Mode set to {mode_name}") self.tts.speak(feedback) self.status = f"Mode: {mode_name}" return True return False # ========================================================================= # Audio callbacks # ========================================================================= def _on_audio_level(self, level: float, history: list): """Callback for real-time audio level updates from microphone.""" self.audio_level = level self.level_history = history.copy() # Update UI if we have a live display if self._live and self.status in ["Listening...", "Recording..."]: self._live.update(self._build_ui()) def _on_tts_level(self, level: float): """Callback for real-time TTS audio level updates.""" self.audio_level = level # Shift history and add new level self.level_history.pop(0) self.level_history.append(level) # Don't update UI here - let Live's refresh handle it def load_skill(self, skill_id: str) -> bool: """Load and activate a skill.""" skill = self.skill_loader.load_skill(skill_id) if not skill: return False self.active_skill = skill # Set system prompt self.llm.set_system_prompt(skill.system_prompt) # Load voice if available if skill.voice_file: try: self.tts.load_voice(skill.voice_file, skill.id) except Exception as e: console.print( f"[yellow]Warning: Could not load custom voice: {e}[/yellow]" ) self.tts.load_voice(voice_name="default") else: self.tts.load_voice(voice_name="default") # Create output pipeline based on skill config (Phase 3) self.output_pipeline = create_pipeline_for_skill(skill) # Reset conversation self.conversation.clear() self.llm.reset() self.llm.set_system_prompt(skill.system_prompt) return True def show_skill_menu(self) -> Optional[str]: """Show interactive skill selection menu with visual cards, full width.""" skills = self.skill_loader.list_skills() if not skills: console.print(f"[{Theme.ERROR}]No skills found![/{Theme.ERROR}]") return None console.clear() # Full terminal width total_width = console.width # Header - full width panel header_text = Text( "Select a Character", style=f"bold {Theme.ACCENT}", justify="center" ) header_panel = Panel( Align.center(header_text), border_style=Theme.PRIMARY, box=box.ROUNDED, padding=(1, 4), expand=True, ) # Build skill cards cards = [] for i, skill in enumerate(skills, 1): is_current = self.active_skill and skill["id"] == self.active_skill.id border_style = Theme.SUCCESS if is_current else Theme.PRIMARY content = Text() content.append(f"{skill['display_name']}\n\n", style=f"bold {Theme.BRIGHT}") desc = skill["description"] # Approximate max desc based on half terminal width minus padding max_desc = (total_width // 2) - 12 if len(desc) > max_desc: desc = desc[: max_desc - 3] + "..." content.append(f"{desc}\n\n", style=Theme.MUTED) if is_current: content.append("[*] Current", style=f"bold {Theme.SUCCESS}") else: content.append(f"[{i}] Select", style=f"{Theme.SECONDARY}") card = Panel( content, border_style=border_style, box=box.ROUNDED, height=8, padding=(1, 2), expand=True, ) cards.append(card) # Arrange cards in a table that expands - use Columns for proper expansion row_table = Table.grid(padding=(0, 2), expand=True) row_table.add_column(ratio=1) row_table.add_column(ratio=1) for i in range(0, len(cards), 2): row_cards = cards[i : i + 2] if len(row_cards) == 2: row_table.add_row(row_cards[0], row_cards[1]) else: row_table.add_row(row_cards[0], Text("")) # Print full-width layout console.print() console.print(header_panel) console.print() console.print(row_table) console.print() console.print(Align.center(Text("[0] Cancel", style=Theme.MUTED))) try: # Manually center the input prompt # We print this below the layout console.print() prompt_text = "Enter number: " padding = max(0, (console.width - len(prompt_text)) // 2) prompt = ( " " * padding + f"[bold {Theme.ACCENT}]{prompt_text}[/bold {Theme.ACCENT}]" ) choice = console.input(prompt) choice_num = int(choice) if choice_num == 0: return None if 1 <= choice_num <= len(skills): return skills[choice_num - 1]["id"] except (ValueError, KeyboardInterrupt): pass return None def show_mode_menu(self) -> Optional[str]: """Show interactive mode selection menu.""" console.clear() modes = [ ("vad", "Voice Activity", "Auto-detects when you speak"), ("ptt", "Push-to-Talk", "Hold SPACE or RIGHT SHIFT to talk"), ("type", "Type Mode", "Type messages, hear voice responses"), ] # Header header_text = Text( "Select Input Mode", style=f"bold {Theme.ACCENT}", justify="center" ) header_panel = Panel( Align.center(header_text), border_style=Theme.PRIMARY, box=box.ROUNDED, padding=(1, 4), expand=True, ) console.print() console.print(header_panel) console.print() # Mode options for i, (mode_id, name, desc) in enumerate(modes, 1): is_current = config.mode == mode_id style = f"bold {Theme.SUCCESS}" if is_current else Theme.BRIGHT marker = "[*]" if is_current else f"[{i}]" console.print(f" {marker} {name} - {desc}", style=style) console.print() console.print(f" [0] Cancel", style=Theme.MUTED) console.print() try: choice = console.input( f"[bold {Theme.ACCENT}]Enter number: [/bold {Theme.ACCENT}]" ) choice_num = int(choice) if choice_num == 0: return None if 1 <= choice_num <= len(modes): return modes[choice_num - 1][0] except (ValueError, KeyboardInterrupt): pass return None def _build_ui(self) -> Group: """Build the main UI with retro ASCII art banner, perfectly centered.""" # Get provider info provider_info = ( self.llm.get_provider_info() if hasattr(self.llm, "get_provider_info") else "LOCAL" ) skill_name = ( self.active_skill.display_name if self.active_skill else "Voice Assistant" ) # Full terminal width ui_width = console.width # Status mapping status_styles = { "Ready": Theme.STATUS_READY, "Listening...": Theme.STATUS_LISTENING, "Recording...": Theme.STATUS_RECORDING, "Transcribing...": Theme.STATUS_TRANSCRIBING, "Thinking...": Theme.STATUS_THINKING, "Speaking...": Theme.STATUS_SPEAKING, } style_color, icon = status_styles.get(self.status, ("white", "○")) # Master Table for horizontal centering main_table = Table.grid(expand=True) main_table.add_column(justify="center") # 1. Header Section (Banner & Subtitle) banner = create_banner() main_table.add_row(banner) features = ["MULTI-CHARACTER", "SKILLS", "VAD", "PTT", "MCP"] platform = "WINDOWS" if sys.platform == "win32" else "LINUX" subtitle_text = Text() for i, feature in enumerate(features): if i > 0: subtitle_text.append(" • ", style=Theme.SECONDARY) subtitle_text.append(feature, style=f"bold {Theme.ACCENT}") subtitle_text.append( f" | RUNNING ON {platform}", style=f"dim {Theme.PRIMARY}" ) main_table.add_row(subtitle_text) main_table.add_row(Text("\n")) # 2. Information Grid (Character & Status) info_grid = Table.grid(padding=(0, 4)) info_grid.add_column(justify="right", width=20) info_grid.add_column(justify="left", width=30) info_grid.add_row( Text("Character:", style=f"dim {Theme.SECONDARY}"), Text(skill_name, style=f"bold {Theme.ACCENT}"), ) info_grid.add_row( Text("Status:", style=f"dim {Theme.SECONDARY}"), Text(f"{icon} {self.status}", style=f"bold {style_color}"), ) main_table.add_row(info_grid) main_table.add_row(Text("\n")) # 3. Audio Visualization Panel if self.status in ["Listening...", "Recording...", "Speaking..."]: waveform = create_waveform(self.level_history, width=60, height=6) audio_panel = Panel( Align.center(waveform), title=f"[dim {Theme.SECONDARY}]Audio Level[/dim {Theme.SECONDARY}]", border_style=f"dim {Theme.PRIMARY}", box=box.ROUNDED, expand=True, padding=(0, 2), ) main_table.add_row(audio_panel) else: # Show empty placeholder to maintain layout main_table.add_row(Text("\n")) # 4. Conversation History - Centered Panel conv_list = Text() recent = ( self.conversation_history[-5:] if len(self.conversation_history) > 5 else self.conversation_history ) for role, text_content in recent: if role == "user": role_text = Text("You: ", style=f"bold {Theme.SUCCESS}") content_text = Text( text_content[:75] + ("..." if len(text_content) > 75 else ""), style=Theme.SUCCESS, ) else: role_text = Text("AI: ", style=f"bold {Theme.PRIMARY}") content_text = Text( text_content[:75] + ("..." if len(text_content) > 75 else ""), style=Theme.SECONDARY, ) conv_list.append_text(role_text) conv_list.append_text(content_text) conv_list.append("\n") if not recent: conv_list.append( "Start speaking to begin...\n", style=f"dim {Theme.SECONDARY}" ) conv_panel = Panel( conv_list, title="[dim]Conversation[/dim]", border_style=f"dim {Theme.PRIMARY}", box=box.ROUNDED, expand=True, padding=(1, 2), ) main_table.add_row(conv_panel) # 5. Shortcuts & Status Bar shortcuts_text = Text() shortcuts_text.append("[Q]", style=f"bold {Theme.SECONDARY}") shortcuts_text.append(" Quit ", style="dim #666666") shortcuts_text.append("[V]", style=f"bold {Theme.SECONDARY}") shortcuts_text.append(" Voice ", style="dim #666666") shortcuts_text.append("[C]", style=f"bold {Theme.SECONDARY}") shortcuts_text.append(" Clear ", style="dim #666666") shortcuts_text.append("[ESC]", style=f"bold {Theme.SECONDARY}") shortcuts_text.append(" Stop", style="dim #666666") main_table.add_row(shortcuts_text) status_bar = create_status_bar( latency_ms=self.latency_ms, tts_speed=self.tts_speed, tokens_in=self.tokens_in, tokens_out=self.tokens_out, provider=provider_info.split()[0] if provider_info else "LOCAL", status=self.status.upper().replace(".", ""), ) status_panel = Panel( status_bar, border_style=f"dim {Theme.PRIMARY}", box=box.ROUNDED, expand=True, ) main_table.add_row(status_panel) return main_table def _update_status(self, status: str, live: Optional[Live] = None): """Update status and refresh UI.""" self.status = status if live: live.update(self._build_ui()) def _process_audio_with_ui(self, audio: np.ndarray, live: Live) -> bool: """Process recorded audio with UI updates. Returns True to continue, False to exit.""" if len(audio) < config.sample_rate * 0.3: # Return to idle if audio too short self.turn_machine.safe_send("cancel") return True # Transcribe self._update_status("Transcribing...", live) text = self.asr.transcribe(audio) if not text.strip(): self._update_status("Listening...", live) # Return to idle if no text self.turn_machine.safe_send("cancel") return True # Check for voice commands BEFORE sending to LLM (MODE-03) if self._handle_voice_command(text): self.turn_machine.safe_send("cancel") self._update_status("Listening...", live) return True # Command handled, don't send to LLM # Check for injection attempts BEFORE sending to LLM (Phase 3) if self._check_injection(text): # Speak warning and return to listening self._update_status("Speaking...", live) self.turn_machine.safe_send("start_speaking") if self._should_speak(): self.tts.speak( InjectionDetector.WARNING_MESSAGE, level_callback=self._on_tts_level ) self.turn_machine.safe_send("finish_speaking") self._update_status("Listening...", live) self.turn_machine.safe_send("cancel") return True # Blocked, don't send to LLM # Add to conversation self.conversation.add_turn("user", text) self._update_status("Thinking...", live) # Check for exit commands if text.lower().strip() in ["exit", "quit", "goodbye", "bye", "stop"]: self._update_status("Speaking...", live) self.turn_machine.safe_send("start_speaking") if self._should_speak(): self.tts.speak("Goodbye!", level_callback=self._on_tts_level) self.turn_machine.safe_send("finish_speaking") return False # Check for voice change command if "change voice" in text.lower() or "change character" in text.lower(): self.turn_machine.safe_send("cancel") return "change_voice" # Send to LLM with latency tracking llm_start = time.time() response = self.llm.send_message(text) llm_end = time.time() # Update metrics via ConversationManager self.conversation.set_latency(int((llm_end - llm_start) * 1000)) # Add assistant response to conversation self.conversation.add_turn("assistant", response) # Filter and speak response (Phase 3 pipeline) filtered_response = self._filter_for_tts(response) self._update_status("Speaking...", live) self.turn_machine.safe_send("start_speaking") if self._should_speak() and filtered_response: self.tts.speak(filtered_response, level_callback=self._on_tts_level) self.turn_machine.safe_send("finish_speaking") # Small delay to prevent echo pickup time.sleep(0.3) self._update_status("Listening...", live) return True def on_key_press(self, key): """Handle key press.""" try: if hasattr(key, "char") and key.char: if key.char.lower() == "q": self.running = False return False elif key.char.lower() == "v": self.running = "change_voice" return False elif key.char.lower() == "c": self.conversation.clear() self.llm.reset() if self.active_skill: self.llm.set_system_prompt(self.active_skill.system_prompt) # PTT keys: space or right shift if key == keyboard.Key.space or key == keyboard.Key.shift_r: if not self.ptt_pressed: self.ptt_pressed = True self.recorder.start_recording() if key == keyboard.Key.esc: self.running = False return False except AttributeError: pass def on_key_release(self, key): """Handle key release.""" try: # PTT keys: space or right shift if key == keyboard.Key.space or key == keyboard.Key.shift_r: if self.ptt_pressed: self.ptt_pressed = False return "process_audio" except AttributeError: pass def run_vad_mode(self): """Run in voice activity detection mode with rich UI.""" console.clear() self.running = True # Start keyboard listener in background for hotkeys def key_listener(): def on_press(key): result = self.on_key_press(key) if result is False: return False with keyboard.Listener(on_press=on_press) as listener: listener.join() key_thread = threading.Thread(target=key_listener, daemon=True) key_thread.start() # Greeting if self.active_skill: greeting = self.active_skill.metadata.get( "greeting", f"Hello! {self.active_skill.name} here. How can I help?" ) else: greeting = "Voice mode active. How can I help you?" with Live( self._build_ui(), refresh_per_second=4, console=console, screen=True ) as live: # Store reference for real-time waveform updates self._live = live self._update_status("Speaking...", live) # Transition to speaking state for greeting self.turn_machine.safe_send("start_speaking") if self._should_speak(): self.tts.speak(greeting, level_callback=self._on_tts_level) self.turn_machine.safe_send("finish_speaking") time.sleep(0.3) # Prevent echo pickup self._update_status("Listening...", live) while self.running is True: try: # Check mode before recording if not self._should_process_audio(): time.sleep(0.1) # Avoid busy loop when STT disabled continue # Transition to listening state self.turn_machine.safe_send("start_listening") audio = self.recorder.record_vad() # Voice detected - transition to processing self.turn_machine.safe_send("voice_detected") result = self._process_audio_with_ui(audio, live) if result == "change_voice": self.running = "change_voice" break elif result is False: break except KeyboardInterrupt: break # Clear live reference self._live = None return self.running # Return state for main loop def run_ptt_mode(self): """Run in push-to-talk mode with rich UI.""" console.clear() self.running = True with Live( self._build_ui(), refresh_per_second=4, console=console, screen=True ) as live: # Store reference for real-time waveform updates self._live = live self._update_status("Ready - Hold SPACE or RIGHT SHIFT to talk", live) # Greeting if self.active_skill: greeting = self.active_skill.metadata.get( "greeting", f"Hello! {self.active_skill.name} here." ) else: greeting = "Push-to-talk mode active." self._update_status("Speaking...", live) self.tts.speak(greeting, level_callback=self._on_tts_level) self._update_status("Ready - Hold SPACE or RIGHT SHIFT to talk", live) def on_press(key): self.on_key_press(key) if self.ptt_pressed: self._update_status("Recording...", live) def on_release(key): result = self.on_key_release(key) if result == "process_audio": audio = self.recorder.stop_recording() self._update_status("Processing...", live) proc_result = self._process_audio_with_ui(audio, live) if proc_result is False: self.running = False return False elif proc_result == "change_voice": self.running = "change_voice" return False self._update_status( "Ready - Hold SPACE or RIGHT SHIFT to talk", live ) if self.running is not True: return False with keyboard.Listener( on_press=on_press, on_release=on_release ) as listener: listener.join() return self.running def run_type_mode(self): """Run in type mode - keyboard input with voice response.""" console.clear() self.running = True # Build header info if self.provider_config: provider_info = f"{self.provider_config.name}" if self.provider_config.model: provider_info += f" ({self.provider_config.model})" else: provider_info = "Custom API" skill_name = ( self.active_skill.name if self.active_skill else "Default Assistant" ) # Print header console.print( Panel( Text( "LocalVoiceMode - Type Mode", style=f"bold {Theme.SUCCESS}", justify="center", ), border_style=Theme.BORDER, box=box.ROUNDED, ) ) console.print(f"[dim]Provider:[/dim] [bold]{provider_info}[/bold]") console.print(f"[dim]Character:[/dim] [bold]{skill_name}[/bold]") console.print() console.print( f"[dim]Type your message and press Enter. The assistant will speak the response.[/dim]" ) console.print( f"[dim]Commands: 'quit' to exit, 'voice' to change character, 'clear' to reset[/dim]" ) console.print() # Greeting if self.active_skill: greeting = self.active_skill.metadata.get( "greeting", f"Hello! {self.active_skill.name} here. How can I help?" ) else: greeting = "Type mode active. How can I help you?" console.print( f"[bold {Theme.PRIMARY}]{skill_name}:[/bold {Theme.PRIMARY}] {greeting}" ) self.tts.speak(greeting, level_callback=self._on_tts_level) console.print() while self.running: try: # Get user input user_input = console.input( f"[bold {Theme.SUCCESS}]You:[/bold {Theme.SUCCESS}] " ) if not user_input.strip(): continue text = user_input.strip() # Check for exit commands if text.lower() in ["exit", "quit", "goodbye", "bye", "stop", "q"]: console.print( f"\n[bold {Theme.PRIMARY}]{skill_name}:[/bold {Theme.PRIMARY}] Goodbye!" ) self.tts.speak("Goodbye!", level_callback=self._on_tts_level) break # Check for voice change command if text.lower() in ["voice", "change voice", "change character", "v"]: return "change_voice" # Check for clear command if text.lower() in ["clear", "reset", "c"]: self.conversation.clear() self.llm.reset() if self.active_skill: self.llm.set_system_prompt(self.active_skill.system_prompt) console.print(f"[dim]Conversation cleared.[/dim]\n") continue # Check for injection attempts BEFORE sending to LLM (Phase 3) if self._check_injection(text): console.print( f"\n[bold {Theme.ERROR}]{skill_name}:[/bold {Theme.ERROR}] " f"{InjectionDetector.WARNING_MESSAGE}" ) self.tts.speak( InjectionDetector.WARNING_MESSAGE, level_callback=self._on_tts_level ) console.print() continue # Add to conversation and send to LLM self.conversation.add_turn("user", text) # Show thinking indicator with console.status(f"[{Theme.ACCENT}]Thinking...", spinner="dots"): llm_start = time.time() response = self.llm.send_message(text) llm_end = time.time() # Update metrics via ConversationManager self.conversation.set_latency(int((llm_end - llm_start) * 1000)) # Add assistant response self.conversation.add_turn("assistant", response) # Print and speak response (filter through pipeline for TTS) console.print( f"\n[bold {Theme.PRIMARY}]{skill_name}:[/bold {Theme.PRIMARY}] {response}" ) console.print(f"[dim]({self.latency_ms}ms)[/dim]\n") filtered_response = self._filter_for_tts(response) if filtered_response: self.tts.speak(filtered_response, level_callback=self._on_tts_level) except KeyboardInterrupt: console.print("\n[dim]Interrupted[/dim]") break except EOFError: break return self.running def run_headless(self): """Run in headless mode for MCP integration - no UI, just voice.""" self.running = True # Greeting if self.active_skill: greeting = self.active_skill.metadata.get( "greeting", f"Hello! {self.active_skill.name} here. How can I help?" ) else: greeting = "Voice mode active." self.tts.speak(greeting) while self.running: try: audio = self.recorder.record_vad() if len(audio) < config.sample_rate * 0.3: continue text = self.asr.transcribe(audio) if not text.strip(): continue # Check for voice commands BEFORE sending to LLM (MODE-03) if self._handle_voice_command(text): continue # Command handled, don't send to LLM # Check for injection attempts BEFORE sending to LLM (Phase 3) if self._check_injection(text): self.tts.speak(InjectionDetector.WARNING_MESSAGE) continue # Blocked, don't send to LLM # Check for exit commands lower_text = text.lower().strip() if lower_text in [ "exit", "quit", "goodbye", "bye", "stop", "stop voice", "end voice", ]: self.tts.speak("Goodbye!") break # Check for skill change if "change voice" in lower_text or "change character" in lower_text: # List available skills skills = self.skill_loader.list_skills() skill_names = ", ".join([s["name"] for s in skills]) self.tts.speak( f"Available characters are: {skill_names}. Say the name of who you want to talk to." ) continue # Check if user is selecting a skill by name for skill in self.skill_loader.list_skills(): if ( skill["name"].lower() in lower_text or skill["id"].lower() in lower_text ): self.load_skill(skill["id"]) self.tts.speak(f"Switched to {skill['name']}.") continue # Normal conversation self.conversation.add_turn("user", text) response = self.llm.send_message(text) self.conversation.add_turn("assistant", response) # Filter response through pipeline before speaking (Phase 3) filtered_response = self._filter_for_tts(response) if filtered_response: self.tts.speak(filtered_response) except KeyboardInterrupt: break def show_startup_menu(self) -> bool: """Show startup menu with character and mode selection. Returns False to quit.""" console.clear() # Show banner console.print(create_banner()) console.print() # Current settings skill_name = ( self.active_skill.display_name if self.active_skill else "Voice Assistant" ) mode_names = { "vad": "Voice Activity", "ptt": "Push-to-Talk", "type": "Type Mode", } mode_name = mode_names.get(config.mode, config.mode) console.print( f" Character: [bold {Theme.ACCENT}]{skill_name}[/bold {Theme.ACCENT}]" ) console.print(f" Mode: [bold {Theme.ACCENT}]{mode_name}[/bold {Theme.ACCENT}]") console.print() # Menu options console.print(f" [1] Start Voice Chat", style=f"bold {Theme.SUCCESS}") console.print(f" [2] Change Character", style=Theme.BRIGHT) console.print(f" [3] Change Input Mode", style=Theme.BRIGHT) console.print(f" [0] Quit", style=Theme.MUTED) console.print() try: choice = console.input( f"[bold {Theme.ACCENT}]Enter number: [/bold {Theme.ACCENT}]" ) choice_num = int(choice) if choice_num == 0: return False elif choice_num == 1: return True # Start voice chat elif choice_num == 2: new_skill = self.show_skill_menu() if new_skill: self.load_skill(new_skill) return None # Show menu again elif choice_num == 3: new_mode = self.show_mode_menu() if new_mode: config.mode = new_mode return None # Show menu again except (ValueError, KeyboardInterrupt): return False return None def run(self): """Run voice mode with interactive menu.""" # Headless mode - no UI if config.headless: self.run_headless() return # Show startup menu while True: result = self.show_startup_menu() if result is True: break # Start voice chat elif result is False: console.clear() console.print("\n[bold cyan]Goodbye![/bold cyan]\n") return # result is None means show menu again while True: # Run the appropriate mode if config.mode == "ptt": result = self.run_ptt_mode() elif config.mode == "type": result = self.run_type_mode() else: result = self.run_vad_mode() # Check if we should change voice if result == "change_voice": new_skill = self.show_skill_menu() if new_skill: self.load_skill(new_skill) console.clear() continue else: break console.clear() console.print("\n[bold cyan]Thanks for using LocalVoiceMode![/bold cyan]\n") # ============================================================================ # Main # ============================================================================ def main(): parser = argparse.ArgumentParser( description="LocalVoiceMode - Voice Chat with Character Skills" ) parser.add_argument( "--skill", "-s", help="Skill/character to load (e.g., 'hermione')" ) parser.add_argument( "--list-skills", "-l", action="store_true", help="List available skills" ) parser.add_argument( "--list-providers", action="store_true", help="List available LLM providers" ) parser.add_argument( "--mode", "-m", choices=["vad", "ptt", "type"], default="vad", help="Input mode: vad (voice activity), ptt (push-to-talk), or type (keyboard input with voice response)", ) # Provider settings parser.add_argument( "--provider", "-p", choices=["lm_studio", "openrouter", "openai"], help="Force specific provider (default: auto-detect)", ) # LLM API settings (manual override) parser.add_argument( "--api-url", help="OpenAI-compatible API URL (overrides auto-detection)" ) parser.add_argument("--api-key", help="API key for the LLM service") parser.add_argument( "--model", help="Model name to use (e.g., 'gpt-4', 'claude-sonnet-4-20250514', 'llama-3')", ) # ASR device settings parser.add_argument( "--device", default="cuda", choices=["cuda", "cpu"], help="Device for ASR model (cuda for GPU with TensorRT/CUDA, cpu for CPU)", ) # Headless mode for MCP integration parser.add_argument( "--headless", action="store_true", help="Run without UI (for MCP integration)" ) args = parser.parse_args() # Update config config.mode = args.mode config.device = args.device config.headless = args.headless # List providers if args.list_providers: console.print( f"\n[bold {Theme.PRIMARY}]LLM Provider Status:[/bold {Theme.PRIMARY}]" ) console.print("-" * 50) console.print(ProviderManager.get_status_report()) console.print() return # List skills (non-interactive) if args.list_skills: loader = SkillLoader(config.skills_dir, config.voice_refs_dir) skills = loader.list_skills() console.print( f"\n[bold {Theme.PRIMARY}]Available Skills:[/bold {Theme.PRIMARY}]" ) console.print("-" * 50) if not skills: console.print( f"[{Theme.ERROR}]No skills found in {config.skills_dir}[/{Theme.ERROR}]" ) else: for skill in skills: console.print(f" {skill['display_name']}") console.print(f" [{Theme.MUTED}]ID: {skill['id']}[/{Theme.MUTED}]") console.print( f" [{Theme.MUTED}]{skill['description'][:60]}...[/{Theme.MUTED}]" ) console.print() return # Determine provider provider_config = None force_provider = args.provider or os.environ.get("VOICE_PROVIDER") if args.api_url: # Manual override - create a provider config from args model = args.model # If no model specified, try to detect from the API if not model: try: api_url = args.api_url.rstrip("/") resp = httpx.get(f"{api_url}/models", timeout=2.0) if resp.status_code == 200: data = resp.json() if data.get("data"): model = data["data"][0].get("id") except Exception: pass # Will use default model handling provider_config = ProviderConfig( type=ProviderType.LM_STUDIO, # Treat manual URL as generic OpenAI-compatible name="Custom API", api_url=args.api_url, api_key=args.api_key, model=model, available=True, ) else: # Auto-detect provider (verbose to show what's happening) provider_config = ProviderManager.get_best_provider( force_provider, verbose=True ) if not provider_config: console.print( f"\n[{Theme.ERROR}]No LLM provider available![/{Theme.ERROR}]" ) console.print() console.print(f"[{Theme.MUTED}]Options:[/{Theme.MUTED}]") console.print(f" 1. Start LM Studio with local server enabled") console.print(f" 2. Set OPENROUTER_API_KEY environment variable") console.print(f" 3. Set OPENAI_API_KEY environment variable") console.print(f" 4. Use --api-url to specify a custom endpoint") console.print() return # Override model if specified if args.model: provider_config.model = args.model # Headless mode - minimal output, for MCP integration if config.headless: controller = VoiceModeController(provider_config=provider_config) skill = args.skill or "assistant" controller.load_skill(skill) try: controller.run() except KeyboardInterrupt: pass return # Show startup banner console.clear() console.print() banner_content = Text() banner_content.append("LocalVoiceMode\n", style=f"bold {Theme.PRIMARY}") banner_content.append(f"Provider: {provider_config.name}\n", style=Theme.MUTED) if provider_config.model: banner_content.append(f"Model: {provider_config.model}\n", style=Theme.MUTED) asr_model = f"Parakeet TDT 0.6B v3 on {config.device.upper()}" banner_content.append(f"ASR: {asr_model}", style=Theme.MUTED) console.print( Panel.fit( banner_content, border_style=Theme.BORDER, title=f"[{Theme.BRIGHT}] Voice Mode [{Theme.BRIGHT}]", ) ) console.print() # Create controller with detected provider controller = VoiceModeController(provider_config=provider_config) # Load skill - either from args or show menu if args.skill: if not controller.load_skill(args.skill): console.print( f"[{Theme.ERROR}]Failed to load skill: {args.skill}[/{Theme.ERROR}]" ) console.print( f"[{Theme.MUTED}]Use --list-skills to see available skills[/{Theme.MUTED}]" ) return else: # Show interactive skill selection skill_id = controller.show_skill_menu() if skill_id: controller.load_skill(skill_id) else: # Default to assistant if cancelled controller.load_skill("assistant") # Run try: controller.run() except KeyboardInterrupt: console.clear() console.print("\n[bold cyan]Goodbye![/bold cyan]\n") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DevMan57/voiceblitz-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

voice_client.py•63.5 KiB