Speech MCP
by Kvadratni
- backup
- ui
import os
import sys
import json
import time
import threading
import logging
import tempfile
import io
from queue import Queue
import sysconfig
os.environ["PYTHONPATH"] = os.path.join(sysconfig.get_paths()['stdlib'], 'site-packages')
os.environ["TCL_LIBRARY"] = os.path.join(sysconfig.get_paths()['stdlib'], '..', 'tcl8.6')
os.environ["TK_LIBRARY"] = os.path.join(sysconfig.get_paths()['stdlib'], '..', 'tk8.6')
import tkinter as tk
# Set up logging
log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "speech-mcp-ui.log")
format='%(levelname)s: %(message)s', # Very simple format for easier parsing
logging.StreamHandler(sys.stdout) # Explicitly use stdout
logger = logging.getLogger(__name__)
# Path to audio notification files
AUDIO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "resources", "audio")
START_LISTENING_SOUND = os.path.join(AUDIO_DIR, "start_listening.wav")
STOP_LISTENING_SOUND = os.path.join(AUDIO_DIR, "stop_listening.wav")
# Import other dependencies
import numpy as np
import wave
import pyaudio
# For playing notification sounds
def play_audio_file(file_path):
"""Play an audio file using PyAudio"""
if not os.path.exists(file_path):
logger.error(f"Audio file not found: {file_path}")
logger.debug(f"Playing audio notification: {file_path}")
# Open the wave file
with wave.open(file_path, 'rb') as wf:
# Create PyAudio instance
p = pyaudio.PyAudio()
# Open stream
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
# Read data in chunks and play
chunk_size = 1024
data = wf.readframes(chunk_size)
while data:
data = wf.readframes(chunk_size)
# Close stream and PyAudio
logger.debug("Audio notification played successfully")
except Exception as e:
logger.error(f"Error playing audio notification: {e}")
# For text-to-speech
# Always prioritize Kokoro as the primary TTS engine if available
print("Initializing Kokoro as primary TTS engine...")
logger.info("Initializing Kokoro as primary TTS engine")
# Import and initialize Kokoro adapter
from speech_mcp.tts_adapters.kokoro_adapter import KokoroTTS
# Initialize with Kokoro voice settings
tts_engine = KokoroTTS(voice="af_heart", lang_code="a", speed=1.0)
tts_available = True
logger.info("Kokoro TTS adapter initialized successfully as primary TTS engine")
print("Kokoro TTS adapter initialized successfully as primary TTS engine!")
# Log available voices
voices = tts_engine.get_available_voices()
logger.debug(f"Available Kokoro TTS voices: {len(voices)}")
for i, voice in enumerate(voices):
logger.debug(f"Voice {i}: {voice}")
print(f"Available Kokoro voices: {', '.join(voices[:5])}{' and more...' if len(voices) > 5 else ''}")
except ImportError as e:
# If the adapter is available but Kokoro itself is not installed
logger.warning(f"Kokoro package not available: {e}. Falling back to pyttsx3.")
print("WARNING: Kokoro package not available. Falling back to pyttsx3.")
raise ImportError("Kokoro package not installed")
except ImportError as e:
logger.warning(f"Kokoro adapter not available: {e}. Falling back to pyttsx3.")
print("WARNING: Kokoro adapter not available. Falling back to pyttsx3.")
# Fall back to pyttsx3
import pyttsx3
tts_engine = pyttsx3.init()
tts_available = True
logger.info("pyttsx3 text-to-speech engine initialized as fallback")
print("pyttsx3 text-to-speech engine initialized as fallback!")
# Log available voices
voices = tts_engine.getProperty('voices')
logger.debug(f"Available pyttsx3 voices: {len(voices)}")
for i, voice in enumerate(voices):
logger.debug(f"Voice {i}: {voice.id} - {voice.name}")
except ImportError as e:
logger.warning(f"pyttsx3 not available: {e}. Text-to-speech will be simulated.")
print("WARNING: pyttsx3 not available. Text-to-speech will be simulated.")
tts_available = False
except Exception as e:
logger.error(f"Error initializing text-to-speech engine: {e}")
print(f"WARNING: Error initializing text-to-speech: {e}. Text-to-speech will be simulated.")
tts_available = False
except Exception as e:
logger.error(f"Error initializing Kokoro TTS adapter: {e}")
print(f"WARNING: Error initializing Kokoro TTS adapter: {e}. Falling back to pyttsx3.")
# Fall back to pyttsx3
import pyttsx3
tts_engine = pyttsx3.init()
tts_available = True
logger.info("pyttsx3 text-to-speech engine initialized as fallback")
print("pyttsx3 text-to-speech engine initialized as fallback!")
# Log available voices
voices = tts_engine.getProperty('voices')
logger.debug(f"Available pyttsx3 voices: {len(voices)}")
for i, voice in enumerate(voices):
logger.debug(f"Voice {i}: {voice.id} - {voice.name}")
except ImportError as e:
logger.warning(f"pyttsx3 not available: {e}. Text-to-speech will be simulated.")
print("WARNING: pyttsx3 not available. Text-to-speech will be simulated.")
tts_available = False
except Exception as e:
logger.error(f"Error initializing text-to-speech engine: {e}")
print(f"WARNING: Error initializing text-to-speech: {e}. Text-to-speech will be simulated.")
tts_available = False
# These will be imported later when needed
whisper_loaded = False
speech_recognition_loaded = False
# Path to save speech state - same as in server.py
STATE_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "speech_state.json")
TRANSCRIPTION_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "transcription.txt")
RESPONSE_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "response.txt")
COMMAND_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "ui_command.txt")
# Audio parameters
CHUNK = 1024
FORMAT = pyaudio.paInt16
RATE = 16000
# Import optional dependencies when needed
def load_whisper():
global whisper_loaded
global whisper
print("Loading faster-whisper speech recognition model... This may take a moment.")
import faster_whisper
whisper_loaded = True
logger.info("faster-whisper successfully loaded")
print("faster-whisper speech recognition model loaded successfully!")
return True
except ImportError as e:
logger.error(f"Failed to load faster-whisper: {e}")
print(f"ERROR: Failed to load faster-whisper module: {e}")
print("Trying to fall back to SpeechRecognition library...")
return load_speech_recognition()
def load_speech_recognition():
global speech_recognition_loaded
global sr
import speech_recognition as sr
speech_recognition_loaded = True
logger.info("SpeechRecognition successfully loaded")
print("SpeechRecognition library loaded successfully!")
return True
except ImportError as e:
logger.error(f"Failed to load SpeechRecognition: {e}")
print(f"ERROR: Failed to load SpeechRecognition module: {e}")
print("Please install it with: pip install SpeechRecognition")
return False
class SimpleSpeechProcessorUI:
"""A speech processor UI that shows status and audio waveform visualization"""
def __init__(self, root):
self.root = root
self.root.title("Speech MCP - Voice Interface")
self.root.geometry("500x500") # Square shape for better circular visualization
# Initialize basic components
print("Initializing speech processor...")
logger.info("Initializing speech processor UI")
self.ui_active = True
self.listening = False
self.speaking = False
self.last_transcript = ""
self.last_response = ""
self.should_update = True
self.stream = None
# Create initial command file to indicate UI is ready
with open(COMMAND_FILE, 'w') as f:
logger.info("Created initial UI_READY command file")
except Exception as e:
logger.error(f"Error creating initial command file: {e}")
# Audio visualization parameters
self.waveform_data = []
self.waveform_max_points = 100 # Number of points to display in waveform
self.waveform_update_interval = 50 # Update interval in milliseconds
# Initialize PyAudio with explicit device selection
print("Initializing audio system...")
logger.info("Initializing PyAudio system")
self.p = pyaudio.PyAudio()
# Log audio device information
logger.info(f"PyAudio version: {pyaudio.get_portaudio_version()}")
# Get all available audio devices
info = self.p.get_host_api_info_by_index(0)
numdevices = info.get('deviceCount')
logger.info(f"Found {numdevices} audio devices:")
# Find the best input device
selected_device = None
selected_device_index = None
for i in range(numdevices):
device_info = self.p.get_device_info_by_host_api_device_index(0, i)
device_name = device_info.get('name')
max_input_channels = device_info.get('maxInputChannels')
logger.info(f"Device {i}: {device_name}")
logger.info(f" Max Input Channels: {max_input_channels}")
logger.info(f" Default Sample Rate: {device_info.get('defaultSampleRate')}")
# Only consider input devices
if max_input_channels > 0:
print(f"Found input device: {device_name}")
# Prefer non-default devices as they're often external mics
if not selected_device or 'default' not in device_name.lower():
selected_device = device_info
selected_device_index = i
except Exception as e:
logger.warning(f"Error checking device {i}: {e}")
if not selected_device:
raise Exception("No suitable input device found")
logger.info(f"Selected input device: {selected_device['name']} (index {selected_device_index})")
print(f"Using input device: {selected_device['name']}")
# Store the selected device info for later use
self.selected_device_index = selected_device_index
self.selected_device_info = selected_device
except Exception as e:
logger.error(f"Error initializing PyAudio: {e}", exc_info=True)
print(f"ERROR: Failed to initialize audio system: {e}")
# Show error in UI
self.root.after(0, lambda: self.status_label.config(
text=f"Audio Error: {str(e)[:30]}..."
# Create the UI components
# Main frame
self.main_frame = tk.Frame(self.root)
self.main_frame.pack(expand=True, fill="both", padx=10, pady=10)
# Status label
self.status_label = tk.Label(
font=('Arial', 16)
self.status_label.pack(fill="x", pady=(0, 10))
# Waveform canvas
self.waveform_frame = tk.Frame(self.main_frame, bg="#f0f0f0")
self.waveform_frame.pack(expand=True, fill="both")
self.waveform_canvas = tk.Canvas(
self.waveform_canvas.pack(expand=True, fill="both", padx=5, pady=5)
# Load speech state
# Load whisper in a background thread
print("Checking for speech recognition module...")
threading.Thread(target=self.initialize_speech_recognition, daemon=True).start()
# Start threads for monitoring state changes
self.update_thread = threading.Thread(target=self.check_for_updates)
self.update_thread.daemon = True
# Start thread for checking response file
self.response_thread = threading.Thread(target=self.check_for_responses)
self.response_thread.daemon = True
# Handle window close event
root.protocol("WM_DELETE_WINDOW", self.on_close)
# Initialize UI to a proper state
self.root.after(100, self.update_ui_from_state)
print("Speech processor initialization complete!")
logger.info("Speech processor initialized successfully")
# Update the waveform to show the initial state
self.root.after(200, self.update_waveform)
def initialize_speech_recognition(self):
"""Initialize speech recognition in a background thread"""
if not load_whisper():
self.root.after(0, lambda: self.status_label.config(
text="WARNING: Speech recognition not available"
# Load the faster-whisper model
self.root.after(0, lambda: self.status_label.config(
text="Loading faster-whisper model..."
# Import here to avoid circular imports
import faster_whisper
# Load the small model for a good balance of speed and accuracy
# Using CPU as default for compatibility
self.whisper_model = faster_whisper.WhisperModel("base", device="cpu", compute_type="int8")
self.root.after(0, lambda: self.status_label.config(
logger.info("faster-whisper model loaded successfully")
except Exception as e:
logger.error(f"Error loading faster-whisper model: {e}")
self.root.after(0, lambda: self.status_label.config(
text=f"Error loading model: {e}"
def load_speech_state(self):
"""Load the speech state from the file shared with the server"""
if os.path.exists(STATE_FILE):
with open(STATE_FILE, 'r') as f:
state = json.load(f)
self.ui_active = state.get("ui_active", False)
self.listening = state.get("listening", False)
self.speaking = state.get("speaking", False)
self.last_transcript = state.get("last_transcript", "")
self.last_response = state.get("last_response", "")
# Default state if file doesn't exist
self.ui_active = True
self.listening = False
self.speaking = False
self.last_transcript = ""
self.last_response = ""
except Exception as e:
logger.error(f"Error loading speech state: {e}")
# Default state on error
self.ui_active = True
self.listening = False
self.speaking = False
self.last_transcript = ""
self.last_response = ""
def save_speech_state(self):
"""Save the speech state to the file shared with the server"""
state = {
"ui_active": self.ui_active,
"listening": self.listening,
"speaking": self.speaking,
"last_transcript": self.last_transcript,
"last_response": self.last_response
with open(STATE_FILE, 'w') as f:
json.dump(state, f)
except Exception as e:
logger.error(f"Error saving speech state: {e}")
def update_ui_from_state(self):
"""Update the UI to reflect the current speech state"""
if self.listening:
# Start visualization if not already running
self.root.after(0, self.update_waveform)
# Start listening if not already started
if not hasattr(self, 'stream') or self.stream is None:
self.root.after(0, self.start_listening)
elif self.speaking:
# Start visualization for speaking
self.root.after(0, self.update_waveform)
# Update visualization to show idle state
self.root.after(0, self.update_waveform)
# Stop listening if still active
if hasattr(self, 'stream') and self.stream is not None:
self.root.after(0, self.stop_listening)
# Always schedule another waveform update to ensure the UI stays responsive
self.root.after(self.waveform_update_interval * 2, self.update_waveform)
def update_waveform(self):
"""Update the circular audio visualization on the canvas"""
# Get canvas dimensions
canvas_width = self.waveform_canvas.winfo_width()
canvas_height = self.waveform_canvas.winfo_height()
if canvas_width <= 1 or canvas_height <= 1:
# Canvas not yet properly sized
self.root.after(100, self.update_waveform)
# Clear the canvas
# Calculate center point
center_x = canvas_width / 2
center_y = canvas_height / 2
# Draw background circle
background_radius = min(canvas_width, canvas_height) * 0.4
center_x - background_radius, center_y - background_radius,
center_x + background_radius, center_y + background_radius,
outline="#e0e0e0", width=2, fill="#f8f8f8"
if self.listening or self.speaking:
# Get current amplitude
current_amplitude = 0
if hasattr(self, 'waveform_data') and len(self.waveform_data) > 0:
# Use the most recent amplitude value
current_amplitude = self.waveform_data[-1]
# Calculate radius based on amplitude
# Scale the amplitude (typically 0-0.5) to a reasonable range
# Base radius is 30% of the background circle, max is 90%
min_radius = background_radius * 0.3
max_radius = background_radius * 0.9
# Scale amplitude (typically 0-0.5) to radius range
radius = min_radius + (current_amplitude * (max_radius - min_radius) * 4)
# Ensure radius stays within bounds
radius = max(min_radius, min(radius, max_radius))
# Draw the amplitude circle
fill_color = "#4287f5" if self.listening else "#42f587" # Blue for listening, green for speaking
center_x - radius, center_y - radius,
center_x + radius, center_y + radius,
outline="", fill=fill_color
# Draw inner circle (white)
inner_radius = min_radius * 0.8
center_x - inner_radius, center_y - inner_radius,
center_x + inner_radius, center_y + inner_radius,
outline="", fill="white"
# Add icon based on state
if self.listening:
# Draw microphone icon (simple representation)
mic_width = inner_radius * 0.6
mic_height = inner_radius * 1.2
# Microphone body
center_x - mic_width/2, center_y - mic_height/2,
center_x + mic_width/2, center_y + mic_height/4,
fill="#555555", outline=""
# Microphone top (rounded)
center_x - mic_width/2, center_y - mic_height/2 - mic_width/2,
center_x + mic_width/2, center_y - mic_height/2 + mic_width/2,
fill="#555555", outline=""
# Stand
center_x - mic_width/6, center_y + mic_height/4,
center_x + mic_width/6, center_y + mic_height/2,
fill="#555555", outline=""
# Base
center_x - mic_width/2, center_y + mic_height/2 - mic_width/6,
center_x + mic_width/2, center_y + mic_height/2 + mic_width/6,
fill="#555555", outline=""
# Draw speaker icon for speaking
speaker_size = inner_radius * 0.7
# Speaker body
center_x - speaker_size/2, center_y - speaker_size/2,
center_x - speaker_size/6, center_y + speaker_size/2,
fill="#555555", outline=""
# Speaker cone
points = [
center_x - speaker_size/6, center_y - speaker_size/2, # Top left
center_x + speaker_size/2, center_y - speaker_size, # Top right
center_x + speaker_size/2, center_y + speaker_size, # Bottom right
center_x - speaker_size/6, center_y + speaker_size/2 # Bottom left
self.waveform_canvas.create_polygon(points, fill="#555555", outline="")
# Sound waves (3 arcs)
for i in range(1, 4):
arc_size = speaker_size * (0.5 + i * 0.25)
center_x, center_y - arc_size/2,
center_x + arc_size, center_y + arc_size/2,
start=300, extent=120,
style="arc", outline="#555555", width=2
# Draw pulsing rings
if hasattr(self, 'pulse_count'):
self.pulse_count += 1
if self.pulse_count > 100:
self.pulse_count = 0
self.pulse_count = 0
# Create 3 pulsing rings
for i in range(3):
pulse_phase = (self.pulse_count + i * 33) % 100
if pulse_phase < 70: # Only show rings during part of the cycle
# Calculate ring size based on pulse phase
ring_size = background_radius * (0.5 + pulse_phase / 70)
# Calculate opacity based on pulse phase (fade out as it expands)
opacity = int(255 * (1 - pulse_phase / 70))
ring_color = f"#{opacity:02x}{opacity:02x}{opacity:02x}"
center_x - ring_size, center_y - ring_size,
center_x + ring_size, center_y + ring_size,
outline=ring_color, width=1, fill=""
# Draw a standby/ready icon in the center
ready_radius = background_radius * 0.3
center_x - ready_radius, center_y - ready_radius,
center_x + ready_radius, center_y + ready_radius,
outline="#cccccc", width=2, fill="#f0f0f0"
# Draw a simple "ready" symbol (play button)
triangle_size = ready_radius * 0.8
points = [
center_x - triangle_size/2, center_y - triangle_size,
center_x - triangle_size/2, center_y + triangle_size,
center_x + triangle_size, center_y
self.waveform_canvas.create_polygon(points, fill="#cccccc", outline="")
# Schedule the next update if listening or speaking
if self.listening or self.speaking:
self.root.after(self.waveform_update_interval, self.update_waveform)
except Exception as e:
logger.error(f"Error updating visualization: {e}", exc_info=True)
# Try again after a delay
self.root.after(self.waveform_update_interval * 2, self.update_waveform)
def process_audio_for_visualization(self, audio_data):
"""Process audio data for visualization"""
# Convert to numpy array
data = np.frombuffer(audio_data, dtype=np.int16)
# Normalize the data to range [-1, 1]
normalized = data.astype(float) / 32768.0
# Take absolute value to get amplitude
amplitude = np.abs(normalized).mean()
# Add to waveform data
# Keep only the most recent points
if len(self.waveform_data) > self.waveform_max_points:
self.waveform_data = self.waveform_data[-self.waveform_max_points:]
except Exception as e:
logger.error(f"Error processing audio for visualization: {e}", exc_info=True)
def start_listening(self):
"""Start listening for audio input"""
logger.info("Starting audio recording")
# Play start listening notification sound
threading.Thread(target=play_audio_file, args=(START_LISTENING_SOUND,), daemon=True).start()
# Reset waveform data
self.waveform_data = []
def audio_callback(in_data, frame_count, time_info, status):
# Log detailed timing information periodically
if hasattr(self, 'callback_count'):
self.callback_count += 1
if self.callback_count % 50 == 0: # Log every ~50 callbacks
logger.debug(f"Audio callback timing - input timestamp: {time_info.get('input_buffer_adc_time', 'N/A')}, "
f"current time: {time_info.get('current_time', 'N/A')}")
self.callback_count = 1
# Check for audio status flags
if status:
status_flags = []
if status & pyaudio.paInputUnderflow:
status_flags.append("Input Underflow")
if status & pyaudio.paInputOverflow:
status_flags.append("Input Overflow")
if status & pyaudio.paOutputUnderflow:
status_flags.append("Output Underflow")
if status & pyaudio.paOutputOverflow:
status_flags.append("Output Overflow")
if status & pyaudio.paPrimingOutput:
status_flags.append("Priming Output")
if status_flags:
logger.warning(f"Audio callback status flags: {', '.join(status_flags)}")
# Store audio data for processing
if hasattr(self, 'audio_frames'):
# Process audio for visualization
# Periodically log audio levels for debugging
if len(self.audio_frames) % 20 == 0: # Log every ~1 second (20 chunks at 1024 samples)
audio_data = np.frombuffer(in_data, dtype=np.int16)
normalized = audio_data.astype(float) / 32768.0
amplitude = np.abs(normalized).mean()
logger.debug(f"Current audio amplitude: {amplitude:.6f}")
except Exception as e:
logger.error(f"Error calculating audio level: {e}")
return (in_data, pyaudio.paContinue)
except Exception as e:
logger.error(f"Error in audio callback: {e}", exc_info=True)
return (in_data, pyaudio.paContinue) # Try to continue despite errors
# Initialize audio frames list
self.audio_frames = []
# Start the audio stream with the selected device
logger.debug(f"Opening audio stream with FORMAT={FORMAT}, CHANNELS={CHANNELS}, RATE={RATE}, CHUNK={CHUNK}, DEVICE={self.selected_device_index}")
self.stream = self.p.open(
# Verify stream is active and receiving audio
if not self.stream.is_active():
logger.error("Stream created but not active")
raise Exception("Audio stream is not active")
# Test audio input
logger.info("Testing audio input...")
print("Testing audio input...")
# Wait a moment and check if we're receiving audio
if not hasattr(self, 'audio_frames') or len(self.audio_frames) == 0:
logger.error("No audio data received in initial test")
raise Exception("No audio data being received")
# Check audio levels
test_frame = self.audio_frames[-1]
audio_data = np.frombuffer(test_frame, dtype=np.int16)
normalized = audio_data.astype(float) / 32768.0
level = np.abs(normalized).mean()
logger.info(f"Initial audio level: {level:.6f}")
print(f"Audio input level: {level:.6f}")
if level < 0.0001: # Very low level threshold
logger.warning("Very low audio input level detected")
print("Warning: Very low audio input level detected")
logger.info("Audio stream initialized and receiving data")
print("Microphone activated. Listening for speech...")
# Start a thread to detect silence and stop recording
threading.Thread(target=self.detect_silence, daemon=True).start()
except Exception as e:
logger.error(f"Error starting audio stream: {e}", exc_info=True)
print(f"Error starting audio: {e}")
self.listening = False
def detect_silence(self):
"""Detect when the user stops speaking and end recording"""
# Wait for initial audio to accumulate
logger.info("Starting silence detection")
# Adjusted silence detection parameters for longer pauses
silence_threshold = 0.008 # Reduced threshold to be more sensitive to quiet speech (was 0.01)
silence_duration = 0
max_silence = 5.0 # Increased from 1.5s to 5.0s to allow for longer thinking pauses
check_interval = 0.1 # Check every 100ms
logger.debug(f"Silence detection parameters: threshold={silence_threshold}, max_silence={max_silence}s, check_interval={check_interval}s")
# Track audio levels for debugging
amplitude_history = []
while self.listening and self.stream and silence_duration < max_silence:
if not hasattr(self, 'audio_frames') or len(self.audio_frames) < 2:
# Get the latest audio frame
latest_frame = self.audio_frames[-1]
audio_data = np.frombuffer(latest_frame, dtype=np.int16)
normalized = audio_data.astype(float) / 32768.0
current_amplitude = np.abs(normalized).mean()
# Use a moving average of recent amplitudes for more stable detection
if hasattr(self, 'recent_amplitudes') and len(self.recent_amplitudes) > 0:
avg_amplitude = sum(self.recent_amplitudes) / len(self.recent_amplitudes)
avg_amplitude = current_amplitude
if avg_amplitude < silence_threshold:
silence_duration += check_interval
# Log only when silence is detected
if silence_duration >= 1.0 and silence_duration % 1.0 < check_interval:
logger.debug(f"Silence detected for {silence_duration:.1f}s, avg amplitude: {avg_amplitude:.6f}")
if silence_duration > 0:
logger.debug(f"Speech resumed after {silence_duration:.1f}s of silence, amplitude: {avg_amplitude:.6f}")
silence_duration = 0
# If we exited because of silence detection
if self.listening and self.stream:
logger.info(f"Silence threshold reached after {silence_duration:.1f}s, stopping recording")
logger.debug(f"Final amplitude history: {[f'{a:.6f}' for a in amplitude_history]}")
self.root.after(0, lambda: self.status_label.config(text="Processing speech..."))
print("Silence detected. Processing speech...")
if not self.listening:
logger.info("Silence detection stopped because listening state changed")
if not self.stream:
logger.info("Silence detection stopped because audio stream was closed")
except Exception as e:
logger.error(f"Error in silence detection: {e}", exc_info=True)
def process_recording(self):
"""Process the recorded audio and generate a transcription using faster-whisper"""
if not hasattr(self, 'audio_frames') or not self.audio_frames:
logger.warning("No audio frames to process")
logger.info(f"Processing {len(self.audio_frames)} audio frames")
# Check if we have enough audio data
total_audio_time = len(self.audio_frames) * (CHUNK / RATE)
logger.info(f"Total recorded audio: {total_audio_time:.2f} seconds")
if total_audio_time < 0.5: # Less than half a second of audio
logger.warning(f"Audio recording too short ({total_audio_time:.2f}s), may not contain speech")
if not hasattr(self, 'whisper_model') or self.whisper_model is None:
logger.warning("faster-whisper model not loaded yet")
self.last_transcript = "Sorry, speech recognition model is still loading. Please try again in a moment."
with open(TRANSCRIPTION_FILE, 'w') as f:
# Save the recorded audio to a temporary WAV file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
temp_audio_path = temp_audio.name
# Create a WAV file from the recorded frames
logger.debug(f"Creating WAV file at {temp_audio_path}")
wf = wave.open(temp_audio_path, 'wb')
# Get file size for logging
file_size = os.path.getsize(temp_audio_path)
logger.debug(f"WAV file created, size: {file_size} bytes")
logger.info(f"Audio saved to temporary file: {temp_audio_path}")
# Use faster-whisper to transcribe the audio
logger.info("Transcribing audio with faster-whisper...")
print("Transcribing audio with faster-whisper...")
self.root.after(0, lambda: self.status_label.config(text="Transcribing audio..."))
transcription_start = time.time()
segments, info = self.whisper_model.transcribe(temp_audio_path, beam_size=5)
# Collect all segments to form the complete transcription
transcription = ""
for segment in segments:
transcription += segment.text + " "
transcription = transcription.strip()
transcription_time = time.time() - transcription_start
logger.info(f"Transcription completed in {transcription_time:.2f}s: {transcription}")
logger.debug(f"Transcription info: {info}")
print(f"Transcription complete: \"{transcription}\"")
# Log segments for debugging
logger.debug("Transcription segments:")
for i, segment in enumerate(segments):
logger.debug(f"Segment {i}: {segment.start}-{segment.end}s: {segment.text}")
# Clean up the temporary file
logger.debug(f"Removing temporary WAV file: {temp_audio_path}")
except Exception as e:
logger.error(f"Error removing temporary file: {e}")
# Update the state with the transcription
self.last_transcript = transcription
# Write the transcription to a file for the server to read
logger.debug(f"Writing transcription to file: {TRANSCRIPTION_FILE}")
with open(TRANSCRIPTION_FILE, 'w') as f:
logger.debug("Transcription file written successfully")
except Exception as e:
logger.error(f"Error writing transcription to file: {e}", exc_info=True)
raise e
# Update state
except Exception as e:
logger.error(f"Error processing recording: {e}", exc_info=True)
self.last_transcript = f"Error processing speech: {str(e)}"
with open(TRANSCRIPTION_FILE, 'w') as f:
def stop_listening(self):
"""Stop listening for audio input"""
logger.info("Stopping audio recording")
if self.stream:
logger.debug(f"Stopping audio stream, stream active: {self.stream.is_active()}")
self.stream = None
print("Microphone deactivated.")
logger.info("Audio stream closed successfully")
# Play stop listening notification sound
threading.Thread(target=play_audio_file, args=(STOP_LISTENING_SOUND,), daemon=True).start()
logger.debug("No active audio stream to close")
# Clear waveform data
self.waveform_data = []
# Update state
self.listening = False
except Exception as e:
logger.error(f"Error stopping audio stream: {e}", exc_info=True)
print(f"Error stopping audio: {e}")
# Make sure we update state even if there's an error
self.listening = False
def check_for_updates(self):
"""Periodically check for updates to the speech state file and command file"""
last_modified_state = 0
last_modified_command = 0
if os.path.exists(STATE_FILE):
last_modified_state = os.path.getmtime(STATE_FILE)
while self.should_update:
# Check for command file first (higher priority)
if os.path.exists(COMMAND_FILE):
current_modified = os.path.getmtime(COMMAND_FILE)
if current_modified > last_modified_command:
last_modified_command = current_modified
# Read the command
with open(COMMAND_FILE, 'r') as f:
command = f.read().strip()
logger.debug(f"Received UI command: {command}")
# Process the command
if command == "LISTEN":
if not self.listening:
self.listening = True
self.speaking = False
self.root.after(0, self.start_listening)
self.root.after(0, self.update_ui_from_state)
elif command == "SPEAK":
if not self.speaking:
self.listening = False
self.speaking = True
self.root.after(0, self.update_ui_from_state)
elif command == "IDLE":
self.listening = False
self.speaking = False
self.root.after(0, self.update_ui_from_state)
except Exception as e:
logger.error(f"Error processing command: {e}")
# Also check state file for other updates
if os.path.exists(STATE_FILE):
current_modified = os.path.getmtime(STATE_FILE)
if current_modified > last_modified_state:
last_modified_state = current_modified
self.root.after(0, self.update_ui_from_state)
except Exception as e:
logger.error(f"Error checking for updates: {e}")
time.sleep(0.1) # Check every 100ms for faster response
def check_for_responses(self):
"""Periodically check for new responses to speak"""
# Add a lock to prevent multiple TTS instances from running simultaneously
self.tts_lock = threading.Lock()
while self.should_update:
if os.path.exists(RESPONSE_FILE):
# Only proceed if we're not already speaking
if not self.speaking and self.tts_lock.acquire(blocking=False):
# Read the response
logger.debug(f"Found response file: {RESPONSE_FILE}")
with open(RESPONSE_FILE, 'r') as f:
response = f.read().strip()
logger.debug(f"Read response text ({len(response)} chars): {response[:100]}{'...' if len(response) > 100 else ''}")
except Exception as e:
logger.error(f"Error reading response file: {e}", exc_info=True)
# Delete the file immediately to prevent duplicate processing
logger.debug("Removing response file")
except Exception as e:
logger.warning(f"Error removing response file: {e}")
# Process the response
if response:
self.last_response = response
self.speaking = True
self.root.after(0, self.update_ui_from_state)
# Create a simple speaking animation
def animate_speaking():
if not self.speaking:
# Generate a random amplitude for speaking animation
# Use a sine wave with noise for more natural movement
import time
time_val = time.time() * 3 # Speed factor
base_amplitude = 0.1 + 0.1 * np.sin(time_val)
noise = 0.05 * np.random.random()
amplitude = base_amplitude + noise
# Add to waveform data
# Keep only the most recent points
if len(self.waveform_data) > self.waveform_max_points:
self.waveform_data = self.waveform_data[-self.waveform_max_points:]
# Update the visualization
# Schedule the next animation frame if still speaking
if self.speaking:
self.root.after(50, animate_speaking)
# Start the speaking animation
self.root.after(0, animate_speaking)
logger.info(f"Speaking text ({len(response)} chars): {response[:100]}{'...' if len(response) > 100 else ''}")
print(f"Speaking: \"{response}\"")
# Use actual text-to-speech if available
if tts_available:
logger.debug("Using TTS engine for text-to-speech")
# If we're using our Kokoro adapter
if hasattr(tts_engine, 'speak'):
# Use the speak method directly
tts_start = time.time()
tts_duration = time.time() - tts_start
logger.info(f"Kokoro TTS completed in {tts_duration:.2f} seconds")
print("Speech completed.")
# Use pyttsx3 directly
# Log TTS settings
rate = tts_engine.getProperty('rate')
volume = tts_engine.getProperty('volume')
voice = tts_engine.getProperty('voice')
logger.debug(f"TTS settings - Rate: {rate}, Volume: {volume}, Voice: {voice}")
# Speak the text
tts_start = time.time()
tts_duration = time.time() - tts_start
logger.info(f"Speech completed in {tts_duration:.2f} seconds")
print("Speech completed.")
except Exception as e:
logger.error(f"Error using text-to-speech: {e}", exc_info=True)
print(f"Error using text-to-speech: {e}")
# Fall back to simulated speech
logger.info("Falling back to simulated speech")
speaking_duration = len(response) * 0.05 # 50ms per character
# Simulate speaking time if TTS not available
logger.debug("TTS not available, simulating speech timing")
speaking_duration = len(response) * 0.05 # 50ms per character
logger.debug(f"Simulating speech for {speaking_duration:.2f} seconds")
# Update state when done speaking
self.speaking = False
self.waveform_data = [] # Clear waveform data
self.root.after(0, self.update_ui_from_state)
print("Done speaking.")
logger.info("Done speaking")
# Release the lock when done
except Exception as e:
logger.error(f"Error processing response: {e}", exc_info=True)
# Make sure we release the lock on error
self.speaking = False
except RuntimeError:
pass # Ignore if lock wasn't acquired
except Exception as e:
logger.error(f"Error checking for responses: {e}", exc_info=True)
# Make sure we're not stuck in speaking state
if self.speaking:
self.speaking = False
# Try to release the lock if we might have it
except RuntimeError:
pass # Ignore if lock wasn't acquired
time.sleep(0.1) # Check every 100ms for faster response
def on_close(self):
"""Handle window close event"""
logger.info("Shutting down speech processor")
print("\nShutting down speech processor...")
self.should_update = False
if self.stream:
logger.debug("Stopping audio stream")
logger.debug("Audio stream closed successfully")
except Exception as e:
logger.error(f"Error closing audio stream: {e}")
logger.debug("Terminating PyAudio")
logger.debug("PyAudio terminated successfully")
except Exception as e:
logger.error(f"Error terminating PyAudio: {e}")
# Update state to indicate UI is closed
self.ui_active = False
self.listening = False
self.speaking = False
# Write a UI_CLOSED command to the command file
with open(COMMAND_FILE, 'w') as f:
logger.info("Created UI_CLOSED command file")
except Exception as e:
logger.error(f"Error creating command file: {e}")
# Remove the lock file
lock_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "speech_ui.lock")
if os.path.exists(lock_file):
logger.info("Removed lock file")
except Exception as e:
logger.error(f"Error removing lock file: {e}")
print("Speech processor shut down successfully.")
logger.info("Speech processor shut down successfully")
except Exception as e:
logger.error(f"Error shutting down speech processor: {e}", exc_info=True)
print(f"Error during shutdown: {e}")
def main():
"""Main entry point for the speech processor"""
logger.info("Starting Speech MCP Processor")
print("\n===== Speech MCP Processor =====")
print("Starting speech recognition system...")
# Log platform information
import platform
logger.info(f"Platform: {platform.platform()}")
logger.info(f"Python version: {platform.python_version()}")
# Check if another instance is already running
import psutil
import os
# Create a lock file to prevent multiple instances
lock_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "speech_ui.lock")
# Check if the lock file exists and if the process is still running
if os.path.exists(lock_file):
with open(lock_file, 'r') as f:
pid = int(f.read().strip())
if psutil.pid_exists(pid):
# Check if it's actually our UI process
process = psutil.Process(pid)
cmdline = process.cmdline()
if len(cmdline) >= 3 and 'speech_mcp.ui' in ' '.join(cmdline):
logger.warning(f"Another UI instance is already running with PID {pid}")
print(f"WARNING: Another Speech UI instance is already running with PID {pid}")
print("Only one instance of Speech UI can run at a time.")
except (psutil.NoSuchProcess, psutil.AccessDenied):
# Process doesn't exist or can't be accessed, ignore the lock file
except Exception as e:
logger.error(f"Error checking lock file: {e}")
# Create a new lock file with our PID
with open(lock_file, 'w') as f:
logger.info(f"Created lock file with PID {os.getpid()}")
except Exception as e:
logger.error(f"Error creating lock file: {e}")
# Log audio-related environment variables
audio_env_vars = {k: v for k, v in os.environ.items() if 'AUDIO' in k.upper() or 'PULSE' in k.upper() or 'ALSA' in k.upper()}
if audio_env_vars:
logger.debug(f"Audio-related environment variables: {json.dumps(audio_env_vars)}")
# Start the UI
root = tk.Tk()
app = SimpleSpeechProcessorUI(root)
logger.info("Starting Tkinter main loop")
logger.info("Tkinter main loop exited")
# Clean up the lock file when we exit
if os.path.exists(lock_file):
logger.info("Removed lock file")
except Exception as e:
logger.error(f"Error removing lock file: {e}")
except Exception as e:
logger.error(f"Error in speech processor main: {e}", exc_info=True)
print(f"\nERROR: Failed to start speech processor: {e}")
if __name__ == "__main__":