HomeAssistant MCP

import os import json import queue import threading import numpy as np import sounddevice as sd from openwakeword import Model from datetime import datetime import wave from faster_whisper import WhisperModel import requests import logging import time # Set up logging logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Configuration SAMPLE_RATE = 16000 CHANNELS = 1 CHUNK_SIZE = 1024 BUFFER_DURATION = 10 # seconds to keep in buffer DETECTION_THRESHOLD = 0.5 CONTINUOUS_TRANSCRIPTION_INTERVAL = 3 # seconds between transcriptions MAX_MODEL_LOAD_RETRIES = 3 MODEL_LOAD_RETRY_DELAY = 5 # seconds MODEL_DOWNLOAD_TIMEOUT = 600 # 10 minutes timeout for model download # ALSA device configuration AUDIO_DEVICE = 'hw:0,0' # Use ALSA hardware device directly # Audio processing parameters NOISE_THRESHOLD = 0.08 # Increased threshold for better noise filtering MIN_SPEECH_DURATION = 2.0 # Longer minimum duration to avoid fragments SILENCE_DURATION = 1.0 # Longer silence duration MAX_REPETITIONS = 1 # More aggressive repetition filtering ECHO_THRESHOLD = 0.75 # More sensitive echo detection MIN_SEGMENT_DURATION = 1.0 # Longer minimum segment duration FEEDBACK_WINDOW = 5 # Window size for feedback detection in seconds # Feature flags from environment WAKE_WORD_ENABLED = os.environ.get('ENABLE_WAKE_WORD', 'false').lower() == 'true' SPEECH_ENABLED = os.environ.get('ENABLE_SPEECH_FEATURES', 'true').lower() == 'true' # Wake word models to use (only if wake word is enabled) WAKE_WORDS = ["hey_jarvis"] # Using hey_jarvis as it's more similar to "hey gaja" WAKE_WORD_ALIAS = "gaja" # What we print when wake word is detected # Home Assistant Configuration HASS_HOST = os.environ.get('HASS_HOST', 'http://homeassistant.local:8123') HASS_TOKEN = os.environ.get('HASS_TOKEN') def initialize_asr_model(): """Initialize the ASR model with retries and timeout""" model_path = os.environ.get('WHISPER_MODEL_PATH', '/models') model_name = os.environ.get('WHISPER_MODEL_TYPE', 'base') start_time = time.time() for attempt in range(MAX_MODEL_LOAD_RETRIES): try: if time.time() - start_time > MODEL_DOWNLOAD_TIMEOUT: logger.error("Model download timeout exceeded") raise TimeoutError("Model download took too long") logger.info(f"Loading ASR model (attempt {attempt + 1}/{MAX_MODEL_LOAD_RETRIES})") model = WhisperModel( model_size_or_path=model_name, device="cpu", compute_type="int8", download_root=model_path, num_workers=1 # Reduce concurrent downloads ) logger.info("ASR model loaded successfully") return model except Exception as e: logger.error(f"Failed to load ASR model (attempt {attempt + 1}): {e}") if attempt < MAX_MODEL_LOAD_RETRIES - 1: logger.info(f"Retrying in {MODEL_LOAD_RETRY_DELAY} seconds...") time.sleep(MODEL_LOAD_RETRY_DELAY) else: logger.error("Failed to load ASR model after all retries") raise # Initialize the ASR model with retries try: asr_model = initialize_asr_model() except Exception as e: logger.error(f"Critical error initializing ASR model: {e}") raise def send_command_to_hass(domain, service, entity_id): """Send command to Home Assistant""" if not HASS_TOKEN: logger.error("Error: HASS_TOKEN not set") return False headers = { "Authorization": f"Bearer {HASS_TOKEN}", "Content-Type": "application/json", } url = f"{HASS_HOST}/api/services/{domain}/{service}" data = {"entity_id": entity_id} try: response = requests.post(url, headers=headers, json=data) response.raise_for_status() logger.info(f"Command sent: {domain}.{service} for {entity_id}") return True except Exception as e: logger.error(f"Error sending command to Home Assistant: {e}") return False def is_speech(audio_data, threshold=NOISE_THRESHOLD): """Detect if audio segment contains speech based on amplitude and frequency content""" # Calculate RMS amplitude rms = np.sqrt(np.mean(np.square(audio_data))) # Calculate signal energy in speech frequency range (100-4000 Hz) fft = np.fft.fft(audio_data) freqs = np.fft.fftfreq(len(audio_data), 1/SAMPLE_RATE) speech_mask = (np.abs(freqs) >= 100) & (np.abs(freqs) <= 4000) speech_energy = np.sum(np.abs(fft[speech_mask])) / len(audio_data) # Enhanced echo detection # 1. Check for periodic patterns in the signal autocorr = np.correlate(audio_data, audio_data, mode='full') autocorr = autocorr[len(autocorr)//2:] # Use only positive lags peaks = np.where(autocorr > ECHO_THRESHOLD * np.max(autocorr))[0] peak_spacing = np.diff(peaks) has_periodic_echo = len(peak_spacing) > 2 and np.std(peak_spacing) < 0.1 * np.mean(peak_spacing) # 2. Check for sudden amplitude changes amplitude_envelope = np.abs(audio_data) amplitude_changes = np.diff(amplitude_envelope) has_feedback_spikes = np.any(np.abs(amplitude_changes) > threshold * 2) # 3. Check frequency distribution freq_magnitudes = np.abs(fft)[:len(fft)//2] peak_freqs = freqs[:len(fft)//2][np.argsort(freq_magnitudes)[-3:]] has_feedback_freqs = np.any((peak_freqs > 2000) & (peak_freqs < 4000)) # Combine all criteria is_valid_speech = ( rms > threshold and speech_energy > threshold and not has_periodic_echo and not has_feedback_spikes and not has_feedback_freqs ) return is_valid_speech def process_command(text): """Process the transcribed command and execute appropriate action""" text = text.lower().strip() # Skip if text is too short or contains numbers (likely noise) if len(text) < 5 or any(char.isdigit() for char in text): logger.debug("Text too short or contains numbers, skipping") return # Enhanced noise pattern detection noise_patterns = ["lei", "los", "und", "aber", "nicht mehr", "das das", "und und"] for pattern in noise_patterns: if text.count(pattern) > 1: # More aggressive pattern filtering logger.debug(f"Detected noise pattern '{pattern}', skipping") return # More aggressive repetition detection words = text.split() if len(words) >= 2: # Check for immediate word repetitions for i in range(len(words)-1): if words[i] == words[i+1]: logger.debug(f"Detected immediate word repetition: '{words[i]}', skipping") return # Check for phrase repetitions phrases = [' '.join(words[i:i+2]) for i in range(len(words)-1)] phrase_counts = {} for phrase in phrases: phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1 if phrase_counts[phrase] > MAX_REPETITIONS: logger.debug(f"Skipping due to excessive repetition: '{phrase}'") return # German command mappings commands = { "ausschalten": "turn_off", "einschalten": "turn_on", "an": "turn_on", "aus": "turn_off" } rooms = { "wohnzimmer": "living_room", "küche": "kitchen", "schlafzimmer": "bedroom", "bad": "bathroom" } # Detect room detected_room = None for german_room, english_room in rooms.items(): if german_room in text: detected_room = english_room break # Detect command detected_command = None for german_cmd, english_cmd in commands.items(): if german_cmd in text: detected_command = english_cmd break if detected_room and detected_command: # Construct entity ID (assuming light) entity_id = f"light.{detected_room}" # Send command to Home Assistant if send_command_to_hass("light", detected_command, entity_id): logger.info(f"Executed: {detected_command} for {entity_id}") else: logger.error("Failed to execute command") else: logger.debug(f"No command found in text: '{text}'") class AudioProcessor: def __init__(self): logger.info("Initializing AudioProcessor...") self.audio_buffer = queue.Queue() self.recording = False self.buffer = np.zeros(SAMPLE_RATE * BUFFER_DURATION) self.buffer_lock = threading.Lock() self.last_transcription_time = 0 try: logger.info(f"Opening audio device: {AUDIO_DEVICE}") self.stream = sd.InputStream( device=AUDIO_DEVICE, samplerate=SAMPLE_RATE, channels=CHANNELS, dtype=np.int16, blocksize=CHUNK_SIZE, callback=self._audio_callback ) logger.info("Audio stream initialized successfully") except Exception as e: logger.error(f"Failed to initialize audio stream: {e}") raise self.speech_detected = False self.silence_frames = 0 self.speech_frames = 0 # Initialize wake word detection only if enabled if WAKE_WORD_ENABLED: try: logger.info("Initializing wake word model...") self.wake_word_model = Model(vad_threshold=0.5) self.last_prediction = None logger.info("Wake word model initialized successfully") except Exception as e: logger.error(f"Failed to initialize wake word model: {e}") raise else: self.wake_word_model = None self.last_prediction = None logger.info("Wake word detection disabled") def should_transcribe(self): """Determine if we should transcribe based on mode and timing""" current_time = datetime.now().timestamp() if not WAKE_WORD_ENABLED: # Check if enough time has passed since last transcription time_since_last = current_time - self.last_transcription_time if time_since_last >= CONTINUOUS_TRANSCRIPTION_INTERVAL: # Only transcribe if we detect speech frames_per_chunk = CHUNK_SIZE min_speech_frames = int(MIN_SPEECH_DURATION * SAMPLE_RATE / frames_per_chunk) if self.speech_frames >= min_speech_frames: self.last_transcription_time = current_time self.speech_frames = 0 # Reset counter return True return False def _audio_callback(self, indata, frames, time, status): """Callback for audio input""" if status: logger.warning(f"Audio callback status: {status}") # Convert to mono if necessary if CHANNELS > 1: audio_data = np.mean(indata, axis=1) else: audio_data = indata.flatten() # Check for speech if is_speech(audio_data): self.speech_frames += 1 self.silence_frames = 0 else: self.silence_frames += 1 frames_per_chunk = CHUNK_SIZE silence_frames_threshold = int(SILENCE_DURATION * SAMPLE_RATE / frames_per_chunk) if self.silence_frames >= silence_frames_threshold: self.speech_frames = 0 # Update circular buffer with self.buffer_lock: self.buffer = np.roll(self.buffer, -len(audio_data)) self.buffer[-len(audio_data):] = audio_data if WAKE_WORD_ENABLED: # Process for wake word detection self.last_prediction = self.wake_word_model.predict(audio_data) # Check if wake word detected for wake_word in WAKE_WORDS: confidence = self.last_prediction[wake_word] if confidence > DETECTION_THRESHOLD: logger.info( f"Wake word: {WAKE_WORD_ALIAS} (confidence: {confidence:.2f})" ) self.process_audio() break else: # Continuous transcription mode if self.should_transcribe(): self.process_audio() def process_audio(self): """Process the current audio buffer (save and transcribe)""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"/audio/audio_segment_{timestamp}.wav" # Save the audio buffer to a WAV file with wave.open(filename, 'wb') as wf: wf.setnchannels(CHANNELS) wf.setsampwidth(2) # 16-bit audio wf.setframerate(SAMPLE_RATE) # Convert float32 to int16 audio_data = (self.buffer * 32767).astype(np.int16) wf.writeframes(audio_data.tobytes()) logger.info(f"Saved audio segment to {filename}") # Transcribe the audio with German language preference try: segments, info = asr_model.transcribe( filename, language="de", # Set German as preferred language beam_size=5, temperature=0 ) # Get the full transcribed text transcribed_text = " ".join(segment.text for segment in segments) logger.info(f"Transcribed text: {transcribed_text}") # Process the command process_command(transcribed_text) except Exception as e: logger.error(f"Error during transcription or processing: {e}") def start(self): """Start audio processing""" try: logger.info("Starting audio processor...") # Log configuration logger.debug(f"Sample Rate: {SAMPLE_RATE}") logger.debug(f"Channels: {CHANNELS}") logger.debug(f"Chunk Size: {CHUNK_SIZE}") logger.debug(f"Buffer Duration: {BUFFER_DURATION}") logger.debug(f"Wake Word Enabled: {WAKE_WORD_ENABLED}") logger.debug(f"Speech Enabled: {SPEECH_ENABLED}") logger.debug(f"ASR Model: {os.environ.get('ASR_MODEL')}") if WAKE_WORD_ENABLED: logger.info("Initializing wake word detection...") logger.info(f"Loaded wake words: {', '.join(WAKE_WORDS)}") else: logger.info("Starting continuous transcription mode...") interval = CONTINUOUS_TRANSCRIPTION_INTERVAL logger.info(f"Will transcribe every {interval} seconds") try: logger.debug("Setting up audio input stream...") with sd.InputStream( channels=CHANNELS, samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE, callback=self._audio_callback ): logger.info("Audio input stream started successfully") logger.info("Listening for audio input...") logger.info("Press Ctrl+C to stop") while True: sd.sleep(1000) # Sleep for 1 second except sd.PortAudioError as e: logger.error(f"Error setting up audio stream: {e}") logger.error("Check if microphone is connected and accessible") raise except Exception as e: logger.error(f"Unexpected error in audio stream: {e}") raise except KeyboardInterrupt: logger.info("\nStopping audio processing...") except Exception as e: logger.error("Critical error in audio processing", exc_info=True) raise if __name__ == "__main__": try: logger.info("Initializing AudioProcessor...") processor = AudioProcessor() processor.start() except Exception as e: logger.error("Failed to start AudioProcessor", exc_info=True) raise