Skip to main content
Glama
8b-is
by 8b-is
vad_marine.rs14 kB
// VAD with Marine Algorithm - "Semper Fi to voice detection!" 🎖️ // Voice Activity Detection using MEM8's marine salience algorithm // "Standing watch at the boundaries of speech!" - Hue use anyhow::Result; use std::collections::VecDeque; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio::sync::RwLock; /// Voice Activity Detector using Marine algorithm /// Detects when someone is speaking vs silence pub struct MarineVAD { /// Marine detector state detector: Arc<RwLock<MarineDetectorState>>, /// Audio input monitoring audio_monitor: Arc<RwLock<AudioMonitor>>, /// VAD state is_voice_active: Arc<RwLock<bool>>, /// Callback for voice state changes state_callback: StateCallback, } type StateCallback = Arc<RwLock<Option<Box<dyn Fn(bool) + Send + Sync>>>>; /// Marine detector state for VAD struct MarineDetectorState { /// Clip threshold for voice detection (dB) voice_threshold: f64, /// Grid tick rate (Hz) - how often we evaluate tick_rate: f64, /// Peak history for voice pattern analysis peak_history: VecDeque<PeakEvent>, /// Period tracking for speech patterns period_ema: ExponentialMovingAverage, /// Amplitude tracking for voice energy amplitude_ema: ExponentialMovingAverage, /// Speech pattern detector speech_detector: SpeechPatternDetector, /// Current salience score (0.0 to 1.0) voice_salience: f64, /// Last evaluation time last_tick: Instant, /// Voice onset time voice_onset: Option<Instant>, /// Voice offset time voice_offset: Option<Instant>, } /// Peak event in audio signal #[derive(Clone, Debug)] struct PeakEvent { timestamp: Instant, amplitude: f64, frequency: f64, // Estimated frequency is_voiced: bool, // Voiced vs unvoiced } /// Exponential moving average for smoothing struct ExponentialMovingAverage { value: f64, alpha: f64, // Smoothing factor } impl ExponentialMovingAverage { fn new(alpha: f64) -> Self { Self { value: 0.0, alpha } } fn update(&mut self, sample: f64) -> f64 { self.value = self.alpha * sample + (1.0 - self.alpha) * self.value; self.value } fn jitter(&self, sample: f64) -> f64 { (sample - self.value).abs() } } /// Speech pattern detector struct SpeechPatternDetector { /// Typical speech fundamental frequency range (Hz) f0_min: f64, // ~80 Hz for deep male voice f0_max: f64, // ~400 Hz for high female/child voice /// Formant tracking formant_tracker: FormantTracker, /// Syllable rate detector (2-7 Hz typical) syllable_detector: SyllableRateDetector, /// Voice quality metrics voice_quality: VoiceQuality, } /// Formant tracker for vowel detection struct FormantTracker { f1_range: (f64, f64), // First formant range (200-1000 Hz) f2_range: (f64, f64), // Second formant range (500-2500 Hz) f3_range: (f64, f64), // Third formant range (1500-3500 Hz) } /// Syllable rate detector struct SyllableRateDetector { energy_envelope: VecDeque<f64>, peak_times: VecDeque<Instant>, min_syllable_gap: Duration, // ~100ms minimum max_syllable_gap: Duration, // ~500ms maximum } /// Voice quality metrics struct VoiceQuality { harmonicity: f64, // Harmonic-to-noise ratio spectral_tilt: f64, // High vs low frequency energy zero_crossing_rate: f64, // Voiced vs unvoiced energy_variance: f64, // Speech dynamics } /// Audio input monitor struct AudioMonitor { /// Current audio level (RMS) current_level: f64, /// Peak level in window peak_level: f64, /// Noise floor estimate noise_floor: f64, /// Signal-to-noise ratio snr: f64, /// Audio source (mic, line-in, etc) source: AudioSource, } #[derive(Clone, Debug)] enum AudioSource { Microphone, LineIn, Virtual, // For testing } impl MarineVAD { /// Create new VAD with marine algorithm pub fn new() -> Result<Self> { Ok(Self { detector: Arc::new(RwLock::new(MarineDetectorState::new())), audio_monitor: Arc::new(RwLock::new(AudioMonitor::new())), is_voice_active: Arc::new(RwLock::new(false)), state_callback: Arc::new(RwLock::new(None)), }) } /// Process audio samples pub async fn process_audio(&self, samples: &[f32], sample_rate: u32) -> Result<bool> { let mut detector = self.detector.write().await; let mut monitor = self.audio_monitor.write().await; // Update audio monitor monitor.update_levels(samples); // Check if we should evaluate (based on tick rate) let now = Instant::now(); let tick_duration = Duration::from_secs_f64(1.0 / detector.tick_rate); if now.duration_since(detector.last_tick) < tick_duration { return Ok(*self.is_voice_active.read().await); } detector.last_tick = now; // Marine algorithm evaluation let voice_detected = detector.evaluate_voice(samples, sample_rate, monitor.snr); // Update state if changed let mut is_active = self.is_voice_active.write().await; if voice_detected != *is_active { *is_active = voice_detected; // Call state change callback if let Some(callback) = &*self.state_callback.read().await { callback(voice_detected); } // Log state change if voice_detected { println!("🎤 Voice detected - switching to minimal output mode"); detector.voice_onset = Some(now); } else { println!("🔇 Voice ended - returning to normal output mode"); detector.voice_offset = Some(now); } } Ok(voice_detected) } /// Set callback for voice state changes pub async fn set_state_callback<F>(&self, callback: F) where F: Fn(bool) + Send + Sync + 'static, { let mut cb = self.state_callback.write().await; *cb = Some(Box::new(callback)); } /// Get current voice activity state pub async fn is_voice_active(&self) -> bool { *self.is_voice_active.read().await } /// Get voice salience score (0.0 to 1.0) pub async fn get_salience(&self) -> f64 { self.detector.read().await.voice_salience } /// Get voice quality metrics pub async fn get_voice_quality(&self) -> VoiceQualityReport { let detector = self.detector.read().await; VoiceQualityReport { salience: detector.voice_salience, harmonicity: detector.speech_detector.voice_quality.harmonicity, spectral_tilt: detector.speech_detector.voice_quality.spectral_tilt, zero_crossing_rate: detector.speech_detector.voice_quality.zero_crossing_rate, energy_variance: detector.speech_detector.voice_quality.energy_variance, } } } impl MarineDetectorState { fn new() -> Self { Self { voice_threshold: -40.0, // -40 dB threshold tick_rate: 100.0, // 100 Hz evaluation rate peak_history: VecDeque::with_capacity(100), period_ema: ExponentialMovingAverage::new(0.1), amplitude_ema: ExponentialMovingAverage::new(0.05), speech_detector: SpeechPatternDetector::new(), voice_salience: 0.0, last_tick: Instant::now(), voice_onset: None, voice_offset: None, } } /// Evaluate voice presence using marine algorithm fn evaluate_voice(&mut self, samples: &[f32], sample_rate: u32, snr: f64) -> bool { // Calculate RMS energy let energy: f64 = samples.iter().map(|&s| (s as f64).powi(2)).sum::<f64>() / samples.len() as f64; let rms = energy.sqrt(); let db = 20.0 * rms.log10(); // Update amplitude tracking self.amplitude_ema.update(rms); // Check against threshold if db < self.voice_threshold { self.voice_salience *= 0.9; // Decay salience return false; } // Analyze for speech patterns let has_speech_pattern = self.speech_detector.analyze(samples, sample_rate); // Calculate salience score let mut salience = 0.0; // Energy contribution (30%) let energy_score = ((db - self.voice_threshold) / 20.0).clamp(0.0, 1.0); salience += energy_score * 0.3; // SNR contribution (20%) let snr_score = (snr / 20.0).clamp(0.0, 1.0); salience += snr_score * 0.2; // Speech pattern contribution (50%) if has_speech_pattern { salience += 0.5; } // Update salience with smoothing self.voice_salience = 0.7 * salience + 0.3 * self.voice_salience; // Voice detected if salience > 0.5 self.voice_salience > 0.5 } } impl SpeechPatternDetector { fn new() -> Self { Self { f0_min: 80.0, f0_max: 400.0, formant_tracker: FormantTracker { f1_range: (200.0, 1000.0), f2_range: (500.0, 2500.0), f3_range: (1500.0, 3500.0), }, syllable_detector: SyllableRateDetector { energy_envelope: VecDeque::with_capacity(100), peak_times: VecDeque::with_capacity(20), min_syllable_gap: Duration::from_millis(100), max_syllable_gap: Duration::from_millis(500), }, voice_quality: VoiceQuality { harmonicity: 0.0, spectral_tilt: 0.0, zero_crossing_rate: 0.0, energy_variance: 0.0, }, } } fn analyze(&mut self, samples: &[f32], sample_rate: u32) -> bool { // Simple zero-crossing rate for voiced/unvoiced detection let mut zero_crossings = 0; for i in 1..samples.len() { if samples[i - 1] * samples[i] < 0.0 { zero_crossings += 1; } } let zcr = zero_crossings as f64 / samples.len() as f64; self.voice_quality.zero_crossing_rate = zcr; // Voiced speech has lower ZCR (< 0.3), unvoiced has higher let is_voiced = zcr < 0.3; // Check if in speech frequency range let estimated_freq = zcr * sample_rate as f64 / 2.0; let in_speech_range = estimated_freq >= self.f0_min && estimated_freq <= self.f0_max * 10.0; is_voiced && in_speech_range } } impl AudioMonitor { fn new() -> Self { Self { current_level: 0.0, peak_level: 0.0, noise_floor: -60.0, // Start with -60 dB assumption snr: 0.0, source: AudioSource::Microphone, } } fn update_levels(&mut self, samples: &[f32]) { // Calculate RMS let sum_squares: f32 = samples.iter().map(|&s| s * s).sum(); let rms = (sum_squares / samples.len() as f32).sqrt(); self.current_level = rms as f64; // Find peak let peak = samples.iter().map(|&s| s.abs()).fold(0.0f32, f32::max) as f64; self.peak_level = peak; // Update noise floor estimate (slow adaptation) if rms as f64 > 0.0 { let db = 20.0 * (rms as f64).log10(); self.noise_floor = 0.99 * self.noise_floor + 0.01 * db; self.snr = db - self.noise_floor; } } } /// Voice quality report #[derive(Debug, Clone)] pub struct VoiceQualityReport { pub salience: f64, pub harmonicity: f64, pub spectral_tilt: f64, pub zero_crossing_rate: f64, pub energy_variance: f64, } /// Integration with rust_shell impl super::rust_shell::RustShell { /// Enable VAD with marine algorithm pub async fn enable_marine_vad(&self) -> Result<()> { println!("🎖️ Enabling Marine VAD - Semper Fi to voice detection!"); let vad = MarineVAD::new()?; // Set callback to adjust verbosity let output_mode = self.output_mode.clone(); vad.set_state_callback(move |is_voice| { // This would be called when voice state changes let mode = output_mode.clone(); tokio::spawn(async move { let mut m = mode.write().await; if is_voice { m.verbosity = super::rust_shell::VerbosityLevel::Minimal; m.format = super::rust_shell::OutputFormat::Voice; } else { m.verbosity = super::rust_shell::VerbosityLevel::Normal; m.format = super::rust_shell::OutputFormat::Text; } }); }) .await; // Store VAD instance (would need to add field to RustShell) // self.vad = Some(vad); Ok(()) } } #[cfg(test)] mod tests { use super::*; #[tokio::test] async fn test_marine_vad_creation() { let vad = MarineVAD::new(); assert!(vad.is_ok()); } #[tokio::test] async fn test_voice_detection() { let vad = MarineVAD::new().unwrap(); // Create test signal (sine wave at 200 Hz - typical voice F0) let sample_rate = 16000; let frequency = 200.0; let duration = 0.1; // 100ms let num_samples = (sample_rate as f64 * duration) as usize; let mut samples = vec![0.0f32; num_samples]; for (i, sample) in samples.iter_mut().enumerate().take(num_samples) { let t = i as f64 / sample_rate as f64; *sample = (2.0 * std::f64::consts::PI * frequency * t).sin() as f32 * 0.5; } // Process audio let _is_voice = vad.process_audio(&samples, sample_rate).await.unwrap(); // Should detect voice-like signal // (In real implementation would need proper training) } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/8b-is/smart-tree'

If you have feedback or need assistance with the MCP directory API, please join our Discord server