// VAD with Marine Algorithm - "Semper Fi to voice detection!" 🎖️
// Voice Activity Detection using MEM8's marine salience algorithm
// "Standing watch at the boundaries of speech!" - Hue
use anyhow::Result;
use std::collections::VecDeque;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::RwLock;
/// Voice Activity Detector using Marine algorithm
/// Detects when someone is speaking vs silence
pub struct MarineVAD {
/// Marine detector state
detector: Arc<RwLock<MarineDetectorState>>,
/// Audio input monitoring
audio_monitor: Arc<RwLock<AudioMonitor>>,
/// VAD state
is_voice_active: Arc<RwLock<bool>>,
/// Callback for voice state changes
state_callback: StateCallback,
}
type StateCallback = Arc<RwLock<Option<Box<dyn Fn(bool) + Send + Sync>>>>;
/// Marine detector state for VAD
struct MarineDetectorState {
/// Clip threshold for voice detection (dB)
voice_threshold: f64,
/// Grid tick rate (Hz) - how often we evaluate
tick_rate: f64,
/// Peak history for voice pattern analysis
peak_history: VecDeque<PeakEvent>,
/// Period tracking for speech patterns
period_ema: ExponentialMovingAverage,
/// Amplitude tracking for voice energy
amplitude_ema: ExponentialMovingAverage,
/// Speech pattern detector
speech_detector: SpeechPatternDetector,
/// Current salience score (0.0 to 1.0)
voice_salience: f64,
/// Last evaluation time
last_tick: Instant,
/// Voice onset time
voice_onset: Option<Instant>,
/// Voice offset time
voice_offset: Option<Instant>,
}
/// Peak event in audio signal
#[derive(Clone, Debug)]
struct PeakEvent {
timestamp: Instant,
amplitude: f64,
frequency: f64, // Estimated frequency
is_voiced: bool, // Voiced vs unvoiced
}
/// Exponential moving average for smoothing
struct ExponentialMovingAverage {
value: f64,
alpha: f64, // Smoothing factor
}
impl ExponentialMovingAverage {
fn new(alpha: f64) -> Self {
Self { value: 0.0, alpha }
}
fn update(&mut self, sample: f64) -> f64 {
self.value = self.alpha * sample + (1.0 - self.alpha) * self.value;
self.value
}
fn jitter(&self, sample: f64) -> f64 {
(sample - self.value).abs()
}
}
/// Speech pattern detector
struct SpeechPatternDetector {
/// Typical speech fundamental frequency range (Hz)
f0_min: f64, // ~80 Hz for deep male voice
f0_max: f64, // ~400 Hz for high female/child voice
/// Formant tracking
formant_tracker: FormantTracker,
/// Syllable rate detector (2-7 Hz typical)
syllable_detector: SyllableRateDetector,
/// Voice quality metrics
voice_quality: VoiceQuality,
}
/// Formant tracker for vowel detection
struct FormantTracker {
f1_range: (f64, f64), // First formant range (200-1000 Hz)
f2_range: (f64, f64), // Second formant range (500-2500 Hz)
f3_range: (f64, f64), // Third formant range (1500-3500 Hz)
}
/// Syllable rate detector
struct SyllableRateDetector {
energy_envelope: VecDeque<f64>,
peak_times: VecDeque<Instant>,
min_syllable_gap: Duration, // ~100ms minimum
max_syllable_gap: Duration, // ~500ms maximum
}
/// Voice quality metrics
struct VoiceQuality {
harmonicity: f64, // Harmonic-to-noise ratio
spectral_tilt: f64, // High vs low frequency energy
zero_crossing_rate: f64, // Voiced vs unvoiced
energy_variance: f64, // Speech dynamics
}
/// Audio input monitor
struct AudioMonitor {
/// Current audio level (RMS)
current_level: f64,
/// Peak level in window
peak_level: f64,
/// Noise floor estimate
noise_floor: f64,
/// Signal-to-noise ratio
snr: f64,
/// Audio source (mic, line-in, etc)
source: AudioSource,
}
#[derive(Clone, Debug)]
enum AudioSource {
Microphone,
LineIn,
Virtual, // For testing
}
impl MarineVAD {
/// Create new VAD with marine algorithm
pub fn new() -> Result<Self> {
Ok(Self {
detector: Arc::new(RwLock::new(MarineDetectorState::new())),
audio_monitor: Arc::new(RwLock::new(AudioMonitor::new())),
is_voice_active: Arc::new(RwLock::new(false)),
state_callback: Arc::new(RwLock::new(None)),
})
}
/// Process audio samples
pub async fn process_audio(&self, samples: &[f32], sample_rate: u32) -> Result<bool> {
let mut detector = self.detector.write().await;
let mut monitor = self.audio_monitor.write().await;
// Update audio monitor
monitor.update_levels(samples);
// Check if we should evaluate (based on tick rate)
let now = Instant::now();
let tick_duration = Duration::from_secs_f64(1.0 / detector.tick_rate);
if now.duration_since(detector.last_tick) < tick_duration {
return Ok(*self.is_voice_active.read().await);
}
detector.last_tick = now;
// Marine algorithm evaluation
let voice_detected = detector.evaluate_voice(samples, sample_rate, monitor.snr);
// Update state if changed
let mut is_active = self.is_voice_active.write().await;
if voice_detected != *is_active {
*is_active = voice_detected;
// Call state change callback
if let Some(callback) = &*self.state_callback.read().await {
callback(voice_detected);
}
// Log state change
if voice_detected {
println!("🎤 Voice detected - switching to minimal output mode");
detector.voice_onset = Some(now);
} else {
println!("🔇 Voice ended - returning to normal output mode");
detector.voice_offset = Some(now);
}
}
Ok(voice_detected)
}
/// Set callback for voice state changes
pub async fn set_state_callback<F>(&self, callback: F)
where
F: Fn(bool) + Send + Sync + 'static,
{
let mut cb = self.state_callback.write().await;
*cb = Some(Box::new(callback));
}
/// Get current voice activity state
pub async fn is_voice_active(&self) -> bool {
*self.is_voice_active.read().await
}
/// Get voice salience score (0.0 to 1.0)
pub async fn get_salience(&self) -> f64 {
self.detector.read().await.voice_salience
}
/// Get voice quality metrics
pub async fn get_voice_quality(&self) -> VoiceQualityReport {
let detector = self.detector.read().await;
VoiceQualityReport {
salience: detector.voice_salience,
harmonicity: detector.speech_detector.voice_quality.harmonicity,
spectral_tilt: detector.speech_detector.voice_quality.spectral_tilt,
zero_crossing_rate: detector.speech_detector.voice_quality.zero_crossing_rate,
energy_variance: detector.speech_detector.voice_quality.energy_variance,
}
}
}
impl MarineDetectorState {
fn new() -> Self {
Self {
voice_threshold: -40.0, // -40 dB threshold
tick_rate: 100.0, // 100 Hz evaluation rate
peak_history: VecDeque::with_capacity(100),
period_ema: ExponentialMovingAverage::new(0.1),
amplitude_ema: ExponentialMovingAverage::new(0.05),
speech_detector: SpeechPatternDetector::new(),
voice_salience: 0.0,
last_tick: Instant::now(),
voice_onset: None,
voice_offset: None,
}
}
/// Evaluate voice presence using marine algorithm
fn evaluate_voice(&mut self, samples: &[f32], sample_rate: u32, snr: f64) -> bool {
// Calculate RMS energy
let energy: f64 =
samples.iter().map(|&s| (s as f64).powi(2)).sum::<f64>() / samples.len() as f64;
let rms = energy.sqrt();
let db = 20.0 * rms.log10();
// Update amplitude tracking
self.amplitude_ema.update(rms);
// Check against threshold
if db < self.voice_threshold {
self.voice_salience *= 0.9; // Decay salience
return false;
}
// Analyze for speech patterns
let has_speech_pattern = self.speech_detector.analyze(samples, sample_rate);
// Calculate salience score
let mut salience = 0.0;
// Energy contribution (30%)
let energy_score = ((db - self.voice_threshold) / 20.0).clamp(0.0, 1.0);
salience += energy_score * 0.3;
// SNR contribution (20%)
let snr_score = (snr / 20.0).clamp(0.0, 1.0);
salience += snr_score * 0.2;
// Speech pattern contribution (50%)
if has_speech_pattern {
salience += 0.5;
}
// Update salience with smoothing
self.voice_salience = 0.7 * salience + 0.3 * self.voice_salience;
// Voice detected if salience > 0.5
self.voice_salience > 0.5
}
}
impl SpeechPatternDetector {
fn new() -> Self {
Self {
f0_min: 80.0,
f0_max: 400.0,
formant_tracker: FormantTracker {
f1_range: (200.0, 1000.0),
f2_range: (500.0, 2500.0),
f3_range: (1500.0, 3500.0),
},
syllable_detector: SyllableRateDetector {
energy_envelope: VecDeque::with_capacity(100),
peak_times: VecDeque::with_capacity(20),
min_syllable_gap: Duration::from_millis(100),
max_syllable_gap: Duration::from_millis(500),
},
voice_quality: VoiceQuality {
harmonicity: 0.0,
spectral_tilt: 0.0,
zero_crossing_rate: 0.0,
energy_variance: 0.0,
},
}
}
fn analyze(&mut self, samples: &[f32], sample_rate: u32) -> bool {
// Simple zero-crossing rate for voiced/unvoiced detection
let mut zero_crossings = 0;
for i in 1..samples.len() {
if samples[i - 1] * samples[i] < 0.0 {
zero_crossings += 1;
}
}
let zcr = zero_crossings as f64 / samples.len() as f64;
self.voice_quality.zero_crossing_rate = zcr;
// Voiced speech has lower ZCR (< 0.3), unvoiced has higher
let is_voiced = zcr < 0.3;
// Check if in speech frequency range
let estimated_freq = zcr * sample_rate as f64 / 2.0;
let in_speech_range = estimated_freq >= self.f0_min && estimated_freq <= self.f0_max * 10.0;
is_voiced && in_speech_range
}
}
impl AudioMonitor {
fn new() -> Self {
Self {
current_level: 0.0,
peak_level: 0.0,
noise_floor: -60.0, // Start with -60 dB assumption
snr: 0.0,
source: AudioSource::Microphone,
}
}
fn update_levels(&mut self, samples: &[f32]) {
// Calculate RMS
let sum_squares: f32 = samples.iter().map(|&s| s * s).sum();
let rms = (sum_squares / samples.len() as f32).sqrt();
self.current_level = rms as f64;
// Find peak
let peak = samples.iter().map(|&s| s.abs()).fold(0.0f32, f32::max) as f64;
self.peak_level = peak;
// Update noise floor estimate (slow adaptation)
if rms as f64 > 0.0 {
let db = 20.0 * (rms as f64).log10();
self.noise_floor = 0.99 * self.noise_floor + 0.01 * db;
self.snr = db - self.noise_floor;
}
}
}
/// Voice quality report
#[derive(Debug, Clone)]
pub struct VoiceQualityReport {
pub salience: f64,
pub harmonicity: f64,
pub spectral_tilt: f64,
pub zero_crossing_rate: f64,
pub energy_variance: f64,
}
/// Integration with rust_shell
impl super::rust_shell::RustShell {
/// Enable VAD with marine algorithm
pub async fn enable_marine_vad(&self) -> Result<()> {
println!("🎖️ Enabling Marine VAD - Semper Fi to voice detection!");
let vad = MarineVAD::new()?;
// Set callback to adjust verbosity
let output_mode = self.output_mode.clone();
vad.set_state_callback(move |is_voice| {
// This would be called when voice state changes
let mode = output_mode.clone();
tokio::spawn(async move {
let mut m = mode.write().await;
if is_voice {
m.verbosity = super::rust_shell::VerbosityLevel::Minimal;
m.format = super::rust_shell::OutputFormat::Voice;
} else {
m.verbosity = super::rust_shell::VerbosityLevel::Normal;
m.format = super::rust_shell::OutputFormat::Text;
}
});
})
.await;
// Store VAD instance (would need to add field to RustShell)
// self.vad = Some(vad);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_marine_vad_creation() {
let vad = MarineVAD::new();
assert!(vad.is_ok());
}
#[tokio::test]
async fn test_voice_detection() {
let vad = MarineVAD::new().unwrap();
// Create test signal (sine wave at 200 Hz - typical voice F0)
let sample_rate = 16000;
let frequency = 200.0;
let duration = 0.1; // 100ms
let num_samples = (sample_rate as f64 * duration) as usize;
let mut samples = vec![0.0f32; num_samples];
for (i, sample) in samples.iter_mut().enumerate().take(num_samples) {
let t = i as f64 / sample_rate as f64;
*sample = (2.0 * std::f64::consts::PI * frequency * t).sin() as f32 * 0.5;
}
// Process audio
let _is_voice = vad.process_audio(&samples, sample_rate).await.unwrap();
// Should detect voice-like signal
// (In real implementation would need proper training)
}
}