MCP Gateway

kill_switch.rs•48.3 KiB

//! Production safety — kill switches and error budgets for backend servers. //! //! Provides three complementary mechanisms: //! //! - **Kill switch** (`KillSwitch`): operator-controlled, instant disable/re-enable of //! any backend by name. Changes take effect on the next `gateway_invoke` call. //! //! - **Backend error budget** (`ErrorBudget`): per-backend sliding-window error-rate //! tracker. When a backend exceeds its configured failure threshold it is automatically //! killed. The operator can revive it manually via `gateway_revive_server`. //! //! - **Per-capability error budget**: per-capability sliding-window error-rate tracker. //! When a single capability exceeds its threshold, only that capability is disabled — //! the rest of the backend remains healthy. Disabled capabilities auto-recover after //! a configurable cooldown period (default 5 min). The backend-level kill switch //! is a secondary safeguard that fires when cumulative errors exceed its threshold. //! //! Both the backend kill set and the disabled-capability map are backed by lock-free //! `DashMap`/`DashSet` structures so the read hot-path (every `gateway_invoke`) is //! contention-free. use std::collections::VecDeque; use std::sync::Arc; use std::time::{Duration, Instant}; use dashmap::DashMap; use dashmap::DashSet; use tracing::{info, warn}; // ============================================================================ // Kill switch // ============================================================================ /// Operator-controlled kill switch for backend servers with per-capability /// error budgets. /// /// Backed by lock-free `DashSet`/`DashMap` structures for contention-free /// reads on every `gateway_invoke` call. Writes (kill/revive) are rare. #[derive(Debug, Default)] pub struct KillSwitch { /// Set of backend server names that are currently disabled. killed: DashSet<String>, /// Per-backend error budgets (sliding window). budgets: DashMap<String, Arc<parking_lot::Mutex<BudgetWindow>>>, /// Per-capability error budgets. /// /// Key: `"{backend}:{capability}"`. Each entry tracks the sliding-window /// error rate for a single capability independently of its backend. capability_budgets: DashMap<String, Arc<parking_lot::Mutex<BudgetWindow>>>, /// Disabled capabilities with the `Instant` at which they were disabled. /// /// Key: `"{backend}:{capability}"`. Capabilities are re-enabled /// automatically once the cooldown period elapses. disabled_capabilities: DashMap<String, Instant>, } impl KillSwitch { /// Create a new kill switch with no servers or capabilities disabled. #[must_use] pub fn new() -> Self { Self { killed: DashSet::new(), budgets: DashMap::new(), capability_budgets: DashMap::new(), disabled_capabilities: DashMap::new(), } } // ── Operator control ────────────────────────────────────────────────────── /// Immediately disable routing to `server`. /// /// Idempotent — calling this on an already-killed server is a no-op. pub fn kill(&self, server: &str) { if self.killed.insert(server.to_string()) { warn!(server = server, "Kill switch engaged: server disabled"); } } /// Re-enable routing to `server`. /// /// Idempotent — calling this on an already-live server is a no-op. /// Also resets the error-budget window so the backend gets a clean slate. pub fn revive(&self, server: &str) { if self.killed.remove(server).is_some() { info!(server = server, "Kill switch released: server re-enabled"); } // Reset the budget window so the revived server starts fresh. if let Some(budget) = self.budgets.get(server) { budget.lock().reset(); } } /// Returns `true` when `server` is currently disabled. #[must_use] #[inline] pub fn is_killed(&self, server: &str) -> bool { self.killed.contains(server) } /// Returns the set of currently-killed server names (snapshot). #[must_use] pub fn killed_servers(&self) -> Vec<String> { self.killed.iter().map(|s| s.clone()).collect() } // ── Backend error budget ────────────────────────────────────────────────── /// Record a successful call for `server`. /// /// Only updates the budget window; does not change kill state. pub fn record_success(&self, server: &str, window_size: usize, window_duration: Duration) { self.get_or_create_budget(server, window_size, window_duration) .lock() .record(true); } /// Record a failed call for `server`, auto-killing when the budget is exhausted. /// /// The kill switch is **not** evaluated until the window contains at least /// `min_samples` calls. This prevents a single early failure from killing a /// backend before enough data has been collected. /// /// Returns `true` when this failure triggered a new auto-kill. pub fn record_failure( &self, server: &str, window_size: usize, window_duration: Duration, threshold: f64, min_samples: usize, ) -> bool { let budget = self.get_or_create_budget(server, window_size, window_duration); let mut window = budget.lock(); window.record(false); // Do not evaluate until we have enough data. let (successes, failures) = window.counts(); let total = successes + failures; if total < min_samples { return false; } let rate = window.error_rate(); let usage_fraction = rate / threshold; if (0.8..1.0).contains(&usage_fraction) { warn!( server = server, error_rate = rate, threshold = threshold, "Error budget at 80% — approaching auto-kill threshold" ); } if rate >= threshold && !self.is_killed(server) { warn!( server = server, error_rate = rate, threshold = threshold, "Error budget exhausted — auto-killing server" ); self.killed.insert(server.to_string()); return true; } false } /// Retrieve the current error rate (0.0–1.0) for `server`. /// /// Returns `0.0` when no calls have been recorded yet. #[must_use] pub fn error_rate(&self, server: &str) -> f64 { self.budgets .get(server) .map_or(0.0, |b| b.lock().error_rate()) } /// Retrieve the window call counts for `server` as `(successes, failures)`. #[must_use] pub fn window_counts(&self, server: &str) -> (usize, usize) { self.budgets .get(server) .map_or((0, 0), |b| b.lock().counts()) } // ── Per-capability error budget ─────────────────────────────────────────── /// Build the composite key used for per-capability budget and disabled maps. fn capability_key(backend: &str, capability: &str) -> String { format!("{backend}:{capability}") } /// Returns `true` when `capability` on `backend` is currently disabled. /// /// Does **not** perform cooldown-based auto-recovery. Use /// [`is_capability_disabled_with_cooldown`] on the invocation hot-path to /// trigger transparent recovery when the cooldown has elapsed. #[must_use] pub fn is_capability_disabled(&self, backend: &str, capability: &str) -> bool { let key = Self::capability_key(backend, capability); self.is_capability_key_disabled(&key, None) } /// Returns `true` when `capability` on `backend` is currently disabled, /// performing transparent auto-recovery if `cooldown` has elapsed. /// /// This is the preferred method for the hot-path invocation check because /// it combines the disabled test with the auto-recovery sweep in one call. #[must_use] pub fn is_capability_disabled_with_cooldown( &self, backend: &str, capability: &str, cooldown: Duration, ) -> bool { let key = Self::capability_key(backend, capability); self.is_capability_key_disabled(&key, Some(cooldown)) } /// Record a successful call for `(backend, capability)`. /// /// `cfg` supplies window sizing and cooldown configuration. /// Also triggers transparent auto-recovery if the capability's cooldown /// has elapsed since it was auto-disabled. pub fn record_capability_success( &self, backend: &str, capability: &str, cfg: &CapabilityErrorBudgetConfig, ) { let key = Self::capability_key(backend, capability); // Trigger auto-recovery check on success path too. self.is_capability_key_disabled(&key, Some(cfg.cooldown)); self.get_or_create_capability_budget(&key, cfg.window_size, cfg.window_duration) .lock() .record(true); } /// Record a failed call for `(backend, capability)`, auto-disabling the /// capability when its budget is exhausted. /// /// `cfg` supplies threshold, window sizing, min-samples, and cooldown. /// /// Returns `true` when this failure triggered a new auto-disable. /// The backend-level budget is unaffected — callers must still call /// [`record_failure`] separately to update the backend budget. pub fn record_capability_failure( &self, backend: &str, capability: &str, cfg: &CapabilityErrorBudgetConfig, ) -> bool { let key = Self::capability_key(backend, capability); // Check if it is already disabled (respecting cooldown). if self.is_capability_key_disabled(&key, Some(cfg.cooldown)) { // Record the failure in the window even while disabled so that // the error rate reflects reality when it recovers. self.get_or_create_capability_budget(&key, cfg.window_size, cfg.window_duration) .lock() .record(false); return false; } let budget = self.get_or_create_capability_budget(&key, cfg.window_size, cfg.window_duration); let mut window = budget.lock(); window.record(false); let (successes, failures) = window.counts(); let total = successes + failures; if total < cfg.min_samples { return false; } let rate = window.error_rate(); let usage_fraction = rate / cfg.threshold; if (0.8..1.0).contains(&usage_fraction) { warn!( capability = key, error_rate = rate, threshold = cfg.threshold, "Capability error budget at 80% — approaching auto-disable threshold" ); } if rate >= cfg.threshold { warn!( capability = key, error_rate = rate, threshold = cfg.threshold, "Capability error budget exhausted — auto-disabling capability" ); drop(window); self.disabled_capabilities.insert(key, Instant::now()); return true; } false } /// Revive a previously disabled capability and reset its budget window. pub fn revive_capability(&self, backend: &str, capability: &str) { let key = Self::capability_key(backend, capability); if self.disabled_capabilities.remove(&key).is_some() { info!( capability = key, "Capability revived — re-enabled by operator" ); } if let Some(budget) = self.capability_budgets.get(&key) { budget.lock().reset(); } } /// List all currently disabled capabilities as `"backend:capability"` keys. /// /// Entries whose cooldown has expired are transparently removed (auto-recovery) /// before the list is returned. Callers receive only genuinely disabled entries. #[must_use] pub fn disabled_capabilities(&self, cooldown: Duration) -> Vec<String> { // First, purge any expired entries. let expired: Vec<String> = self .disabled_capabilities .iter() .filter(|e| e.value().elapsed() >= cooldown) .map(|e| e.key().clone()) .collect(); for key in &expired { self.disabled_capabilities.remove(key); if let Some(budget) = self.capability_budgets.get(key) { budget.lock().reset(); } info!( capability = key, "Capability auto-recovered after cooldown (list sweep)" ); } self.disabled_capabilities .iter() .map(|e| e.key().clone()) .collect() } /// Retrieve the current error rate (0.0–1.0) for a capability. #[must_use] pub fn capability_error_rate(&self, backend: &str, capability: &str) -> f64 { let key = Self::capability_key(backend, capability); self.capability_budgets .get(&key) .map_or(0.0, |b| b.lock().error_rate()) } /// Retrieve the window call counts for a capability as `(successes, failures)`. #[must_use] pub fn capability_window_counts(&self, backend: &str, capability: &str) -> (usize, usize) { let key = Self::capability_key(backend, capability); self.capability_budgets .get(&key) .map_or((0, 0), |b| b.lock().counts()) } // ── Internal helpers ────────────────────────────────────────────────────── fn get_or_create_budget( &self, server: &str, window_size: usize, window_duration: Duration, ) -> Arc<parking_lot::Mutex<BudgetWindow>> { self.budgets .entry(server.to_string()) .or_insert_with(|| { Arc::new(parking_lot::Mutex::new(BudgetWindow::new( window_size, window_duration, ))) }) .clone() } fn get_or_create_capability_budget( &self, key: &str, window_size: usize, window_duration: Duration, ) -> Arc<parking_lot::Mutex<BudgetWindow>> { self.capability_budgets .entry(key.to_string()) .or_insert_with(|| { Arc::new(parking_lot::Mutex::new(BudgetWindow::new( window_size, window_duration, ))) }) .clone() } /// Check whether a capability key is disabled, optionally auto-recovering. fn is_capability_key_disabled(&self, key: &str, cooldown: Option<Duration>) -> bool { if let Some(entry) = self.disabled_capabilities.get(key) { if let Some(cd) = cooldown { if entry.elapsed() >= cd { // Cooldown elapsed — auto-recover. drop(entry); self.disabled_capabilities.remove(key); if let Some(budget) = self.capability_budgets.get(key) { budget.lock().reset(); } info!(capability = key, "Capability auto-recovered after cooldown"); return false; } } return true; } false } } // ============================================================================ // Sliding-window error budget // ============================================================================ /// Sliding-window call tracker for error-rate computation. /// /// Maintains up to `max_calls` entries OR entries younger than `max_age`. /// Old entries are evicted lazily on each `record` call. #[derive(Debug)] pub(crate) struct BudgetWindow { /// Ring buffer of `(timestamp, success)` pairs. entries: VecDeque<(Instant, bool)>, /// Maximum number of entries to retain. max_calls: usize, /// Maximum age of entries before they are evicted. max_age: Duration, } impl BudgetWindow { /// Create a new window. pub fn new(max_calls: usize, max_age: Duration) -> Self { Self { entries: VecDeque::with_capacity(max_calls.min(4096)), max_calls, max_age, } } /// Record a call outcome and evict expired entries. pub fn record(&mut self, success: bool) { self.evict_old(); self.entries.push_back((Instant::now(), success)); // Enforce size cap if self.entries.len() > self.max_calls { self.entries.pop_front(); } } /// Compute the error rate over all valid entries (0.0–1.0). pub fn error_rate(&mut self) -> f64 { self.evict_old(); let total = self.entries.len(); if total == 0 { return 0.0; } let failures = self.entries.iter().filter(|(_, ok)| !ok).count(); #[allow(clippy::cast_precision_loss)] let rate = failures as f64 / total as f64; rate } /// Return `(successes, failures)` counts after eviction. pub fn counts(&mut self) -> (usize, usize) { self.evict_old(); let failures = self.entries.iter().filter(|(_, ok)| !ok).count(); let successes = self.entries.len() - failures; (successes, failures) } /// Clear all entries (used on revive). pub fn reset(&mut self) { self.entries.clear(); } /// Remove entries older than `max_age`. fn evict_old(&mut self) { let now = Instant::now(); while let Some((ts, _)) = self.entries.front() { if now.duration_since(*ts) > self.max_age { self.entries.pop_front(); } else { break; } } } } // ============================================================================ // Error budget configuration // ============================================================================ /// Configuration for the per-backend error budget. #[derive(Debug, Clone)] pub struct ErrorBudgetConfig { /// Failure rate threshold that triggers auto-kill (0.0–1.0). /// /// Default: `0.8` (80% failure rate). A backend must sustain a very high /// error rate before being auto-killed, preventing single-capability /// failures on large backends (e.g. fulcrum's 234 tools) from killing the /// entire server. pub threshold: f64, /// Number of calls in the sliding window. /// /// Default: `100`. pub window_size: usize, /// Maximum age of calls in the sliding window. /// /// Default: 5 minutes. pub window_duration: Duration, /// Minimum number of calls in the window before the kill switch is /// evaluated. /// /// Default: `10`. Prevents a single early failure from triggering an /// auto-kill before enough samples have accumulated. pub min_samples: usize, } impl Default for ErrorBudgetConfig { fn default() -> Self { Self { threshold: 0.8, window_size: 100, window_duration: Duration::from_secs(5 * 60), min_samples: 10, } } } /// Configuration for the per-capability error budget. /// /// Capabilities operate in the same backend but are tracked independently. /// When a single capability exceeds its threshold only that capability is /// disabled; the backend and its other capabilities remain healthy. /// /// Auto-recovery: after `cooldown` has elapsed since the capability was /// disabled, the next call to it automatically re-enables it and resets its /// window — no operator action required. #[derive(Debug, Clone)] pub struct CapabilityErrorBudgetConfig { /// Failure rate threshold that triggers per-capability auto-disable (0.0–1.0). /// /// Default: `0.8` (80% failure rate). Matches the backend-level default. pub threshold: f64, /// Number of calls in the per-capability sliding window. /// /// Default: `50`. Smaller than the backend window to detect failing /// capabilities faster. pub window_size: usize, /// Maximum age of calls in the per-capability sliding window. /// /// Default: 5 minutes. pub window_duration: Duration, /// Minimum number of calls before the per-capability budget is evaluated. /// /// Default: `5`. Lower than the backend default since individual /// capabilities receive fewer calls. pub min_samples: usize, /// How long a disabled capability stays offline before auto-recovering. /// /// Default: 5 minutes. After this period the capability is transparently /// re-enabled on its next invocation so transient outages heal themselves. pub cooldown: Duration, } impl Default for CapabilityErrorBudgetConfig { fn default() -> Self { Self { threshold: 0.8, window_size: 50, window_duration: Duration::from_secs(5 * 60), min_samples: 5, cooldown: Duration::from_secs(5 * 60), } } } // ============================================================================ // Tests // ============================================================================ #[cfg(test)] mod tests { use super::*; // ── KillSwitch::kill / revive / is_killed ──────────────────────────────── #[test] fn kill_server_marks_it_as_killed() { // GIVEN: a fresh kill switch let ks = KillSwitch::new(); // WHEN: a server is killed ks.kill("backend-a"); // THEN: it reports as killed assert!(ks.is_killed("backend-a")); } #[test] fn revive_server_unmarks_it() { // GIVEN: a killed server let ks = KillSwitch::new(); ks.kill("backend-a"); // WHEN: it is revived ks.revive("backend-a"); // THEN: it is no longer killed assert!(!ks.is_killed("backend-a")); } #[test] fn kill_is_idempotent() { let ks = KillSwitch::new(); ks.kill("srv"); ks.kill("srv"); // second call must not panic assert!(ks.is_killed("srv")); } #[test] fn revive_is_idempotent() { let ks = KillSwitch::new(); ks.revive("srv"); // reviving a live server must not panic assert!(!ks.is_killed("srv")); } #[test] fn unknown_server_is_not_killed() { let ks = KillSwitch::new(); assert!(!ks.is_killed("nonexistent")); } #[test] fn killed_servers_returns_snapshot() { let ks = KillSwitch::new(); ks.kill("a"); ks.kill("b"); let mut killed = ks.killed_servers(); killed.sort(); assert_eq!(killed, vec!["a", "b"]); } #[test] fn killed_servers_empty_when_none_killed() { let ks = KillSwitch::new(); assert!(ks.killed_servers().is_empty()); } // ── Error budget: auto-kill ────────────────────────────────────────────── /// Shared test helper: `min_samples = 1` lets tests exercise auto-kill /// without needing to accumulate a minimum window of calls first. const NO_MIN: usize = 1; // ── Capability budget test helpers ─────────────────────────────────────── /// Build a `CapabilityErrorBudgetConfig` from raw test parameters. fn cap_cfg( window_size: usize, window_duration: Duration, threshold: f64, min_samples: usize, cooldown: Duration, ) -> CapabilityErrorBudgetConfig { CapabilityErrorBudgetConfig { threshold, window_size, window_duration, min_samples, cooldown, } } /// Build a capability budget config with `min_samples = 1` (no guard). fn cap_cfg_no_min( window_size: usize, window_duration: Duration, threshold: f64, cooldown: Duration, ) -> CapabilityErrorBudgetConfig { cap_cfg(window_size, window_duration, threshold, 1, cooldown) } #[test] fn auto_kill_triggers_at_threshold() { // GIVEN: window of 4 calls, threshold 0.5, min_samples=1 let ks = KillSwitch::new(); let (size, dur, thresh) = (4, Duration::from_secs(300), 0.5); // WHEN: 2 successes then 2 failures (50% error rate == threshold) ks.record_success("srv", size, dur); ks.record_success("srv", size, dur); let triggered1 = ks.record_failure("srv", size, dur, thresh, NO_MIN); let triggered2 = ks.record_failure("srv", size, dur, thresh, NO_MIN); // THEN: second failure tips rate to 50% → auto-kill; first does not assert!(!triggered1, "first failure should not yet trigger auto-kill"); assert!(triggered2, "second failure should trigger auto-kill"); assert!(ks.is_killed("srv")); } #[test] fn no_auto_kill_below_threshold() { // GIVEN: window of 10 calls, threshold 0.5, min_samples=1 let ks = KillSwitch::new(); let (size, dur, thresh) = (10, Duration::from_secs(300), 0.5); // WHEN: 6 successes + 4 failures (40% error rate < 50%) for _ in 0..6 { ks.record_success("srv", size, dur); } for _ in 0..4 { ks.record_failure("srv", size, dur, thresh, NO_MIN); } // THEN: server is NOT killed assert!(!ks.is_killed("srv"), "40% error rate should not trigger kill"); } #[test] fn auto_kill_does_not_fire_twice() { // GIVEN: window of 2, threshold 0.5, min_samples=1 let ks = KillSwitch::new(); let (size, dur, thresh) = (2, Duration::from_secs(300), 0.5); // First failure: rate=100% >= 50% → auto-kills let triggered1 = ks.record_failure("srv", size, dur, thresh, NO_MIN); assert!(triggered1, "first failure should trigger auto-kill (100% error rate)"); assert!(ks.is_killed("srv")); // Second failure: server already killed, must NOT re-trigger let triggered2 = ks.record_failure("srv", size, dur, thresh, NO_MIN); assert!(!triggered2, "already-killed server must not re-trigger"); // Third failure: still must not re-trigger let triggered3 = ks.record_failure("srv", size, dur, thresh, NO_MIN); assert!(!triggered3, "already-killed server must not re-trigger on 3rd call"); } #[test] fn revive_resets_error_budget() { // GIVEN: server auto-killed by budget (min_samples=1, threshold=0.5) let ks = KillSwitch::new(); let thresh = 0.5; let (size, dur) = (4, Duration::from_secs(300)); // Two failures → 100% error rate → auto-kill ks.record_failure("srv", size, dur, thresh, NO_MIN); ks.record_failure("srv", size, dur, thresh, NO_MIN); assert!(ks.is_killed("srv"), "should be auto-killed"); // WHEN: revived ks.revive("srv"); assert!(!ks.is_killed("srv"), "should be alive after revive"); // THEN: 3 successes followed by 1 failure → 25% error rate < threshold ks.record_success("srv", size, dur); ks.record_success("srv", size, dur); ks.record_success("srv", size, dur); let triggered = ks.record_failure("srv", size, dur, thresh, NO_MIN); assert!(!triggered, "25% error rate after revive must not trigger auto-kill"); assert!(!ks.is_killed("srv"), "server must remain alive"); } // ── min_samples guard ──────────────────────────────────────────────────── #[test] fn min_samples_prevents_kill_below_sample_count() { // GIVEN: 100% failure rate but only 9 calls (< min_samples=10) let ks = KillSwitch::new(); let (size, dur, thresh, min) = (100, Duration::from_secs(300), 0.8, 10); for _ in 0..9 { let triggered = ks.record_failure("srv", size, dur, thresh, min); assert!(!triggered, "kill must not fire before min_samples reached"); } // THEN: server is alive despite 100% error rate assert!(!ks.is_killed("srv"), "should not be killed before min_samples"); } #[test] fn min_samples_allows_kill_once_sample_count_reached() { // GIVEN: 90% failure rate, min_samples=10 let ks = KillSwitch::new(); let (size, dur, thresh, min) = (100, Duration::from_secs(300), 0.8, 10); // 1 success + 9 failures → window has exactly 10 samples at 90% error rate ks.record_success("srv", size, dur); for i in 0..9usize { let triggered = ks.record_failure("srv", size, dur, thresh, min); if i < 8 { // Total samples still < 10 after first 8 failures (1 success + 8 failures = 9) assert!( !triggered, "kill must not fire before min_samples reached (iteration {i})" ); } else { // 10th sample: 9/10 = 90% >= 80% threshold → auto-kill assert!(triggered, "kill must fire at min_samples when threshold exceeded"); } } assert!(ks.is_killed("srv")); } #[test] fn min_samples_one_is_equivalent_to_no_guard() { // GIVEN: min_samples=1 — a single failure at 100% rate must auto-kill immediately let ks = KillSwitch::new(); let (size, dur, thresh) = (100, Duration::from_secs(300), 0.5); let triggered = ks.record_failure("srv", size, dur, thresh, 1); assert!(triggered, "single failure with min_samples=1 must trigger kill"); assert!(ks.is_killed("srv")); } // ── Default threshold is 0.8, not 0.5 ─────────────────────────────────── #[test] fn default_threshold_does_not_kill_at_50_percent() { // GIVEN: default threshold (0.8) with min_samples=10 let cfg = ErrorBudgetConfig::default(); let ks = KillSwitch::new(); // Fill window with exactly 50% failures (5 out of 10) for _ in 0..5 { ks.record_success("srv", cfg.window_size, cfg.window_duration); } for _ in 0..5 { ks.record_failure( "srv", cfg.window_size, cfg.window_duration, cfg.threshold, cfg.min_samples, ); } // 50% error rate is below 80% default threshold assert!( !ks.is_killed("srv"), "50% error rate must not trigger kill at default 0.8 threshold" ); } // ── Error budget: error_rate / window_counts ───────────────────────────── #[test] fn error_rate_zero_with_no_calls() { let ks = KillSwitch::new(); assert!(ks.error_rate("unknown") < f64::EPSILON); } #[test] fn error_rate_computed_correctly() { let ks = KillSwitch::new(); let (size, dur) = (10, Duration::from_secs(300)); ks.record_success("srv", size, dur); ks.record_success("srv", size, dur); // threshold=1.0 ensures auto-kill can never trigger; min=1 is irrelevant here ks.record_failure("srv", size, dur, 1.0, 1); let rate = ks.error_rate("srv"); assert!((rate - 1.0 / 3.0).abs() < 1e-10, "expected 33% error rate"); } #[test] fn window_counts_returns_successes_and_failures() { let ks = KillSwitch::new(); let (size, dur) = (100, Duration::from_secs(300)); for _ in 0..3 { ks.record_success("srv", size, dur); } ks.record_failure("srv", size, dur, 1.0, 1); let (s, f) = ks.window_counts("srv"); assert_eq!(s, 3); assert_eq!(f, 1); } // ── BudgetWindow ───────────────────────────────────────────────────────── #[test] fn budget_window_evicts_when_full() { // GIVEN: window of 3 let mut w = BudgetWindow::new(3, Duration::from_secs(300)); w.record(true); w.record(true); w.record(false); w.record(false); // this evicts the first entry (success) let (s, f) = w.counts(); assert_eq!(s + f, 3, "window must not exceed max_calls"); } #[test] fn budget_window_evicts_expired_entries() { // GIVEN: window with 1ms max_age let mut w = BudgetWindow::new(100, Duration::from_millis(1)); w.record(false); // Wait for entry to expire std::thread::sleep(Duration::from_millis(5)); w.record(true); // triggers eviction of the expired failure let (s, f) = w.counts(); assert_eq!(f, 0, "expired failure must be evicted"); assert_eq!(s, 1); } #[test] fn budget_window_reset_clears_all_entries() { let mut w = BudgetWindow::new(10, Duration::from_secs(60)); w.record(false); w.record(false); w.reset(); assert!(w.error_rate() < f64::EPSILON); let (s, f) = w.counts(); assert_eq!(s, 0); assert_eq!(f, 0); } // ── ErrorBudgetConfig defaults ──────────────────────────────────────────── #[test] fn error_budget_config_default_values() { let cfg = ErrorBudgetConfig::default(); assert!((cfg.threshold - 0.8).abs() < 1e-10, "default threshold must be 0.8"); assert_eq!(cfg.window_size, 100); assert_eq!(cfg.window_duration, Duration::from_secs(300)); assert_eq!(cfg.min_samples, 10, "default min_samples must be 10"); } // ── CapabilityErrorBudgetConfig defaults ───────────────────────────────── #[test] fn capability_error_budget_config_default_values() { let cfg = CapabilityErrorBudgetConfig::default(); assert!((cfg.threshold - 0.8).abs() < 1e-10, "default threshold must be 0.8"); assert_eq!(cfg.window_size, 50); assert_eq!(cfg.window_duration, Duration::from_secs(300)); assert_eq!(cfg.min_samples, 5, "default min_samples must be 5"); assert_eq!(cfg.cooldown, Duration::from_secs(300)); } // ── Per-capability: is_capability_disabled ──────────────────────────────── #[test] fn unknown_capability_is_not_disabled() { let ks = KillSwitch::new(); assert!(!ks.is_capability_disabled("fulcrum", "calendar_get_event")); } #[test] fn capability_not_disabled_after_success_only() { // GIVEN: only success records let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(50, Duration::from_secs(300), 1.0, Duration::from_secs(300)); for _ in 0..10 { ks.record_capability_success("fulcrum", "calendar_get", &cfg); } // THEN: capability is not disabled assert!(!ks.is_capability_disabled("fulcrum", "calendar_get")); } // ── Per-capability: single capability failure doesn't kill backend ──────── #[test] fn single_capability_failure_does_not_kill_backend() { // GIVEN: a capability with 100% error rate but backend has other successes let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(50, Duration::from_secs(300), 0.8, Duration::from_secs(300)); // Backend gets many successes from other capabilities (backend-level budget) for _ in 0..20 { ks.record_success("fulcrum", 100, Duration::from_secs(300)); } // One bad capability fires repeatedly for _ in 0..10 { ks.record_capability_failure("fulcrum", "broken_tool", &cfg); } // THEN: backend is alive; only the capability is disabled assert!( !ks.is_killed("fulcrum"), "backend must NOT be killed by a single capability's failures" ); assert!( ks.is_capability_disabled("fulcrum", "broken_tool"), "broken_tool capability must be disabled" ); // Other capabilities are unaffected assert!( !ks.is_capability_disabled("fulcrum", "healthy_tool"), "unaffected capability must remain enabled" ); } // ── Per-capability: auto-disable at threshold ───────────────────────────── #[test] fn capability_auto_disabled_when_threshold_exceeded() { // GIVEN: min_samples=1, threshold=0.5 let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(10, Duration::from_secs(300), 0.5, Duration::from_secs(300)); // 1 success + 1 failure → 50% error rate == threshold → auto-disable ks.record_capability_success("fulcrum", "tool_a", &cfg); let triggered = ks.record_capability_failure("fulcrum", "tool_a", &cfg); assert!(triggered, "50% error rate should trigger auto-disable"); assert!(ks.is_capability_disabled("fulcrum", "tool_a")); } #[test] fn capability_not_disabled_below_threshold() { // GIVEN: 40% error rate < 50% threshold let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(10, Duration::from_secs(300), 0.5, Duration::from_secs(300)); for _ in 0..6 { ks.record_capability_success("fulcrum", "tool_b", &cfg); } for _ in 0..4 { ks.record_capability_failure("fulcrum", "tool_b", &cfg); } assert!( !ks.is_capability_disabled("fulcrum", "tool_b"), "40% error rate must not disable capability" ); } #[test] fn capability_auto_disable_does_not_fire_twice() { // GIVEN: already-disabled capability let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(2, Duration::from_secs(300), 0.5, Duration::from_secs(300)); let first = ks.record_capability_failure("fulcrum", "tool_c", &cfg); assert!(first, "first failure (100% rate) should trigger auto-disable"); let second = ks.record_capability_failure("fulcrum", "tool_c", &cfg); assert!(!second, "already-disabled capability must not re-trigger"); } // ── Per-capability: min_samples guard ──────────────────────────────────── #[test] fn capability_min_samples_prevents_disable_below_sample_count() { // GIVEN: 100% failure rate but only 4 calls < min_samples=5 let ks = KillSwitch::new(); let cfg = cap_cfg(50, Duration::from_secs(300), 0.8, 5, Duration::from_secs(300)); for _ in 0..4 { let triggered = ks.record_capability_failure("fulcrum", "tool_d", &cfg); assert!(!triggered, "must not disable before min_samples reached"); } assert!(!ks.is_capability_disabled("fulcrum", "tool_d")); } #[test] fn capability_min_samples_allows_disable_once_reached() { // GIVEN: 80% failure rate, min_samples=5 let ks = KillSwitch::new(); let cfg = cap_cfg(50, Duration::from_secs(300), 0.8, 5, Duration::from_secs(300)); // 1 success + 4 failures = 5 samples, 80% error rate == threshold ks.record_capability_success("fulcrum", "tool_e", &cfg); for i in 0..4usize { let triggered = ks.record_capability_failure("fulcrum", "tool_e", &cfg); if i < 3 { assert!(!triggered, "must not trigger before 5th sample (iteration {i})"); } else { // 5th sample: 4/5 = 80% >= threshold assert!(triggered, "must trigger at 5th sample when threshold met"); } } assert!(ks.is_capability_disabled("fulcrum", "tool_e")); } // ── Per-capability: auto-recovery after cooldown ────────────────────────── #[test] fn capability_auto_recovers_after_cooldown() { // GIVEN: a disabled capability with a 10ms cooldown let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(2, Duration::from_secs(300), 0.5, Duration::from_millis(10)); let triggered = ks.record_capability_failure("fulcrum", "tool_f", &cfg); assert!(triggered, "should be disabled immediately"); // Without cooldown param: confirms it is in the disabled set (no recovery check) assert!(ks.is_capability_disabled("fulcrum", "tool_f")); // Wait for cooldown to elapse std::thread::sleep(Duration::from_millis(20)); // THEN: capability auto-recovers when checked with the cooldown // (the hot-path uses is_capability_disabled_with_cooldown) assert!( !ks.is_capability_disabled_with_cooldown("fulcrum", "tool_f", cfg.cooldown), "capability must auto-recover after cooldown when checked with cooldown param" ); // And now the no-cooldown check also shows it as enabled (entry was purged) assert!( !ks.is_capability_disabled("fulcrum", "tool_f"), "capability must be purged from disabled set after auto-recovery" ); } #[test] fn capability_does_not_recover_before_cooldown_elapses() { // GIVEN: a disabled capability with a 60s cooldown let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(2, Duration::from_secs(300), 0.5, Duration::from_secs(60)); ks.record_capability_failure("fulcrum", "tool_g", &cfg); assert!(ks.is_capability_disabled("fulcrum", "tool_g")); // Immediately check — cooldown has not elapsed assert!( ks.is_capability_disabled("fulcrum", "tool_g"), "capability must not recover before cooldown elapses" ); } // ── Per-capability: revive_capability ──────────────────────────────────── #[test] fn revive_capability_re_enables_disabled_capability() { // GIVEN: a disabled capability let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(2, Duration::from_secs(300), 0.5, Duration::from_secs(300)); ks.record_capability_failure("fulcrum", "tool_h", &cfg); assert!(ks.is_capability_disabled("fulcrum", "tool_h")); // WHEN: revived by operator ks.revive_capability("fulcrum", "tool_h"); // THEN: capability is re-enabled assert!( !ks.is_capability_disabled("fulcrum", "tool_h"), "capability must be re-enabled after operator revive" ); } #[test] fn revive_capability_resets_error_budget() { // GIVEN: a revived capability that receives successes let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(4, Duration::from_secs(300), 0.5, Duration::from_secs(300)); // Disable it by recording 2 failures (100% error rate > 0.5 threshold) for _ in 0..2 { ks.record_capability_failure("fulcrum", "tool_i", &cfg); } ks.revive_capability("fulcrum", "tool_i"); // After revive: 3 successes + 1 failure = 25% error rate (below threshold) for _ in 0..3 { ks.record_capability_success("fulcrum", "tool_i", &cfg); } let retrigger = ks.record_capability_failure("fulcrum", "tool_i", &cfg); assert!(!retrigger, "25% error rate after revive must not re-trigger"); assert!(!ks.is_capability_disabled("fulcrum", "tool_i")); } #[test] fn revive_capability_is_idempotent_on_live_capability() { let ks = KillSwitch::new(); // Reviving a capability that was never disabled must not panic ks.revive_capability("fulcrum", "never_disabled"); assert!(!ks.is_capability_disabled("fulcrum", "never_disabled")); } // ── Per-capability: disabled_capabilities list ──────────────────────────── #[test] fn disabled_capabilities_returns_empty_when_none_disabled() { let ks = KillSwitch::new(); let list = ks.disabled_capabilities(Duration::from_secs(300)); assert!(list.is_empty()); } #[test] fn disabled_capabilities_lists_disabled_entries() { let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(2, Duration::from_secs(300), 0.5, Duration::from_secs(300)); ks.record_capability_failure("fulcrum", "tool_j", &cfg); ks.record_capability_failure("fulcrum", "tool_k", &cfg); let mut list = ks.disabled_capabilities(cfg.cooldown); list.sort(); assert_eq!(list, vec!["fulcrum:tool_j", "fulcrum:tool_k"]); } #[test] fn disabled_capabilities_purges_expired_entries_on_list() { // GIVEN: two disabled capabilities with a 10ms cooldown let ks = KillSwitch::new(); let cfg = cap_cfg_no_min(2, Duration::from_secs(300), 0.5, Duration::from_millis(10)); ks.record_capability_failure("fulcrum", "tool_l", &cfg); ks.record_capability_failure("fulcrum", "tool_m", &cfg); // Both should be disabled immediately (use long cooldown for initial check) let before = ks.disabled_capabilities(Duration::from_secs(300)); assert_eq!(before.len(), 2, "both capabilities should be listed as disabled"); // Wait for both cooldowns to expire std::thread::sleep(Duration::from_millis(20)); // WHEN: listing with the actual short cooldown let after = ks.disabled_capabilities(cfg.cooldown); // THEN: both entries are purged (expired), list is empty assert!( after.is_empty(), "expired entries must be purged from the disabled list" ); } // ── Per-capability: error_rate / window_counts ──────────────────────────── #[test] fn capability_error_rate_zero_with_no_calls() { let ks = KillSwitch::new(); assert!(ks.capability_error_rate("fulcrum", "unknown") < f64::EPSILON); } #[test] fn capability_window_counts_initial_state() { let ks = KillSwitch::new(); let (s, f) = ks.capability_window_counts("fulcrum", "unknown"); assert_eq!(s, 0); assert_eq!(f, 0); } #[test] fn capability_error_rate_computed_correctly() { let ks = KillSwitch::new(); // threshold=1.0 means auto-disable can never trigger let cfg = cap_cfg_no_min(10, Duration::from_secs(300), 1.0, Duration::from_secs(300)); ks.record_capability_success("srv", "cap", &cfg); ks.record_capability_success("srv", "cap", &cfg); ks.record_capability_failure("srv", "cap", &cfg); let rate = ks.capability_error_rate("srv", "cap"); assert!( (rate - 1.0 / 3.0).abs() < 1e-10, "expected 33% error rate, got {rate}" ); } // ── Backend-level budget still works as fallback ────────────────────────── #[test] fn backend_level_budget_still_kills_when_all_capabilities_fail() { // GIVEN: many different capabilities all failing — backend threshold exceeded let ks = KillSwitch::new(); let (backend_ws, backend_wd, thresh, min) = (20, Duration::from_secs(300), 0.8, 1_usize); let cap_cfg_val = cap_cfg_no_min(20, Duration::from_secs(300), 0.8, Duration::from_secs(300)); // Flood the backend budget with failures (each represents a different // capability, so none individually dominates) for i in 0..20u32 { let cap = format!("tool_{i}"); // Record on backend budget ks.record_failure("fulcrum", backend_ws, backend_wd, thresh, min); // Also record on per-capability budget ks.record_capability_failure("fulcrum", &cap, &cap_cfg_val); } // THEN: backend is killed because cumulative error rate exceeds threshold assert!( ks.is_killed("fulcrum"), "backend must be killed when cumulative error rate exceeds backend threshold" ); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/MikkoParkkola/mcp-gateway'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

kill_switch.rs•48.3 KiB