BioMCP

Overview Schema Related Servers Score Discussions

biomcp
src
cli
benchmark

run.rs•40.6 KiB

use std::collections::{BTreeMap, BTreeSet}; use std::ffi::OsString; use std::fs; use std::path::{Path, PathBuf}; use std::process::Stdio; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use anyhow::{Context, anyhow}; use semver::Version; use sha2::{Digest, Sha256}; use time::OffsetDateTime; use time::format_description::well_known::Rfc3339; use super::types::{ BENCHMARK_SCHEMA_VERSION, BenchmarkCaseKind, BenchmarkCaseStatus, BenchmarkCommandReport, BenchmarkEnvironment, BenchmarkMode, BenchmarkRegression, BenchmarkRunReport, BenchmarkSummary, BenchmarkTransientFailure, }; const SUITE_VERSION: &str = "2026-02-17"; const DEFAULT_LATENCY_THRESHOLD_PCT: f64 = 20.0; const DEFAULT_SIZE_THRESHOLD_PCT: f64 = 10.0; const DEFAULT_MAX_FAIL_FAST_MS: u64 = 1500; const DEFAULT_FULL_ITERATIONS: u32 = 3; const DEFAULT_QUICK_ITERATIONS: u32 = 2; const DEFAULT_FULL_TIMEOUT_MS: u64 = 45_000; const DEFAULT_QUICK_TIMEOUT_MS: u64 = 20_000; #[derive(Debug, Clone)] pub struct RunOptions { pub quick: bool, pub iterations: Option<u32>, pub baseline: Option<PathBuf>, pub fail_on_regression: bool, pub fail_on_transient: bool, pub latency_threshold_pct: f64, pub size_threshold_pct: f64, pub max_fail_fast_ms: u64, } impl Default for RunOptions { fn default() -> Self { Self { quick: false, iterations: None, baseline: None, fail_on_regression: false, fail_on_transient: false, latency_threshold_pct: DEFAULT_LATENCY_THRESHOLD_PCT, size_threshold_pct: DEFAULT_SIZE_THRESHOLD_PCT, max_fail_fast_ms: DEFAULT_MAX_FAIL_FAST_MS, } } } #[derive(Debug, Clone)] pub struct SaveBaselineOptions { pub quick: bool, pub iterations: Option<u32>, pub output: Option<PathBuf>, } impl Default for SaveBaselineOptions { fn default() -> Self { Self { quick: false, iterations: None, output: None, } } } #[derive(Debug, Clone, Copy)] struct RegressionThresholds { latency_pct: f64, size_pct: f64, max_fail_fast_ms: u64, } #[derive(Debug)] struct CommandExecution { latency_ms: f64, stdout_bytes: u64, stderr_excerpt: String, exit_code: i32, timed_out: bool, } #[derive(Debug, Clone, Copy)] struct CaseSpec { id: &'static str, kind: BenchmarkCaseKind, args: &'static [&'static str], tags: &'static [&'static str], } const FULL_SUITE: &[CaseSpec] = &[ CaseSpec { id: "get_gene_braf", kind: BenchmarkCaseKind::Success, args: &["get", "gene", "BRAF"], tags: &["core"], }, CaseSpec { id: "get_variant_braf_v600e", kind: BenchmarkCaseKind::Success, args: &["get", "variant", "BRAF V600E"], tags: &["core"], }, CaseSpec { id: "get_trial_nct02576665", kind: BenchmarkCaseKind::Success, args: &["get", "trial", "NCT02576665"], tags: &["core"], }, CaseSpec { id: "search_article_braf_limit_5", kind: BenchmarkCaseKind::Success, args: &["search", "article", "-g", "BRAF", "--limit", "5"], tags: &["core"], }, CaseSpec { id: "get_drug_imatinib", kind: BenchmarkCaseKind::Success, args: &["get", "drug", "imatinib"], tags: &["extended"], }, CaseSpec { id: "search_trial_melanoma_limit_5", kind: BenchmarkCaseKind::Success, args: &["search", "trial", "-c", "melanoma", "--limit", "5"], tags: &["extended"], }, CaseSpec { id: "get_pgx_cyp2d6", kind: BenchmarkCaseKind::Success, args: &["get", "pgx", "CYP2D6"], tags: &["extended"], }, CaseSpec { id: "search_variant_egfr_limit_5", kind: BenchmarkCaseKind::Success, args: &["search", "variant", "-g", "EGFR", "--limit", "5"], tags: &["extended"], }, CaseSpec { id: "get_pathway_r_hsa_5673001", kind: BenchmarkCaseKind::Success, args: &["get", "pathway", "R-HSA-5673001"], tags: &["extended"], }, CaseSpec { id: "get_disease_mondo_0005105", kind: BenchmarkCaseKind::Success, args: &["get", "disease", "MONDO:0005105"], tags: &["extended"], }, CaseSpec { id: "contract_invalid_article_since_2024_13_01", kind: BenchmarkCaseKind::ContractFailure, args: &[ "search", "article", "-g", "BRAF", "--since", "2024-13-01", "--limit", "1", ], tags: &["contract", "contract_core"], }, CaseSpec { id: "contract_invalid_trial_since_2024_02_30", kind: BenchmarkCaseKind::ContractFailure, args: &[ "search", "trial", "-c", "melanoma", "--since", "2024-02-30", "--limit", "1", ], tags: &["contract"], }, ]; pub async fn run_benchmark(opts: RunOptions, json_output: bool) -> anyhow::Result<String> { let mode = if opts.quick { BenchmarkMode::Quick } else { BenchmarkMode::Full }; let iterations = opts.iterations.unwrap_or_else(|| default_iterations(mode)); let timeout_ms = default_timeout(mode); let mut report = collect_report(mode, iterations, timeout_ms, opts.max_fail_fast_ms, None).await?; let baseline_path = if let Some(explicit) = opts.baseline.as_ref() { Some(explicit.clone()) } else { discover_latest_baseline_path() }; if let Some(path) = baseline_path { if path.exists() { let baseline = load_baseline(&path)?; compare_against_baseline( &mut report, &baseline, RegressionThresholds { latency_pct: opts.latency_threshold_pct, size_pct: opts.size_threshold_pct, max_fail_fast_ms: opts.max_fail_fast_ms, }, ); report.baseline_path = Some(path.display().to_string()); } } report.summary = build_summary(&report); let rendered = if json_output { crate::render::json::to_pretty(&report)? } else { render_human_report(&report) }; if opts.fail_on_regression && !report.regressions.is_empty() { return Err(anyhow!(format!( "benchmark regressions detected ({}).\n{}", report.regressions.len(), rendered ))); } if opts.fail_on_transient && !report.transient_failures.is_empty() { return Err(anyhow!(format!( "transient benchmark failures detected ({}).\n{}", report.transient_failures.len(), rendered ))); } Ok(rendered) } pub async fn save_baseline(opts: SaveBaselineOptions, json_output: bool) -> anyhow::Result<String> { let mode = if opts.quick { BenchmarkMode::Quick } else { BenchmarkMode::Full }; let iterations = opts.iterations.unwrap_or_else(|| default_iterations(mode)); let timeout_ms = default_timeout(mode); let report = collect_report(mode, iterations, timeout_ms, DEFAULT_MAX_FAIL_FAST_MS, None).await?; let output_path = opts.output.unwrap_or_else(default_baseline_path); if let Some(parent) = output_path.parent() { fs::create_dir_all(parent).with_context(|| { format!( "failed to create baseline directory {}", parent.to_string_lossy() ) })?; } let mut serialized = crate::render::json::to_pretty(&report)?; serialized.push('\n'); fs::write(&output_path, serialized).with_context(|| { format!( "failed to write baseline file {}", output_path.to_string_lossy() ) })?; if json_output { #[derive(serde::Serialize)] struct SaveBaselineResponse { path: String, report: BenchmarkRunReport, } return Ok(crate::render::json::to_pretty(&SaveBaselineResponse { path: output_path.display().to_string(), report, })?); } Ok(format!( "Saved benchmark baseline: {}\ncases: {} | ok: {} | failed: {} | transient: {}", output_path.display(), report.summary.total_cases, report.summary.ok_cases, report.summary.failed_cases, report.summary.transient_failures, )) } fn build_summary(report: &BenchmarkRunReport) -> BenchmarkSummary { let total_cases = report.commands.len(); let ok_cases = report .commands .iter() .filter(|case| case.status == BenchmarkCaseStatus::Ok) .count(); let transient_failures = report .commands .iter() .filter(|case| case.status == BenchmarkCaseStatus::TransientFailure) .count(); let failed_cases = total_cases.saturating_sub(ok_cases + transient_failures); BenchmarkSummary { total_cases, ok_cases, failed_cases, transient_failures, regression_count: report.regressions.len(), } } async fn collect_report( mode: BenchmarkMode, iterations: u32, timeout_ms: u64, max_fail_fast_ms: u64, baseline_path: Option<String>, ) -> anyhow::Result<BenchmarkRunReport> { let suite = select_suite(mode); let suite_hash = compute_suite_hash(&suite); let cache_root = create_temp_cache_root()?; let _cache_guard = TempDirGuard::new(cache_root.clone()); let exe = std::env::current_exe().context("failed to resolve biomcp executable path")?; let mut commands = Vec::with_capacity(suite.len()); for case in suite { let case_cache_root = cache_root.join(case.id); let report = match case.kind { BenchmarkCaseKind::Success => { run_success_case(case, iterations, timeout_ms, &exe, &case_cache_root).await? } BenchmarkCaseKind::ContractFailure => { run_contract_case(case, iterations, max_fail_fast_ms, &exe, &case_cache_root) .await? } }; commands.push(report); } commands.sort_by(|a, b| a.id.cmp(&b.id)); let mut report = BenchmarkRunReport { schema_version: BENCHMARK_SCHEMA_VERSION, suite_version: SUITE_VERSION.to_string(), suite_hash, cli_version: env!("CARGO_PKG_VERSION").to_string(), generated_at: now_rfc3339()?, environment: BenchmarkEnvironment { os: std::env::consts::OS.to_string(), arch: std::env::consts::ARCH.to_string(), hostname: std::env::var("HOSTNAME").ok(), }, mode, iterations, baseline_path, commands, regressions: Vec::new(), transient_failures: Vec::new(), summary: BenchmarkSummary { total_cases: 0, ok_cases: 0, failed_cases: 0, transient_failures: 0, regression_count: 0, }, }; report.summary = build_summary(&report); Ok(report) } async fn run_success_case( case: CaseSpec, iterations: u32, timeout_ms: u64, exe: &Path, case_cache_root: &Path, ) -> anyhow::Result<BenchmarkCommandReport> { let mut cold_samples = Vec::with_capacity(iterations as usize); let mut warm_samples = Vec::with_capacity(iterations as usize); let mut markdown_bytes = Vec::with_capacity(iterations as usize); let mut json_bytes = Vec::with_capacity(iterations as usize); let mut had_transient_failure = false; let mut had_non_transient_failure = false; let mut stderr_excerpt = None; let mut last_exit_code = None; for _ in 0..iterations { reset_case_cache(case_cache_root)?; let cold = execute_case_command(exe, case.args, false, case_cache_root, timeout_ms).await?; if cold.exit_code == 0 && !cold.timed_out { cold_samples.push(cold.latency_ms); markdown_bytes.push(cold.stdout_bytes); } else { record_failure( &cold, &mut had_transient_failure, &mut had_non_transient_failure, &mut stderr_excerpt, ); } let warm = execute_case_command(exe, case.args, false, case_cache_root, timeout_ms).await?; if warm.exit_code == 0 && !warm.timed_out { warm_samples.push(warm.latency_ms); } else { record_failure( &warm, &mut had_transient_failure, &mut had_non_transient_failure, &mut stderr_excerpt, ); } let json = execute_case_command(exe, case.args, true, case_cache_root, timeout_ms).await?; last_exit_code = Some(json.exit_code); if json.exit_code == 0 && !json.timed_out { json_bytes.push(json.stdout_bytes); } else { record_failure( &json, &mut had_transient_failure, &mut had_non_transient_failure, &mut stderr_excerpt, ); } } let status = if had_non_transient_failure { BenchmarkCaseStatus::Failed } else if had_transient_failure { BenchmarkCaseStatus::TransientFailure } else { BenchmarkCaseStatus::Ok }; Ok(BenchmarkCommandReport { id: case.id.to_string(), kind: BenchmarkCaseKind::Success, command: format_command(case.args), tags: case.tags.iter().map(|tag| (*tag).to_string()).collect(), status, iterations, cold_latency_ms: median_f64(&cold_samples), warm_latency_ms: median_f64(&warm_samples), markdown_bytes: median_u64(&markdown_bytes), json_bytes: median_u64(&json_bytes), fail_fast_latency_ms: None, exit_code: last_exit_code, stderr_excerpt, }) } async fn run_contract_case( case: CaseSpec, iterations: u32, max_fail_fast_ms: u64, exe: &Path, case_cache_root: &Path, ) -> anyhow::Result<BenchmarkCommandReport> { let timeout_ms = max_fail_fast_ms.saturating_mul(4).max(3000); let mut latencies = Vec::with_capacity(iterations as usize); let mut exit_codes = Vec::with_capacity(iterations as usize); let mut stderr_excerpt = None; let mut saw_success_exit = false; for _ in 0..iterations { reset_case_cache(case_cache_root)?; let exec = execute_case_command(exe, case.args, false, case_cache_root, timeout_ms).await?; latencies.push(exec.latency_ms); exit_codes.push(exec.exit_code); if exec.exit_code == 0 { saw_success_exit = true; if stderr_excerpt.is_none() { stderr_excerpt = Some(exec.stderr_excerpt.clone()); } } } let fail_fast_latency_ms = median_f64(&latencies); let status = if saw_success_exit { BenchmarkCaseStatus::Failed } else if fail_fast_latency_ms .map(|latency| latency > max_fail_fast_ms as f64) .unwrap_or(true) { BenchmarkCaseStatus::Failed } else { BenchmarkCaseStatus::Ok }; let exit_code = median_i32(&exit_codes); Ok(BenchmarkCommandReport { id: case.id.to_string(), kind: BenchmarkCaseKind::ContractFailure, command: format_command(case.args), tags: case.tags.iter().map(|tag| (*tag).to_string()).collect(), status, iterations, cold_latency_ms: None, warm_latency_ms: None, markdown_bytes: None, json_bytes: None, fail_fast_latency_ms, exit_code, stderr_excerpt, }) } fn record_failure( exec: &CommandExecution, had_transient_failure: &mut bool, had_non_transient_failure: &mut bool, stderr_excerpt: &mut Option<String>, ) { if is_transient_failure(exec) { *had_transient_failure = true; } else { *had_non_transient_failure = true; } if stderr_excerpt.is_none() { *stderr_excerpt = Some(exec.stderr_excerpt.clone()); } } async fn execute_case_command( exe: &Path, args: &[&str], as_json: bool, cache_home: &Path, timeout_ms: u64, ) -> anyhow::Result<CommandExecution> { let mut cmd = tokio::process::Command::new(exe); cmd.kill_on_drop(true) .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .env("XDG_CACHE_HOME", cache_home) .args(build_child_args(args, as_json)); let start = tokio::time::Instant::now(); let output = tokio::time::timeout(Duration::from_millis(timeout_ms), cmd.output()).await; match output { Ok(Ok(out)) => { let latency_ms = start.elapsed().as_secs_f64() * 1000.0; let stderr = String::from_utf8_lossy(&out.stderr).to_string(); Ok(CommandExecution { latency_ms, stdout_bytes: out.stdout.len() as u64, stderr_excerpt: trim_excerpt(&stderr), exit_code: out.status.code().unwrap_or(-1), timed_out: false, }) } Ok(Err(err)) => Err(err).context("failed to run benchmark command"), Err(_) => Ok(CommandExecution { latency_ms: timeout_ms as f64, stdout_bytes: 0, stderr_excerpt: format!("timed out after {}ms", timeout_ms), exit_code: -1, timed_out: true, }), } } fn compare_against_baseline( report: &mut BenchmarkRunReport, baseline: &BenchmarkRunReport, thresholds: RegressionThresholds, ) { let baseline_by_id = baseline .commands .iter() .map(|command| (command.id.as_str(), command)) .collect::<BTreeMap<_, _>>(); let mut regressions = Vec::new(); let mut transient = Vec::new(); let mut seen = BTreeSet::new(); for command in &report.commands { seen.insert(command.id.clone()); if command.status == BenchmarkCaseStatus::TransientFailure { transient.push(BenchmarkTransientFailure { command_id: command.id.clone(), message: command .stderr_excerpt .clone() .unwrap_or_else(|| "transient upstream failure".to_string()), }); continue; } let Some(base) = baseline_by_id.get(command.id.as_str()) else { continue; }; if base.kind != command.kind { regressions.push(BenchmarkRegression { command_id: command.id.clone(), metric: "kind".to_string(), baseline_value: format!("{:?}", base.kind), current_value: format!("{:?}", command.kind), delta_pct: None, message: "benchmark case kind changed".to_string(), }); continue; } if base.status == BenchmarkCaseStatus::Ok && command.status != BenchmarkCaseStatus::Ok { regressions.push(BenchmarkRegression { command_id: command.id.clone(), metric: "status".to_string(), baseline_value: "ok".to_string(), current_value: format!("{:?}", command.status), delta_pct: None, message: "case no longer succeeds".to_string(), }); } match command.kind { BenchmarkCaseKind::Success => compare_success_case( &mut regressions, base, command, thresholds.latency_pct, thresholds.size_pct, ), BenchmarkCaseKind::ContractFailure => { compare_contract_case(&mut regressions, base, command, thresholds.max_fail_fast_ms) } } } for command in &baseline.commands { if !seen.contains(&command.id) { regressions.push(BenchmarkRegression { command_id: command.id.clone(), metric: "missing_case".to_string(), baseline_value: "present".to_string(), current_value: "missing".to_string(), delta_pct: None, message: "command missing from current benchmark run".to_string(), }); } } regressions.sort_by(|a, b| { a.command_id .cmp(&b.command_id) .then_with(|| a.metric.cmp(&b.metric)) }); transient.sort_by(|a, b| a.command_id.cmp(&b.command_id)); report.regressions = regressions; report.transient_failures = transient; } fn compare_success_case( regressions: &mut Vec<BenchmarkRegression>, baseline: &BenchmarkCommandReport, current: &BenchmarkCommandReport, latency_threshold_pct: f64, size_threshold_pct: f64, ) { maybe_push_numeric_regression( regressions, current.id.as_str(), "warm_latency_ms", baseline.warm_latency_ms, current.warm_latency_ms, latency_threshold_pct, "warm latency", ); maybe_push_numeric_regression( regressions, current.id.as_str(), "cold_latency_ms", baseline.cold_latency_ms, current.cold_latency_ms, latency_threshold_pct, "cold latency", ); maybe_push_numeric_regression( regressions, current.id.as_str(), "markdown_bytes", baseline.markdown_bytes.map(|v| v as f64), current.markdown_bytes.map(|v| v as f64), size_threshold_pct, "markdown output size", ); maybe_push_numeric_regression( regressions, current.id.as_str(), "json_bytes", baseline.json_bytes.map(|v| v as f64), current.json_bytes.map(|v| v as f64), size_threshold_pct, "json output size", ); if let (Some(base_exit), Some(cur_exit)) = (baseline.exit_code, current.exit_code) && base_exit != cur_exit { regressions.push(BenchmarkRegression { command_id: current.id.clone(), metric: "exit_code".to_string(), baseline_value: base_exit.to_string(), current_value: cur_exit.to_string(), delta_pct: None, message: "exit code changed".to_string(), }); } } fn compare_contract_case( regressions: &mut Vec<BenchmarkRegression>, baseline: &BenchmarkCommandReport, current: &BenchmarkCommandReport, max_fail_fast_ms: u64, ) { let baseline_exit = baseline.exit_code.unwrap_or(1); let current_exit = current.exit_code.unwrap_or(0); if baseline_exit != 0 && current_exit == 0 { regressions.push(BenchmarkRegression { command_id: current.id.clone(), metric: "invalid_date_exit_code".to_string(), baseline_value: baseline_exit.to_string(), current_value: current_exit.to_string(), delta_pct: None, message: "invalid date case no longer fails".to_string(), }); } if let Some(latency) = current.fail_fast_latency_ms && latency > max_fail_fast_ms as f64 { regressions.push(BenchmarkRegression { command_id: current.id.clone(), metric: "fail_fast_latency_ms".to_string(), baseline_value: baseline .fail_fast_latency_ms .map(|value| format_float(value)) .unwrap_or_else(|| "n/a".to_string()), current_value: format_float(latency), delta_pct: None, message: format!("fail-fast latency exceeds {}ms limit", max_fail_fast_ms), }); } } fn maybe_push_numeric_regression( regressions: &mut Vec<BenchmarkRegression>, command_id: &str, metric: &str, baseline: Option<f64>, current: Option<f64>, threshold_pct: f64, label: &str, ) { let (Some(base), Some(cur)) = (baseline, current) else { return; }; if base <= 0.0 { if cur > 0.0 { regressions.push(BenchmarkRegression { command_id: command_id.to_string(), metric: metric.to_string(), baseline_value: format_float(base), current_value: format_float(cur), delta_pct: None, message: format!("{label} changed from zero baseline"), }); } return; } let delta_pct = ((cur - base) / base) * 100.0; if delta_pct > threshold_pct { regressions.push(BenchmarkRegression { command_id: command_id.to_string(), metric: metric.to_string(), baseline_value: format_float(base), current_value: format_float(cur), delta_pct: Some(delta_pct), message: format!( "{label} increased by {:.2}% (threshold {:.2}%)", delta_pct, threshold_pct ), }); } } fn is_transient_failure(exec: &CommandExecution) -> bool { if exec.timed_out { return true; } let msg = exec.stderr_excerpt.to_ascii_lowercase(); msg.contains("timed out") || msg.contains("timeout") || msg.contains("temporary") || msg.contains("connection") || msg.contains("dns") || msg.contains("http 429") || msg.contains("http 502") || msg.contains("http 503") || msg.contains("http 504") } fn render_human_report(report: &BenchmarkRunReport) -> String { let mut out = String::new(); out.push_str("# BioMCP Benchmark Report\n\n"); out.push_str(&format!( "- Mode: {}\n- Iterations: {}\n- Suite version: {}\n- Suite hash: {}\n- Generated: {}\n", mode_label(report.mode), report.iterations, report.suite_version, report.suite_hash, report.generated_at, )); if let Some(path) = &report.baseline_path { out.push_str(&format!("- Baseline: {}\n", path)); } out.push_str(&format!( "- Summary: total={} ok={} failed={} transient={} regressions={}\n", report.summary.total_cases, report.summary.ok_cases, report.summary.failed_cases, report.summary.transient_failures, report.summary.regression_count, )); out.push_str("\n## Command Metrics\n\n"); out.push_str( "| id | kind | status | cold_ms | warm_ms | md_bytes | json_bytes | fail_fast_ms |\n", ); out.push_str("|---|---|---|---:|---:|---:|---:|---:|\n"); for case in &report.commands { out.push_str(&format!( "| {} | {} | {} | {} | {} | {} | {} | {} |\n", case.id, kind_label(case.kind), status_label(case.status), fmt_opt_f64(case.cold_latency_ms), fmt_opt_f64(case.warm_latency_ms), fmt_opt_u64(case.markdown_bytes), fmt_opt_u64(case.json_bytes), fmt_opt_f64(case.fail_fast_latency_ms), )); } if !report.regressions.is_empty() { out.push_str("\n## Regressions\n\n"); out.push_str("| command_id | metric | baseline | current | delta_pct | message |\n"); out.push_str("|---|---|---:|---:|---:|---|\n"); for regression in &report.regressions { out.push_str(&format!( "| {} | {} | {} | {} | {} | {} |\n", regression.command_id, regression.metric, regression.baseline_value, regression.current_value, regression .delta_pct .map(format_float) .unwrap_or_else(|| "n/a".to_string()), regression.message, )); } } if !report.transient_failures.is_empty() { out.push_str("\n## Transient Failures\n\n"); for failure in &report.transient_failures { out.push_str(&format!("- {}: {}\n", failure.command_id, failure.message)); } } out } fn mode_label(mode: BenchmarkMode) -> &'static str { match mode { BenchmarkMode::Full => "full", BenchmarkMode::Quick => "quick", } } fn kind_label(kind: BenchmarkCaseKind) -> &'static str { match kind { BenchmarkCaseKind::Success => "success", BenchmarkCaseKind::ContractFailure => "contract_failure", } } fn status_label(status: BenchmarkCaseStatus) -> &'static str { match status { BenchmarkCaseStatus::Ok => "ok", BenchmarkCaseStatus::Failed => "failed", BenchmarkCaseStatus::TransientFailure => "transient_failure", } } fn trim_excerpt(text: &str) -> String { let compact = text.split_whitespace().collect::<Vec<_>>().join(" "); if compact.len() <= 240 { compact } else { format!("{}...", &compact[..240]) } } fn build_child_args(args: &[&str], as_json: bool) -> Vec<OsString> { let mut full = Vec::with_capacity(args.len() + usize::from(as_json)); if as_json { full.push(OsString::from("--json")); } for arg in args { full.push(OsString::from(arg)); } full } fn create_temp_cache_root() -> anyhow::Result<PathBuf> { let now_ms = SystemTime::now() .duration_since(UNIX_EPOCH) .context("failed to build benchmark temp cache timestamp")? .as_millis(); let pid = std::process::id(); let root = std::env::temp_dir().join(format!("biomcp-benchmark-{}-{}", pid, now_ms)); fs::create_dir_all(&root).with_context(|| { format!( "failed to create benchmark cache root {}", root.to_string_lossy() ) })?; Ok(root) } fn reset_case_cache(path: &Path) -> anyhow::Result<()> { if path.exists() { fs::remove_dir_all(path).with_context(|| { format!("failed to clear benchmark cache {}", path.to_string_lossy()) })?; } fs::create_dir_all(path) .with_context(|| format!("failed to create cache {}", path.to_string_lossy())) } fn now_rfc3339() -> anyhow::Result<String> { Ok(OffsetDateTime::now_utc() .format(&Rfc3339) .context("failed to format benchmark timestamp")?) } fn default_timeout(mode: BenchmarkMode) -> u64 { match mode { BenchmarkMode::Full => DEFAULT_FULL_TIMEOUT_MS, BenchmarkMode::Quick => DEFAULT_QUICK_TIMEOUT_MS, } } fn default_iterations(mode: BenchmarkMode) -> u32 { match mode { BenchmarkMode::Full => DEFAULT_FULL_ITERATIONS, BenchmarkMode::Quick => DEFAULT_QUICK_ITERATIONS, } } fn default_baseline_path() -> PathBuf { PathBuf::from("benchmarks").join(format!("v{}.json", env!("CARGO_PKG_VERSION"))) } fn discover_latest_baseline_path() -> Option<PathBuf> { let dir = Path::new("benchmarks"); let entries = fs::read_dir(dir).ok()?; let mut candidates = Vec::new(); for entry in entries.flatten() { let path = entry.path(); let name = path.file_name()?.to_str()?; if !name.starts_with('v') || !name.ends_with(".json") { continue; } let version_text = name .strip_prefix('v') .and_then(|v| v.strip_suffix(".json"))?; let version = Version::parse(version_text).ok()?; candidates.push((version, path)); } candidates.sort_by(|a, b| a.0.cmp(&b.0)); candidates.pop().map(|(_, path)| path) } fn load_baseline(path: &Path) -> anyhow::Result<BenchmarkRunReport> { let text = fs::read_to_string(path) .with_context(|| format!("failed to read baseline file {}", path.to_string_lossy()))?; let report = serde_json::from_str::<BenchmarkRunReport>(&text) .with_context(|| format!("failed to parse baseline file {}", path.to_string_lossy()))?; Ok(report) } fn select_suite(mode: BenchmarkMode) -> Vec<CaseSpec> { match mode { BenchmarkMode::Full => FULL_SUITE.to_vec(), BenchmarkMode::Quick => FULL_SUITE .iter() .copied() .filter(|case| case.tags.contains(&"core") || case.tags.contains(&"contract_core")) .collect(), } } fn compute_suite_hash(cases: &[CaseSpec]) -> String { let mut hasher = Sha256::new(); for case in cases { hasher.update(case.id.as_bytes()); hasher.update(b"\0"); for arg in case.args { hasher.update(arg.as_bytes()); hasher.update(b"\0"); } hasher.update(b"\n"); } let digest = hasher.finalize(); let mut hex = String::with_capacity(digest.len() * 2); for byte in digest { use std::fmt::Write as _; let _ = write!(&mut hex, "{byte:02x}"); } hex } fn format_command(args: &[&str]) -> String { let mut command = String::from("biomcp"); for arg in args { command.push(' '); if arg.contains(' ') { command.push('"'); command.push_str(arg); command.push('"'); } else { command.push_str(arg); } } command } fn median_f64(samples: &[f64]) -> Option<f64> { if samples.is_empty() { return None; } let mut sorted = samples.to_vec(); sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); let mid = sorted.len() / 2; if sorted.len() % 2 == 0 { Some((sorted[mid - 1] + sorted[mid]) / 2.0) } else { Some(sorted[mid]) } } fn median_u64(samples: &[u64]) -> Option<u64> { if samples.is_empty() { return None; } let mut sorted = samples.to_vec(); sorted.sort_unstable(); Some(sorted[sorted.len() / 2]) } fn median_i32(samples: &[i32]) -> Option<i32> { if samples.is_empty() { return None; } let mut sorted = samples.to_vec(); sorted.sort_unstable(); Some(sorted[sorted.len() / 2]) } fn fmt_opt_f64(value: Option<f64>) -> String { value.map(format_float).unwrap_or_else(|| "n/a".to_string()) } fn fmt_opt_u64(value: Option<u64>) -> String { value .map(|v| v.to_string()) .unwrap_or_else(|| "n/a".to_string()) } fn format_float(value: f64) -> String { format!("{value:.2}") } struct TempDirGuard { path: PathBuf, } impl TempDirGuard { fn new(path: PathBuf) -> Self { Self { path } } } impl Drop for TempDirGuard { fn drop(&mut self) { let _ = fs::remove_dir_all(&self.path); } } #[cfg(test)] mod tests { use super::*; fn success_case( id: &str, warm_ms: f64, cold_ms: f64, md_bytes: u64, json_bytes: u64, ) -> BenchmarkCommandReport { BenchmarkCommandReport { id: id.to_string(), kind: BenchmarkCaseKind::Success, command: "biomcp get gene BRAF".to_string(), tags: vec!["core".to_string()], status: BenchmarkCaseStatus::Ok, iterations: 3, cold_latency_ms: Some(cold_ms), warm_latency_ms: Some(warm_ms), markdown_bytes: Some(md_bytes), json_bytes: Some(json_bytes), fail_fast_latency_ms: None, exit_code: Some(0), stderr_excerpt: None, } } fn contract_case(id: &str, latency_ms: f64, exit_code: i32) -> BenchmarkCommandReport { BenchmarkCommandReport { id: id.to_string(), kind: BenchmarkCaseKind::ContractFailure, command: "biomcp search article -g BRAF --since 2024-13-01 --limit 1".to_string(), tags: vec!["contract".to_string()], status: BenchmarkCaseStatus::Ok, iterations: 3, cold_latency_ms: None, warm_latency_ms: None, markdown_bytes: None, json_bytes: None, fail_fast_latency_ms: Some(latency_ms), exit_code: Some(exit_code), stderr_excerpt: None, } } fn report(commands: Vec<BenchmarkCommandReport>) -> BenchmarkRunReport { BenchmarkRunReport { schema_version: BENCHMARK_SCHEMA_VERSION, suite_version: SUITE_VERSION.to_string(), suite_hash: "abc".to_string(), cli_version: "0.3.0".to_string(), generated_at: "2026-02-17T00:00:00Z".to_string(), environment: BenchmarkEnvironment { os: "linux".to_string(), arch: "x86_64".to_string(), hostname: None, }, mode: BenchmarkMode::Full, iterations: 3, baseline_path: None, commands, regressions: Vec::new(), transient_failures: Vec::new(), summary: BenchmarkSummary { total_cases: 0, ok_cases: 0, failed_cases: 0, transient_failures: 0, regression_count: 0, }, } } #[test] fn detects_latency_and_size_regressions_above_threshold() { let baseline = report(vec![success_case("case", 100.0, 120.0, 1000, 1500)]); let mut current = report(vec![success_case("case", 130.0, 160.0, 1200, 1700)]); compare_against_baseline( &mut current, &baseline, RegressionThresholds { latency_pct: 20.0, size_pct: 10.0, max_fail_fast_ms: 1500, }, ); let metrics = current .regressions .iter() .map(|r| r.metric.clone()) .collect::<BTreeSet<_>>(); assert!(metrics.contains("warm_latency_ms")); assert!(metrics.contains("cold_latency_ms")); assert!(metrics.contains("markdown_bytes")); assert!(metrics.contains("json_bytes")); } #[test] fn ignores_changes_below_thresholds() { let baseline = report(vec![success_case("case", 100.0, 120.0, 1000, 1500)]); let mut current = report(vec![success_case("case", 118.0, 140.0, 1080, 1600)]); compare_against_baseline( &mut current, &baseline, RegressionThresholds { latency_pct: 20.0, size_pct: 10.0, max_fail_fast_ms: 1500, }, ); assert!(current.regressions.is_empty()); } #[test] fn flags_invalid_date_contract_that_starts_succeeding() { let baseline = report(vec![contract_case("contract", 300.0, 1)]); let mut current = report(vec![contract_case("contract", 350.0, 0)]); compare_against_baseline( &mut current, &baseline, RegressionThresholds { latency_pct: 20.0, size_pct: 10.0, max_fail_fast_ms: 1500, }, ); assert!( current .regressions .iter() .any(|regression| regression.metric == "invalid_date_exit_code") ); } #[test] fn flags_fail_fast_latency_over_limit() { let baseline = report(vec![contract_case("contract", 300.0, 1)]); let mut current = report(vec![contract_case("contract", 2000.0, 1)]); compare_against_baseline( &mut current, &baseline, RegressionThresholds { latency_pct: 20.0, size_pct: 10.0, max_fail_fast_ms: 1500, }, ); assert!( current .regressions .iter() .any(|regression| regression.metric == "fail_fast_latency_ms") ); } #[test] fn quick_suite_keeps_core_and_one_contract_case() { let quick = select_suite(BenchmarkMode::Quick); let contract_count = quick .iter() .filter(|case| case.kind == BenchmarkCaseKind::ContractFailure) .count(); assert!(quick.len() >= 4); assert_eq!(contract_count, 1); assert!( quick .iter() .all(|case| case.tags.contains(&"core") || case.tags.contains(&"contract_core")) ); } #[test] fn baseline_discovery_picks_highest_semver() { let root = std::env::temp_dir().join(format!( "biomcp-benchmark-discovery-{}", SystemTime::now() .duration_since(UNIX_EPOCH) .expect("time") .as_nanos() )); let benchmarks_dir = root.join("benchmarks"); fs::create_dir_all(&benchmarks_dir).expect("mkdir"); fs::write(benchmarks_dir.join("v0.1.0.json"), "{}").expect("write"); fs::write(benchmarks_dir.join("v0.3.0.json"), "{}").expect("write"); fs::write(benchmarks_dir.join("v0.2.5.json"), "{}").expect("write"); let cwd = std::env::current_dir().expect("cwd"); std::env::set_current_dir(&root).expect("set cwd"); let selected = discover_latest_baseline_path(); std::env::set_current_dir(cwd).expect("restore cwd"); fs::remove_dir_all(&root).expect("cleanup"); let selected_name = selected .as_ref() .and_then(|path| path.file_name()) .and_then(|name| name.to_str()); assert_eq!(selected_name, Some("v0.3.0.json")); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/genomoncology/biomcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

run.rs•40.6 KiB