compare_windows
Detect drift or degradation by comparing baseline and recent episode arrays. Returns trend, delta metrics, and severity factors for model health tracking.
Instructions
Compare two windows of episodes to detect drift or degradation — no API call, no auth, no credits. Takes baseline and recent episode arrays from evaluate or fleet_session_round responses. Response: { comparison: { baseline: stats, recent: stats }, delta: { ci_mean, ema_mean, al_mean, ghost_delta, warn_delta, fault_delta }, trend: 'improving'|'stable'|'degrading', degraded: bool, severity_factors: [...] }. Use after multiple evaluate calls to track model health over time.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| baseline | Yes | Baseline episode array (e.g. last hour, known-good run) | |
| recent | Yes | Recent episode array to compare against baseline |
Implementation Reference
- src/index.ts:948-1042 (registration)Registration of the compare_windows tool via server.tool(...) call. Lines 948-1042 contain the full registration including name, description, schema definition, and handler.
server.tool( "compare_windows", "Compare two windows of episodes to detect drift or degradation — no API call, no auth, no credits. Takes baseline and recent episode arrays from evaluate or fleet_session_round responses. Response: { comparison: { baseline: stats, recent: stats }, delta: { ci_mean, ema_mean, al_mean, ghost_delta, warn_delta, fault_delta }, trend: 'improving'|'stable'|'degrading', degraded: bool, severity_factors: [...] }. Use after multiple evaluate calls to track model health over time.", { baseline: z.array(z.record(z.string(), z.unknown())).min(1).describe("Baseline episode array (e.g. last hour, known-good run)"), recent: z.array(z.record(z.string(), z.unknown())).min(1).describe("Recent episode array to compare against baseline"), }, async ({ baseline, recent }) => { const Q = 65535; function windowStats(eps: Array<Record<string, unknown>>) { const cis = eps.map(e => ((e.ci_out as number) || 0) / Q); const emas = eps.map(e => ((e.ci_ema_out as number) || 0) / Q); const als = eps.map(e => (e.al_out as number) || 0); const mean = (arr: number[]) => arr.reduce((a, b) => a + b, 0) / arr.length; const max = (arr: number[]) => Math.max(...arr); const min = (arr: number[]) => Math.min(...arr); const ghosts = eps.filter(e => e.ghost_confirmed).length; const warns = eps.filter(e => e.warn).length; const faults = eps.filter(e => e.fault).length; return { episodes: eps.length, ci_mean: mean(cis), ci_max: max(cis), ci_min: min(cis), ema_mean: mean(emas), al_mean: mean(als), al_max: max(als), ghosts, warns, faults, }; } function classify(ci: number): string { if (ci <= 0.15) return "Stable"; if (ci <= 0.45) return "Drift"; if (ci <= 0.70) return "Flip"; return "Collapse"; } const base = windowStats(baseline as Array<Record<string, unknown>>); const curr = windowStats(recent as Array<Record<string, unknown>>); const ciDelta = curr.ci_mean - base.ci_mean; const emaDelta = curr.ema_mean - base.ema_mean; const alDelta = curr.al_mean - base.al_mean; // Trend direction let trend: "improving" | "stable" | "degrading"; if (ciDelta < -0.03) trend = "improving"; else if (ciDelta > 0.03) trend = "degrading"; else trend = "stable"; // Severity assessment const severityFactors: string[] = []; if (ciDelta > 0.15) severityFactors.push(`CI jumped significantly (+${(ciDelta * 100).toFixed(1)}%)`); if (curr.ghosts > base.ghosts) severityFactors.push(`Ghost count increased (${base.ghosts} → ${curr.ghosts})`); if (curr.faults > base.faults) severityFactors.push(`Fault count increased (${base.faults} → ${curr.faults})`); if (curr.al_max > base.al_max) severityFactors.push(`Max authority level rose (AL${base.al_max} → AL${curr.al_max})`); if (classify(curr.ci_mean) !== classify(base.ci_mean)) { severityFactors.push(`Classification changed: ${classify(base.ci_mean)} → ${classify(curr.ci_mean)}`); } return { content: [ { type: "text" as const, text: JSON.stringify( { comparison: { baseline: { ...base, ci_mean_pct: +(base.ci_mean * 100).toFixed(2), classification: classify(base.ci_mean) }, recent: { ...curr, ci_mean_pct: +(curr.ci_mean * 100).toFixed(2), classification: classify(curr.ci_mean) }, }, delta: { ci_mean: +(ciDelta * 100).toFixed(2), ema_mean: +(emaDelta * 100).toFixed(2), al_mean: +alDelta.toFixed(2), ghost_delta: curr.ghosts - base.ghosts, warn_delta: curr.warns - base.warns, fault_delta: curr.faults - base.faults, }, trend, degraded: trend === "degrading", severity_factors: severityFactors.length ? severityFactors : ["No significant changes detected"], }, null, 2 ), }, ], }; } ); - src/index.ts:955-1042 (handler)The async handler function for compare_windows. It takes baseline and recent episode arrays, computes per-window statistics (ci_mean, ema_mean, al_mean, ghosts, warns, faults), calculates deltas, determines trend direction (improving/stable/degrading), and assesses severity factors.
async ({ baseline, recent }) => { const Q = 65535; function windowStats(eps: Array<Record<string, unknown>>) { const cis = eps.map(e => ((e.ci_out as number) || 0) / Q); const emas = eps.map(e => ((e.ci_ema_out as number) || 0) / Q); const als = eps.map(e => (e.al_out as number) || 0); const mean = (arr: number[]) => arr.reduce((a, b) => a + b, 0) / arr.length; const max = (arr: number[]) => Math.max(...arr); const min = (arr: number[]) => Math.min(...arr); const ghosts = eps.filter(e => e.ghost_confirmed).length; const warns = eps.filter(e => e.warn).length; const faults = eps.filter(e => e.fault).length; return { episodes: eps.length, ci_mean: mean(cis), ci_max: max(cis), ci_min: min(cis), ema_mean: mean(emas), al_mean: mean(als), al_max: max(als), ghosts, warns, faults, }; } function classify(ci: number): string { if (ci <= 0.15) return "Stable"; if (ci <= 0.45) return "Drift"; if (ci <= 0.70) return "Flip"; return "Collapse"; } const base = windowStats(baseline as Array<Record<string, unknown>>); const curr = windowStats(recent as Array<Record<string, unknown>>); const ciDelta = curr.ci_mean - base.ci_mean; const emaDelta = curr.ema_mean - base.ema_mean; const alDelta = curr.al_mean - base.al_mean; // Trend direction let trend: "improving" | "stable" | "degrading"; if (ciDelta < -0.03) trend = "improving"; else if (ciDelta > 0.03) trend = "degrading"; else trend = "stable"; // Severity assessment const severityFactors: string[] = []; if (ciDelta > 0.15) severityFactors.push(`CI jumped significantly (+${(ciDelta * 100).toFixed(1)}%)`); if (curr.ghosts > base.ghosts) severityFactors.push(`Ghost count increased (${base.ghosts} → ${curr.ghosts})`); if (curr.faults > base.faults) severityFactors.push(`Fault count increased (${base.faults} → ${curr.faults})`); if (curr.al_max > base.al_max) severityFactors.push(`Max authority level rose (AL${base.al_max} → AL${curr.al_max})`); if (classify(curr.ci_mean) !== classify(base.ci_mean)) { severityFactors.push(`Classification changed: ${classify(base.ci_mean)} → ${classify(curr.ci_mean)}`); } return { content: [ { type: "text" as const, text: JSON.stringify( { comparison: { baseline: { ...base, ci_mean_pct: +(base.ci_mean * 100).toFixed(2), classification: classify(base.ci_mean) }, recent: { ...curr, ci_mean_pct: +(curr.ci_mean * 100).toFixed(2), classification: classify(curr.ci_mean) }, }, delta: { ci_mean: +(ciDelta * 100).toFixed(2), ema_mean: +(emaDelta * 100).toFixed(2), al_mean: +alDelta.toFixed(2), ghost_delta: curr.ghosts - base.ghosts, warn_delta: curr.warns - base.warns, fault_delta: curr.faults - base.faults, }, trend, degraded: trend === "degrading", severity_factors: severityFactors.length ? severityFactors : ["No significant changes detected"], }, null, 2 ), }, ], }; } ); - src/index.ts:951-954 (schema)Input schema for compare_windows: baseline (array of episode objects) and recent (array of episode objects), both required with min 1 item.
{ baseline: z.array(z.record(z.string(), z.unknown())).min(1).describe("Baseline episode array (e.g. last hour, known-good run)"), recent: z.array(z.record(z.string(), z.unknown())).min(1).describe("Recent episode array to compare against baseline"), },