detect_anomalies
Scan one or all monitored services for abnormal behavior and return anomalies ranked by severity using z-score and log error-rate spike detection.
Instructions
Scan one or all monitored services for abnormal behavior and return the findings ranked by severity. When to use: the entry point for 'is anything wrong anywhere?' triage. Once a service is flagged, follow up with get_service_health for the verdict or query_metrics/query_logs for the raw evidence. Behavior: read-only, no side effects. Applies z-score analysis to metrics, detects log error-rate spikes, and correlates the two. Returns a list of anomalies, each with the affected service, metric/signal, severity, the deviation (e.g. σ and % change), and a short explanation. No anomalies yields an empty list, not an error. Related: get_service_health (single-service verdict), query_metrics (raw series behind a flagged metric).
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| service | No | Optional. Restrict the scan to one service (exact, case-sensitive name from `list_services`). Default: scan every monitored service. | |
| duration | No | Optional. Look-back window analyzed for anomalies, written as <number><unit> with unit s|m|h|d (e.g. '5m', '15m', '1h'). Default: '10m'. | |
| sensitivity | No | Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'. |
Implementation Reference
- Main handler for the detect_anomalies tool. Iterates over services, queries metrics for KEY_METRICS (cpu, memory, error_rate, latency_p99, request_rate) via connectors, runs z-score anomaly detection using detectAnomaly(), checks logs for error spikes and critical patterns, performs cross-signal correlation, and ranks root causes via rankRootCause(). Returns anomalies, correlations, root cause, and a summary.
export async function detectAnomaliesHandler( registry: ConnectorRegistry, args: { service?: string; duration?: string; sensitivity?: string }, _ctx: RequestContext = defaultContext() ) { const duration = args.duration || "10m"; const threshold = SENSITIVITY_THRESHOLDS[args.sensitivity || "medium"] || 2.0; // Discover services to scan const metricsConnectors = registry.getBySignal("metrics"); const logConnectors = registry.getBySignal("logs"); let serviceNames: string[] = []; if (args.service) { serviceNames = [args.service]; } else { for (const connector of metricsConnectors) { const services = await connector.listServices(); for (const s of services) { if (!serviceNames.includes(s.name)) serviceNames.push(s.name); } } } const allAnomalies: AnomalyReport[] = []; const allCorrelations: string[] = []; for (const serviceName of serviceNames) { // Check metrics for (const connector of metricsConnectors) { if (!connector.queryMetrics) continue; for (const metric of KEY_METRICS) { try { const result = await connector.queryMetrics({ service: serviceName, metric, duration }); const points = result.values.map((v) => ({ timestamp: v.timestamp, value: v.value })); const anomaly = detectAnomaly(points, { threshold, metricKind: classifyMetric(metric), }); if (anomaly.isAnomaly) { const deviationPercent = anomaly.baselineValue === 0 ? 100 : Math.round(((anomaly.recentValue - anomaly.baselineValue) / anomaly.baselineValue) * 100); allAnomalies.push({ metric, severity: Math.abs(anomaly.score) >= 6 ? "high" : Math.abs(anomaly.score) >= 4 ? "medium" : "low", description: `${metric}: ${anomaly.reason}`, currentValue: anomaly.recentValue, baselineValue: anomaly.baselineValue, deviationPercent, source: connector.name, service: serviceName, }); } } catch { // Skip metrics that don't exist for this service } } } // Check logs for error spikes for (const connector of logConnectors) { if (!connector.queryLogs) continue; try { const logs = await connector.queryLogs({ service: serviceName, duration, limit: 500 }); // Critical-pattern scan — independent of the error-ratio gate, so a // warn-level OOM/leak signal is not silently dropped. const criticalPattern = logs.summary.topPatterns.find((p) => CRITICAL_LOG_PATTERN.test(p) ); if (criticalPattern) { allAnomalies.push({ metric: "log_critical_pattern", severity: "high", description: `Critical log pattern detected: "${criticalPattern}"`, currentValue: logs.summary.errorCount + logs.summary.warnCount, baselineValue: 0, deviationPercent: 100, source: connector.name, service: serviceName, }); } if (logs.summary.errorCount > 5) { const errorRatio = logs.summary.total > 0 ? logs.summary.errorCount / logs.summary.total : 0; if (errorRatio > 0.1) { allAnomalies.push({ metric: "log_error_rate", severity: errorRatio > 0.3 ? "high" : errorRatio > 0.15 ? "medium" : "low", description: `${Math.round(errorRatio * 100)}% of logs are errors (${logs.summary.errorCount}/${logs.summary.total}). Top: ${logs.summary.topPatterns[0] || "N/A"}`, currentValue: logs.summary.errorCount, baselineValue: 0, deviationPercent: 100, source: connector.name, service: serviceName, }); } } } catch { // Skip if logs unavailable } } } // Cross-signal correlation if (allAnomalies.length > 0) { const servicesWithAnomalies = [...new Set(allAnomalies.map((a) => a.service))]; for (const svc of servicesWithAnomalies) { const svcAnomalies = allAnomalies.filter((a) => a.service === svc); const metricTypes = svcAnomalies.map((a) => a.metric).filter((m) => m !== "log_error_rate"); const hasLogAnomaly = svcAnomalies.some((a) => a.metric === "log_error_rate"); if (metricTypes.length > 0 && hasLogAnomaly) { allCorrelations.push( `${svc}: metric anomalies (${metricTypes.join(", ")}) correlate with elevated log error rate` ); } if (metricTypes.includes("cpu") && metricTypes.includes("latency_p99")) { allCorrelations.push( `${svc}: CPU spike and latency increase detected simultaneously — possible resource saturation` ); } } } // Dependency-aware root-cause ranking. The service graph / change markers // are empty here (no trace source wired yet); ranking then degrades to // severity-weighted ordering and still names the most likely culprit // instead of just listing "both signals bad". const rootCause = allAnomalies.length > 0 ? rankRootCause( allAnomalies.map((a) => ({ service: a.service, metric: a.metric, severity: a.severity, })) ) : { ranked: [], summary: "" }; const result = { scannedServices: serviceNames.length, anomalies: allAnomalies, correlations: allCorrelations, rootCause, summary: allAnomalies.length === 0 ? "All services healthy — no anomalies detected." : `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} service(s).`, }; return { content: [{ type: "text" as const, text: JSON.stringify(result, null, 2) }], }; } - Input schema definition for detect_anomalies tool: optional service (string), duration (string, default '10m'), sensitivity (enum: low/medium/high, default 'medium').
export const detectAnomaliesDefinition = { name: "detect_anomalies" as const, description: "Scan for anomalies across all monitored services (or a specific service). Detects metric deviations using z-score analysis against recent baseline, checks log error spikes, and correlates signals across metrics and logs. Returns anomalies with severity ratings and cross-signal correlations.", inputSchema: { type: "object" as const, properties: { service: { type: "string", description: "Specific service to scan. If omitted, scans all services.", }, duration: { type: "string", description: "Time range to analyze (e.g. '5m', '15m', '1h'). Default: '10m'", }, sensitivity: { type: "string", enum: ["low", "medium", "high"], description: "Detection sensitivity. 'low' = major deviations only (>3σ), 'medium' = moderate (>2σ), 'high' = subtle changes (>1.5σ). Default: 'medium'", }, }, }, }; - mcp-server/src/index.ts:279-308 (registration)Registration of detect_anomalies as an MCP tool with Zod schema validation for input parameters (service, duration, sensitivity) and delegation to detectAnomaliesHandler with metrics instrumentation.
mcpServer.tool( "detect_anomalies", [ "Scan one or all monitored services for abnormal behavior and return the findings ranked by severity.", "When to use: the entry point for 'is anything wrong anywhere?' triage. Once a service is flagged, follow up with `get_service_health` for the verdict or `query_metrics`/`query_logs` for the raw evidence.", "Behavior: read-only, no side effects. Applies z-score analysis to metrics, detects log error-rate spikes, and correlates the two. Returns a list of anomalies, each with the affected service, metric/signal, severity, the deviation (e.g. σ and % change), and a short explanation. No anomalies yields an empty list, not an error.", "Related: `get_service_health` (single-service verdict), `query_metrics` (raw series behind a flagged metric).", ].join(" "), { service: z .string() .optional() .describe( "Optional. Restrict the scan to one service (exact, case-sensitive name from `list_services`). Default: scan every monitored service.", ), duration: z .string() .optional() .describe( "Optional. Look-back window analyzed for anomalies, written as <number><unit> with unit s|m|h|d (e.g. '5m', '15m', '1h'). Default: '10m'.", ), sensitivity: z .enum(["low", "medium", "high"]) .optional() .describe( "Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'.", ), }, async (args) => withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx)) ); - detectAnomaly() orchestrator: prefers seasonality-aware detection (detectSeasonalAnomaly) when enough multi-period history exists, otherwise falls back to robust z-score detection (detectRobustAnomaly) using median/MAD for outlier resistance.
export function detectAnomaly( points: SeasonalPoint[], opts: SeasonalAnomalyOptions & RobustAnomalyOptions = {} ): { isAnomaly: boolean; method: "seasonal" | "robust-z" | "trend" | "none"; score: number; recentValue: number; baselineValue: number; direction: "above" | "below" | "flat"; reason: string; } { const seasonal = detectSeasonalAnomaly(points, opts); if (seasonal.applicable) { return { isAnomaly: seasonal.isAnomaly, method: "seasonal", score: seasonal.score, recentValue: seasonal.recentValue, baselineValue: seasonal.expected, direction: seasonal.direction, reason: seasonal.reason, }; } const r = detectRobustAnomaly(points.map((p) => p.value), opts); return { isAnomaly: r.isAnomaly, method: r.method, score: r.score, recentValue: r.recentValue, baselineValue: r.baselineValue, direction: r.direction, reason: r.reason, }; } - rankRootCause() ranks anomalous services by dependency position (caller/callee graph), onset ordering, change markers, and severity weights to identify the likely root cause.
export function rankRootCause( anomalies: RankInputAnomaly[], edges: ServiceEdge[] = [], changes: ChangeMarker[] = [] ): RootCauseResult { const services = [...new Set(anomalies.map((a) => a.service))]; if (services.length === 0) { return { ranked: [], summary: "No anomalies to attribute." }; } // "depends on": from -> set(to). A root cause is a service that other // anomalous services (transitively) depend on. const deps = new Map<string, Set<string>>(); for (const e of edges) { if (!deps.has(e.from)) deps.set(e.from, new Set()); deps.get(e.from)!.add(e.to); } const dependsOn = (from: string, to: string, seen = new Set<string>()): boolean => { if (seen.has(from)) return false; seen.add(from); const direct = deps.get(from); if (!direct) return false; if (direct.has(to)) return true; for (const mid of direct) if (dependsOn(mid, to, seen)) return true; return false; }; const earliest = Math.min( ...anomalies.filter((a) => a.onsetTs !== undefined).map((a) => a.onsetTs!) ); const haveOnset = Number.isFinite(earliest); const candidates: RootCauseCandidate[] = services.map((svc) => { const svcAnoms = anomalies.filter((a) => a.service === svc); const reasons: string[] = []; let score = 0; // (1) Dependency position: how many *other* anomalous services depend on // this one. Each dependent is a downstream symptom this service explains. const dependents = services.filter( (other) => other !== svc && dependsOn(other, svc) ); if (dependents.length > 0) { score += 5 * dependents.length; reasons.push( `${dependents.length} anomalous service(s) depend on it (${dependents.join(", ")}) — their symptoms are likely downstream` ); } // Penalty: this service depends on another anomalous one → likely a victim. const upstreamCauses = services.filter( (other) => other !== svc && dependsOn(svc, other) ); if (upstreamCauses.length > 0) { score -= 3 * upstreamCauses.length; reasons.push(`depends on anomalous ${upstreamCauses.join(", ")} — may be a downstream victim`); } // (2) Onset ordering: started at/near the earliest onset. if (haveOnset) { const myOnset = Math.min( ...svcAnoms.filter((a) => a.onsetTs !== undefined).map((a) => a.onsetTs!) ); if (Number.isFinite(myOnset)) { const lagSec = Math.round((myOnset - earliest) / 1000); if (lagSec <= 0) { score += 4; reasons.push("anomaly started first (earliest onset)"); } else if (lagSec <= 60) { score += 1; reasons.push(`onset ${lagSec}s after the first signal`); } else { reasons.push(`onset ${lagSec}s after the first signal — likely reactive`); } } } // (3) Deploy/change marker shortly before onset. const myOnset = svcAnoms.find((a) => a.onsetTs !== undefined)?.onsetTs; const marker = changes .filter((c) => c.service === svc) .find((c) => myOnset === undefined || (c.ts <= myOnset && myOnset - c.ts <= 15 * 60_000)); if (marker) { score += 4; reasons.push( `${marker.kind || "change"} on this service ${ myOnset ? `${Math.round((myOnset - marker.ts) / 1000)}s before onset` : "near the incident" }` ); } // Tie-breaker: signal breadth × severity (small weight). const breadth = svcAnoms.reduce((s, a) => s + SEV_WEIGHT[a.severity], 0); score += 0.25 * breadth; return { service: svc, score, confidence: "low" as const, reasons }; }); candidates.sort((a, b) => b.score - a.score); // Confidence from the score gap between #1 and #2. const top = candidates[0]; const gap = candidates.length > 1 ? top.score - candidates[1].score : top.score; top.confidence = gap >= 5 ? "high" : gap >= 2 ? "medium" : "low"; const summary = candidates.length === 1 ? `Single anomalous service: ${top.service}.` : `Likely root cause: ${top.service} (${top.confidence} confidence). ${ top.reasons[0] || "ranked by severity" }. ${candidates.length - 1} other service(s) likely downstream.`; return { ranked: candidates, summary }; }