srt_run_watchdog
Run real health checks from the container to measure API, frontend, disk, memory, TLS, DB, and DNS. Returns actual values without AI-generated data.
Instructions
Run real health check probes from the MCP container (API health, frontend, disk, memory, TLS cert, DB, DNS). Returns actual measured values — never uses AI-provided data. Classification: INFORMATIONAL — read-only, no side effects.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| check_results | No | IGNORED — real probes are always used. This parameter exists for backward compatibility only. |
Implementation Reference
- src/mcp/tools/srt.ts:592-702 (handler)Main handler function 'registerSRTRunWatchdogTool' that registers the 'srt_run_watchdog' MCP tool. Runs real health probes (API, frontend, disk, memory, TLS cert, DB, DNS), classifies findings, creates incidents, and emits governance telemetry.
export function registerSRTRunWatchdogTool(server: McpServer, engine: GovernanceEngine): void { server.tool( 'srt_run_watchdog', 'Run real health check probes from the MCP container (API health, frontend, disk, memory, TLS cert, DB, DNS). Returns actual measured values — never uses AI-provided data. Classification: INFORMATIONAL — read-only, no side effects.', { check_results: z.array(z.object({ check_id: z.string().describe('Check identifier (e.g. disk-usage, api-healthz)'), name: z.string().describe('Human-readable check name'), status: z.enum(['PASS', 'WARNING', 'CRITICAL', 'ERROR', 'TIMEOUT']).describe('Check result status'), value: z.number().describe('Measured value'), unit: z.string().describe('Unit (%, ms, days, count, bool)'), threshold: z.number().describe('Threshold that was compared against'), message: z.string().describe('Status message'), })).optional().describe('IGNORED — real probes are always used. This parameter exists for backward compatibility only.'), }, { title: 'Run SRT Watchdog', readOnlyHint: false, idempotentHint: false, destructiveHint: false, openWorldHint: false, _meta: { ui: { resourceUri: 'ui://srt-health' } } } as any, async (_input) => { await ensureSRTRecovery(); // One-time recovery from PostgreSQL try { // ALWAYS run real probes — never trust AI-provided check_results const results: SRTCheckResult[] = await runRealHealthChecks(); const failures = results.filter(r => r.status !== 'PASS'); if (failures.length === 0) { // Auto-emit governance telemetry — all probes passed engine.telemetryService.emitProbeResult('watchdog-healthy', 'health-check', true, results.length); engine.telemetryService.emitToolCall('srt_run_watchdog', 'watchdog-healthy', 'INFORMATIONAL', true); return { content: [{ type: 'text' as const, text: JSON.stringify({ status: 'HEALTHY', checksRun: results.length, checksFailed: 0, timestamp: new Date().toISOString(), message: 'All health checks passed. No action needed.', }, null, 2) }] }; } // Build finding capsule const hasCritical = failures.some(r => r.status === 'CRITICAL' || r.status === 'ERROR'); const severity = hasCritical ? 'CRITICAL' : 'HIGH'; // Classify finding type const ids = failures.map(f => f.checkId); let findingType = 'HEALTH_DEGRADED'; if (ids.some(id => ['api-healthz', 'container-health-api'].includes(id))) findingType = 'SERVICE_DOWN'; else if (ids.includes('db-connectivity')) findingType = 'DB_UNREACHABLE'; else if (ids.includes('disk-usage')) findingType = 'DISK_PRESSURE'; else if (ids.includes('memory-usage')) findingType = 'MEMORY_PRESSURE'; else if (ids.includes('cpu-usage')) findingType = 'CPU_PRESSURE'; else if (ids.includes('tls-expiry')) findingType = 'TLS_EXPIRING'; else if (ids.includes('dns-resolve')) findingType = 'DNS_FAILURE'; else if (ids.some(id => ['nginx-process', 'nginx-config-test'].includes(id))) findingType = 'CONFIG_INVALID'; else if (ids.includes('api-error-rate')) findingType = 'ERROR_RATE_SPIKE'; // ── Security scanner finding types ── else if (ids.some(id => ['ssh-root-login', 'ssh-password-auth', 'ssh-max-auth', 'ssh-x11', 'ssh-idle-timeout'].includes(id))) findingType = 'SSH_HARDENING_REQUIRED'; else if (ids.some(id => ['ufw-inactive', 'firewall-missing', 'firewall-inactive'].includes(id))) findingType = 'FIREWALL_INACTIVE'; else if (ids.some(id => ['fail2ban-missing', 'packages-outdated', 'pending-updates'].includes(id))) findingType = 'SECURITY_PACKAGES_MISSING'; else if (ids.some(id => ['docker-privileged', 'docker-root-user', 'docker-no-limits', 'docker-no-daemon-config'].includes(id))) findingType = 'DOCKER_HARDENING_REQUIRED'; const signal = failures.map(f => f.checkId).join('+'); const observations = failures.map(f => `${f.status}: ${f.name} = ${f.value}${f.unit} (threshold: ${f.threshold}${f.unit})` ); const metrics: Record<string, number> = {}; results.forEach(r => { metrics[r.checkId] = r.value; }); const finding: SRTFinding = { findingId: genId('FND'), findingType, severity, signal, observations, metrics, recommendedNext: severity === 'CRITICAL' || severity === 'HIGH' ? 'TRIGGER_DIAGNOSTICIAN' : 'MONITOR', evidenceRefs: failures.map(f => `EVD-${f.checkId}-${Date.now().toString(36)}`), checksRun: results.length, checksFailed: failures.length, timestamp: new Date().toISOString(), ttl: 24, }; // Create incident const incident: SRTIncident = { incidentId: genId('INC'), status: 'DETECTED', severity, finding, createdAt: new Date().toISOString(), updatedAt: new Date().toISOString(), }; incidents.set(incident.incidentId, incident); persistIncident(incident); // Write-through to PostgreSQL // Auto-emit governance telemetry — probes found failures engine.telemetryService.emitProbeResult(incident.incidentId, findingType, false, results.length); engine.telemetryService.emitToolCall('srt_run_watchdog', incident.incidentId, 'INFORMATIONAL', true); return { content: [{ type: 'text' as const, text: JSON.stringify({ incidentCreated: true, incidentId: incident.incidentId, finding, activeIncidents: incidents.size, }, null, 2) }] }; } catch (error) { engine.telemetryService.emitToolCall('srt_run_watchdog', `watchdog-err-${Date.now().toString(36)}`, 'INFORMATIONAL', false); return { content: [{ type: 'text' as const, text: JSON.stringify({ error: 'WATCHDOG_FAILED', message: String(error) }) }], isError: true }; } } ); } - src/mcp/tools/srt.ts:1141-1146 (registration)Convenience function 'registerSRTTools' that registers all 4 SRT tools including srt_run_watchdog via 'registerSRTRunWatchdogTool'.
export function registerSRTTools(server: McpServer, engine: GovernanceEngine): void { registerSRTRunWatchdogTool(server, engine); registerSRTDiagnoseTool(server, engine); registerSRTApproveRepairTool(server, engine); registerSRTGeneratePostmortemTool(server, engine); } - src/mcp/server.ts:43-43 (registration)Import of 'registerSRTTools' from './tools/srt.js' which includes srt_run_watchdog registration.
import { registerSRTTools } from './tools/srt.js'; - src/mcp/server.ts:121-121 (registration)Registration entry for SRT tools including srt_run_watchdog, listed as operator-tier tool.
{ tier: 'operator', register: registerSRTTools, description: 'srt (run_watchdog, diagnose, approve_repair, generate_postmortem)' }, - src/core/audit/telemetry.ts:182-182 (schema)Telemetry classification schema entry for srt_run_watchdog: toolClass='read', riskTier='low', maiDefault='INFORMATIONAL', requiresHumanApproval=false.
{ toolName: 'srt_run_watchdog', toolClass: 'read', riskTier: 'low', maiDefault: 'INFORMATIONAL', requiresHumanApproval: false, category: 'srt' },