DevOps AI Toolkit

remediate.test.ts•26.1 KiB

/** * Integration Test: Remediate Tool * * Tests the complete remediation workflow via REST API against a real test cluster. * Validates AI-powered investigation, root cause analysis, remediation execution, * and actual cluster state fixes. */ import { describe, test, expect, beforeAll } from 'vitest'; import { IntegrationTest } from '../helpers/test-base.js'; describe.concurrent('Remediate Tool Integration', () => { const integrationTest = new IntegrationTest(); const testNamespace = 'remediate-test'; beforeAll(() => { // Verify we're using the test cluster const kubeconfig = process.env.KUBECONFIG; expect(kubeconfig).toContain('kubeconfig-test.yaml'); }); describe('Manual Mode Workflow', () => { test('should complete full workflow: setup broken deployment → investigate → execute → verify cluster fix', async () => { // SETUP: Create namespace await integrationTest.kubectl(`create namespace ${testNamespace}`); // SETUP: Create deployment with insufficient memory (will OOMKill) // Memory limit of 128Mi with stress requesting 250M causes intentional OOM crashes await integrationTest.kubectl(`apply -n ${testNamespace} -f - <<'EOF' apiVersion: apps/v1 kind: Deployment metadata: name: test-app namespace: remediate-test spec: replicas: 1 selector: matchLabels: app: test-app template: metadata: labels: app: test-app spec: containers: - name: stress image: polinux/stress:1.0.4 command: ["stress"] args: ["--vm", "1", "--vm-bytes", "250M", "--vm-hang", "1"] resources: limits: memory: "128Mi" requests: memory: "64Mi" EOF`); // Wait for pod to start and crash at least once (with retry loop) let podData: any; let restartCountInitial = 0; const maxWaitTime = 90000; // 90 seconds max const checkInterval = 5000; // Check every 5 seconds const startTime = Date.now(); while (Date.now() - startTime < maxWaitTime) { const podsJson = await integrationTest.kubectl( `get pods -n ${testNamespace} -l app=test-app -o json` ); // Skip if empty response (pods not ready yet) if (!podsJson || podsJson.trim() === '') { await new Promise(resolve => setTimeout(resolve, checkInterval)); continue; } const podsData = JSON.parse(podsJson); if (podsData.items && podsData.items.length > 0) { podData = podsData.items[0]; if (podData.status.containerStatuses && podData.status.containerStatuses[0]) { restartCountInitial = podData.status.containerStatuses[0].restartCount; if (restartCountInitial > 0) { break; // Pod has crashed and restarted } } } await new Promise(resolve => setTimeout(resolve, checkInterval)); } // Verify pod is in a problematic state (CrashLoopBackOff, Running but will crash, or Pending) expect(podData.status.phase).toMatch(/Running|Pending/); // Verify pod has restarted at least once (indicating OOM crashes) expect(restartCountInitial).toBeGreaterThan(0); // Should have crashed at least once // PHASE 1: AI Investigation const investigationResponse = await integrationTest.httpClient.post( '/api/v1/tools/remediate', { issue: `my app in ${testNamespace} namespace is crashing`, interaction_id: 'manual_analyze' } ); // Validate investigation response (based on actual curl inspection) const expectedInvestigationResponse = { success: true, data: { result: { status: 'awaiting_user_approval', sessionId: expect.stringMatching(/^rem-\d+-[a-f0-9]{8}$/), investigation: { iterations: expect.any(Number), dataGathered: expect.arrayContaining([ expect.stringMatching(/^kubectl_\w+ $call \d+$$/) ]) }, analysis: { rootCause: expect.any(String), // AI describes OOM/memory issue in various ways confidence: expect.any(Number), factors: expect.any(Array) }, remediation: { summary: expect.stringContaining('memory'), actions: expect.arrayContaining([ expect.objectContaining({ description: expect.any(String), command: expect.stringContaining('kubectl'), risk: expect.stringMatching(/^(low|medium|high)$/), rationale: expect.any(String) }) ]), risk: expect.stringMatching(/^(low|medium|high)$/) }, validationIntent: expect.any(String), executed: false, mode: 'manual', guidance: expect.stringContaining('CRITICAL'), agentInstructions: expect.stringContaining('Show the user'), nextAction: 'remediate', message: expect.any(String), // PRD #320: Remediate tool returns visualizationUrl visualizationUrl: expect.stringMatching(/^https:\/\/dot-ai-ui\.test\.local\/v\/rem-\d+-[a-f0-9]+$/), executionChoices: [ expect.objectContaining({ id: 1, label: 'Execute automatically via MCP', description: expect.any(String), risk: expect.stringMatching(/^(low|medium|high)$/) }), expect.objectContaining({ id: 2, label: 'Execute via agent', description: expect.any(String), risk: expect.stringMatching(/^(low|medium|high)$/) }) ] }, tool: 'remediate', executionTime: expect.any(Number) }, meta: { timestamp: expect.stringMatching(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/), requestId: expect.any(String), version: 'v1' } }; expect(investigationResponse).toMatchObject(expectedInvestigationResponse); // PRD #320: Verify visualization URL is present in response (not embedded in message) expect(investigationResponse.data.result.visualizationUrl).toBeTruthy(); expect(investigationResponse.data.result.visualizationUrl).toMatch(/^https?:\/\//); // Extract sessionId for execution const sessionId = investigationResponse.data.result.sessionId; const remediationActions = investigationResponse.data.result.remediation.actions; // Verify AI found OOM issue and memory-related remediation expect(investigationResponse.data.result.analysis.rootCause.toLowerCase()).toMatch(/oom|memory/); expect(investigationResponse.data.result.analysis.confidence).toBeGreaterThan(0.8); expect(remediationActions.length).toBeGreaterThan(0); // SESSION RETRIEVAL: Test GET /api/v1/sessions/{sessionId} for URL sharing/refresh support const sessionStartTime = Date.now(); const sessionResponse = await integrationTest.httpClient.get(`/api/v1/sessions/${sessionId}`); const sessionRetrievalTime = Date.now() - sessionStartTime; // Should be fast (no AI call - just file read) expect(sessionRetrievalTime).toBeLessThan(1000); // Under 1 second // Validate session response structure const expectedSessionResponse = { success: true, data: { sessionId: sessionId, createdAt: expect.any(String), updatedAt: expect.any(String), data: { toolName: 'remediate', issue: expect.stringContaining(testNamespace), mode: 'manual', status: 'analysis_complete', finalAnalysis: { status: 'awaiting_user_approval', sessionId: sessionId, analysis: { rootCause: expect.any(String), confidence: expect.any(Number), factors: expect.any(Array) }, remediation: { summary: expect.stringContaining('memory'), actions: expect.any(Array), risk: expect.stringMatching(/^(low|medium|high)$/) } } } }, meta: { timestamp: expect.any(String), requestId: expect.any(String), version: 'v1' } }; expect(sessionResponse).toMatchObject(expectedSessionResponse); // NOTE: Visualization endpoint is tested in version.test.ts (fastest tool) // PHASE 2: Execute remediation via MCP (choice 1) const executionResponse = await integrationTest.httpClient.post( '/api/v1/tools/remediate', { executeChoice: 1, sessionId, mode: 'manual', interaction_id: 'manual_execute' } ); // Execution response status is either 'success' or 'awaiting_user_approval'. // Cluster state is verified directly in Phase 3 regardless of status. const expectedExecutionResponse = { success: true, data: { result: { sessionId: sessionId, executed: true, results: expect.arrayContaining([ expect.objectContaining({ action: expect.any(String), success: true, timestamp: expect.any(String) }) ]), executedCommands: expect.any(Array), analysis: expect.objectContaining({ rootCause: expect.any(String), confidence: expect.any(Number) }), remediation: expect.objectContaining({ summary: expect.any(String), actions: expect.any(Array), risk: expect.stringMatching(/^(low|medium|high)$/) }), investigation: expect.objectContaining({ iterations: expect.any(Number) }) }, tool: 'remediate', executionTime: expect.any(Number) }, meta: { timestamp: expect.stringMatching(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/), requestId: expect.any(String), version: 'v1' } }; expect(executionResponse).toMatchObject(expectedExecutionResponse); // Status: 'success' when validation confirms fix, 'awaiting_user_approval' when AI wants more investigation expect(['success', 'awaiting_user_approval']).toContain(executionResponse.data.result.status); // Verify all remediation commands succeeded const results = executionResponse.data.result.results; results.forEach((result: any) => { expect(result.success).toBe(true); }); // PHASE 3: Verify ACTUAL cluster remediation ✅ KEY VALIDATION // Wait for deployment to rollout new pods with updated memory await new Promise(resolve => setTimeout(resolve, 10000)); // Get pod managed by deployment const afterPodsJson = await integrationTest.kubectl( `get pods -n ${testNamespace} -l app=test-app -o json` ); const afterPodsData = JSON.parse(afterPodsJson); expect(afterPodsData.items.length).toBeGreaterThan(0); const afterPod = afterPodsData.items[0]; // Verify pod is now running (not crashing) expect(afterPod.status.phase).toBe('Running'); // Verify pod has not restarted since fix (restart count should be 0 for new pod) expect(afterPod.status.containerStatuses[0].restartCount).toBe(0); // Verify pod is actually healthy (Ready condition) const readyCondition = afterPod.status.conditions.find((c: any) => c.type === 'Ready'); expect(readyCondition.status).toBe('True'); // Verify deployment memory limit was increased (should be higher than original 128Mi) const deploymentJson = await integrationTest.kubectl( `get deployment test-app -n ${testNamespace} -o json` ); const deploymentData = JSON.parse(deploymentJson); const memoryLimit = deploymentData.spec.template.spec.containers[0].resources.limits.memory; // Parse memory value and verify it's greater than 128Mi const memValue = parseInt(memoryLimit.replace(/Mi|Gi/, '')); const isGi = memoryLimit.includes('Gi'); const actualMi = isGi ? memValue * 1024 : memValue; expect(actualMi).toBeGreaterThan(128); // AI should have increased from 128Mi }, 1200000); // 20 minute timeout for AI investigation + execution + validation (accommodates slower AI models like Gemini) }); describe('Automatic Mode Workflow', () => { const autoNamespace = 'remediate-auto-test'; test('should auto-execute remediation when confidence and risk thresholds are met', async () => { // SETUP: Create namespace await integrationTest.kubectl(`create namespace ${autoNamespace}`); // SETUP: Create deployment with insufficient memory (OOM scenario for automatic mode) // Using Deployment instead of Pod because Pods have immutable container specs // Memory limit of 128Mi with stress requesting 250M causes intentional OOM crashes await integrationTest.kubectl(`apply -n ${autoNamespace} -f - <<'EOF' apiVersion: apps/v1 kind: Deployment metadata: name: auto-test-app namespace: ${autoNamespace} spec: replicas: 1 selector: matchLabels: app: auto-test-app template: metadata: labels: app: auto-test-app spec: containers: - name: stress image: polinux/stress:1.0.4 command: ["stress"] args: ["--vm", "1", "--vm-bytes", "250M", "--vm-hang", "1"] resources: limits: memory: "128Mi" requests: memory: "64Mi" EOF`); // Wait for pod to start and crash (with retry loop) let podData: any; let restartCount = 0; const maxWaitTime = 90000; // 90 seconds max const checkInterval = 5000; // Check every 5 seconds const startTime = Date.now(); while (Date.now() - startTime < maxWaitTime) { const podsJson = await integrationTest.kubectl( `get pods -n ${autoNamespace} -l app=auto-test-app -o json` ); // Skip if empty response (pods not ready yet) if (!podsJson || podsJson.trim() === '') { await new Promise(resolve => setTimeout(resolve, checkInterval)); continue; } const podsData = JSON.parse(podsJson); if (podsData.items && podsData.items.length > 0) { podData = podsData.items[0]; if (podData.status.containerStatuses && podData.status.containerStatuses[0]) { restartCount = podData.status.containerStatuses[0].restartCount; if (restartCount > 0) { break; // Pod has crashed and restarted } } } await new Promise(resolve => setTimeout(resolve, checkInterval)); } // Verify pod has crashed at least once expect(restartCount).toBeGreaterThan(0); // PHASE 1: Call remediate with automatic mode (single call auto-executes everything) const autoResponse = await integrationTest.httpClient.post( '/api/v1/tools/remediate', { issue: `auto-test-app deployment in ${autoNamespace} namespace is crashing`, mode: 'automatic', confidenceThreshold: 0.1, // Very low threshold ensures auto-execution - we're testing the mechanism, not AI confidence maxRiskLevel: 'high', // Allow any risk level - we're testing auto-execution works when thresholds are met interaction_id: 'automatic_analyze_execute' } ); // Validate automatic execution response const expectedAutoResponse = { success: true, data: { result: { status: 'success', executed: true, // KEY: Should auto-execute without user approval results: expect.arrayContaining([ expect.objectContaining({ success: true }) ]), validation: { success: true // Validation should confirm the fix worked } } } }; expect(autoResponse).toMatchObject(expectedAutoResponse); // Verify execution was automatic (no executionChoices) expect(autoResponse.data.result.executionChoices).toBeUndefined(); // Verify all remediation commands succeeded const results = autoResponse.data.result.results; results.forEach((result: any) => { expect(result.success).toBe(true); }); // PHASE 2: Verify ACTUAL cluster remediation - outcome-based validation await new Promise(resolve => setTimeout(resolve, 15000)); // Wait for new pods to stabilize // Get all pods in namespace - deployment controller will create new pods after patch const afterPodsJson = await integrationTest.kubectl(`get pods -n ${autoNamespace} -l app=auto-test-app -o json`); const afterPodsData = JSON.parse(afterPodsJson); // Should have at least one running stress workload pod const runningPods = afterPodsData.items.filter((pod: any) => pod.status.phase === 'Running' && pod.spec.containers.some((container: any) => container.image === 'polinux/stress:1.0.4') ); expect(runningPods.length).toBeGreaterThan(0); // Should have no crashing pods (restart count = 0 means stable with new memory limits) const stablePod = runningPods[0]; expect(stablePod.status.containerStatuses[0].restartCount).toBe(0); }, 1800000); // 30 minute timeout for automatic mode (accommodates slower AI models like OpenAI) }); describe('Helm Release Remediation', () => { const helmNamespace = 'remediate-helm-test'; test('should detect Helm release issues using Helm investigation tools and remediate', async () => { const { execSync } = await import('child_process'); const kubeconfig = process.env.KUBECONFIG || './kubeconfig-test.yaml'; const runHelm = (cmd: string): string => { try { return execSync(`helm --kubeconfig=${kubeconfig} ${cmd}`, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 180000 }); } catch (error: unknown) { return (error as { stdout?: string }).stdout || ''; } }; // SETUP: Create namespace and a test Helm chart await integrationTest.kubectl(`create namespace ${helmNamespace}`); execSync('rm -rf ./tmp/helm-remediate-test-chart'); execSync('helm create ./tmp/helm-remediate-test-chart', { encoding: 'utf8', timeout: 30000 }); // Install chart with known-good nginx image execSync( `helm --kubeconfig=${kubeconfig} install test-nginx ./tmp/helm-remediate-test-chart -n ${helmNamespace} --set image.tag=alpine --wait --timeout=120s`, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'pipe'], timeout: 180000 } ); // Verify initial deployment is healthy const initialPodsJson = await integrationTest.kubectl( `get pods -n ${helmNamespace} -l app.kubernetes.io/instance=test-nginx -o json` ); const initialPodsData = JSON.parse(initialPodsJson); expect(initialPodsData.items.length).toBeGreaterThan(0); expect(initialPodsData.items[0].status.phase).toBe('Running'); // BREAK: Upgrade with non-existent image (--wait fails, marking release as "failed") runHelm( `upgrade test-nginx ./tmp/helm-remediate-test-chart -n ${helmNamespace} --set image.repository=nonexistent-registry.invalid/nginx --set image.tag=doesnotexist --wait --timeout=60s` ); // Wait for pods to be in ImagePullBackOff let podInErrorState = false; const maxWaitTime = 90000; const checkInterval = 5000; const startTime = Date.now(); while (Date.now() - startTime < maxWaitTime) { const podsJson = await integrationTest.kubectl(`get pods -n ${helmNamespace} -o json`); if (podsJson && podsJson.trim() !== '') { const podsData = JSON.parse(podsJson); for (const pod of podsData.items) { for (const cs of (pod.status?.containerStatuses || [])) { const waitReason = cs.state?.waiting?.reason; if (waitReason === 'ImagePullBackOff' || waitReason === 'ErrImagePull') { podInErrorState = true; break; } } if (podInErrorState) break; } } if (podInErrorState) break; await new Promise(resolve => setTimeout(resolve, checkInterval)); } expect(podInErrorState).toBe(true); // PHASE 1: AI Investigation (manual mode to inspect Helm tool usage) const investigationResponse = await integrationTest.httpClient.post( '/api/v1/tools/remediate', { issue: `helm release test-nginx in ${helmNamespace} namespace was upgraded and is now failing`, interaction_id: 'helm_investigate' } ); expect(investigationResponse).toMatchObject({ success: true, data: { result: { status: 'awaiting_user_approval', sessionId: expect.stringMatching(/^rem-\d+-[a-f0-9]{8}$/), investigation: { iterations: expect.any(Number), dataGathered: expect.arrayContaining([ expect.stringMatching(/^(kubectl_|helm_)\w+ $call \d+$$/) ]) }, analysis: { rootCause: expect.any(String), confidence: expect.any(Number), factors: expect.arrayContaining([expect.any(String)]) }, remediation: { summary: expect.any(String), actions: expect.arrayContaining([ expect.objectContaining({ description: expect.any(String), command: expect.any(String), risk: expect.stringMatching(/^(low|medium|high)$/), rationale: expect.any(String) }) ]), risk: expect.stringMatching(/^(low|medium|high)$/) }, validationIntent: expect.any(String), executed: false, mode: 'manual', guidance: expect.stringContaining('CRITICAL'), agentInstructions: expect.stringContaining('Show the user'), nextAction: 'remediate', message: expect.any(String), visualizationUrl: expect.stringMatching(/^https:\/\/dot-ai-ui\.test\.local\/v\/rem-\d+-[a-f0-9]+$/), executionChoices: [ { id: 1, label: 'Execute automatically via MCP', description: expect.any(String), risk: expect.stringMatching(/^(low|medium|high)$/) }, { id: 2, label: 'Execute via agent', description: expect.any(String), risk: expect.stringMatching(/^(low|medium|high)$/) } ] }, tool: 'remediate', executionTime: expect.any(Number) }, meta: { timestamp: expect.stringMatching(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$/), requestId: expect.any(String), version: 'v1' } }); // KEY VALIDATION: AI used at least one Helm investigation tool const dataGathered: string[] = investigationResponse.data.result.investigation.dataGathered; const helmToolCalls = dataGathered.filter((entry: string) => entry.startsWith('helm_')); expect(helmToolCalls.length).toBeGreaterThan(0); // Verify AI identified the issue with reasonable confidence expect(investigationResponse.data.result.analysis.confidence).toBeGreaterThanOrEqual(0.7); expect(investigationResponse.data.result.analysis.rootCause.toLowerCase()).toMatch(/image|pull|helm|upgrade|fail/); expect(investigationResponse.data.result.remediation.actions.length).toBeGreaterThan(0); // PHASE 2: Execute remediation via MCP (choice 1) const sessionId = investigationResponse.data.result.sessionId; const executionResponse = await integrationTest.httpClient.post( '/api/v1/tools/remediate', { executeChoice: 1, sessionId, mode: 'manual', interaction_id: 'helm_execute' } ); expect(executionResponse).toMatchObject({ success: true, data: { result: { status: 'success', sessionId: sessionId, executed: true, results: expect.arrayContaining([ expect.objectContaining({ action: expect.any(String), success: true, timestamp: expect.any(String) }) ]), executedCommands: expect.arrayContaining([expect.any(String)]), guidance: expect.stringContaining('REMEDIATION COMPLETE'), message: expect.stringContaining('resolved') }, tool: 'remediate', executionTime: expect.any(Number) } }); const results = executionResponse.data.result.results; results.forEach((result: { success: boolean }) => { expect(result).toMatchObject({ success: true }); }); // PHASE 3: Verify cluster recovery await new Promise(resolve => setTimeout(resolve, 15000)); const afterPodsJson = await integrationTest.kubectl( `get pods -n ${helmNamespace} -l app.kubernetes.io/instance=test-nginx -o json` ); const afterPodsData = JSON.parse(afterPodsJson); expect(afterPodsData.items.length).toBeGreaterThan(0); // At least one pod should be running and ready (recovered from bad image) const healthyPods = afterPodsData.items.filter((pod: { status: { phase: string; containerStatuses?: Array<{ ready: boolean; restartCount: number }> } }) => pod.status.phase === 'Running' && pod.status.containerStatuses?.[0]?.ready === true ); expect(healthyPods.length).toBeGreaterThan(0); // Recovered pod should have zero restarts (fresh pod after rollback/fix) expect(healthyPods[0].status.containerStatuses[0].restartCount).toBe(0); }, 1200000); // 20 minute timeout }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vfarcic/dot-ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

remediate.test.ts•26.1 KiB