OpenShift MCP Server

Overview Schema Related Servers Score Discussions

gpu.py•5.53 KiB

from openshift_mcp_server.utils.prometheus import query_prometheus, OCError from openshift_mcp_server.utils.oc import run_oc_command import logging logger = logging.getLogger("openshift-mcp-server") async def inspect_gpu_pod(namespace: str, pod_name: str) -> str: """ Run 'nvidia-smi' inside a GPU-enabled pod to view real-time process and memory details. Why: - Debug OOM: See exact memory usage per process. - Verify allocation: Confirm the pod actually sees the GPU. - Check processes: Identify zombie processes or unexpected workloads. Args: namespace: Pod namespace pod_name: Pod name Returns: Output of nvidia-smi from inside the pod. """ try: # First check if the pod is running status_cmd = ["get", "pod", pod_name, "-n", namespace, "-o", "jsonpath={.status.phase}"] phase = await run_oc_command(status_cmd) if phase.strip() != "Running": return f"❌ Pod {pod_name} is not in Running phase (current: {phase}). Cannot exec." # Run nvidia-smi cmd = ["exec", "-n", namespace, pod_name, "--", "nvidia-smi"] output = await run_oc_command(cmd) return f"### GPU Status for `{namespace}/{pod_name}`\n\n```\n{output}\n```" except OCError as e: if "executable file not found" in str(e): return f"❌ `nvidia-smi` not found in pod {pod_name}. The container might not have NVIDIA drivers or tools installed." return f"❌ Error running nvidia-smi: {e}" async def check_gpu_health() -> str: """ Check for GPU hardware errors (XID) and throttling events across the cluster. Why: - Detect Hardware Failures: XID errors often indicate physical GPU faults. - Explain Performance Issues: Thermal or Power throttling explains why a model is slow. Returns: Markdown report of GPU health issues. """ queries = { "xid_errors": "DCGM_FI_DEV_XID_ERRORS", "temp": "DCGM_FI_DEV_GPU_TEMP", "power": "DCGM_FI_DEV_POWER_USAGE", "throttle": "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS" } try: results = {} for name, query in queries.items(): try: # Get the last valid value in the last 5 minutes to ensure we don't miss transient drops # But simple query is usually enough for gauges res = await query_prometheus(query) if res.get("status") == "success": results[name] = res.get("data", {}).get("result", []) except OCError: results[name] = [] output = ["### GPU Health & Diagnostics"] # 1. Check for XID Errors (Critical) xid_errors = [] for r in results.get("xid_errors", []): val = float(r.get("value", [0, "0"])[1]) if val > 0: metric = r.get("metric", {}) node = metric.get("node", metric.get("instance", "unknown")) gpu = metric.get("gpu", metric.get("GPU", "?")) xid_errors.append(f"- 🚨 **Node {node} GPU {gpu}**: XID Error Code {int(val)}") if xid_errors: output.append("#### ❌ Hardware Errors Detected") output.extend(xid_errors) output.append(" - **Action**: Contact infrastructure team. This usually indicates hardware failure.") else: output.append("#### ✅ No Hardware (XID) Errors") # 2. Check Throttling throttled = [] for r in results.get("throttle", []): val = int(float(r.get("value", [0, "0"])[1])) if val > 0: metric = r.get("metric", {}) node = metric.get("node", metric.get("instance", "unknown")) gpu = metric.get("gpu", metric.get("GPU", "?")) reasons = [] if val & 1: reasons.append("GPU Idle") # Not really a throttle in bad sense if val & 2: reasons.append("Power Cap (Normal)") if val & 4: reasons.append("🔥 Thermal Slowdown") if val & 8: reasons.append("Power Supply Failure") if val & 16: reasons.append("Unknown") # Filter out "GPU Idle" or normal Power Cap unless strictly needed # For this report, we care about Thermal or HW issues critical_reasons = [r for r in reasons if "Thermal" in r or "Failure" in r] if critical_reasons: throttled.append(f"- ⚠️ **Node {node} GPU {gpu}**: {', '.join(critical_reasons)}") if throttled: output.append("\n#### 🔥 Thermal/Power Throttling Events") output.extend(throttled) else: output.append("\n#### ✅ No Critical Throttling") # 3. High Temperature Check (> 80C) hot_gpus = [] for r in results.get("temp", []): val = float(r.get("value", [0, "0"])[1]) if val > 80: metric = r.get("metric", {}) node = metric.get("node", metric.get("instance", "unknown")) gpu = metric.get("gpu", metric.get("GPU", "?")) hot_gpus.append(f"- **Node {node} GPU {gpu}**: {val:.1f}°C") if hot_gpus: output.append("\n#### 🌡️ High Temperatures (>80°C)") output.extend(hot_gpus) return "\n".join(output) except OCError as e: return f"Error checking GPU health: {e}"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/junzzhu/openshift-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

gpu.py•5.53 KiB