Skip to main content
Glama
health_checks.py•20.4 kB
""" Health check functions for AWS GPU NIM RAG system validation. Provides structured health checks for all system components: - GPU availability and utilization - Docker GPU runtime - IRIS database connectivity and schema - NIM LLM service health and inference All functions return structured results with pass/fail status and diagnostic messages for troubleshooting. """ import subprocess import json import urllib.request import urllib.error from typing import Dict, List, Any, Optional from dataclasses import dataclass, asdict @dataclass class HealthCheckResult: """Structured result from a health check. Attributes: component: Component name being checked status: 'pass' or 'fail' message: Human-readable status message details: Optional additional diagnostic information """ component: str status: str # 'pass' or 'fail' message: str details: Optional[Dict[str, Any]] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" return asdict(self) def gpu_check() -> HealthCheckResult: """ Check GPU availability and basic functionality. Verifies: - nvidia-smi command is available - GPU device is detected - Driver and CUDA versions Returns: HealthCheckResult with GPU information or error details """ try: # Check if nvidia-smi exists result = subprocess.run( ['nvidia-smi', '--query-gpu=name,driver_version,memory.total', '--format=csv,noheader'], capture_output=True, text=True, timeout=10 ) if result.returncode != 0: return HealthCheckResult( component="GPU", status="fail", message="GPU not accessible", details={"error": result.stderr} ) # Parse GPU information gpu_info = result.stdout.strip().split(', ') if len(gpu_info) < 3: return HealthCheckResult( component="GPU", status="fail", message="Could not parse GPU information", details={"output": result.stdout} ) gpu_name = gpu_info[0] driver_version = gpu_info[1] memory_mb = gpu_info[2].split()[0] # Get CUDA version cuda_result = subprocess.run( ['nvidia-smi'], capture_output=True, text=True, timeout=10 ) cuda_version = "Unknown" for line in cuda_result.stdout.split('\n'): if "CUDA Version" in line: parts = line.split("CUDA Version:") if len(parts) > 1: cuda_version = parts[1].strip().split()[0] break return HealthCheckResult( component="GPU", status="pass", message=f"GPU detected: {gpu_name}", details={ "gpu_name": gpu_name, "driver_version": driver_version, "memory_mb": memory_mb, "cuda_version": cuda_version } ) except FileNotFoundError: return HealthCheckResult( component="GPU", status="fail", message="nvidia-smi not found - GPU drivers may not be installed", details={"suggestion": "Run: ./scripts/aws/install-gpu-drivers.sh"} ) except subprocess.TimeoutExpired: return HealthCheckResult( component="GPU", status="fail", message="nvidia-smi command timed out", details={} ) except Exception as e: return HealthCheckResult( component="GPU", status="fail", message=f"Unexpected error checking GPU: {str(e)}", details={"error_type": type(e).__name__} ) def gpu_utilization_check() -> HealthCheckResult: """ Check GPU utilization and memory usage. Provides real-time GPU metrics useful for monitoring workloads. Returns: HealthCheckResult with utilization metrics """ try: result = subprocess.run( ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu', '--format=csv,noheader,nounits'], capture_output=True, text=True, timeout=10 ) if result.returncode != 0: return HealthCheckResult( component="GPU Utilization", status="fail", message="Could not query GPU utilization", details={"error": result.stderr} ) # Parse utilization info util_info = result.stdout.strip().split(', ') if len(util_info) < 4: return HealthCheckResult( component="GPU Utilization", status="fail", message="Could not parse GPU utilization", details={"output": result.stdout} ) gpu_util_pct = float(util_info[0]) memory_used_mb = float(util_info[1]) memory_total_mb = float(util_info[2]) temperature_c = float(util_info[3]) memory_util_pct = (memory_used_mb / memory_total_mb) * 100 if memory_total_mb > 0 else 0 return HealthCheckResult( component="GPU Utilization", status="pass", message=f"GPU utilization: {gpu_util_pct}%", details={ "gpu_utilization_pct": gpu_util_pct, "memory_used_mb": memory_used_mb, "memory_total_mb": memory_total_mb, "memory_utilization_pct": round(memory_util_pct, 1), "temperature_c": temperature_c } ) except Exception as e: return HealthCheckResult( component="GPU Utilization", status="fail", message=f"Error checking GPU utilization: {str(e)}", details={"error_type": type(e).__name__} ) def docker_gpu_check() -> HealthCheckResult: """ Check Docker GPU runtime configuration. Verifies: - Docker is installed - Docker can access GPU via --gpus flag - GPU is accessible inside containers Returns: HealthCheckResult with Docker GPU status """ try: # Test GPU access in container result = subprocess.run( ['docker', 'run', '--rm', '--gpus', 'all', 'nvidia/cuda:12.2.0-base-ubuntu22.04', 'nvidia-smi'], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message="Docker cannot access GPU", details={ "error": result.stderr, "suggestion": "Run: ./scripts/aws/setup-docker-gpu.sh" } ) return HealthCheckResult( component="Docker GPU Runtime", status="pass", message="Docker can access GPU", details={} ) except FileNotFoundError: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message="Docker not found", details={"suggestion": "Install Docker first"} ) except subprocess.TimeoutExpired: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message="Docker GPU test timed out", details={} ) except Exception as e: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message=f"Error checking Docker GPU: {str(e)}", details={"error_type": type(e).__name__} ) def iris_connection_check(host: str = "localhost", port: int = 1972, namespace: str = "DEMO", username: str = "_SYSTEM", password: str = "SYS") -> HealthCheckResult: """ Check IRIS database connectivity. Args: host: IRIS host address port: IRIS SuperServer port namespace: IRIS namespace username: Database username password: Database password Returns: HealthCheckResult with connection status """ try: import iris # Attempt connection conn = iris.connect(host, port, namespace, username, password) cursor = conn.cursor() # Test with simple query cursor.execute("SELECT 1") result = cursor.fetchone() conn.close() if result and result[0] == 1: return HealthCheckResult( component="IRIS Connection", status="pass", message=f"Connected to IRIS at {host}:{port}/{namespace}", details={ "host": host, "port": port, "namespace": namespace } ) else: return HealthCheckResult( component="IRIS Connection", status="fail", message="Query test failed", details={"result": str(result)} ) except ImportError: return HealthCheckResult( component="IRIS Connection", status="fail", message="iris Python module not installed", details={"suggestion": "pip install intersystems-iris"} ) except Exception as e: return HealthCheckResult( component="IRIS Connection", status="fail", message=f"Connection failed: {str(e)}", details={ "error_type": type(e).__name__, "host": host, "port": port, "suggestion": "Check IRIS container is running: docker ps | grep iris" } ) def iris_tables_check(host: str = "localhost", port: int = 1972, namespace: str = "DEMO", username: str = "_SYSTEM", password: str = "SYS") -> HealthCheckResult: """ Check IRIS vector table schema. Verifies: - ClinicalNoteVectors table exists - MedicalImageVectors table exists Args: host: IRIS host address port: IRIS SuperServer port namespace: IRIS namespace username: Database username password: Database password Returns: HealthCheckResult with table existence status """ try: import iris conn = iris.connect(host, port, namespace, username, password) cursor = conn.cursor() # Check ClinicalNoteVectors table cursor.execute(""" SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA=? AND TABLE_NAME='ClinicalNoteVectors' """, (namespace,)) clinical_exists = cursor.fetchone()[0] > 0 # Check MedicalImageVectors table cursor.execute(""" SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA=? AND TABLE_NAME='MedicalImageVectors' """, (namespace,)) image_exists = cursor.fetchone()[0] > 0 conn.close() tables_found = [] tables_missing = [] if clinical_exists: tables_found.append("ClinicalNoteVectors") else: tables_missing.append("ClinicalNoteVectors") if image_exists: tables_found.append("MedicalImageVectors") else: tables_missing.append("MedicalImageVectors") if len(tables_found) == 2: return HealthCheckResult( component="IRIS Tables", status="pass", message="All vector tables exist", details={ "tables_found": tables_found, "namespace": namespace } ) elif len(tables_found) > 0: return HealthCheckResult( component="IRIS Tables", status="fail", message="Some vector tables missing", details={ "tables_found": tables_found, "tables_missing": tables_missing, "suggestion": "Run schema creation scripts" } ) else: return HealthCheckResult( component="IRIS Tables", status="fail", message="No vector tables found", details={ "tables_missing": tables_missing, "suggestion": "Run: python src/setup/create_text_vector_table.py" } ) except ImportError: return HealthCheckResult( component="IRIS Tables", status="fail", message="iris Python module not installed", details={"suggestion": "pip install intersystems-iris"} ) except Exception as e: return HealthCheckResult( component="IRIS Tables", status="fail", message=f"Table check failed: {str(e)}", details={"error_type": type(e).__name__} ) def nim_llm_health_check(host: str = "localhost", port: int = 8001) -> HealthCheckResult: """ Check NIM LLM service health endpoint. Args: host: NIM service host port: NIM service port Returns: HealthCheckResult with service health status """ try: url = f"http://{host}:{port}/health" req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=10) as response: if response.status == 200: return HealthCheckResult( component="NIM LLM Health", status="pass", message=f"NIM LLM service healthy at {host}:{port}", details={"endpoint": url} ) else: return HealthCheckResult( component="NIM LLM Health", status="fail", message=f"Health endpoint returned status {response.status}", details={"status_code": response.status} ) except urllib.error.URLError as e: return HealthCheckResult( component="NIM LLM Health", status="fail", message="Health endpoint not accessible", details={ "error": str(e), "endpoint": f"http://{host}:{port}/health", "suggestion": "NIM may still be initializing - check: docker logs nim-llm" } ) except Exception as e: return HealthCheckResult( component="NIM LLM Health", status="fail", message=f"Error checking health: {str(e)}", details={"error_type": type(e).__name__} ) def nim_llm_inference_test(host: str = "localhost", port: int = 8001) -> HealthCheckResult: """ Test NIM LLM inference with a simple query. Args: host: NIM service host port: NIM service port Returns: HealthCheckResult with inference test status and response """ try: url = f"http://{host}:{port}/v1/chat/completions" payload = { "model": "meta/llama-3.1-8b-instruct", "messages": [ {"role": "user", "content": "What is 2+2? Answer with just the number."} ], "max_tokens": 10 } data = json.dumps(payload).encode('utf-8') req = urllib.request.Request( url, data=data, headers={'Content-Type': 'application/json'} ) with urllib.request.urlopen(req, timeout=30) as response: if response.status == 200: response_data = json.loads(response.read().decode('utf-8')) if 'choices' in response_data and len(response_data['choices']) > 0: answer = response_data['choices'][0]['message']['content'] return HealthCheckResult( component="NIM LLM Inference", status="pass", message="Inference test successful", details={ "test_query": "What is 2+2?", "response": answer.strip(), "endpoint": url } ) else: return HealthCheckResult( component="NIM LLM Inference", status="fail", message="Response missing expected fields", details={"response": response_data} ) else: return HealthCheckResult( component="NIM LLM Inference", status="fail", message=f"Inference endpoint returned status {response.status}", details={"status_code": response.status} ) except urllib.error.URLError as e: return HealthCheckResult( component="NIM LLM Inference", status="fail", message="Inference endpoint not accessible", details={ "error": str(e), "endpoint": url, "suggestion": "Model may still be loading - check: docker logs nim-llm" } ) except json.JSONDecodeError as e: return HealthCheckResult( component="NIM LLM Inference", status="fail", message="Could not parse response JSON", details={"error": str(e)} ) except Exception as e: return HealthCheckResult( component="NIM LLM Inference", status="fail", message=f"Inference test failed: {str(e)}", details={"error_type": type(e).__name__} ) def run_all_checks(iris_host: str = "localhost", iris_port: int = 1972, nim_host: str = "localhost", nim_port: int = 8001, skip_gpu: bool = False, skip_docker: bool = False, skip_iris: bool = False, skip_nim: bool = False) -> List[HealthCheckResult]: """ Run all health checks and return results. Args: iris_host: IRIS database host iris_port: IRIS database port nim_host: NIM service host nim_port: NIM service port skip_gpu: Skip GPU checks skip_docker: Skip Docker checks skip_iris: Skip IRIS checks skip_nim: Skip NIM checks Returns: List of HealthCheckResult objects """ results = [] if not skip_gpu: results.append(gpu_check()) results.append(gpu_utilization_check()) if not skip_docker: results.append(docker_gpu_check()) if not skip_iris: results.append(iris_connection_check(iris_host, iris_port)) results.append(iris_tables_check(iris_host, iris_port)) if not skip_nim: results.append(nim_llm_health_check(nim_host, nim_port)) results.append(nim_llm_inference_test(nim_host, nim_port)) return results if __name__ == "__main__": """Run all health checks when executed as script.""" print("Running health checks...") print() results = run_all_checks() passed = sum(1 for r in results if r.status == "pass") failed = sum(1 for r in results if r.status == "fail") for result in results: status_symbol = "āœ“" if result.status == "pass" else "āœ—" print(f"{status_symbol} {result.component}: {result.message}") if result.details and result.status == "fail": if "suggestion" in result.details: print(f" Suggestion: {result.details['suggestion']}") if "error" in result.details: print(f" Error: {result.details['error']}") print() print(f"Results: {passed} passed, {failed} failed") # Exit with error code if any check failed exit(0 if failed == 0 else 1)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/isc-tdyar/medical-graphrag-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server