health_checks.pyā¢20.4 kB
"""
Health check functions for AWS GPU NIM RAG system validation.
Provides structured health checks for all system components:
- GPU availability and utilization
- Docker GPU runtime
- IRIS database connectivity and schema
- NIM LLM service health and inference
All functions return structured results with pass/fail status
and diagnostic messages for troubleshooting.
"""
import subprocess
import json
import urllib.request
import urllib.error
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict
@dataclass
class HealthCheckResult:
"""Structured result from a health check.
Attributes:
component: Component name being checked
status: 'pass' or 'fail'
message: Human-readable status message
details: Optional additional diagnostic information
"""
component: str
status: str # 'pass' or 'fail'
message: str
details: Optional[Dict[str, Any]] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary representation."""
return asdict(self)
def gpu_check() -> HealthCheckResult:
"""
Check GPU availability and basic functionality.
Verifies:
- nvidia-smi command is available
- GPU device is detected
- Driver and CUDA versions
Returns:
HealthCheckResult with GPU information or error details
"""
try:
# Check if nvidia-smi exists
result = subprocess.run(
['nvidia-smi', '--query-gpu=name,driver_version,memory.total', '--format=csv,noheader'],
capture_output=True,
text=True,
timeout=10
)
if result.returncode != 0:
return HealthCheckResult(
component="GPU",
status="fail",
message="GPU not accessible",
details={"error": result.stderr}
)
# Parse GPU information
gpu_info = result.stdout.strip().split(', ')
if len(gpu_info) < 3:
return HealthCheckResult(
component="GPU",
status="fail",
message="Could not parse GPU information",
details={"output": result.stdout}
)
gpu_name = gpu_info[0]
driver_version = gpu_info[1]
memory_mb = gpu_info[2].split()[0]
# Get CUDA version
cuda_result = subprocess.run(
['nvidia-smi'],
capture_output=True,
text=True,
timeout=10
)
cuda_version = "Unknown"
for line in cuda_result.stdout.split('\n'):
if "CUDA Version" in line:
parts = line.split("CUDA Version:")
if len(parts) > 1:
cuda_version = parts[1].strip().split()[0]
break
return HealthCheckResult(
component="GPU",
status="pass",
message=f"GPU detected: {gpu_name}",
details={
"gpu_name": gpu_name,
"driver_version": driver_version,
"memory_mb": memory_mb,
"cuda_version": cuda_version
}
)
except FileNotFoundError:
return HealthCheckResult(
component="GPU",
status="fail",
message="nvidia-smi not found - GPU drivers may not be installed",
details={"suggestion": "Run: ./scripts/aws/install-gpu-drivers.sh"}
)
except subprocess.TimeoutExpired:
return HealthCheckResult(
component="GPU",
status="fail",
message="nvidia-smi command timed out",
details={}
)
except Exception as e:
return HealthCheckResult(
component="GPU",
status="fail",
message=f"Unexpected error checking GPU: {str(e)}",
details={"error_type": type(e).__name__}
)
def gpu_utilization_check() -> HealthCheckResult:
"""
Check GPU utilization and memory usage.
Provides real-time GPU metrics useful for monitoring workloads.
Returns:
HealthCheckResult with utilization metrics
"""
try:
result = subprocess.run(
['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu',
'--format=csv,noheader,nounits'],
capture_output=True,
text=True,
timeout=10
)
if result.returncode != 0:
return HealthCheckResult(
component="GPU Utilization",
status="fail",
message="Could not query GPU utilization",
details={"error": result.stderr}
)
# Parse utilization info
util_info = result.stdout.strip().split(', ')
if len(util_info) < 4:
return HealthCheckResult(
component="GPU Utilization",
status="fail",
message="Could not parse GPU utilization",
details={"output": result.stdout}
)
gpu_util_pct = float(util_info[0])
memory_used_mb = float(util_info[1])
memory_total_mb = float(util_info[2])
temperature_c = float(util_info[3])
memory_util_pct = (memory_used_mb / memory_total_mb) * 100 if memory_total_mb > 0 else 0
return HealthCheckResult(
component="GPU Utilization",
status="pass",
message=f"GPU utilization: {gpu_util_pct}%",
details={
"gpu_utilization_pct": gpu_util_pct,
"memory_used_mb": memory_used_mb,
"memory_total_mb": memory_total_mb,
"memory_utilization_pct": round(memory_util_pct, 1),
"temperature_c": temperature_c
}
)
except Exception as e:
return HealthCheckResult(
component="GPU Utilization",
status="fail",
message=f"Error checking GPU utilization: {str(e)}",
details={"error_type": type(e).__name__}
)
def docker_gpu_check() -> HealthCheckResult:
"""
Check Docker GPU runtime configuration.
Verifies:
- Docker is installed
- Docker can access GPU via --gpus flag
- GPU is accessible inside containers
Returns:
HealthCheckResult with Docker GPU status
"""
try:
# Test GPU access in container
result = subprocess.run(
['docker', 'run', '--rm', '--gpus', 'all',
'nvidia/cuda:12.2.0-base-ubuntu22.04', 'nvidia-smi'],
capture_output=True,
text=True,
timeout=60
)
if result.returncode != 0:
return HealthCheckResult(
component="Docker GPU Runtime",
status="fail",
message="Docker cannot access GPU",
details={
"error": result.stderr,
"suggestion": "Run: ./scripts/aws/setup-docker-gpu.sh"
}
)
return HealthCheckResult(
component="Docker GPU Runtime",
status="pass",
message="Docker can access GPU",
details={}
)
except FileNotFoundError:
return HealthCheckResult(
component="Docker GPU Runtime",
status="fail",
message="Docker not found",
details={"suggestion": "Install Docker first"}
)
except subprocess.TimeoutExpired:
return HealthCheckResult(
component="Docker GPU Runtime",
status="fail",
message="Docker GPU test timed out",
details={}
)
except Exception as e:
return HealthCheckResult(
component="Docker GPU Runtime",
status="fail",
message=f"Error checking Docker GPU: {str(e)}",
details={"error_type": type(e).__name__}
)
def iris_connection_check(host: str = "localhost", port: int = 1972,
namespace: str = "DEMO", username: str = "_SYSTEM",
password: str = "SYS") -> HealthCheckResult:
"""
Check IRIS database connectivity.
Args:
host: IRIS host address
port: IRIS SuperServer port
namespace: IRIS namespace
username: Database username
password: Database password
Returns:
HealthCheckResult with connection status
"""
try:
import iris
# Attempt connection
conn = iris.connect(host, port, namespace, username, password)
cursor = conn.cursor()
# Test with simple query
cursor.execute("SELECT 1")
result = cursor.fetchone()
conn.close()
if result and result[0] == 1:
return HealthCheckResult(
component="IRIS Connection",
status="pass",
message=f"Connected to IRIS at {host}:{port}/{namespace}",
details={
"host": host,
"port": port,
"namespace": namespace
}
)
else:
return HealthCheckResult(
component="IRIS Connection",
status="fail",
message="Query test failed",
details={"result": str(result)}
)
except ImportError:
return HealthCheckResult(
component="IRIS Connection",
status="fail",
message="iris Python module not installed",
details={"suggestion": "pip install intersystems-iris"}
)
except Exception as e:
return HealthCheckResult(
component="IRIS Connection",
status="fail",
message=f"Connection failed: {str(e)}",
details={
"error_type": type(e).__name__,
"host": host,
"port": port,
"suggestion": "Check IRIS container is running: docker ps | grep iris"
}
)
def iris_tables_check(host: str = "localhost", port: int = 1972,
namespace: str = "DEMO", username: str = "_SYSTEM",
password: str = "SYS") -> HealthCheckResult:
"""
Check IRIS vector table schema.
Verifies:
- ClinicalNoteVectors table exists
- MedicalImageVectors table exists
Args:
host: IRIS host address
port: IRIS SuperServer port
namespace: IRIS namespace
username: Database username
password: Database password
Returns:
HealthCheckResult with table existence status
"""
try:
import iris
conn = iris.connect(host, port, namespace, username, password)
cursor = conn.cursor()
# Check ClinicalNoteVectors table
cursor.execute("""
SELECT COUNT(*)
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA=? AND TABLE_NAME='ClinicalNoteVectors'
""", (namespace,))
clinical_exists = cursor.fetchone()[0] > 0
# Check MedicalImageVectors table
cursor.execute("""
SELECT COUNT(*)
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA=? AND TABLE_NAME='MedicalImageVectors'
""", (namespace,))
image_exists = cursor.fetchone()[0] > 0
conn.close()
tables_found = []
tables_missing = []
if clinical_exists:
tables_found.append("ClinicalNoteVectors")
else:
tables_missing.append("ClinicalNoteVectors")
if image_exists:
tables_found.append("MedicalImageVectors")
else:
tables_missing.append("MedicalImageVectors")
if len(tables_found) == 2:
return HealthCheckResult(
component="IRIS Tables",
status="pass",
message="All vector tables exist",
details={
"tables_found": tables_found,
"namespace": namespace
}
)
elif len(tables_found) > 0:
return HealthCheckResult(
component="IRIS Tables",
status="fail",
message="Some vector tables missing",
details={
"tables_found": tables_found,
"tables_missing": tables_missing,
"suggestion": "Run schema creation scripts"
}
)
else:
return HealthCheckResult(
component="IRIS Tables",
status="fail",
message="No vector tables found",
details={
"tables_missing": tables_missing,
"suggestion": "Run: python src/setup/create_text_vector_table.py"
}
)
except ImportError:
return HealthCheckResult(
component="IRIS Tables",
status="fail",
message="iris Python module not installed",
details={"suggestion": "pip install intersystems-iris"}
)
except Exception as e:
return HealthCheckResult(
component="IRIS Tables",
status="fail",
message=f"Table check failed: {str(e)}",
details={"error_type": type(e).__name__}
)
def nim_llm_health_check(host: str = "localhost", port: int = 8001) -> HealthCheckResult:
"""
Check NIM LLM service health endpoint.
Args:
host: NIM service host
port: NIM service port
Returns:
HealthCheckResult with service health status
"""
try:
url = f"http://{host}:{port}/health"
req = urllib.request.Request(url)
with urllib.request.urlopen(req, timeout=10) as response:
if response.status == 200:
return HealthCheckResult(
component="NIM LLM Health",
status="pass",
message=f"NIM LLM service healthy at {host}:{port}",
details={"endpoint": url}
)
else:
return HealthCheckResult(
component="NIM LLM Health",
status="fail",
message=f"Health endpoint returned status {response.status}",
details={"status_code": response.status}
)
except urllib.error.URLError as e:
return HealthCheckResult(
component="NIM LLM Health",
status="fail",
message="Health endpoint not accessible",
details={
"error": str(e),
"endpoint": f"http://{host}:{port}/health",
"suggestion": "NIM may still be initializing - check: docker logs nim-llm"
}
)
except Exception as e:
return HealthCheckResult(
component="NIM LLM Health",
status="fail",
message=f"Error checking health: {str(e)}",
details={"error_type": type(e).__name__}
)
def nim_llm_inference_test(host: str = "localhost", port: int = 8001) -> HealthCheckResult:
"""
Test NIM LLM inference with a simple query.
Args:
host: NIM service host
port: NIM service port
Returns:
HealthCheckResult with inference test status and response
"""
try:
url = f"http://{host}:{port}/v1/chat/completions"
payload = {
"model": "meta/llama-3.1-8b-instruct",
"messages": [
{"role": "user", "content": "What is 2+2? Answer with just the number."}
],
"max_tokens": 10
}
data = json.dumps(payload).encode('utf-8')
req = urllib.request.Request(
url,
data=data,
headers={'Content-Type': 'application/json'}
)
with urllib.request.urlopen(req, timeout=30) as response:
if response.status == 200:
response_data = json.loads(response.read().decode('utf-8'))
if 'choices' in response_data and len(response_data['choices']) > 0:
answer = response_data['choices'][0]['message']['content']
return HealthCheckResult(
component="NIM LLM Inference",
status="pass",
message="Inference test successful",
details={
"test_query": "What is 2+2?",
"response": answer.strip(),
"endpoint": url
}
)
else:
return HealthCheckResult(
component="NIM LLM Inference",
status="fail",
message="Response missing expected fields",
details={"response": response_data}
)
else:
return HealthCheckResult(
component="NIM LLM Inference",
status="fail",
message=f"Inference endpoint returned status {response.status}",
details={"status_code": response.status}
)
except urllib.error.URLError as e:
return HealthCheckResult(
component="NIM LLM Inference",
status="fail",
message="Inference endpoint not accessible",
details={
"error": str(e),
"endpoint": url,
"suggestion": "Model may still be loading - check: docker logs nim-llm"
}
)
except json.JSONDecodeError as e:
return HealthCheckResult(
component="NIM LLM Inference",
status="fail",
message="Could not parse response JSON",
details={"error": str(e)}
)
except Exception as e:
return HealthCheckResult(
component="NIM LLM Inference",
status="fail",
message=f"Inference test failed: {str(e)}",
details={"error_type": type(e).__name__}
)
def run_all_checks(iris_host: str = "localhost", iris_port: int = 1972,
nim_host: str = "localhost", nim_port: int = 8001,
skip_gpu: bool = False, skip_docker: bool = False,
skip_iris: bool = False, skip_nim: bool = False) -> List[HealthCheckResult]:
"""
Run all health checks and return results.
Args:
iris_host: IRIS database host
iris_port: IRIS database port
nim_host: NIM service host
nim_port: NIM service port
skip_gpu: Skip GPU checks
skip_docker: Skip Docker checks
skip_iris: Skip IRIS checks
skip_nim: Skip NIM checks
Returns:
List of HealthCheckResult objects
"""
results = []
if not skip_gpu:
results.append(gpu_check())
results.append(gpu_utilization_check())
if not skip_docker:
results.append(docker_gpu_check())
if not skip_iris:
results.append(iris_connection_check(iris_host, iris_port))
results.append(iris_tables_check(iris_host, iris_port))
if not skip_nim:
results.append(nim_llm_health_check(nim_host, nim_port))
results.append(nim_llm_inference_test(nim_host, nim_port))
return results
if __name__ == "__main__":
"""Run all health checks when executed as script."""
print("Running health checks...")
print()
results = run_all_checks()
passed = sum(1 for r in results if r.status == "pass")
failed = sum(1 for r in results if r.status == "fail")
for result in results:
status_symbol = "ā" if result.status == "pass" else "ā"
print(f"{status_symbol} {result.component}: {result.message}")
if result.details and result.status == "fail":
if "suggestion" in result.details:
print(f" Suggestion: {result.details['suggestion']}")
if "error" in result.details:
print(f" Error: {result.details['error']}")
print()
print(f"Results: {passed} passed, {failed} failed")
# Exit with error code if any check failed
exit(0 if failed == 0 else 1)