Medical GraphRAG Assistant

health_checks.py•25.3 KiB

""" Health check functions for AWS GPU NIM RAG system validation. Provides structured health checks for all system components: - GPU availability and utilization - Docker GPU runtime - IRIS database connectivity and schema - NIM LLM service health and inference All functions return structured results with pass/fail status and diagnostic messages for troubleshooting. """ import subprocess import json import os import urllib.request import urllib.error from typing import Dict, List, Any, Optional from dataclasses import dataclass, asdict @dataclass class HealthCheckResult: """Structured result from a health check. Attributes: component: Component name being checked status: 'pass' or 'fail' message: Human-readable status message details: Optional additional diagnostic information """ component: str status: str # 'pass' or 'fail' message: str details: Optional[Dict[str, Any]] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" return asdict(self) def gpu_check() -> HealthCheckResult: """ Check GPU availability and basic functionality. Verifies: - nvidia-smi command is available - GPU device is detected - Driver and CUDA versions Returns: HealthCheckResult with GPU information or error details """ try: # Check if nvidia-smi exists result = subprocess.run( ['nvidia-smi', '--query-gpu=name,driver_version,memory.total', '--format=csv,noheader'], capture_output=True, text=True, timeout=10 ) if result.returncode != 0: return HealthCheckResult( component="GPU", status="fail", message="GPU not accessible", details={"error": result.stderr} ) # Parse GPU information gpu_info = result.stdout.strip().split(', ') if len(gpu_info) < 3: return HealthCheckResult( component="GPU", status="fail", message="Could not parse GPU information", details={"output": result.stdout} ) gpu_name = gpu_info[0] driver_version = gpu_info[1] memory_mb = gpu_info[2].split()[0] # Get CUDA version cuda_result = subprocess.run( ['nvidia-smi'], capture_output=True, text=True, timeout=10 ) cuda_version = "Unknown" for line in cuda_result.stdout.split('\n'): if "CUDA Version" in line: parts = line.split("CUDA Version:") if len(parts) > 1: cuda_version = parts[1].strip().split()[0] break return HealthCheckResult( component="GPU", status="pass", message=f"GPU detected: {gpu_name}", details={ "gpu_name": gpu_name, "driver_version": driver_version, "memory_mb": memory_mb, "cuda_version": cuda_version } ) except FileNotFoundError: return HealthCheckResult( component="GPU", status="fail", message="nvidia-smi not found - GPU drivers may not be installed", details={"suggestion": "Run: ./scripts/aws/install-gpu-drivers.sh"} ) except subprocess.TimeoutExpired: return HealthCheckResult( component="GPU", status="fail", message="nvidia-smi command timed out", details={} ) except Exception as e: return HealthCheckResult( component="GPU", status="fail", message=f"Unexpected error checking GPU: {str(e)}", details={"error_type": type(e).__name__} ) def gpu_utilization_check() -> HealthCheckResult: """ Check GPU utilization and memory usage. Provides real-time GPU metrics useful for monitoring workloads. Returns: HealthCheckResult with utilization metrics """ try: result = subprocess.run( ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu', '--format=csv,noheader,nounits'], capture_output=True, text=True, timeout=10 ) if result.returncode != 0: return HealthCheckResult( component="GPU Utilization", status="fail", message="Could not query GPU utilization", details={"error": result.stderr} ) # Parse utilization info util_info = result.stdout.strip().split(', ') if len(util_info) < 4: return HealthCheckResult( component="GPU Utilization", status="fail", message="Could not parse GPU utilization", details={"output": result.stdout} ) gpu_util_pct = float(util_info[0].split('\n')[0]) memory_used_mb = float(util_info[1].split('\n')[0]) memory_total_mb = float(util_info[2].split('\n')[0]) temperature_c = float(util_info[3].split('\n')[0]) memory_util_pct = (memory_used_mb / memory_total_mb) * 100 if memory_total_mb > 0 else 0 return HealthCheckResult( component="GPU Utilization", status="pass", message=f"GPU utilization: {gpu_util_pct}%", details={ "gpu_utilization_pct": gpu_util_pct, "memory_used_mb": memory_used_mb, "memory_total_mb": memory_total_mb, "memory_utilization_pct": round(memory_util_pct, 1), "temperature_c": temperature_c } ) except Exception as e: return HealthCheckResult( component="GPU Utilization", status="fail", message=f"Error checking GPU utilization: {str(e)}", details={"error_type": type(e).__name__} ) def docker_gpu_check() -> HealthCheckResult: """ Check Docker GPU runtime configuration. Verifies: - Docker is installed - Docker can access GPU via --gpus flag - GPU is accessible inside containers Returns: HealthCheckResult with Docker GPU status """ try: # Test GPU access in container result = subprocess.run( ['docker', 'run', '--rm', '--gpus', 'all', 'nvidia/cuda:12.2.0-base-ubuntu22.04', 'nvidia-smi'], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message="Docker cannot access GPU", details={ "error": result.stderr, "suggestion": "Run: ./scripts/aws/setup-docker-gpu.sh" } ) return HealthCheckResult( component="Docker GPU Runtime", status="pass", message="Docker can access GPU", details={} ) except FileNotFoundError: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message="Docker not found", details={"suggestion": "Install Docker first"} ) except subprocess.TimeoutExpired: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message="Docker GPU test timed out", details={} ) except Exception as e: return HealthCheckResult( component="Docker GPU Runtime", status="fail", message=f"Error checking Docker GPU: {str(e)}", details={"error_type": type(e).__name__} ) from src.db.connection import DatabaseConnection def iris_connection_check(host: Optional[str] = None, port: Optional[int] = None, namespace: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> HealthCheckResult: """ Check IRIS database connectivity. Args: host: IRIS host address (default: env) port: IRIS SuperServer port (default: env) namespace: IRIS namespace (default: env) username: Database username (default: env) password: Database password (default: env) Returns: HealthCheckResult with connection status """ # Resolve defaults for reporting db_config = DatabaseConnection.get_config() host = host or db_config['hostname'] port = port or db_config['port'] namespace = namespace or db_config['namespace'] try: # Attempt connection using centralized logic conn = DatabaseConnection.get_connection( hostname=host, port=port, namespace=namespace, username=username, password=password ) cursor = conn.cursor() # Test with simple query cursor.execute("SELECT 1") result = cursor.fetchone() conn.close() if result and result[0] == 1: return HealthCheckResult( component="IRIS Connection", status="pass", message=f"Connected to IRIS at {host}:{port}/{namespace}", details={ "host": host, "port": port, "namespace": namespace } ) else: return HealthCheckResult( component="IRIS Connection", status="fail", message="Query test failed", details={"result": str(result)} ) except ImportError: return HealthCheckResult( component="IRIS Connection", status="fail", message="iris Python module not installed", details={"suggestion": "pip install intersystems-irispython"} ) except Exception as e: return HealthCheckResult( component="IRIS Connection", status="fail", message=f"Connection failed: {str(e)}", details={ "error_type": type(e).__name__, "host": host, "port": port, "suggestion": "Check IRIS container is running: docker ps | grep iris" } ) def iris_tables_check(host: str = "localhost", port: int = 1972, namespace: str = "DEMO", username: str = "_SYSTEM", password: str = "SYS") -> HealthCheckResult: """ Check IRIS vector table schema. Verifies: - ClinicalNoteVectors table exists - MedicalImageVectors table exists Args: host: IRIS host address port: IRIS SuperServer port namespace: IRIS namespace username: Database username password: Database password Returns: HealthCheckResult with table existence status """ try: # Use centralized connection logic conn = DatabaseConnection.get_connection( hostname=host, port=port, namespace=namespace, username=username, password=password ) cursor = conn.cursor() # Check ClinicalNoteVectors table cursor.execute(""" SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA=? AND TABLE_NAME='ClinicalNoteVectors' """, (namespace,)) clinical_exists = cursor.fetchone()[0] > 0 # Check MedicalImageVectors table cursor.execute(""" SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA=? AND TABLE_NAME='MedicalImageVectors' """, (namespace,)) image_exists = cursor.fetchone()[0] > 0 conn.close() tables_found = [] tables_missing = [] if clinical_exists: tables_found.append("ClinicalNoteVectors") else: tables_missing.append("ClinicalNoteVectors") if image_exists: tables_found.append("MedicalImageVectors") else: tables_missing.append("MedicalImageVectors") if len(tables_found) == 2: return HealthCheckResult( component="IRIS Tables", status="pass", message="All vector tables exist", details={ "tables_found": tables_found, "namespace": namespace } ) elif len(tables_found) > 0: return HealthCheckResult( component="IRIS Tables", status="fail", message="Some vector tables missing", details={ "tables_found": tables_found, "tables_missing": tables_missing, "suggestion": "Run schema creation scripts" } ) else: return HealthCheckResult( component="IRIS Tables", status="fail", message="No vector tables found", details={ "tables_missing": tables_missing, "suggestion": "Run: python src/setup/create_text_vector_table.py" } ) except ImportError: return HealthCheckResult( component="IRIS Tables", status="fail", message="iris Python module not installed", details={"suggestion": "pip install intersystems-iris"} ) except Exception as e: return HealthCheckResult( component="IRIS Tables", status="fail", message=f"Table check failed: {str(e)}", details={"error_type": type(e).__name__} ) def nim_llm_health_check(host: str = "localhost", port: int = 8001) -> HealthCheckResult: """ Check NIM LLM service health endpoint. """ scheme = "https" if port == 443 else "http" base_url = f"{scheme}://{host}" if port in (80, 443) else f"{scheme}://{host}:{port}" endpoints = [ f"{base_url}/v1/models", f"{base_url}/v1/health/ready", f"{base_url}/health/ready", f"{base_url}/health" ] last_error = None for url in endpoints: try: req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=5) as response: if response.status == 200: return HealthCheckResult( component="NIM LLM Health", status="pass", message=f"NIM LLM service healthy at {host}:{port}", details={"endpoint": url} ) except Exception as e: last_error = e continue return HealthCheckResult( component="NIM LLM Health", status="fail", message="Health endpoint not accessible", details={ "error": str(last_error), "endpoints_checked": endpoints, "suggestion": "NIM may still be initializing - check: docker logs nim-llm" } ) def nim_llm_inference_test(host: str = "localhost", port: int = 8001) -> HealthCheckResult: """ Test NIM LLM inference with a simple query. """ scheme = "https" if port == 443 else "http" base_url = f"{scheme}://{host}" if port in (80, 443) else f"{scheme}://{host}:{port}" url = f"{base_url}/v1/chat/completions" try: # Use a model that exists on both local and Hosted API model_name = "meta/llama-3.1-8b-instruct" payload = { "model": model_name, "messages": [ {"role": "user", "content": "What is 2+2? Answer with just the number."} ], "max_tokens": 10 } api_key = os.getenv("NVIDIA_API_KEY") headers = {'Content-Type': 'application/json'} if api_key: headers['Authorization'] = f'Bearer {api_key}' data = json.dumps(payload).encode('utf-8') req = urllib.request.Request(url, data=data, headers=headers) with urllib.request.urlopen(req, timeout=60) as response: if response.status == 200: response_data = json.loads(response.read().decode('utf-8')) if 'choices' in response_data and len(response_data['choices']) > 0: answer = response_data['choices'][0]['message']['content'] return HealthCheckResult( component="NIM LLM Inference", status="pass", message="Inference test successful", details={ "test_query": "What is 2+2?", "response": answer.strip(), "endpoint": url } ) else: return HealthCheckResult( component="NIM LLM Inference", status="fail", message="Response missing expected fields", details={"response": response_data} ) else: return HealthCheckResult( component="NIM LLM Inference", status="fail", message=f"Inference endpoint returned status {response.status}", details={"status_code": response.status} ) except urllib.error.URLError as e: return HealthCheckResult( component="NIM LLM Inference", status="fail", message="Inference endpoint not accessible", details={ "error": str(e), "endpoint": url, "suggestion": "Model may still be loading - check: docker logs nim-llm" } ) except json.JSONDecodeError as e: return HealthCheckResult( component="NIM LLM Inference", status="fail", message="Could not parse response JSON", details={"error": str(e)} ) except Exception as e: return HealthCheckResult( component="NIM LLM Inference", status="fail", message=f"Inference test failed: {str(e)}", details={"error_type": type(e).__name__} ) def iris_schema_check(host: Optional[str] = None, port: Optional[int] = None, namespace: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> HealthCheckResult: """ Comprehensive check of IRIS database schema for all required tables. Verifies existence of: - SQLUser.FHIRDocuments - SQLUser.Entities - SQLUser.EntityRelationships - VectorSearch.MIMICCXRImages Returns: HealthCheckResult with schema validation status """ try: # Use centralized connection logic conn = DatabaseConnection.get_connection( hostname=host, port=port, namespace=namespace, username=username, password=password ) cursor = conn.cursor() required_tables = [ ("SQLUser", "FHIRDocuments"), ("RAG", "Entities"), ("RAG", "EntityRelationships"), ("VectorSearch", "MIMICCXRImages") ] found = [] missing = [] for schema, table in required_tables: cursor.execute(""" SELECT COUNT(*) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA=? AND TABLE_NAME=? """, (schema, table)) if cursor.fetchone()[0] > 0: found.append(f"{schema}.{table}") else: missing.append(f"{schema}.{table}") conn.close() if not missing: return HealthCheckResult( component="IRIS Schema", status="pass", message="All required tables exist", details={"tables": found} ) else: return HealthCheckResult( component="IRIS Schema", status="fail", message=f"Missing tables: {', '.join(missing)}", details={ "found": found, "missing": missing, "suggestion": "Run setup scripts or recreate missing tables" } ) except Exception as e: return HealthCheckResult( component="IRIS Schema", status="fail", message=f"Schema check failed: {str(e)}" ) def fhir_auth_check(url: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> HealthCheckResult: """ Check FHIR server authentication and availability. Args: url: FHIR base URL username: Auth username password: Auth password Returns: HealthCheckResult with auth status """ import os import base64 import requests fhir_url = url or os.getenv("FHIR_BASE_URL", "http://localhost:32783/csp/healthshare/demo/fhir/r4") user = username or os.getenv("FHIR_USERNAME", "_SYSTEM") pw = password or os.getenv("FHIR_PASSWORD", "SYS") try: auth = base64.b64encode(f"{user}:{pw}".encode()).decode() headers = { "Accept": "application/fhir+json", "Authorization": f"Basic {auth}" } # Test with metadata endpoint response = requests.get(f"{fhir_url}/metadata", headers=headers, timeout=10) if response.status_code == 200: return HealthCheckResult( component="FHIR Auth", status="pass", message=f"Authenticated successfully with {fhir_url}", details={"status_code": 200} ) elif response.status_code == 401: return HealthCheckResult( component="FHIR Auth", status="fail", message="401 Unauthorized - FHIR credentials invalid", details={ "status_code": 401, "suggestion": "Run 'python -m src.cli reset-security' to fix IRIS security configuration." } ) else: return HealthCheckResult( component="FHIR Auth", status="fail", message=f"Unexpected status code {response.status_code}", details={"status_code": response.status_code} ) except Exception as e: return HealthCheckResult( component="FHIR Auth", status="fail", message=f"FHIR connectivity check failed: {str(e)}", details={"error_type": type(e).__name__} ) def run_all_checks(iris_host: Optional[str] = None, iris_port: Optional[int] = None, nim_host: str = "localhost", nim_port: int = 8001, skip_gpu: bool = False, skip_docker: bool = False, skip_iris: bool = False, skip_nim: bool = False) -> List[HealthCheckResult]: """ Run all health checks and return results. Args: iris_host: IRIS database host (default: env or localhost) iris_port: IRIS database port (default: env or 1972) nim_host: NIM service host nim_port: NIM service port skip_gpu: Skip GPU checks skip_docker: Skip Docker checks skip_iris: Skip IRIS checks skip_nim: Skip NIM checks Returns: List of HealthCheckResult objects """ # Load defaults from DatabaseConnection if not provided db_config = DatabaseConnection.get_config() if iris_host is None: iris_host = db_config['hostname'] if iris_port is None: iris_port = db_config['port'] results = [] if not skip_gpu: results.append(gpu_check()) results.append(gpu_utilization_check()) if not skip_docker: results.append(docker_gpu_check()) if not skip_iris: results.append(iris_connection_check(iris_host, iris_port)) results.append(iris_schema_check(iris_host, iris_port)) results.append(fhir_auth_check()) if not skip_nim: results.append(nim_llm_health_check(nim_host, nim_port)) results.append(nim_llm_inference_test(nim_host, nim_port)) return results if __name__ == "__main__": """Run all health checks when executed as script.""" print("Running health checks...") print() results = run_all_checks() passed = sum(1 for r in results if r.status == "pass") failed = sum(1 for r in results if r.status == "fail") for result in results: status_symbol = "✓" if result.status == "pass" else "✗" print(f"{status_symbol} {result.component}: {result.message}") if result.details and result.status == "fail": if "suggestion" in result.details: print(f" Suggestion: {result.details['suggestion']}") if "error" in result.details: print(f" Error: {result.details['error']}") print() print(f"Results: {passed} passed, {failed} failed") # Exit with error code if any check failed exit(0 if failed == 0 else 1)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/isc-tdyar/medical-graphrag-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

health_checks.py•25.3 KiB