Medical GraphRAG Assistant

health_checks.py•19.9 KiB

"""
Health check functions for AWS GPU NIM RAG system validation.

Provides structured health checks for all system components:
- GPU availability and utilization
- Docker GPU runtime
- IRIS database connectivity and schema
- NIM LLM service health and inference

All functions return structured results with pass/fail status
and diagnostic messages for troubleshooting.
"""

import subprocess
import json
import urllib.request
import urllib.error
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict


@dataclass
class HealthCheckResult:
    """Structured result from a health check.

    Attributes:
        component: Component name being checked
        status: 'pass' or 'fail'
        message: Human-readable status message
        details: Optional additional diagnostic information
    """
    component: str
    status: str  # 'pass' or 'fail'
    message: str
    details: Optional[Dict[str, Any]] = None

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary representation."""
        return asdict(self)


def gpu_check() -> HealthCheckResult:
    """
    Check GPU availability and basic functionality.

    Verifies:
    - nvidia-smi command is available
    - GPU device is detected
    - Driver and CUDA versions

    Returns:
        HealthCheckResult with GPU information or error details
    """
    try:
        # Check if nvidia-smi exists
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=name,driver_version,memory.total', '--format=csv,noheader'],
            capture_output=True,
            text=True,
            timeout=10
        )

        if result.returncode != 0:
            return HealthCheckResult(
                component="GPU",
                status="fail",
                message="GPU not accessible",
                details={"error": result.stderr}
            )

        # Parse GPU information
        gpu_info = result.stdout.strip().split(', ')

        if len(gpu_info) < 3:
            return HealthCheckResult(
                component="GPU",
                status="fail",
                message="Could not parse GPU information",
                details={"output": result.stdout}
            )

        gpu_name = gpu_info[0]
        driver_version = gpu_info[1]
        memory_mb = gpu_info[2].split()[0]

        # Get CUDA version
        cuda_result = subprocess.run(
            ['nvidia-smi'],
            capture_output=True,
            text=True,
            timeout=10
        )

        cuda_version = "Unknown"
        for line in cuda_result.stdout.split('\n'):
            if "CUDA Version" in line:
                parts = line.split("CUDA Version:")
                if len(parts) > 1:
                    cuda_version = parts[1].strip().split()[0]
                break

        return HealthCheckResult(
            component="GPU",
            status="pass",
            message=f"GPU detected: {gpu_name}",
            details={
                "gpu_name": gpu_name,
                "driver_version": driver_version,
                "memory_mb": memory_mb,
                "cuda_version": cuda_version
            }
        )

    except FileNotFoundError:
        return HealthCheckResult(
            component="GPU",
            status="fail",
            message="nvidia-smi not found - GPU drivers may not be installed",
            details={"suggestion": "Run: ./scripts/aws/install-gpu-drivers.sh"}
        )
    except subprocess.TimeoutExpired:
        return HealthCheckResult(
            component="GPU",
            status="fail",
            message="nvidia-smi command timed out",
            details={}
        )
    except Exception as e:
        return HealthCheckResult(
            component="GPU",
            status="fail",
            message=f"Unexpected error checking GPU: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def gpu_utilization_check() -> HealthCheckResult:
    """
    Check GPU utilization and memory usage.

    Provides real-time GPU metrics useful for monitoring workloads.

    Returns:
        HealthCheckResult with utilization metrics
    """
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu',
             '--format=csv,noheader,nounits'],
            capture_output=True,
            text=True,
            timeout=10
        )

        if result.returncode != 0:
            return HealthCheckResult(
                component="GPU Utilization",
                status="fail",
                message="Could not query GPU utilization",
                details={"error": result.stderr}
            )

        # Parse utilization info
        util_info = result.stdout.strip().split(', ')

        if len(util_info) < 4:
            return HealthCheckResult(
                component="GPU Utilization",
                status="fail",
                message="Could not parse GPU utilization",
                details={"output": result.stdout}
            )

        gpu_util_pct = float(util_info[0])
        memory_used_mb = float(util_info[1])
        memory_total_mb = float(util_info[2])
        temperature_c = float(util_info[3])

        memory_util_pct = (memory_used_mb / memory_total_mb) * 100 if memory_total_mb > 0 else 0

        return HealthCheckResult(
            component="GPU Utilization",
            status="pass",
            message=f"GPU utilization: {gpu_util_pct}%",
            details={
                "gpu_utilization_pct": gpu_util_pct,
                "memory_used_mb": memory_used_mb,
                "memory_total_mb": memory_total_mb,
                "memory_utilization_pct": round(memory_util_pct, 1),
                "temperature_c": temperature_c
            }
        )

    except Exception as e:
        return HealthCheckResult(
            component="GPU Utilization",
            status="fail",
            message=f"Error checking GPU utilization: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def docker_gpu_check() -> HealthCheckResult:
    """
    Check Docker GPU runtime configuration.

    Verifies:
    - Docker is installed
    - Docker can access GPU via --gpus flag
    - GPU is accessible inside containers

    Returns:
        HealthCheckResult with Docker GPU status
    """
    try:
        # Test GPU access in container
        result = subprocess.run(
            ['docker', 'run', '--rm', '--gpus', 'all',
             'nvidia/cuda:12.2.0-base-ubuntu22.04', 'nvidia-smi'],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            return HealthCheckResult(
                component="Docker GPU Runtime",
                status="fail",
                message="Docker cannot access GPU",
                details={
                    "error": result.stderr,
                    "suggestion": "Run: ./scripts/aws/setup-docker-gpu.sh"
                }
            )

        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="pass",
            message="Docker can access GPU",
            details={}
        )

    except FileNotFoundError:
        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="fail",
            message="Docker not found",
            details={"suggestion": "Install Docker first"}
        )
    except subprocess.TimeoutExpired:
        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="fail",
            message="Docker GPU test timed out",
            details={}
        )
    except Exception as e:
        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="fail",
            message=f"Error checking Docker GPU: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def iris_connection_check(host: str = "localhost", port: int = 1972,
                          namespace: str = "DEMO", username: str = "_SYSTEM",
                          password: str = "SYS") -> HealthCheckResult:
    """
    Check IRIS database connectivity.

    Args:
        host: IRIS host address
        port: IRIS SuperServer port
        namespace: IRIS namespace
        username: Database username
        password: Database password

    Returns:
        HealthCheckResult with connection status
    """
    try:
        import iris

        # Attempt connection
        conn = iris.connect(host, port, namespace, username, password)
        cursor = conn.cursor()

        # Test with simple query
        cursor.execute("SELECT 1")
        result = cursor.fetchone()

        conn.close()

        if result and result[0] == 1:
            return HealthCheckResult(
                component="IRIS Connection",
                status="pass",
                message=f"Connected to IRIS at {host}:{port}/{namespace}",
                details={
                    "host": host,
                    "port": port,
                    "namespace": namespace
                }
            )
        else:
            return HealthCheckResult(
                component="IRIS Connection",
                status="fail",
                message="Query test failed",
                details={"result": str(result)}
            )

    except ImportError:
        return HealthCheckResult(
            component="IRIS Connection",
            status="fail",
            message="iris Python module not installed",
            details={"suggestion": "pip install intersystems-iris"}
        )
    except Exception as e:
        return HealthCheckResult(
            component="IRIS Connection",
            status="fail",
            message=f"Connection failed: {str(e)}",
            details={
                "error_type": type(e).__name__,
                "host": host,
                "port": port,
                "suggestion": "Check IRIS container is running: docker ps | grep iris"
            }
        )


def iris_tables_check(host: str = "localhost", port: int = 1972,
                      namespace: str = "DEMO", username: str = "_SYSTEM",
                      password: str = "SYS") -> HealthCheckResult:
    """
    Check IRIS vector table schema.

    Verifies:
    - ClinicalNoteVectors table exists
    - MedicalImageVectors table exists

    Args:
        host: IRIS host address
        port: IRIS SuperServer port
        namespace: IRIS namespace
        username: Database username
        password: Database password

    Returns:
        HealthCheckResult with table existence status
    """
    try:
        import iris

        conn = iris.connect(host, port, namespace, username, password)
        cursor = conn.cursor()

        # Check ClinicalNoteVectors table
        cursor.execute("""
            SELECT COUNT(*)
            FROM INFORMATION_SCHEMA.TABLES
            WHERE TABLE_SCHEMA=? AND TABLE_NAME='ClinicalNoteVectors'
        """, (namespace,))
        clinical_exists = cursor.fetchone()[0] > 0

        # Check MedicalImageVectors table
        cursor.execute("""
            SELECT COUNT(*)
            FROM INFORMATION_SCHEMA.TABLES
            WHERE TABLE_SCHEMA=? AND TABLE_NAME='MedicalImageVectors'
        """, (namespace,))
        image_exists = cursor.fetchone()[0] > 0

        conn.close()

        tables_found = []
        tables_missing = []

        if clinical_exists:
            tables_found.append("ClinicalNoteVectors")
        else:
            tables_missing.append("ClinicalNoteVectors")

        if image_exists:
            tables_found.append("MedicalImageVectors")
        else:
            tables_missing.append("MedicalImageVectors")

        if len(tables_found) == 2:
            return HealthCheckResult(
                component="IRIS Tables",
                status="pass",
                message="All vector tables exist",
                details={
                    "tables_found": tables_found,
                    "namespace": namespace
                }
            )
        elif len(tables_found) > 0:
            return HealthCheckResult(
                component="IRIS Tables",
                status="fail",
                message="Some vector tables missing",
                details={
                    "tables_found": tables_found,
                    "tables_missing": tables_missing,
                    "suggestion": "Run schema creation scripts"
                }
            )
        else:
            return HealthCheckResult(
                component="IRIS Tables",
                status="fail",
                message="No vector tables found",
                details={
                    "tables_missing": tables_missing,
                    "suggestion": "Run: python src/setup/create_text_vector_table.py"
                }
            )

    except ImportError:
        return HealthCheckResult(
            component="IRIS Tables",
            status="fail",
            message="iris Python module not installed",
            details={"suggestion": "pip install intersystems-iris"}
        )
    except Exception as e:
        return HealthCheckResult(
            component="IRIS Tables",
            status="fail",
            message=f"Table check failed: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def nim_llm_health_check(host: str = "localhost", port: int = 8001) -> HealthCheckResult:
    """
    Check NIM LLM service health endpoint.

    Args:
        host: NIM service host
        port: NIM service port

    Returns:
        HealthCheckResult with service health status
    """
    try:
        url = f"http://{host}:{port}/health"

        req = urllib.request.Request(url)

        with urllib.request.urlopen(req, timeout=10) as response:
            if response.status == 200:
                return HealthCheckResult(
                    component="NIM LLM Health",
                    status="pass",
                    message=f"NIM LLM service healthy at {host}:{port}",
                    details={"endpoint": url}
                )
            else:
                return HealthCheckResult(
                    component="NIM LLM Health",
                    status="fail",
                    message=f"Health endpoint returned status {response.status}",
                    details={"status_code": response.status}
                )

    except urllib.error.URLError as e:
        return HealthCheckResult(
            component="NIM LLM Health",
            status="fail",
            message="Health endpoint not accessible",
            details={
                "error": str(e),
                "endpoint": f"http://{host}:{port}/health",
                "suggestion": "NIM may still be initializing - check: docker logs nim-llm"
            }
        )
    except Exception as e:
        return HealthCheckResult(
            component="NIM LLM Health",
            status="fail",
            message=f"Error checking health: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def nim_llm_inference_test(host: str = "localhost", port: int = 8001) -> HealthCheckResult:
    """
    Test NIM LLM inference with a simple query.

    Args:
        host: NIM service host
        port: NIM service port

    Returns:
        HealthCheckResult with inference test status and response
    """
    try:
        url = f"http://{host}:{port}/v1/chat/completions"

        payload = {
            "model": "meta/llama-3.1-8b-instruct",
            "messages": [
                {"role": "user", "content": "What is 2+2? Answer with just the number."}
            ],
            "max_tokens": 10
        }

        data = json.dumps(payload).encode('utf-8')
        req = urllib.request.Request(
            url,
            data=data,
            headers={'Content-Type': 'application/json'}
        )

        with urllib.request.urlopen(req, timeout=30) as response:
            if response.status == 200:
                response_data = json.loads(response.read().decode('utf-8'))

                if 'choices' in response_data and len(response_data['choices']) > 0:
                    answer = response_data['choices'][0]['message']['content']

                    return HealthCheckResult(
                        component="NIM LLM Inference",
                        status="pass",
                        message="Inference test successful",
                        details={
                            "test_query": "What is 2+2?",
                            "response": answer.strip(),
                            "endpoint": url
                        }
                    )
                else:
                    return HealthCheckResult(
                        component="NIM LLM Inference",
                        status="fail",
                        message="Response missing expected fields",
                        details={"response": response_data}
                    )
            else:
                return HealthCheckResult(
                    component="NIM LLM Inference",
                    status="fail",
                    message=f"Inference endpoint returned status {response.status}",
                    details={"status_code": response.status}
                )

    except urllib.error.URLError as e:
        return HealthCheckResult(
            component="NIM LLM Inference",
            status="fail",
            message="Inference endpoint not accessible",
            details={
                "error": str(e),
                "endpoint": url,
                "suggestion": "Model may still be loading - check: docker logs nim-llm"
            }
        )
    except json.JSONDecodeError as e:
        return HealthCheckResult(
            component="NIM LLM Inference",
            status="fail",
            message="Could not parse response JSON",
            details={"error": str(e)}
        )
    except Exception as e:
        return HealthCheckResult(
            component="NIM LLM Inference",
            status="fail",
            message=f"Inference test failed: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def run_all_checks(iris_host: str = "localhost", iris_port: int = 1972,
                  nim_host: str = "localhost", nim_port: int = 8001,
                  skip_gpu: bool = False, skip_docker: bool = False,
                  skip_iris: bool = False, skip_nim: bool = False) -> List[HealthCheckResult]:
    """
    Run all health checks and return results.

    Args:
        iris_host: IRIS database host
        iris_port: IRIS database port
        nim_host: NIM service host
        nim_port: NIM service port
        skip_gpu: Skip GPU checks
        skip_docker: Skip Docker checks
        skip_iris: Skip IRIS checks
        skip_nim: Skip NIM checks

    Returns:
        List of HealthCheckResult objects
    """
    results = []

    if not skip_gpu:
        results.append(gpu_check())
        results.append(gpu_utilization_check())

    if not skip_docker:
        results.append(docker_gpu_check())

    if not skip_iris:
        results.append(iris_connection_check(iris_host, iris_port))
        results.append(iris_tables_check(iris_host, iris_port))

    if not skip_nim:
        results.append(nim_llm_health_check(nim_host, nim_port))
        results.append(nim_llm_inference_test(nim_host, nim_port))

    return results


if __name__ == "__main__":
    """Run all health checks when executed as script."""
    print("Running health checks...")
    print()

    results = run_all_checks()

    passed = sum(1 for r in results if r.status == "pass")
    failed = sum(1 for r in results if r.status == "fail")

    for result in results:
        status_symbol = "✓" if result.status == "pass" else "✗"
        print(f"{status_symbol} {result.component}: {result.message}")

        if result.details and result.status == "fail":
            if "suggestion" in result.details:
                print(f"   Suggestion: {result.details['suggestion']}")
            if "error" in result.details:
                print(f"   Error: {result.details['error']}")
        print()

    print(f"Results: {passed} passed, {failed} failed")

    # Exit with error code if any check failed
    exit(0 if failed == 0 else 1)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/isc-tdyar/medical-graphrag-assistant'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

health_checks.py•19.9 KiB

"""
Health check functions for AWS GPU NIM RAG system validation.

Provides structured health checks for all system components:
- GPU availability and utilization
- Docker GPU runtime
- IRIS database connectivity and schema
- NIM LLM service health and inference

All functions return structured results with pass/fail status
and diagnostic messages for troubleshooting.
"""

import subprocess
import json
import urllib.request
import urllib.error
from typing import Dict, List, Any, Optional
from dataclasses import dataclass, asdict


@dataclass
class HealthCheckResult:
    """Structured result from a health check.

    Attributes:
        component: Component name being checked
        status: 'pass' or 'fail'
        message: Human-readable status message
        details: Optional additional diagnostic information
    """
    component: str
    status: str  # 'pass' or 'fail'
    message: str
    details: Optional[Dict[str, Any]] = None

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary representation."""
        return asdict(self)


def gpu_check() -> HealthCheckResult:
    """
    Check GPU availability and basic functionality.

    Verifies:
    - nvidia-smi command is available
    - GPU device is detected
    - Driver and CUDA versions

    Returns:
        HealthCheckResult with GPU information or error details
    """
    try:
        # Check if nvidia-smi exists
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=name,driver_version,memory.total', '--format=csv,noheader'],
            capture_output=True,
            text=True,
            timeout=10
        )

        if result.returncode != 0:
            return HealthCheckResult(
                component="GPU",
                status="fail",
                message="GPU not accessible",
                details={"error": result.stderr}
            )

        # Parse GPU information
        gpu_info = result.stdout.strip().split(', ')

        if len(gpu_info) < 3:
            return HealthCheckResult(
                component="GPU",
                status="fail",
                message="Could not parse GPU information",
                details={"output": result.stdout}
            )

        gpu_name = gpu_info[0]
        driver_version = gpu_info[1]
        memory_mb = gpu_info[2].split()[0]

        # Get CUDA version
        cuda_result = subprocess.run(
            ['nvidia-smi'],
            capture_output=True,
            text=True,
            timeout=10
        )

        cuda_version = "Unknown"
        for line in cuda_result.stdout.split('\n'):
            if "CUDA Version" in line:
                parts = line.split("CUDA Version:")
                if len(parts) > 1:
                    cuda_version = parts[1].strip().split()[0]
                break

        return HealthCheckResult(
            component="GPU",
            status="pass",
            message=f"GPU detected: {gpu_name}",
            details={
                "gpu_name": gpu_name,
                "driver_version": driver_version,
                "memory_mb": memory_mb,
                "cuda_version": cuda_version
            }
        )

    except FileNotFoundError:
        return HealthCheckResult(
            component="GPU",
            status="fail",
            message="nvidia-smi not found - GPU drivers may not be installed",
            details={"suggestion": "Run: ./scripts/aws/install-gpu-drivers.sh"}
        )
    except subprocess.TimeoutExpired:
        return HealthCheckResult(
            component="GPU",
            status="fail",
            message="nvidia-smi command timed out",
            details={}
        )
    except Exception as e:
        return HealthCheckResult(
            component="GPU",
            status="fail",
            message=f"Unexpected error checking GPU: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def gpu_utilization_check() -> HealthCheckResult:
    """
    Check GPU utilization and memory usage.

    Provides real-time GPU metrics useful for monitoring workloads.

    Returns:
        HealthCheckResult with utilization metrics
    """
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu',
             '--format=csv,noheader,nounits'],
            capture_output=True,
            text=True,
            timeout=10
        )

        if result.returncode != 0:
            return HealthCheckResult(
                component="GPU Utilization",
                status="fail",
                message="Could not query GPU utilization",
                details={"error": result.stderr}
            )

        # Parse utilization info
        util_info = result.stdout.strip().split(', ')

        if len(util_info) < 4:
            return HealthCheckResult(
                component="GPU Utilization",
                status="fail",
                message="Could not parse GPU utilization",
                details={"output": result.stdout}
            )

        gpu_util_pct = float(util_info[0])
        memory_used_mb = float(util_info[1])
        memory_total_mb = float(util_info[2])
        temperature_c = float(util_info[3])

        memory_util_pct = (memory_used_mb / memory_total_mb) * 100 if memory_total_mb > 0 else 0

        return HealthCheckResult(
            component="GPU Utilization",
            status="pass",
            message=f"GPU utilization: {gpu_util_pct}%",
            details={
                "gpu_utilization_pct": gpu_util_pct,
                "memory_used_mb": memory_used_mb,
                "memory_total_mb": memory_total_mb,
                "memory_utilization_pct": round(memory_util_pct, 1),
                "temperature_c": temperature_c
            }
        )

    except Exception as e:
        return HealthCheckResult(
            component="GPU Utilization",
            status="fail",
            message=f"Error checking GPU utilization: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def docker_gpu_check() -> HealthCheckResult:
    """
    Check Docker GPU runtime configuration.

    Verifies:
    - Docker is installed
    - Docker can access GPU via --gpus flag
    - GPU is accessible inside containers

    Returns:
        HealthCheckResult with Docker GPU status
    """
    try:
        # Test GPU access in container
        result = subprocess.run(
            ['docker', 'run', '--rm', '--gpus', 'all',
             'nvidia/cuda:12.2.0-base-ubuntu22.04', 'nvidia-smi'],
            capture_output=True,
            text=True,
            timeout=60
        )

        if result.returncode != 0:
            return HealthCheckResult(
                component="Docker GPU Runtime",
                status="fail",
                message="Docker cannot access GPU",
                details={
                    "error": result.stderr,
                    "suggestion": "Run: ./scripts/aws/setup-docker-gpu.sh"
                }
            )

        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="pass",
            message="Docker can access GPU",
            details={}
        )

    except FileNotFoundError:
        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="fail",
            message="Docker not found",
            details={"suggestion": "Install Docker first"}
        )
    except subprocess.TimeoutExpired:
        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="fail",
            message="Docker GPU test timed out",
            details={}
        )
    except Exception as e:
        return HealthCheckResult(
            component="Docker GPU Runtime",
            status="fail",
            message=f"Error checking Docker GPU: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def iris_connection_check(host: str = "localhost", port: int = 1972,
                          namespace: str = "DEMO", username: str = "_SYSTEM",
                          password: str = "SYS") -> HealthCheckResult:
    """
    Check IRIS database connectivity.

    Args:
        host: IRIS host address
        port: IRIS SuperServer port
        namespace: IRIS namespace
        username: Database username
        password: Database password

    Returns:
        HealthCheckResult with connection status
    """
    try:
        import iris

        # Attempt connection
        conn = iris.connect(host, port, namespace, username, password)
        cursor = conn.cursor()

        # Test with simple query
        cursor.execute("SELECT 1")
        result = cursor.fetchone()

        conn.close()

        if result and result[0] == 1:
            return HealthCheckResult(
                component="IRIS Connection",
                status="pass",
                message=f"Connected to IRIS at {host}:{port}/{namespace}",
                details={
                    "host": host,
                    "port": port,
                    "namespace": namespace
                }
            )
        else:
            return HealthCheckResult(
                component="IRIS Connection",
                status="fail",
                message="Query test failed",
                details={"result": str(result)}
            )

    except ImportError:
        return HealthCheckResult(
            component="IRIS Connection",
            status="fail",
            message="iris Python module not installed",
            details={"suggestion": "pip install intersystems-iris"}
        )
    except Exception as e:
        return HealthCheckResult(
            component="IRIS Connection",
            status="fail",
            message=f"Connection failed: {str(e)}",
            details={
                "error_type": type(e).__name__,
                "host": host,
                "port": port,
                "suggestion": "Check IRIS container is running: docker ps | grep iris"
            }
        )


def iris_tables_check(host: str = "localhost", port: int = 1972,
                      namespace: str = "DEMO", username: str = "_SYSTEM",
                      password: str = "SYS") -> HealthCheckResult:
    """
    Check IRIS vector table schema.

    Verifies:
    - ClinicalNoteVectors table exists
    - MedicalImageVectors table exists

    Args:
        host: IRIS host address
        port: IRIS SuperServer port
        namespace: IRIS namespace
        username: Database username
        password: Database password

    Returns:
        HealthCheckResult with table existence status
    """
    try:
        import iris

        conn = iris.connect(host, port, namespace, username, password)
        cursor = conn.cursor()

        # Check ClinicalNoteVectors table
        cursor.execute("""
            SELECT COUNT(*)
            FROM INFORMATION_SCHEMA.TABLES
            WHERE TABLE_SCHEMA=? AND TABLE_NAME='ClinicalNoteVectors'
        """, (namespace,))
        clinical_exists = cursor.fetchone()[0] > 0

        # Check MedicalImageVectors table
        cursor.execute("""
            SELECT COUNT(*)
            FROM INFORMATION_SCHEMA.TABLES
            WHERE TABLE_SCHEMA=? AND TABLE_NAME='MedicalImageVectors'
        """, (namespace,))
        image_exists = cursor.fetchone()[0] > 0

        conn.close()

        tables_found = []
        tables_missing = []

        if clinical_exists:
            tables_found.append("ClinicalNoteVectors")
        else:
            tables_missing.append("ClinicalNoteVectors")

        if image_exists:
            tables_found.append("MedicalImageVectors")
        else:
            tables_missing.append("MedicalImageVectors")

        if len(tables_found) == 2:
            return HealthCheckResult(
                component="IRIS Tables",
                status="pass",
                message="All vector tables exist",
                details={
                    "tables_found": tables_found,
                    "namespace": namespace
                }
            )
        elif len(tables_found) > 0:
            return HealthCheckResult(
                component="IRIS Tables",
                status="fail",
                message="Some vector tables missing",
                details={
                    "tables_found": tables_found,
                    "tables_missing": tables_missing,
                    "suggestion": "Run schema creation scripts"
                }
            )
        else:
            return HealthCheckResult(
                component="IRIS Tables",
                status="fail",
                message="No vector tables found",
                details={
                    "tables_missing": tables_missing,
                    "suggestion": "Run: python src/setup/create_text_vector_table.py"
                }
            )

    except ImportError:
        return HealthCheckResult(
            component="IRIS Tables",
            status="fail",
            message="iris Python module not installed",
            details={"suggestion": "pip install intersystems-iris"}
        )
    except Exception as e:
        return HealthCheckResult(
            component="IRIS Tables",
            status="fail",
            message=f"Table check failed: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def nim_llm_health_check(host: str = "localhost", port: int = 8001) -> HealthCheckResult:
    """
    Check NIM LLM service health endpoint.

    Args:
        host: NIM service host
        port: NIM service port

    Returns:
        HealthCheckResult with service health status
    """
    try:
        url = f"http://{host}:{port}/health"

        req = urllib.request.Request(url)

        with urllib.request.urlopen(req, timeout=10) as response:
            if response.status == 200:
                return HealthCheckResult(
                    component="NIM LLM Health",
                    status="pass",
                    message=f"NIM LLM service healthy at {host}:{port}",
                    details={"endpoint": url}
                )
            else:
                return HealthCheckResult(
                    component="NIM LLM Health",
                    status="fail",
                    message=f"Health endpoint returned status {response.status}",
                    details={"status_code": response.status}
                )

    except urllib.error.URLError as e:
        return HealthCheckResult(
            component="NIM LLM Health",
            status="fail",
            message="Health endpoint not accessible",
            details={
                "error": str(e),
                "endpoint": f"http://{host}:{port}/health",
                "suggestion": "NIM may still be initializing - check: docker logs nim-llm"
            }
        )
    except Exception as e:
        return HealthCheckResult(
            component="NIM LLM Health",
            status="fail",
            message=f"Error checking health: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def nim_llm_inference_test(host: str = "localhost", port: int = 8001) -> HealthCheckResult:
    """
    Test NIM LLM inference with a simple query.

    Args:
        host: NIM service host
        port: NIM service port

    Returns:
        HealthCheckResult with inference test status and response
    """
    try:
        url = f"http://{host}:{port}/v1/chat/completions"

        payload = {
            "model": "meta/llama-3.1-8b-instruct",
            "messages": [
                {"role": "user", "content": "What is 2+2? Answer with just the number."}
            ],
            "max_tokens": 10
        }

        data = json.dumps(payload).encode('utf-8')
        req = urllib.request.Request(
            url,
            data=data,
            headers={'Content-Type': 'application/json'}
        )

        with urllib.request.urlopen(req, timeout=30) as response:
            if response.status == 200:
                response_data = json.loads(response.read().decode('utf-8'))

                if 'choices' in response_data and len(response_data['choices']) > 0:
                    answer = response_data['choices'][0]['message']['content']

                    return HealthCheckResult(
                        component="NIM LLM Inference",
                        status="pass",
                        message="Inference test successful",
                        details={
                            "test_query": "What is 2+2?",
                            "response": answer.strip(),
                            "endpoint": url
                        }
                    )
                else:
                    return HealthCheckResult(
                        component="NIM LLM Inference",
                        status="fail",
                        message="Response missing expected fields",
                        details={"response": response_data}
                    )
            else:
                return HealthCheckResult(
                    component="NIM LLM Inference",
                    status="fail",
                    message=f"Inference endpoint returned status {response.status}",
                    details={"status_code": response.status}
                )

    except urllib.error.URLError as e:
        return HealthCheckResult(
            component="NIM LLM Inference",
            status="fail",
            message="Inference endpoint not accessible",
            details={
                "error": str(e),
                "endpoint": url,
                "suggestion": "Model may still be loading - check: docker logs nim-llm"
            }
        )
    except json.JSONDecodeError as e:
        return HealthCheckResult(
            component="NIM LLM Inference",
            status="fail",
            message="Could not parse response JSON",
            details={"error": str(e)}
        )
    except Exception as e:
        return HealthCheckResult(
            component="NIM LLM Inference",
            status="fail",
            message=f"Inference test failed: {str(e)}",
            details={"error_type": type(e).__name__}
        )


def run_all_checks(iris_host: str = "localhost", iris_port: int = 1972,
                  nim_host: str = "localhost", nim_port: int = 8001,
                  skip_gpu: bool = False, skip_docker: bool = False,
                  skip_iris: bool = False, skip_nim: bool = False) -> List[HealthCheckResult]:
    """
    Run all health checks and return results.

    Args:
        iris_host: IRIS database host
        iris_port: IRIS database port
        nim_host: NIM service host
        nim_port: NIM service port
        skip_gpu: Skip GPU checks
        skip_docker: Skip Docker checks
        skip_iris: Skip IRIS checks
        skip_nim: Skip NIM checks

    Returns:
        List of HealthCheckResult objects
    """
    results = []

    if not skip_gpu:
        results.append(gpu_check())
        results.append(gpu_utilization_check())

    if not skip_docker:
        results.append(docker_gpu_check())

    if not skip_iris:
        results.append(iris_connection_check(iris_host, iris_port))
        results.append(iris_tables_check(iris_host, iris_port))

    if not skip_nim:
        results.append(nim_llm_health_check(nim_host, nim_port))
        results.append(nim_llm_inference_test(nim_host, nim_port))

    return results


if __name__ == "__main__":
    """Run all health checks when executed as script."""
    print("Running health checks...")
    print()

    results = run_all_checks()

    passed = sum(1 for r in results if r.status == "pass")
    failed = sum(1 for r in results if r.status == "fail")

    for result in results:
        status_symbol = "✓" if result.status == "pass" else "✗"
        print(f"{status_symbol} {result.component}: {result.message}")

        if result.details and result.status == "fail":
            if "suggestion" in result.details:
                print(f"   Suggestion: {result.details['suggestion']}")
            if "error" in result.details:
                print(f"   Error: {result.details['error']}")
        print()

    print(f"Results: {passed} passed, {failed} failed")

    # Exit with error code if any check failed
    exit(0 if failed == 0 else 1)