from fastapi import APIRouter, Depends, HTTPException
from typing import Dict, Any
import asyncio
import time
import structlog
from app.core.models.mcp_protocol import HealthStatus, MonitoringMetrics
from app.config import settings
logger = structlog.get_logger()
router = APIRouter()
# Store start time for uptime calculation
start_time = time.time()
@router.get("/", response_model=HealthStatus)
async def health_check():
"""Comprehensive health check endpoint"""
try:
# Check all critical services
services_status = await check_all_services()
# Determine overall health
if all(services_status.values()):
status = "healthy"
elif any(services_status.values()):
status = "degraded"
else:
status = "unhealthy"
uptime = int(time.time() - start_time)
health_status = HealthStatus(
status=status,
services=services_status,
uptime_seconds=uptime,
version=settings.app_version
)
logger.info(
"Health check completed",
status=status,
services=services_status,
uptime=uptime
)
return health_status
except Exception as e:
logger.error("Health check failed", error=str(e))
raise HTTPException(status_code=500, detail="Health check failed")
@router.get("/ready")
async def readiness_check():
"""Readiness probe for Kubernetes"""
try:
# Check if the application is ready to serve traffic
services_ready = await check_readiness()
if all(services_ready.values()):
return {"status": "ready"}
else:
raise HTTPException(status_code=503, detail="Service not ready")
except HTTPException:
raise
except Exception as e:
logger.error("Readiness check failed", error=str(e))
raise HTTPException(status_code=503, detail="Readiness check failed")
@router.get("/live")
async def liveness_check():
"""Liveness probe for Kubernetes"""
try:
# Basic liveness check - just return OK if the process is running
return {"status": "alive", "timestamp": time.time()}
except Exception as e:
logger.error("Liveness check failed", error=str(e))
raise HTTPException(status_code=503, detail="Liveness check failed")
@router.get("/metrics", response_model=MonitoringMetrics)
async def get_metrics():
"""Get application metrics"""
try:
# This would typically collect real metrics from your monitoring system
metrics = MonitoringMetrics(
active_agents=3,
processing_tasks=2,
completed_tasks=156,
failed_tasks=3,
average_response_time=0.234,
requests_per_second=12.5,
error_rate=0.02
)
return metrics
except Exception as e:
logger.error("Failed to collect metrics", error=str(e))
raise HTTPException(status_code=500, detail="Failed to collect metrics")
@router.get("/version")
async def get_version():
"""Get application version information"""
return {
"name": settings.app_name,
"version": settings.app_version,
"build_time": "2024-01-01T00:00:00Z", # This would be set during build
"git_commit": "abc123", # This would be set during build
"python_version": "3.11+",
"environment": "production" if not settings.debug else "development"
}
async def check_all_services() -> Dict[str, bool]:
"""Check all external services"""
services = {}
# Check Redis
try:
# This would be an actual Redis ping
await asyncio.sleep(0.01) # Simulate Redis check
services["redis"] = True
except Exception as e:
logger.warning("Redis health check failed", error=str(e))
services["redis"] = False
# Check Database
try:
# This would be an actual database ping
await asyncio.sleep(0.02) # Simulate DB check
services["database"] = True
except Exception as e:
logger.warning("Database health check failed", error=str(e))
services["database"] = False
# Check External APIs
try:
# This would check external API connectivity
await asyncio.sleep(0.05) # Simulate API check
services["external_apis"] = True
except Exception as e:
logger.warning("External APIs health check failed", error=str(e))
services["external_apis"] = False
# Check LLM Providers
try:
# This would check LLM provider connectivity
await asyncio.sleep(0.03) # Simulate LLM check
services["llm_providers"] = True
except Exception as e:
logger.warning("LLM providers health check failed", error=str(e))
services["llm_providers"] = False
return services
async def check_readiness() -> Dict[str, bool]:
"""Check if application is ready to serve traffic"""
readiness = {}
# Check if critical services are ready
try:
# Simulate readiness checks
await asyncio.sleep(0.01)
readiness["database"] = True
readiness["cache"] = True
readiness["tools_loaded"] = True
except Exception as e:
logger.warning("Readiness check failed", error=str(e))
readiness["database"] = False
readiness["cache"] = False
readiness["tools_loaded"] = False
return readiness