error_handling.py•9.26 kB
"""
Error handling and health monitoring utilities.
Follows Clean Code principles:
- Single Responsibility: Each class handles one aspect of error management
- Open/Closed: Easy to extend with new error types and health checks
- Dependency Inversion: Health checks depend on abstractions
"""
import logging
import traceback
from datetime import datetime
from enum import Enum
from typing import Dict, List, Optional, Any
from pydantic import BaseModel
logger = logging.getLogger(__name__)
class ErrorSeverity(str, Enum):
"""Error severity levels for monitoring and alerting."""
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
CRITICAL = "critical"
class ErrorType(str, Enum):
"""Categorized error types for better handling."""
VALIDATION = "validation"
STORAGE = "storage"
NETWORK = "network"
AUTHENTICATION = "authentication"
RATE_LIMIT = "rate_limit"
INTERNAL = "internal"
class ErrorDetail(BaseModel):
"""Structured error information for logging and debugging."""
error_type: ErrorType
severity: ErrorSeverity
message: str
details: Optional[Dict[str, Any]] = None
timestamp: datetime
stack_trace: Optional[str] = None
@classmethod
def from_exception(
cls,
exc: Exception,
error_type: ErrorType = ErrorType.INTERNAL,
severity: ErrorSeverity = ErrorSeverity.MEDIUM,
details: Optional[Dict[str, Any]] = None
) -> "ErrorDetail":
"""Create ErrorDetail from Python exception."""
return cls(
error_type=error_type,
severity=severity,
message=str(exc),
details=details or {},
timestamp=datetime.utcnow(),
stack_trace=traceback.format_exc()
)
class HealthStatus(str, Enum):
"""Overall system health status."""
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
class HealthCheckResult(BaseModel):
"""Individual health check result."""
name: str
status: HealthStatus
message: str
response_time_ms: Optional[float] = None
details: Optional[Dict[str, Any]] = None
class SystemHealth(BaseModel):
"""Overall system health report."""
status: HealthStatus
timestamp: datetime
checks: List[HealthCheckResult]
uptime_seconds: float
version: str = "1.0.0"
def is_healthy(self) -> bool:
"""Check if system is in healthy state."""
return self.status == HealthStatus.HEALTHY
class ErrorHandler:
"""
Centralized error handling with structured logging.
Follows SRP: Handles only error processing and logging.
"""
def __init__(self, service_name: str = "codebuddy-mcp"):
self.service_name = service_name
self.error_counts: Dict[ErrorType, int] = {error_type: 0 for error_type in ErrorType}
def handle_error(
self,
exc: Exception,
error_type: ErrorType = ErrorType.INTERNAL,
severity: ErrorSeverity = ErrorSeverity.MEDIUM,
context: Optional[Dict[str, Any]] = None
) -> ErrorDetail:
"""
Handle exception with structured logging and metrics.
Returns ErrorDetail for further processing if needed.
"""
error_detail = ErrorDetail.from_exception(
exc, error_type, severity, context
)
# Update metrics
self.error_counts[error_type] += 1
# Log based on severity
log_message = f"[{error_type.value.upper()}] {error_detail.message}"
if context:
log_message += f" | Context: {context}"
if severity in [ErrorSeverity.CRITICAL, ErrorSeverity.HIGH]:
logger.error(log_message, exc_info=True)
elif severity == ErrorSeverity.MEDIUM:
logger.warning(log_message)
else:
logger.info(log_message)
return error_detail
def get_error_stats(self) -> Dict[str, int]:
"""Get error count statistics."""
return dict(self.error_counts)
class HealthMonitor:
"""
System health monitoring with configurable checks.
Follows Open/Closed Principle: Easy to add new health checks.
"""
def __init__(self):
self.start_time = datetime.utcnow()
self.health_checks: List[callable] = []
def add_health_check(self, check_func: callable) -> None:
"""Add a new health check function."""
self.health_checks.append(check_func)
def check_system_health(self) -> SystemHealth:
"""
Perform all health checks and return overall status.
Aggregates individual check results into system status.
"""
check_results = []
overall_status = HealthStatus.HEALTHY
for check_func in self.health_checks:
try:
start_time = datetime.utcnow()
result = check_func()
end_time = datetime.utcnow()
# Add timing information
if isinstance(result, HealthCheckResult):
result.response_time_ms = (end_time - start_time).total_seconds() * 1000
check_results.append(result)
else:
# Handle simple boolean returns
status = HealthStatus.HEALTHY if result else HealthStatus.UNHEALTHY
check_results.append(HealthCheckResult(
name=check_func.__name__,
status=status,
message="OK" if result else "Failed",
response_time_ms=(end_time - start_time).total_seconds() * 1000
))
# Update overall status
if result.status == HealthStatus.UNHEALTHY:
overall_status = HealthStatus.UNHEALTHY
elif result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
overall_status = HealthStatus.DEGRADED
except Exception as e:
logger.error(f"Health check {check_func.__name__} failed: {e}")
check_results.append(HealthCheckResult(
name=check_func.__name__,
status=HealthStatus.UNHEALTHY,
message=f"Check failed: {str(e)}"
))
overall_status = HealthStatus.UNHEALTHY
uptime = (datetime.utcnow() - self.start_time).total_seconds()
return SystemHealth(
status=overall_status,
timestamp=datetime.utcnow(),
checks=check_results,
uptime_seconds=uptime
)
# Global instances for application use
error_handler = ErrorHandler()
health_monitor = HealthMonitor()
def setup_default_health_checks(storage_instance) -> None:
"""
Setup default health checks for the application.
Args:
storage_instance: TaskStorage instance to monitor
"""
def check_storage_health() -> HealthCheckResult:
"""Check if storage system is accessible."""
try:
# Try to get storage stats as a simple health check
stats = storage_instance.get_storage_stats()
return HealthCheckResult(
name="storage",
status=HealthStatus.HEALTHY,
message=f"Storage accessible with {stats['total']} tasks",
details=stats
)
except Exception as e:
return HealthCheckResult(
name="storage",
status=HealthStatus.UNHEALTHY,
message=f"Storage check failed: {str(e)}"
)
def check_memory_usage() -> HealthCheckResult:
"""Check memory usage levels."""
try:
import psutil
memory = psutil.virtual_memory()
if memory.percent > 90:
status = HealthStatus.UNHEALTHY
message = "Memory usage critically high"
elif memory.percent > 75:
status = HealthStatus.DEGRADED
message = "Memory usage elevated"
else:
status = HealthStatus.HEALTHY
message = "Memory usage normal"
return HealthCheckResult(
name="memory",
status=status,
message=message,
details={"usage_percent": memory.percent, "available_mb": memory.available // (1024 * 1024)}
)
except ImportError:
# psutil not available, skip memory check
return HealthCheckResult(
name="memory",
status=HealthStatus.HEALTHY,
message="Memory monitoring not available"
)
except Exception as e:
return HealthCheckResult(
name="memory",
status=HealthStatus.DEGRADED,
message=f"Memory check failed: {str(e)}"
)
# Register health checks
health_monitor.add_health_check(check_storage_health)
health_monitor.add_health_check(check_memory_usage)