Claude Code MCP - Agent Orchestration Platform

invariants.py•27.2 KiB

""" System State Invariant Validation This module provides comprehensive system state invariant checking for the Agent Orchestration Platform, ensuring consistent system state across all operations and detecting corruption or inconsistencies. Architecture Integration: - Design Patterns: Observer pattern for state monitoring, Template Method for validation sequences - Security Model: Continuous validation of security invariants with automatic recovery mechanisms - Performance Profile: O(n) validation where n = number of agents, with incremental validation support Technical Decisions: - Invariant Categories: Resource, security, state, communication, and persistence invariants - Validation Triggers: On-demand validation, scheduled validation, and event-driven validation - Recovery Actions: Automatic correction for minor issues, alerts for major inconsistencies - Audit Integration: All invariant violations logged with full context and recovery actions Dependencies & Integration: - External: None beyond standard library for maximum reliability - Internal: All core modules for comprehensive system state access Quality Assurance: - Test Coverage: Property-based testing for invariant validation with edge cases - Error Handling: Graceful handling of validation failures with detailed error reporting Author: Adder_4 | Created: 2025-06-26 | Last Modified: 2025-06-26 """ import asyncio import threading from collections import defaultdict from dataclasses import dataclass, field from datetime import datetime, timedelta from enum import Enum, auto from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set from src.boundaries.audit import AuditCategory, AuditLevel, get_audit_logger from src.models.agent import AgentState, AgentStatus from src.models.ids import AgentId, SessionId from src.models.session import SessionState from src.utils.contracts_shim import ensure, require class InvariantType(Enum): """Types of system invariants.""" RESOURCE = "resource" SECURITY = "security" STATE = "state" COMMUNICATION = "communication" PERSISTENCE = "persistence" PERFORMANCE = "performance" class InvariantSeverity(Enum): """Severity levels for invariant violations.""" INFO = auto() WARNING = auto() ERROR = auto() CRITICAL = auto() @dataclass(frozen=True) class InvariantViolation: """Details of a system invariant violation.""" invariant_name: str invariant_type: InvariantType severity: InvariantSeverity description: str affected_resources: List[str] current_value: Any expected_value: Any timestamp: datetime = field(default_factory=datetime.utcnow) recovery_action: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """Convert violation to dictionary for logging.""" return { "invariant_name": self.invariant_name, "invariant_type": self.invariant_type.value, "severity": self.severity.name, "description": self.description, "affected_resources": self.affected_resources, "current_value": str(self.current_value), "expected_value": str(self.expected_value), "timestamp": self.timestamp.isoformat(), "recovery_action": self.recovery_action, } class InvariantResult(NamedTuple): """Result of invariant validation.""" passed: bool violations: List[InvariantViolation] validation_time: float recovery_actions_taken: List[str] class SystemStateSnapshot: """ Immutable snapshot of system state for invariant validation. Provides consistent view of system state across all validation checks to prevent race conditions and ensure accurate invariant checking. """ def __init__( self, agents: Dict[AgentId, AgentState], sessions: Dict[SessionId, SessionState] ): """Initialize state snapshot.""" self.timestamp = datetime.utcnow() self.agents = dict(agents) # Create immutable copy self.sessions = dict(sessions) # Derived state for efficient validation self.total_agents = len(self.agents) self.agents_by_session = self._group_agents_by_session() self.agents_by_status = self._group_agents_by_status() self.resource_usage = self._calculate_resource_usage() def _group_agents_by_session(self) -> Dict[SessionId, List[AgentState]]: """Group agents by their session.""" groups = defaultdict(list) for agent in self.agents.values(): groups[agent.session_id].append(agent) return dict(groups) def _group_agents_by_status(self) -> Dict[AgentStatus, List[AgentState]]: """Group agents by their status.""" groups = defaultdict(list) for agent in self.agents.values(): groups[agent.status].append(agent) return dict(groups) def _calculate_resource_usage(self) -> Dict[str, Any]: """Calculate total resource usage across all agents.""" total_cpu = sum( agent.resource_metrics.cpu_percent for agent in self.agents.values() ) total_memory = sum( agent.resource_metrics.memory_mb for agent in self.agents.values() ) total_file_descriptors = sum( agent.resource_metrics.file_descriptors for agent in self.agents.values() ) return { "total_cpu_percent": total_cpu, "total_memory_mb": total_memory, "total_file_descriptors": total_file_descriptors, "average_cpu_percent": total_cpu / max(1, self.total_agents), "average_memory_mb": total_memory / max(1, self.total_agents), } class SystemInvariantValidator: """ Comprehensive system invariant validator with automatic recovery. Validates all critical system invariants and provides automatic recovery mechanisms for correctable violations while alerting for critical issues. """ # System limits for invariant validation MAX_TOTAL_AGENTS = 32 MAX_AGENTS_PER_SESSION = 8 MAX_TOTAL_CPU_PERCENT = 200.0 # 200% across all cores MAX_TOTAL_MEMORY_MB = 8192 # 8GB MAX_AGENT_ERROR_COUNT = 10 MAX_AGENT_RESTART_COUNT = 5 MAX_HEARTBEAT_AGE_MINUTES = 5 def __init__(self): """Initialize invariant validator.""" self.validation_history: List[InvariantResult] = [] self.recovery_functions: Dict[str, Callable] = {} self.validation_lock = threading.Lock() self._register_recovery_functions() def _register_recovery_functions(self) -> None: """Register automatic recovery functions for correctable violations.""" self.recovery_functions = { "stale_heartbeat": self._recover_stale_heartbeat, "excessive_errors": self._recover_excessive_errors, "resource_cleanup": self._recover_resource_cleanup, "orphaned_processes": self._recover_orphaned_processes, } @require(lambda snapshot: snapshot is not None) @ensure(lambda result: isinstance(result, InvariantResult)) def validate_all_invariants(self, snapshot: SystemStateSnapshot) -> InvariantResult: """ Validate all system invariants against current state snapshot. Contracts: Preconditions: - State snapshot must be valid and current Postconditions: - All invariants validated - Violations identified and categorized - Recovery actions attempted where applicable Invariants: - Validation is atomic and consistent - No side effects on system state during validation - All violations properly logged and tracked """ start_time = datetime.utcnow() violations = [] recovery_actions = [] try: with self.validation_lock: # Resource invariants violations.extend(self._validate_resource_invariants(snapshot)) # Security invariants violations.extend(self._validate_security_invariants(snapshot)) # State consistency invariants violations.extend(self._validate_state_invariants(snapshot)) # Communication invariants violations.extend(self._validate_communication_invariants(snapshot)) # Performance invariants violations.extend(self._validate_performance_invariants(snapshot)) # Attempt automatic recovery for correctable violations for violation in violations: if ( violation.recovery_action and violation.recovery_action in self.recovery_functions ): try: recovery_func = self.recovery_functions[ violation.recovery_action ] recovery_result = recovery_func(violation, snapshot) if recovery_result: recovery_actions.append( f"Recovered: {violation.invariant_name}" ) except Exception as e: recovery_actions.append( f"Recovery failed for {violation.invariant_name}: {e}" ) # Calculate validation time validation_time = (datetime.utcnow() - start_time).total_seconds() # Create result result = InvariantResult( passed=len(violations) == 0, violations=violations, validation_time=validation_time, recovery_actions_taken=recovery_actions, ) # Store in history self.validation_history.append(result) # Keep only last 100 validations if len(self.validation_history) > 100: self.validation_history = self.validation_history[-100:] return result except Exception as e: # If validation itself fails, create critical violation critical_violation = InvariantViolation( invariant_name="validation_system_failure", invariant_type=InvariantType.STATE, severity=InvariantSeverity.CRITICAL, description=f"System invariant validation failed: {e}", affected_resources=["system"], current_value="failure", expected_value="success", ) return InvariantResult( passed=False, violations=[critical_violation], validation_time=(datetime.utcnow() - start_time).total_seconds(), recovery_actions_taken=[], ) def _validate_resource_invariants( self, snapshot: SystemStateSnapshot ) -> List[InvariantViolation]: """Validate resource-related invariants.""" violations = [] # Total agent count invariant if snapshot.total_agents > self.MAX_TOTAL_AGENTS: violations.append( InvariantViolation( invariant_name="max_total_agents", invariant_type=InvariantType.RESOURCE, severity=InvariantSeverity.CRITICAL, description=f"Total agent count exceeds maximum: {snapshot.total_agents} > {self.MAX_TOTAL_AGENTS}", affected_resources=["system"], current_value=snapshot.total_agents, expected_value=f"<= {self.MAX_TOTAL_AGENTS}", ) ) # Agents per session invariant for session_id, session_agents in snapshot.agents_by_session.items(): if len(session_agents) > self.MAX_AGENTS_PER_SESSION: violations.append( InvariantViolation( invariant_name="max_agents_per_session", invariant_type=InvariantType.RESOURCE, severity=InvariantSeverity.ERROR, description=f"Session {session_id} exceeds max agents: {len(session_agents)} > {self.MAX_AGENTS_PER_SESSION}", affected_resources=[str(session_id)], current_value=len(session_agents), expected_value=f"<= {self.MAX_AGENTS_PER_SESSION}", ) ) # Total CPU usage invariant total_cpu = snapshot.resource_usage["total_cpu_percent"] if total_cpu > self.MAX_TOTAL_CPU_PERCENT: violations.append( InvariantViolation( invariant_name="max_total_cpu", invariant_type=InvariantType.RESOURCE, severity=InvariantSeverity.WARNING, description=f"Total CPU usage exceeds limit: {total_cpu:.1f}% > {self.MAX_TOTAL_CPU_PERCENT}%", affected_resources=["system"], current_value=total_cpu, expected_value=f"<= {self.MAX_TOTAL_CPU_PERCENT}%", recovery_action="resource_cleanup", ) ) # Total memory usage invariant total_memory = snapshot.resource_usage["total_memory_mb"] if total_memory > self.MAX_TOTAL_MEMORY_MB: violations.append( InvariantViolation( invariant_name="max_total_memory", invariant_type=InvariantType.RESOURCE, severity=InvariantSeverity.WARNING, description=f"Total memory usage exceeds limit: {total_memory}MB > {self.MAX_TOTAL_MEMORY_MB}MB", affected_resources=["system"], current_value=total_memory, expected_value=f"<= {self.MAX_TOTAL_MEMORY_MB}MB", recovery_action="resource_cleanup", ) ) return violations def _validate_security_invariants( self, snapshot: SystemStateSnapshot ) -> List[InvariantViolation]: """Validate security-related invariants.""" violations = [] # Agent name uniqueness within sessions for session_id, session_agents in snapshot.agents_by_session.items(): agent_names = [agent.name for agent in session_agents] unique_names = set(agent_names) if len(agent_names) != len(unique_names): violations.append( InvariantViolation( invariant_name="agent_name_uniqueness", invariant_type=InvariantType.SECURITY, severity=InvariantSeverity.CRITICAL, description=f"Duplicate agent names in session {session_id}", affected_resources=[str(session_id)], current_value=f"{len(agent_names)} names, {len(unique_names)} unique", expected_value="All agent names unique within session", ) ) # Agent ID uniqueness globally agent_ids = list(snapshot.agents.keys()) unique_ids = set(agent_ids) if len(agent_ids) != len(unique_ids): violations.append( InvariantViolation( invariant_name="agent_id_uniqueness", invariant_type=InvariantType.SECURITY, severity=InvariantSeverity.CRITICAL, description="Duplicate agent IDs detected", affected_resources=["system"], current_value=f"{len(agent_ids)} IDs, {len(unique_ids)} unique", expected_value="All agent IDs unique globally", ) ) return violations def _validate_state_invariants( self, snapshot: SystemStateSnapshot ) -> List[InvariantViolation]: """Validate state consistency invariants.""" violations = [] # Agent error count invariant for agent in snapshot.agents.values(): if agent.error_count > self.MAX_AGENT_ERROR_COUNT: violations.append( InvariantViolation( invariant_name="max_agent_errors", invariant_type=InvariantType.STATE, severity=InvariantSeverity.WARNING, description=f"Agent {agent.name} has excessive errors: {agent.error_count}", affected_resources=[str(agent.agent_id)], current_value=agent.error_count, expected_value=f"<= {self.MAX_AGENT_ERROR_COUNT}", recovery_action="excessive_errors", ) ) # Agent restart count invariant for agent in snapshot.agents.values(): if agent.restart_count > self.MAX_AGENT_RESTART_COUNT: violations.append( InvariantViolation( invariant_name="max_agent_restarts", invariant_type=InvariantType.STATE, severity=InvariantSeverity.ERROR, description=f"Agent {agent.name} has excessive restarts: {agent.restart_count}", affected_resources=[str(agent.agent_id)], current_value=agent.restart_count, expected_value=f"<= {self.MAX_AGENT_RESTART_COUNT}", ) ) # Agent heartbeat freshness invariant max_heartbeat_age = timedelta(minutes=self.MAX_HEARTBEAT_AGE_MINUTES) current_time = snapshot.timestamp for agent in snapshot.agents.values(): if agent.status in { AgentStatus.ACTIVE, AgentStatus.WORKING, AgentStatus.IDLE, }: heartbeat_age = current_time - agent.last_heartbeat if heartbeat_age > max_heartbeat_age: violations.append( InvariantViolation( invariant_name="stale_agent_heartbeat", invariant_type=InvariantType.STATE, severity=InvariantSeverity.WARNING, description=f"Agent {agent.name} heartbeat is stale: {heartbeat_age}", affected_resources=[str(agent.agent_id)], current_value=str(heartbeat_age), expected_value=f"< {max_heartbeat_age}", recovery_action="stale_heartbeat", ) ) return violations def _validate_communication_invariants( self, snapshot: SystemStateSnapshot ) -> List[InvariantViolation]: """Validate communication-related invariants.""" violations = [] # Agent status coherence working_agents = snapshot.agents_by_status.get(AgentStatus.WORKING, []) if ( len(working_agents) > snapshot.total_agents * 0.8 ): # More than 80% working might indicate deadlock violations.append( InvariantViolation( invariant_name="excessive_working_agents", invariant_type=InvariantType.COMMUNICATION, severity=InvariantSeverity.WARNING, description=f"Too many agents in WORKING state: {len(working_agents)} / {snapshot.total_agents}", affected_resources=["system"], current_value=f"{len(working_agents)} working", expected_value=f"< {int(snapshot.total_agents * 0.8)} working", ) ) return violations def _validate_performance_invariants( self, snapshot: SystemStateSnapshot ) -> List[InvariantViolation]: """Validate performance-related invariants.""" violations = [] # Individual agent resource limits for agent in snapshot.agents.values(): # CPU usage per agent if agent.resource_metrics.cpu_percent > agent.claude_config.max_cpu_percent: violations.append( InvariantViolation( invariant_name="agent_cpu_limit", invariant_type=InvariantType.PERFORMANCE, severity=InvariantSeverity.WARNING, description=f"Agent {agent.name} CPU usage exceeds limit: {agent.resource_metrics.cpu_percent}%", affected_resources=[str(agent.agent_id)], current_value=f"{agent.resource_metrics.cpu_percent}%", expected_value=f"<= {agent.claude_config.max_cpu_percent}%", ) ) # Memory usage per agent if agent.resource_metrics.memory_mb > agent.claude_config.max_memory_mb: violations.append( InvariantViolation( invariant_name="agent_memory_limit", invariant_type=InvariantType.PERFORMANCE, severity=InvariantSeverity.WARNING, description=f"Agent {agent.name} memory usage exceeds limit: {agent.resource_metrics.memory_mb}MB", affected_resources=[str(agent.agent_id)], current_value=f"{agent.resource_metrics.memory_mb}MB", expected_value=f"<= {agent.claude_config.max_memory_mb}MB", ) ) return violations # Recovery functions def _recover_stale_heartbeat( self, violation: InvariantViolation, snapshot: SystemStateSnapshot ) -> bool: """Attempt to recover from stale heartbeat by requesting agent status update.""" # In real implementation, this would trigger a health check return True def _recover_excessive_errors( self, violation: InvariantViolation, snapshot: SystemStateSnapshot ) -> bool: """Attempt to recover from excessive errors by resetting error count.""" # In real implementation, this would reset error counters after investigation return True def _recover_resource_cleanup( self, violation: InvariantViolation, snapshot: SystemStateSnapshot ) -> bool: """Attempt to recover from resource issues by triggering cleanup.""" # In real implementation, this would trigger garbage collection and resource cleanup return True def _recover_orphaned_processes( self, violation: InvariantViolation, snapshot: SystemStateSnapshot ) -> bool: """Attempt to recover from orphaned processes by cleaning them up.""" # In real implementation, this would identify and clean up orphaned processes return True async def log_violations(self, result: InvariantResult) -> None: """Log invariant violations to audit system.""" if not result.violations: return try: audit_logger = get_audit_logger() for violation in result.violations: await audit_logger.log_event( level=( AuditLevel.CRITICAL if violation.severity == InvariantSeverity.CRITICAL else AuditLevel.WARNING ), category=AuditCategory.SYSTEM_HEALTH, operation="invariant_validation", resource_type="system_state", resource_id=violation.invariant_name, success=False, error_message=violation.description, metadata=violation.to_dict(), ) # Log overall validation result await audit_logger.log_event( level=AuditLevel.INFO, category=AuditCategory.SYSTEM_HEALTH, operation="system_validation", resource_type="system", resource_id="global", success=result.passed, metadata={ "total_violations": len(result.violations), "validation_time": result.validation_time, "recovery_actions": result.recovery_actions_taken, }, ) except Exception: # Don't fail invariant validation if audit logging fails pass def get_validation_summary(self) -> Dict[str, Any]: """Get summary of recent validation results.""" if not self.validation_history: return { "status": "no_validations", "message": "No validations performed yet", } recent_results = self.validation_history[-10:] # Last 10 validations total_validations = len(recent_results) passed_validations = sum(1 for result in recent_results if result.passed) total_violations = sum(len(result.violations) for result in recent_results) # Categorize violations by type violation_by_type = defaultdict(int) violation_by_severity = defaultdict(int) for result in recent_results: for violation in result.violations: violation_by_type[violation.invariant_type.value] += 1 violation_by_severity[violation.severity.name] += 1 return { "status": "passed" if recent_results[-1].passed else "failed", "total_validations": total_validations, "passed_validations": passed_validations, "success_rate": ( passed_validations / total_validations if total_validations > 0 else 0 ), "total_violations": total_violations, "violations_by_type": dict(violation_by_type), "violations_by_severity": dict(violation_by_severity), "last_validation": recent_results[-1].validation_time, "average_validation_time": sum(r.validation_time for r in recent_results) / total_validations, } # Global invariant validator instance _invariant_validator_instance: Optional[SystemInvariantValidator] = None def get_invariant_validator() -> SystemInvariantValidator: """Get global system invariant validator instance.""" global _invariant_validator_instance if _invariant_validator_instance is None: _invariant_validator_instance = SystemInvariantValidator() return _invariant_validator_instance # Convenience functions for invariant validation async def validate_system_invariants( agents: Dict[AgentId, AgentState], sessions: Dict[SessionId, SessionState] ) -> InvariantResult: """Validate all system invariants with current state.""" validator = get_invariant_validator() snapshot = SystemStateSnapshot(agents, sessions) result = validator.validate_all_invariants(snapshot) # Log violations if any await validator.log_violations(result) return result def get_system_health_summary() -> Dict[str, Any]: """Get summary of system health based on recent invariant validations.""" validator = get_invariant_validator() return validator.get_validation_summary()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Nexus-Digital-Automations/Claude_Code_MCP_2'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

invariants.py•27.2 KiB