Daemon-MCP

code_exec.py•7.55 KiB

""" Code Execution and Failure Classification for Reflexion Evaluator. Wraps SandboxExecutor to provide: 1. Structured code execution results with failure classification 2. CodeFailureType enum for categorizing sandbox errors 3. Retry logic for fixable errors (SYNTAX_ERROR, IMPORT_ERROR) 4. Graceful fallback for infrastructure errors (TIMEOUT, SANDBOX_ERROR) The classification determines Evaluator behavior: - SUCCESS: Code assertions passed -> positive verification signal - ASSERTION_FAILURE: Code assertions failed -> negative verification signal (claim is wrong) - SYNTAX_ERROR: Generated code has syntax errors -> fixable, may retry - IMPORT_ERROR: Code uses unavailable module -> fixable, may retry without import - TIMEOUT: Code exceeded time limit -> infrastructure issue, fall back to text-only - SANDBOX_ERROR: E2B infrastructure failure -> fall back to text-only Phase 14: Code-Augmented Reflexion """ from __future__ import annotations import logging from dataclasses import dataclass from enum import Enum from typing import Optional from ..agency.sandbox import SandboxExecutor, StructuredExecutionResult logger = logging.getLogger(__name__) class CodeFailureType(str, Enum): """Classification of code execution outcomes. Each type triggers different behavior in the Evaluator: - SUCCESS: Positive signal for quality scoring - ASSERTION_FAILURE: Negative signal (claim is factually wrong) - SYNTAX_ERROR: Fixable error, may retry with corrected code - IMPORT_ERROR: Fixable error, may retry without the import - TIMEOUT: Infrastructure issue, fall back to text-only - SANDBOX_ERROR: Infrastructure issue, fall back to text-only """ SUCCESS = "SUCCESS" SYNTAX_ERROR = "SYNTAX_ERROR" IMPORT_ERROR = "IMPORT_ERROR" ASSERTION_FAILURE = "ASSERTION_FAILURE" TIMEOUT = "TIMEOUT" SANDBOX_ERROR = "SANDBOX_ERROR" # Failure types that indicate the CODE is wrong (potentially fixable) FIXABLE_FAILURES = {CodeFailureType.SYNTAX_ERROR, CodeFailureType.IMPORT_ERROR} # Failure types that indicate INFRASTRUCTURE problems (fall back to text-only) INFRASTRUCTURE_FAILURES = {CodeFailureType.TIMEOUT, CodeFailureType.SANDBOX_ERROR} # Failure types that indicate the CLAIM is wrong (verification signal) VERIFICATION_FAILURES = {CodeFailureType.ASSERTION_FAILURE} @dataclass class CodeExecutionResult: """Result of executing verification code with classification. This is the structured result returned to the Evaluator node, containing both the execution output and the failure classification. """ failure_type: CodeFailureType output: str = "" error_message: Optional[str] = None error_traceback: Optional[str] = None execution_time_ms: int = 0 assertions_passed: bool = False @property def is_success(self) -> bool: """Whether the code executed successfully with all assertions passing.""" return self.failure_type == CodeFailureType.SUCCESS @property def is_fixable(self) -> bool: """Whether the failure is potentially fixable by regenerating code.""" return self.failure_type in FIXABLE_FAILURES @property def is_infrastructure_failure(self) -> bool: """Whether the failure is an infrastructure issue (should fall back).""" return self.failure_type in INFRASTRUCTURE_FAILURES @property def is_verification_failure(self) -> bool: """Whether the failure indicates the claim is wrong.""" return self.failure_type in VERIFICATION_FAILURES def to_dict(self) -> dict: """Convert to dict for ReflexionState storage.""" return { "failure_type": self.failure_type.value, "output": self.output, "error_message": self.error_message, "assertions_passed": self.assertions_passed, "execution_time_ms": self.execution_time_ms, } def classify_failure(result: StructuredExecutionResult) -> CodeFailureType: """Classify a sandbox execution result into a CodeFailureType. Uses the structured error_name from E2B's ExecutionError to determine the failure category. The error_name contains the Python exception class name (e.g., "SyntaxError", "AssertionError"). Args: result: StructuredExecutionResult from SandboxExecutor.execute_structured() Returns: CodeFailureType classification """ if result.success: return CodeFailureType.SUCCESS error_name = (result.error_name or "").lower() error_value = (result.error_value or "").lower() # Syntax errors -- generated code has Python syntax issues if "syntax" in error_name or "syntaxerror" in error_name: return CodeFailureType.SYNTAX_ERROR # Import/module errors -- code uses unavailable packages if "import" in error_name or "module" in error_name or "modulenotfound" in error_name: return CodeFailureType.IMPORT_ERROR # Assertion errors -- claim verification failed (the claim is wrong) if "assertion" in error_name or "assertionerror" in error_name: return CodeFailureType.ASSERTION_FAILURE # Timeout errors -- execution exceeded time limit if "timeout" in error_name or "timedout" in error_name: return CodeFailureType.TIMEOUT # Check error_value as fallback for edge cases if "timeout" in error_value or "timed out" in error_value: return CodeFailureType.TIMEOUT if "syntax" in error_value: return CodeFailureType.SYNTAX_ERROR # Anything else is a sandbox/infrastructure error return CodeFailureType.SANDBOX_ERROR async def execute_verification_code( code: str, executor: SandboxExecutor, ) -> CodeExecutionResult: """Execute verification code in the sandbox and classify the result. This is the main entry point for the Evaluator to run verification code. It handles sandbox availability checks, execution, and classification. Args: code: Python verification code to execute executor: SandboxExecutor instance Returns: CodeExecutionResult with classification and details """ # Check sandbox availability first if not executor.available: logger.info("Sandbox unavailable, returning SANDBOX_ERROR for graceful fallback") return CodeExecutionResult( failure_type=CodeFailureType.SANDBOX_ERROR, error_message="Sandbox not available (E2B_API_KEY not set or e2b-code-interpreter not installed)", ) # Execute in sandbox result = await executor.execute_structured(code) # Classify the result failure_type = classify_failure(result) # Check for assertion pass marker in output assertions_passed = ( failure_type == CodeFailureType.SUCCESS and "ALL_ASSERTIONS_PASSED" in result.output ) # Build error message error_message = None if not result.success: error_message = f"{result.error_name}: {result.error_value}" if result.error_name else result.error_value logger.info( f"Verification code execution: {failure_type.value}, " f"assertions_passed={assertions_passed}, " f"time={result.execution_time_ms}ms" ) return CodeExecutionResult( failure_type=failure_type, output=result.output, error_message=error_message, error_traceback=result.error_traceback, execution_time_ms=result.execution_time_ms, assertions_passed=assertions_passed, ) __all__ = [ "CodeFailureType", "CodeExecutionResult", "FIXABLE_FAILURES", "INFRASTRUCTURE_FAILURES", "VERIFICATION_FAILURES", "classify_failure", "execute_verification_code", ]

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DasBluEyedDevil/Daemon-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

code_exec.py•7.55 KiB