MCP Task Aggregator

rewards.py•6.37 KiB

"""Reward functions for OpenPipe ART training.

Provides reward signals for training based on:
1. Task completion metrics from todo status transitions
2. RULER-based LLM judging
3. Combined weighted rewards
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

from loguru import logger

if TYPE_CHECKING:
    import art


def task_completion_reward(
    _trajectory: art.Trajectory,
    completion_signals: dict[str, Any] | None = None,
) -> float:
    """Calculate reward based on task completion signals.

    Uses status transitions and completion metrics to determine
    how well the model performed the task.

    Args:
        _trajectory: The trajectory (reserved for future metadata extraction).
        completion_signals: Optional dict with completion metrics:
            - task_completed: bool - whether task was marked done
            - status_progression: list - status transitions made
            - time_to_completion: float - time in seconds
            - error_count: int - number of errors encountered

    Returns:
        Reward score between 0.0 and 1.0.
    """
    if not completion_signals:
        # No completion signals - use neutral reward
        return 0.5

    reward = 0.0
    weights_sum = 0.0

    # Task completed is most important
    if "task_completed" in completion_signals:
        weight = 0.5
        reward += weight * (1.0 if completion_signals["task_completed"] else 0.0)
        weights_sum += weight

    # Proper status progression
    if "status_progression" in completion_signals:
        weight = 0.2
        progression = completion_signals["status_progression"]
        # Reward clean progressions (todo -> in_progress -> done)
        expected = ["todo", "in_progress", "done"]
        if progression == expected:
            reward += weight * 1.0
        elif "done" in progression:
            reward += weight * 0.7
        elif "in_progress" in progression:
            reward += weight * 0.3
        weights_sum += weight

    # Penalize errors
    if "error_count" in completion_signals:
        weight = 0.2
        error_count = completion_signals["error_count"]
        # No errors = full reward, diminishes with errors
        error_penalty = max(0.0, 1.0 - (error_count * 0.2))
        reward += weight * error_penalty
        weights_sum += weight

    # Reward faster completion (if time metric available)
    if "time_to_completion" in completion_signals:
        weight = 0.1
        time_sec = completion_signals["time_to_completion"]
        # Normalize time (faster is better, cap at 60 seconds for full reward)
        time_reward = max(0.0, 1.0 - (time_sec / 300.0))  # 5 min baseline
        reward += weight * time_reward
        weights_sum += weight

    # Normalize by weights used
    return reward / weights_sum if weights_sum > 0 else 0.5


async def ruler_reward(
    trajectory: art.Trajectory,
    ruler_model: str = "openrouter/openai/gpt-4o-mini",
    _rubric: dict[str, Any] | None = None,
) -> float:
    """Score trajectory using RULER (LLM-as-judge).

    RULER compares multiple responses and ranks them based on
    how well they accomplish the task.

    Args:
        trajectory: The trajectory to score.
        ruler_model: Model to use for judging.
        _rubric: Optional custom rubric (reserved for future custom scoring).

    Returns:
        Reward score from RULER.
    """
    import art
    from art.rewards import ruler_score_group

    # Create a single-trajectory group for scoring
    group = art.TrajectoryGroup(trajectories=[trajectory])

    max_retries = 5
    for attempt in range(max_retries):
        try:
            judged_group = await ruler_score_group(group, ruler_model, debug=False)
            if judged_group and judged_group.trajectories:
                return judged_group.trajectories[0].reward
        except Exception as e:
            logger.warning(f"RULER scoring attempt {attempt + 1} failed: {e}")
            continue

    logger.error("Failed to get RULER score after max retries")
    return 0.0


def combined_reward(
    trajectory: art.Trajectory,
    completion_signals: dict[str, Any] | None = None,
    ruler_score: float | None = None,
    weights: dict[str, float] | None = None,
) -> float:
    """Calculate weighted combination of reward signals.

    Combines task completion metrics with RULER scoring for
    a comprehensive reward signal.

    Args:
        trajectory: The trajectory to score.
        completion_signals: Task completion metrics.
        ruler_score: Pre-computed RULER score (if available).
        weights: Custom weights for combining rewards:
            - task_completion: weight for completion reward (default 0.3)
            - ruler: weight for RULER reward (default 0.7)

    Returns:
        Combined reward score between 0.0 and 1.0.
    """
    default_weights = {
        "task_completion": 0.3,
        "ruler": 0.7,
    }
    weights = weights or default_weights

    total_reward = 0.0
    total_weight = 0.0

    # Task completion reward
    if weights.get("task_completion", 0) > 0:
        tc_reward = task_completion_reward(trajectory, completion_signals)
        total_reward += weights["task_completion"] * tc_reward
        total_weight += weights["task_completion"]

    # RULER reward
    if weights.get("ruler", 0) > 0 and ruler_score is not None:
        total_reward += weights["ruler"] * ruler_score
        total_weight += weights["ruler"]

    return total_reward / total_weight if total_weight > 0 else 0.5


def extract_completion_signals_from_todo(
    todo_before: dict[str, Any],
    todo_after: dict[str, Any],
) -> dict[str, Any]:
    """Extract completion signals from todo state changes.

    Analyzes the before/after state of a todo to generate
    completion signals for reward calculation.

    Args:
        todo_before: Todo state before agent action.
        todo_after: Todo state after agent action.

    Returns:
        Dict with completion signals.
    """
    signals: dict[str, Any] = {}

    # Check if task was completed
    signals["task_completed"] = todo_after.get("status") == "done"

    # Track status progression
    status_before = todo_before.get("status", "todo")
    status_after = todo_after.get("status", "todo")
    signals["status_progression"] = [status_before, status_after]

    # Check for errors (if error field exists)
    signals["error_count"] = 0
    if todo_after.get("error"):
        signals["error_count"] = 1

    return signals

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/89jobrien/mcp-joecc'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

rewards.py•6.37 KiB

"""Reward functions for OpenPipe ART training.

Provides reward signals for training based on:
1. Task completion metrics from todo status transitions
2. RULER-based LLM judging
3. Combined weighted rewards
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any

from loguru import logger

if TYPE_CHECKING:
    import art


def task_completion_reward(
    _trajectory: art.Trajectory,
    completion_signals: dict[str, Any] | None = None,
) -> float:
    """Calculate reward based on task completion signals.

    Uses status transitions and completion metrics to determine
    how well the model performed the task.

    Args:
        _trajectory: The trajectory (reserved for future metadata extraction).
        completion_signals: Optional dict with completion metrics:
            - task_completed: bool - whether task was marked done
            - status_progression: list - status transitions made
            - time_to_completion: float - time in seconds
            - error_count: int - number of errors encountered

    Returns:
        Reward score between 0.0 and 1.0.
    """
    if not completion_signals:
        # No completion signals - use neutral reward
        return 0.5

    reward = 0.0
    weights_sum = 0.0

    # Task completed is most important
    if "task_completed" in completion_signals:
        weight = 0.5
        reward += weight * (1.0 if completion_signals["task_completed"] else 0.0)
        weights_sum += weight

    # Proper status progression
    if "status_progression" in completion_signals:
        weight = 0.2
        progression = completion_signals["status_progression"]
        # Reward clean progressions (todo -> in_progress -> done)
        expected = ["todo", "in_progress", "done"]
        if progression == expected:
            reward += weight * 1.0
        elif "done" in progression:
            reward += weight * 0.7
        elif "in_progress" in progression:
            reward += weight * 0.3
        weights_sum += weight

    # Penalize errors
    if "error_count" in completion_signals:
        weight = 0.2
        error_count = completion_signals["error_count"]
        # No errors = full reward, diminishes with errors
        error_penalty = max(0.0, 1.0 - (error_count * 0.2))
        reward += weight * error_penalty
        weights_sum += weight

    # Reward faster completion (if time metric available)
    if "time_to_completion" in completion_signals:
        weight = 0.1
        time_sec = completion_signals["time_to_completion"]
        # Normalize time (faster is better, cap at 60 seconds for full reward)
        time_reward = max(0.0, 1.0 - (time_sec / 300.0))  # 5 min baseline
        reward += weight * time_reward
        weights_sum += weight

    # Normalize by weights used
    return reward / weights_sum if weights_sum > 0 else 0.5


async def ruler_reward(
    trajectory: art.Trajectory,
    ruler_model: str = "openrouter/openai/gpt-4o-mini",
    _rubric: dict[str, Any] | None = None,
) -> float:
    """Score trajectory using RULER (LLM-as-judge).

    RULER compares multiple responses and ranks them based on
    how well they accomplish the task.

    Args:
        trajectory: The trajectory to score.
        ruler_model: Model to use for judging.
        _rubric: Optional custom rubric (reserved for future custom scoring).

    Returns:
        Reward score from RULER.
    """
    import art
    from art.rewards import ruler_score_group

    # Create a single-trajectory group for scoring
    group = art.TrajectoryGroup(trajectories=[trajectory])

    max_retries = 5
    for attempt in range(max_retries):
        try:
            judged_group = await ruler_score_group(group, ruler_model, debug=False)
            if judged_group and judged_group.trajectories:
                return judged_group.trajectories[0].reward
        except Exception as e:
            logger.warning(f"RULER scoring attempt {attempt + 1} failed: {e}")
            continue

    logger.error("Failed to get RULER score after max retries")
    return 0.0


def combined_reward(
    trajectory: art.Trajectory,
    completion_signals: dict[str, Any] | None = None,
    ruler_score: float | None = None,
    weights: dict[str, float] | None = None,
) -> float:
    """Calculate weighted combination of reward signals.

    Combines task completion metrics with RULER scoring for
    a comprehensive reward signal.

    Args:
        trajectory: The trajectory to score.
        completion_signals: Task completion metrics.
        ruler_score: Pre-computed RULER score (if available).
        weights: Custom weights for combining rewards:
            - task_completion: weight for completion reward (default 0.3)
            - ruler: weight for RULER reward (default 0.7)

    Returns:
        Combined reward score between 0.0 and 1.0.
    """
    default_weights = {
        "task_completion": 0.3,
        "ruler": 0.7,
    }
    weights = weights or default_weights

    total_reward = 0.0
    total_weight = 0.0

    # Task completion reward
    if weights.get("task_completion", 0) > 0:
        tc_reward = task_completion_reward(trajectory, completion_signals)
        total_reward += weights["task_completion"] * tc_reward
        total_weight += weights["task_completion"]

    # RULER reward
    if weights.get("ruler", 0) > 0 and ruler_score is not None:
        total_reward += weights["ruler"] * ruler_score
        total_weight += weights["ruler"]

    return total_reward / total_weight if total_weight > 0 else 0.5


def extract_completion_signals_from_todo(
    todo_before: dict[str, Any],
    todo_after: dict[str, Any],
) -> dict[str, Any]:
    """Extract completion signals from todo state changes.

    Analyzes the before/after state of a todo to generate
    completion signals for reward calculation.

    Args:
        todo_before: Todo state before agent action.
        todo_after: Todo state after agent action.

    Returns:
        Dict with completion signals.
    """
    signals: dict[str, Any] = {}

    # Check if task was completed
    signals["task_completed"] = todo_after.get("status") == "done"

    # Track status progression
    status_before = todo_before.get("status", "todo")
    status_after = todo_after.get("status", "todo")
    signals["status_progression"] = [status_before, status_after]

    # Check for errors (if error field exists)
    signals["error_count"] = 0
    if todo_after.get("error"):
        signals["error_count"] = 1

    return signals