from unittest.mock import MagicMock, patch
import pytest
from src.core.settings import EvaluationSettings, Settings
from src.libs.evaluator.base_evaluator import BaseEvaluator
from src.libs.evaluator.custom_evaluator import CustomEvaluator
from src.libs.evaluator.evaluator_factory import EvaluatorFactory
from src.observability.evaluation.composite_evaluator import CompositeEvaluator
def test_custom_evaluator_metrics():
"""Test Hit Rate and MRR calculation logic."""
evaluator = CustomEvaluator()
query = "test"
# Case 1: Perfect match at rank 1
# golden: A, retrieved: A, B, C
metrics = evaluator.evaluate(query, ["A", "B", "C"], ["A"])
assert metrics["hit_rate"] == 1.0
assert metrics["mrr"] == 1.0
# Case 2: Match at rank 2
# golden: A, retrieved: B, A, C
metrics = evaluator.evaluate(query, ["B", "A", "C"], ["A"])
assert metrics["hit_rate"] == 1.0
assert metrics["mrr"] == 0.5 # 1/2
# Case 3: No match
# golden: A, retrieved: B, C, D
metrics = evaluator.evaluate(query, ["B", "C", "D"], ["A"])
assert metrics["hit_rate"] == 0.0
assert metrics["mrr"] == 0.0
# Case 4: Multiple goldens
# golden: A, Z. retrieved: B, A, C
# First match is A at rank 2
metrics = evaluator.evaluate(query, ["B", "A", "C"], ["A", "Z"])
assert metrics["hit_rate"] == 1.0
assert metrics["mrr"] == 0.5
# Case 5: Empty golden (should be 0)
metrics = evaluator.evaluate(query, ["A"], [])
assert metrics["hit_rate"] == 0.0
assert metrics["mrr"] == 0.0
def test_factory_creates_custom_evaluator():
"""Test factory creates CustomEvaluator when configured."""
settings = MagicMock(spec=Settings)
settings.evaluation = MagicMock(spec=EvaluationSettings)
settings.evaluation.backends = ["custom"]
evaluator = EvaluatorFactory.create(settings)
assert isinstance(evaluator, CustomEvaluator)
assert isinstance(evaluator, BaseEvaluator)
def test_factory_creates_custom_evaluator_mixed():
"""Test factory creates composite evaluator if both are present."""
settings = MagicMock(spec=Settings)
settings.evaluation = MagicMock(spec=EvaluationSettings)
settings.evaluation.backends = ["ragas", "custom"]
settings.evaluation.metrics = []
# Mock RagasEvaluator to avoid import error during instantiation
with patch("src.libs.evaluator.evaluator_factory.RagasEvaluator") as MockRagas:
evaluator = EvaluatorFactory.create(settings)
assert isinstance(evaluator, CompositeEvaluator)
assert MockRagas.called
def test_factory_raises_error_on_unsupported():
"""Test factory raises error if no supported backend found."""
settings = MagicMock(spec=Settings)
settings.evaluation = MagicMock(spec=EvaluationSettings)
settings.evaluation.backends = ["unknown_backend"]
with pytest.raises(ValueError, match="No supported evaluator backend"):
EvaluatorFactory.create(settings)
def test_custom_evaluator_boundaries():
"""Test CustomEvaluator boundary conditions."""
evaluator = CustomEvaluator()
query = "test"
# 1. Empty retrieved list (should be 0.0)
metrics = evaluator.evaluate(query, [], ["A"])
assert metrics["hit_rate"] == 0.0
assert metrics["mrr"] == 0.0
# 2. Retrieved list with duplicates (should count first occurrence for MRR)
# golden: A. retrieved: B, A, A. Rank = 2. MRR = 0.5
metrics = evaluator.evaluate(query, ["B", "A", "A"], ["A"])
assert metrics["hit_rate"] == 1.0
assert metrics["mrr"] == 0.5
# 3. Case sensitivity (IDs are usually case-sensitive strings)
# golden: A. retrieved: a. Should be 0.0
metrics = evaluator.evaluate(query, ["a"], ["A"])
assert metrics["hit_rate"] == 0.0
# 4. Golden list empty (should be 0.0, already covered in main test but good for regression)
metrics = evaluator.evaluate(query, ["A"], [])
assert metrics["hit_rate"] == 0.0
# 5. Large lists
retrieved = [str(i) for i in range(100)]
golden = ["99"]
metrics = evaluator.evaluate(query, retrieved, golden)
assert metrics["hit_rate"] == 1.0
# MRR should be 1/100 = 0.01
assert abs(metrics["mrr"] - 0.01) < 1e-9