test_evaluator_base.py•9.67 kB
from __future__ import annotations as _annotations
import asyncio
from collections.abc import Sequence
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
import pytest
from inline_snapshot import snapshot
from pydantic import TypeAdapter
from ..conftest import try_import
with try_import() as imports_successful:
from pydantic_evals.evaluators._run_evaluator import run_evaluator
from pydantic_evals.evaluators.context import EvaluatorContext
from pydantic_evals.evaluators.evaluator import (
EvaluationReason,
EvaluationResult,
Evaluator,
EvaluatorFailure,
)
from pydantic_evals.otel._errors import SpanTreeRecordingError
pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
def test_evaluation_reason():
"""Test EvaluationReason class."""
# Test with value only
reason = EvaluationReason(value=True)
assert reason.value is True
assert reason.reason is None
# Test with value and reason
reason = EvaluationReason(value=42, reason='Perfect score')
assert reason.value == 42
assert reason.reason == 'Perfect score'
# Test with string value
reason = EvaluationReason(value='pass', reason='Test passed')
assert reason.value == 'pass'
assert reason.reason == 'Test passed'
def test_evaluation_result():
"""Test EvaluationResult class."""
@dataclass
class DummyEvaluator(Evaluator[Any, Any, Any]):
def evaluate(self, ctx: EvaluatorContext) -> bool:
raise NotImplementedError
evaluator = DummyEvaluator()
# Test basic result
result = EvaluationResult(name='test', value=True, reason='Success', source=evaluator.as_spec())
assert result.name == 'test'
assert result.value is True
assert result.reason == 'Success'
assert result.source == evaluator.as_spec()
# Test downcast with matching type
downcast = result.downcast(bool)
assert downcast is not None
assert downcast.value is True
# Test downcast with non-matching type
downcast = result.downcast(int)
assert downcast is None
# Test downcast with multiple types
downcast = result.downcast(int, bool)
assert downcast is not None
assert downcast.value is True
def test_strict_abc_meta():
"""Test _StrictABCMeta metaclass."""
# Test that abstract methods must be implemented
with pytest.raises(TypeError) as exc_info:
@dataclass
class InvalidEvaluator(Evaluator[Any, Any, Any]): # pyright: ignore[reportUnusedClass]
pass
assert 'must implement all abstract methods' in str(exc_info.value)
assert 'evaluate' in str(exc_info.value)
if TYPE_CHECKING or imports_successful(): # pragma: no branch
@dataclass
class SimpleEvaluator(Evaluator[Any, Any, Any]):
value: Any = True
reason: str | None = None
def evaluate(self, ctx: EvaluatorContext) -> bool | EvaluationReason:
if self.reason is not None:
return EvaluationReason(value=self.value, reason=self.reason)
return self.value
@dataclass
class AsyncEvaluator(Evaluator[Any, Any, Any]):
value: Any = True
delay: float = 0.1
async def evaluate(self, ctx: EvaluatorContext) -> bool:
await asyncio.sleep(self.delay)
return self.value
@dataclass
class MultiEvaluator(Evaluator[Any, Any, Any]):
def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool]:
return {'test1': True, 'test2': False}
async def test_evaluator_sync():
"""Test synchronous evaluator execution."""
ctx = EvaluatorContext(
name='test',
inputs={},
metadata=None,
expected_output=None,
output={},
duration=0.0,
_span_tree=SpanTreeRecordingError('did not record spans'),
attributes={},
metrics={},
)
# Test simple boolean result
evaluator = SimpleEvaluator()
result = evaluator.evaluate_sync(ctx)
assert result is True
# Test with reason
evaluator = SimpleEvaluator(value=False, reason='Failed')
result = evaluator.evaluate_sync(ctx)
assert isinstance(result, EvaluationReason)
assert result.value is False
assert result.reason == 'Failed'
# Test with dictionary result
evaluator = MultiEvaluator()
result = evaluator.evaluate_sync(ctx)
assert isinstance(result, dict)
assert result['test1'] is True
assert result['test2'] is False
async def test_evaluator_async():
"""Test asynchronous evaluator execution."""
ctx = EvaluatorContext(
name='test',
inputs={},
metadata=None,
expected_output=None,
output={},
duration=0.0,
_span_tree=SpanTreeRecordingError('did not record spans'),
attributes={},
metrics={},
)
# Test async evaluator
evaluator = AsyncEvaluator()
result = await evaluator.evaluate_async(ctx)
assert result is True
# Test sync evaluator with async execution
evaluator = SimpleEvaluator()
result = await evaluator.evaluate_async(ctx)
assert result is True
async def test_evaluation_name():
"""Test evaluator name method."""
evaluator = SimpleEvaluator()
assert evaluator.get_serialization_name() == 'SimpleEvaluator'
assert evaluator.get_default_evaluation_name() == 'SimpleEvaluator'
async def test_evaluator_serialization():
"""Test evaluator serialization."""
@dataclass
class ExampleEvaluator(Evaluator[Any, Any, Any]):
value: int = 42
optional: str | None = None
default_value: bool = False
default_factory_value: list[int] = field(default_factory=list)
def evaluate(self, ctx: EvaluatorContext) -> bool:
raise NotImplementedError
# Test with default values
evaluator = ExampleEvaluator()
adapter = TypeAdapter(Evaluator)
assert adapter.dump_python(evaluator, context=None) == snapshot({'arguments': None, 'name': 'ExampleEvaluator'})
assert adapter.dump_python(evaluator, context={'use_short_form': True}) == snapshot('ExampleEvaluator')
# Test with a single non-default value
evaluator = ExampleEvaluator(value=100)
assert adapter.dump_python(evaluator, context=None) == snapshot({'arguments': [100], 'name': 'ExampleEvaluator'})
assert adapter.dump_python(evaluator, context={'use_short_form': True}) == snapshot({'ExampleEvaluator': 100})
# Test with multiple non-default values
evaluator = ExampleEvaluator(value=100, optional='test', default_value=True)
assert adapter.dump_python(evaluator, context=None) == snapshot(
{'arguments': {'default_value': True, 'optional': 'test', 'value': 100}, 'name': 'ExampleEvaluator'}
)
assert adapter.dump_python(evaluator, context={'use_short_form': True}) == snapshot(
{'ExampleEvaluator': {'default_value': True, 'optional': 'test', 'value': 100}}
)
# Test with no arguments
@dataclass
class NoArgsEvaluator(Evaluator[Any, Any, Any]):
def evaluate(self, ctx: EvaluatorContext) -> bool:
raise NotImplementedError
evaluator = NoArgsEvaluator()
assert adapter.dump_python(evaluator, context=None) == snapshot({'arguments': None, 'name': 'NoArgsEvaluator'})
assert adapter.dump_python(evaluator, context={'use_short_form': True}) == snapshot('NoArgsEvaluator')
async def test_run_evaluator():
"""Test run_evaluator function."""
ctx = EvaluatorContext(
name='test',
inputs={},
metadata=None,
expected_output=None,
output={},
duration=0.0,
_span_tree=SpanTreeRecordingError('did not record spans'),
attributes={},
metrics={},
)
# Test with simple boolean result
evaluator = SimpleEvaluator()
results = await run_evaluator(evaluator, ctx)
adapter = TypeAdapter[Sequence[EvaluationResult] | EvaluatorFailure](Sequence[EvaluationResult] | EvaluatorFailure)
assert adapter.dump_python(results) == snapshot(
[
{
'name': 'SimpleEvaluator',
'reason': None,
'source': {'arguments': None, 'name': 'SimpleEvaluator'},
'value': True,
}
]
)
# Test with reason
evaluator = SimpleEvaluator(value=False, reason='Failed')
results = await run_evaluator(evaluator, ctx)
assert adapter.dump_python(results) == snapshot(
[
{
'name': 'SimpleEvaluator',
'reason': 'Failed',
'source': {'arguments': {'reason': 'Failed', 'value': False}, 'name': 'SimpleEvaluator'},
'value': False,
}
]
)
# Test with dictionary result
evaluator = MultiEvaluator()
results = await run_evaluator(evaluator, ctx)
assert adapter.dump_python(results) == snapshot(
[
{'name': 'test1', 'reason': None, 'source': {'arguments': None, 'name': 'MultiEvaluator'}, 'value': True},
{'name': 'test2', 'reason': None, 'source': {'arguments': None, 'name': 'MultiEvaluator'}, 'value': False},
]
)
# Test with async evaluator
evaluator = AsyncEvaluator()
results = await run_evaluator(evaluator, ctx)
assert adapter.dump_python(results) == snapshot(
[
{
'name': 'AsyncEvaluator',
'reason': None,
'source': {'arguments': None, 'name': 'AsyncEvaluator'},
'value': True,
}
]
)