Skip to main content
Glama

mcp-run-python

Official
by pydantic
test_evaluator_common.py16.3 kB
from __future__ import annotations as _annotations from datetime import timedelta from typing import TYPE_CHECKING, Any import pytest from inline_snapshot import snapshot from pydantic_core import to_jsonable_python from pytest_mock import MockerFixture from pydantic_ai.settings import ModelSettings from ..conftest import try_import with try_import() as imports_successful: import logfire from logfire.testing import CaptureLogfire from pydantic_evals.evaluators import EvaluationReason, EvaluatorContext from pydantic_evals.evaluators.common import ( Contains, Equals, EqualsExpected, HasMatchingSpan, IsInstance, LLMJudge, MaxDuration, OutputConfig, ) from pydantic_evals.otel._context_in_memory_span_exporter import context_subtree from pydantic_evals.otel._errors import SpanTreeRecordingError from pydantic_evals.otel.span_tree import SpanQuery pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio] if TYPE_CHECKING or imports_successful(): class MockContext(EvaluatorContext[Any, Any, Any]): def __init__(self, output: Any = None, expected_output: Any = None, inputs: Any = None, duration: float = 0.0): self.output = output self.expected_output = expected_output self.inputs = inputs self.duration = duration else: MockContext = object # pragma: lax no cover async def test_equals(): """Test Equals evaluator.""" evaluator = Equals(value=42) # Test equal values assert evaluator.evaluate(MockContext(output=42)) is True # Test unequal values assert evaluator.evaluate(MockContext(output=43)) is False async def test_equals_expected(): """Test EqualsExpected evaluator.""" evaluator = EqualsExpected() # Test with matching output and expected output assert evaluator.evaluate(MockContext(output=42, expected_output=42)) is True # Test with non-matching output and expected output assert evaluator.evaluate(MockContext(output=42, expected_output=43)) is False # Test with no expected output assert evaluator.evaluate(MockContext(output=42, expected_output=None)) == {} async def test_contains_string(): """Test Contains evaluator with strings.""" evaluator = Contains(value='test') # Test string containment assert evaluator.evaluate(MockContext(output='this is a test')).value is True # Test string non-containment assert evaluator.evaluate(MockContext(output='no match')) == snapshot( EvaluationReason(value=False, reason="Output string 'no match' does not contain expected string 'test'") ) # Test case sensitivity evaluator_case_insensitive = Contains(value='TEST', case_sensitive=False) assert evaluator_case_insensitive.evaluate(MockContext(output='this is a test')) == snapshot( EvaluationReason(value=True) ) async def test_contains_dict(): """Test Contains evaluator with dictionaries.""" evaluator = Contains(value={'key': 'value'}) # Test dictionary containment assert evaluator.evaluate(MockContext(output={'key': 'value', 'extra': 'data'})) == snapshot( EvaluationReason(value=True) ) # Test dictionary key missing assert evaluator.evaluate(MockContext(output={'different': 'value'})) == snapshot( EvaluationReason(value=False, reason="Output dictionary does not contain expected key 'key'") ) # Test dictionary value mismatch assert evaluator.evaluate(MockContext(output={'key': 'different'})) == snapshot( EvaluationReason( value=False, reason="Output dictionary has different value for key 'key': 'different' != 'value'", ) ) # Test non-dict value in dict evaluator_single = Contains(value='key') assert evaluator_single.evaluate(MockContext(output={'key': 'value'})) == snapshot(EvaluationReason(value=True)) async def test_contains_list(): """Test Contains evaluator with lists.""" evaluator = Contains(value=42) # Test list containment assert evaluator.evaluate(MockContext(output=[1, 42, 3])) == snapshot(EvaluationReason(value=True)) # Test list non-containment assert evaluator.evaluate(MockContext(output=[1, 2, 3])) == snapshot( EvaluationReason(value=False, reason='Output [1, 2, 3] does not contain provided value') ) async def test_contains_as_strings(): """Test Contains evaluator with as_strings=True.""" evaluator = Contains(value=42, as_strings=True) # Test string conversion assert evaluator.evaluate(MockContext(output='The answer is 42')).value is True # Test string conversion with non-string types assert evaluator.evaluate(MockContext(output=[1, 42, 3])).value is True async def test_contains_invalid_type(): """Test Contains evaluator with invalid types.""" evaluator = Contains(value=42) # Test with unhashable type class Unhashable: __hash__ = None # type: ignore result = evaluator.evaluate(MockContext(output=Unhashable())) assert result.value is False assert result.reason == "Containment check failed: argument of type 'Unhashable' is not iterable" async def test_is_instance(): """Test IsInstance evaluator.""" evaluator = IsInstance(type_name='str') # Test matching type assert evaluator.evaluate(MockContext(output='test')).value is True # Test non-matching type result = evaluator.evaluate(MockContext(output=42)) assert result.value is False assert result.reason == 'output is of type int' # Test with class having different qualname class OuterClass: class InnerClass: pass evaluator = IsInstance(type_name='InnerClass') result = evaluator.evaluate(MockContext(output=OuterClass.InnerClass())) assert result.value is True async def test_max_duration(): """Test MaxDuration evaluator.""" # Test with float seconds evaluator = MaxDuration(seconds=1.0) assert evaluator.evaluate(MockContext(duration=0.5)) is True assert evaluator.evaluate(MockContext(duration=1.5)) is False # Test with timedelta evaluator = MaxDuration(seconds=timedelta(seconds=1)) assert evaluator.evaluate(MockContext(duration=0.5)) is True assert evaluator.evaluate(MockContext(duration=1.5)) is False @pytest.mark.anyio async def test_llm_judge_evaluator(mocker: MockerFixture): """Test LLMJudge evaluator.""" # Create a mock GradingOutput mock_grading_output = mocker.MagicMock() mock_grading_output.score = 1.0 mock_grading_output.pass_ = True mock_grading_output.reason = 'Test passed' # Mock the judge_output function mock_judge_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output') mock_judge_output.return_value = mock_grading_output # Mock the judge_input_output function mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output') mock_judge_input_output.return_value = mock_grading_output # Mock the judge_input_output_expected function mock_judge_input_output_expected = mocker.patch( 'pydantic_evals.evaluators.llm_as_a_judge.judge_input_output_expected' ) mock_judge_input_output_expected.return_value = mock_grading_output # Mock the judge_output_expected function mock_judge_output_expected = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output_expected') mock_judge_output_expected.return_value = mock_grading_output ctx = EvaluatorContext( name='test', inputs={'prompt': 'Hello'}, metadata=None, expected_output='Hello', output='Hello world', duration=0.0, _span_tree=SpanTreeRecordingError('spans were not recorded'), attributes={}, metrics={}, ) # Test without input evaluator = LLMJudge(rubric='Content contains a greeting') assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed'}} ) mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None, None) # Test with input evaluator = LLMJudge(rubric='Output contains input', include_input=True, model='openai:gpt-4o') assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed'}} ) mock_judge_input_output.assert_called_once_with( {'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o', None ) # Test with input and expected output evaluator = LLMJudge( rubric='Output contains input', include_input=True, include_expected_output=True, model='openai:gpt-4o' ) assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed'}} ) mock_judge_input_output_expected.assert_called_once_with( {'prompt': 'Hello'}, 'Hello world', 'Hello', 'Output contains input', 'openai:gpt-4o', None ) # Test with output and expected output evaluator = LLMJudge( rubric='Output contains input', include_input=False, include_expected_output=True, model='openai:gpt-4o' ) assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed'}} ) mock_judge_output_expected.assert_called_once_with( 'Hello world', 'Hello', 'Output contains input', 'openai:gpt-4o', None ) # Test with failing result mock_grading_output.score = 0.0 mock_grading_output.pass_ = False mock_grading_output.reason = 'Test failed' assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': False, 'reason': 'Test failed'}} ) # Test with overridden configs evaluator = LLMJudge(rubric='Mock rubric', assertion=False) assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot({}) evaluator = LLMJudge( rubric='Mock rubric', score=OutputConfig(evaluation_name='my_score', include_reason=True), assertion=OutputConfig(evaluation_name='my_assertion'), ) assert to_jsonable_python(await evaluator.evaluate(ctx)) == snapshot( {'my_assertion': False, 'my_score': {'reason': 'Test failed', 'value': 0.0}} ) @pytest.mark.anyio async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture): """Test LLMJudge evaluator with specific model_settings.""" mock_grading_output = mocker.MagicMock() mock_grading_output.pass_ = True mock_grading_output.reason = 'Test passed with settings' mock_judge_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output') mock_judge_output.return_value = mock_grading_output mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output') mock_judge_input_output.return_value = mock_grading_output mock_judge_input_output_expected = mocker.patch( 'pydantic_evals.evaluators.llm_as_a_judge.judge_input_output_expected' ) mock_judge_input_output_expected.return_value = mock_grading_output mock_judge_output_expected = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output_expected') mock_judge_output_expected.return_value = mock_grading_output custom_model_settings = ModelSettings(temperature=0.77) ctx = EvaluatorContext( name='test_custom_settings', inputs={'prompt': 'Hello Custom'}, metadata=None, expected_output='Hello', output='Hello world custom settings', duration=0.0, _span_tree=SpanTreeRecordingError('spans were not recorded'), attributes={}, metrics={}, ) # Test without input, with custom model_settings evaluator_no_input = LLMJudge(rubric='Greeting with custom settings', model_settings=custom_model_settings) assert to_jsonable_python(await evaluator_no_input.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}} ) mock_judge_output.assert_called_once_with( 'Hello world custom settings', 'Greeting with custom settings', None, custom_model_settings ) # Test with input, with custom model_settings evaluator_with_input = LLMJudge( rubric='Output contains input with custom settings', include_input=True, model='openai:gpt-3.5-turbo', model_settings=custom_model_settings, ) assert to_jsonable_python(await evaluator_with_input.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}} ) mock_judge_input_output.assert_called_once_with( {'prompt': 'Hello Custom'}, 'Hello world custom settings', 'Output contains input with custom settings', 'openai:gpt-3.5-turbo', custom_model_settings, ) # Test with input and expected output, with custom model_settings evaluator_with_input_expected = LLMJudge( rubric='Output contains input with custom settings', include_input=True, include_expected_output=True, model='openai:gpt-3.5-turbo', model_settings=custom_model_settings, ) assert to_jsonable_python(await evaluator_with_input_expected.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}} ) mock_judge_input_output_expected.assert_called_once_with( {'prompt': 'Hello Custom'}, 'Hello world custom settings', 'Hello', 'Output contains input with custom settings', 'openai:gpt-3.5-turbo', custom_model_settings, ) # Test with output and expected output evaluator_with_output_expected = LLMJudge( rubric='Output contains input with custom settings', include_input=False, include_expected_output=True, model='openai:gpt-3.5-turbo', model_settings=custom_model_settings, ) assert to_jsonable_python(await evaluator_with_output_expected.evaluate(ctx)) == snapshot( {'LLMJudge': {'value': True, 'reason': 'Test passed with settings'}} ) mock_judge_output_expected.assert_called_once_with( 'Hello world custom settings', 'Hello', 'Output contains input with custom settings', 'openai:gpt-3.5-turbo', custom_model_settings, ) async def test_span_query_evaluator(capfire: CaptureLogfire): """Test HasMatchingSpan evaluator.""" # Create a span tree with a known structure with context_subtree() as tree: with logfire.span('root'): with logfire.span('child1', key='value'): pass with logfire.span('child2'): with logfire.span('grandchild', nested=True): pass ctx = EvaluatorContext( name='test', inputs={}, metadata=None, expected_output=None, output={}, duration=0.0, _span_tree=tree, attributes={}, metrics={}, ) # Test matching by name evaluator = HasMatchingSpan(query=SpanQuery(name_equals='child1')) assert evaluator.evaluate(ctx) is True # Test matching by name pattern evaluator = HasMatchingSpan(query=SpanQuery(name_matches_regex='child.*')) assert evaluator.evaluate(ctx) is True # Test matching by attributes evaluator = HasMatchingSpan(query=SpanQuery(has_attributes={'key': 'value'})) assert evaluator.evaluate(ctx) is True # Test matching nested span evaluator = HasMatchingSpan(query=SpanQuery(name_equals='grandchild', has_attributes={'nested': True})) assert evaluator.evaluate(ctx) is True # Test non-matching query evaluator = HasMatchingSpan(query=SpanQuery(name_equals='nonexistent')) assert evaluator.evaluate(ctx) is False # Test non-matching attributes evaluator = HasMatchingSpan(query=SpanQuery(name_equals='child1', has_attributes={'wrong': 'value'})) assert evaluator.evaluate(ctx) is False

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/pydantic/pydantic-ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server