Skip to main content
Glama

@arizeai/phoenix-mcp

Official
by Arize-ai
test_wrappers.py9.86 kB
import inspect import math from collections import ChainMap from collections.abc import Callable, Iterable from typing import Any, Optional, cast import numpy as np import pandas as pd import pytest from numpy.testing import assert_almost_equal from pandas.core.dtypes.common import is_object_dtype from sklearn import metrics as sk from phoenix.config import SKLEARN_VERSION from phoenix.core.model_schema import Column from phoenix.metrics.mixins import EvaluationMetric, EvaluationMetricKeywordParameters from phoenix.metrics.wrappers import Eval from phoenix.metrics.wrappers import SkEval as ev N = 32 NA_PCT = 0.25 rng = np.random.default_rng(1234567890) actual_label_column_name = hex(int(rng.random() * 1e9)) actual_score_column_name = hex(int(rng.random() * 1e9)) predicted_label_column_name = hex(int(rng.random() * 1e9)) predicted_score_column_name = hex(int(rng.random() * 1e9)) sample_weight_column_name = hex(int(rng.random() * 1e9)) def add_na(na_pct: float, series: "pd.Series[Any]") -> "pd.Series[Any]": series = series.sample(frac=1, random_state=rng) n_miss = int(len(series) * na_pct) series[: n_miss // 3] = None series[n_miss // 3 : 2 * n_miss // 3] = float("nan") series[2 * n_miss // 3 : n_miss] = np.nan return series.sample(frac=1, ignore_index=True, random_state=rng) def gen_series( n: int, na_pct: float, cats: Optional[Iterable[Any]] = None, f: Callable[[Any], Any] = lambda x: x, ) -> "pd.Series[Any]": if n == 0: return pd.Series(dtype=object) if cats is None: return add_na(na_pct, pd.Series(rng.random(n))) cats = tuple(cats) weights = np.arange(1, len(cats) + 1) return add_na( na_pct, pd.Series(f(rng.choice(cats, n, p=weights / weights.sum()))), ) def gen_df( n: int = N, na_pct: float = 0.1, cats: Iterable[Any] = "01", f: Callable[[Any], Any] = lambda x: x, ) -> pd.DataFrame: df = pd.DataFrame( { actual_label_column_name: gen_series(n, na_pct, cats, f), actual_score_column_name: gen_series(n, na_pct), predicted_label_column_name: gen_series(n, na_pct, cats, f), predicted_score_column_name: gen_series(n, na_pct), sample_weight_column_name: gen_series(n, na_pct), }, ).set_axis(rng.random(n), axis=0) assert len(df) == n assert list(df.isna().sum()) == [int(n * na_pct)] * len(df.columns) return df @pytest.mark.parametrize( "df", [ pd.DataFrame(), gen_df(0), gen_df(N, 1), gen_df(N, NA_PCT), ], ) @pytest.mark.parametrize( "metric,desired,parameters", [ (ev.explained_variance_score, sk.explained_variance_score, {}), (ev.max_error, sk.max_error, {}), (ev.mean_absolute_error, sk.mean_absolute_error, {}), (ev.mean_absolute_percentage_error, sk.mean_absolute_percentage_error, {}), (ev.mean_gamma_deviance, sk.mean_gamma_deviance, {}), (ev.mean_pinball_loss, sk.mean_pinball_loss, {}), (ev.mean_poisson_deviance, sk.mean_poisson_deviance, {}), (ev.mean_squared_error, sk.mean_squared_error, {}), (ev.mean_squared_log_error, sk.mean_squared_log_error, {}), (ev.mean_tweedie_deviance, sk.mean_tweedie_deviance, {}), (ev.median_absolute_error, sk.median_absolute_error, {}), (ev.r2_score, sk.r2_score, {}), (ev.root_mean_squared_error, sk.mean_squared_error, {"squared": False}) if SKLEARN_VERSION < (1, 6) else (ev.root_mean_squared_error, sk.root_mean_squared_error, {}), ], ) def test_regression( df: pd.DataFrame, metric: Eval, desired: Callable[..., float], parameters: dict[str, Any], ) -> None: run_test( df, actual_score_column_name, predicted_score_column_name, sample_weight_column_name, metric, desired, parameters, ) @pytest.mark.parametrize( "pos_label,df", [ ("Y", pd.DataFrame()), ("Y", gen_df(0)), ("Y", gen_df(N, 1)), ("Y", gen_df(N, NA_PCT, "XY")), ("Y", gen_df(N, NA_PCT, "XY", f=pd.Categorical)), ("Y", gen_df(N, NA_PCT, "XYZ")), ("Y", gen_df(N, NA_PCT, "XYZ", f=pd.Categorical)), (10, gen_df(N, NA_PCT, [9, 10])), (10, gen_df(N, NA_PCT, [9, 10], f=pd.Categorical)), (10, gen_df(N, NA_PCT, [9, 10, 11])), (10, gen_df(N, NA_PCT, [9, 10, 11], f=pd.Categorical)), (True, gen_df(N, NA_PCT, [False, True])), (True, gen_df(N, NA_PCT, [False, True], f=pd.Categorical)), ], ) @pytest.mark.parametrize( "metric,desired,parameters,is_binary,pred_is_score", [ (ev.accuracy_score, sk.accuracy_score, {}, False, False), (ev.accuracy_score, sk.accuracy_score, {}, True, False), (ev.average_precision_score, sk.average_precision_score, {}, True, True), (ev.balanced_accuracy_score, sk.balanced_accuracy_score, {}, True, False), (ev.brier_score_loss, sk.brier_score_loss, {}, True, True), (ev.f1_score, sk.f1_score, {}, True, False), (ev.hamming_loss, sk.hamming_loss, {}, True, False), (ev.jaccard_score, sk.jaccard_score, {}, True, False), (ev.log_loss, sk.log_loss, {}, True, True), (ev.matthews_corrcoef, sk.matthews_corrcoef, {}, True, False), (ev.precision_score, sk.precision_score, {}, True, False), (ev.recall_score, sk.recall_score, {}, True, False), (ev.roc_auc_score, sk.roc_auc_score, {}, True, True), (ev.zero_one_loss, sk.zero_one_loss, {}, True, False), ], ) def test_classification( pos_label: Any, df: pd.DataFrame, metric: Eval, desired: Callable[..., float], parameters: dict[str, Any], is_binary: bool, pred_is_score: bool, ) -> None: run_test( df, actual_label_column_name, predicted_score_column_name if pred_is_score else predicted_label_column_name, sample_weight_column_name, metric, desired, parameters, actual_label=pos_label if is_binary else None, predicted_label=None if pred_is_score or not is_binary else pos_label, ) def run_test( df: pd.DataFrame, actual_column_name: str, predicted_column_name: str, sample_weight_column_name: str, eval: Eval, desired: Callable[..., float], parameters: dict[str, Any], actual_label: Any = None, predicted_label: Any = None, ) -> None: # calculation without sample weights actual_result = EvaluationMetric( eval=eval, actual=Column(actual_column_name), predicted=Column(predicted_column_name), parameters=EvaluationMetricKeywordParameters( pos_label=actual_label, ), )(df) try: actual = df.loc[:, actual_column_name] except KeyError: actual = pd.Series(dtype=object) try: predicted = df.loc[:, predicted_column_name] except KeyError: predicted = pd.Series(dtype=object) both_not_na = ~actual.isna() & ~predicted.isna() try: desired_result = desired( _ensure_dtype(_binarize(actual_label, actual).loc[both_not_na]), _ensure_dtype(_binarize(predicted_label, predicted).loc[both_not_na]), **parameters, ) except (ValueError, IndexError, ZeroDivisionError) as exc: if both_not_na.sum(): raise exc desired_result = np.nan # Since we are using random numbers as input, we want to make sure that we # don't mess up something and inadvertently return NaN for comparisons. assert not math.isnan(desired_result) or both_not_na.sum() == 0 assert_almost_equal(actual_result, desired_result) # calculation with sample weights actual_result = EvaluationMetric( eval=eval, actual=Column(actual_column_name), predicted=Column(predicted_column_name), parameters=EvaluationMetricKeywordParameters( sample_weight=Column(sample_weight_column_name), pos_label=actual_label, ), )(df) try: sample_weight = df.loc[:, sample_weight_column_name] except KeyError: sample_weight = pd.Series(dtype=object) allowed_parameters = inspect.signature(desired).parameters.keys() all_not_na = ( both_not_na & ~sample_weight.isna() if "sample_weight" in allowed_parameters else both_not_na ) try: desired_result = desired( _ensure_dtype(_binarize(actual_label, actual).loc[all_not_na]), _ensure_dtype(_binarize(predicted_label, predicted).loc[all_not_na]), **{ k: v for k, v in ChainMap( parameters, {"sample_weight": sample_weight.loc[all_not_na]}, ).items() if k in allowed_parameters }, ) except (ValueError, IndexError, ZeroDivisionError) as exc: if all_not_na.sum(): raise exc desired_result = np.nan # Since we are using random numbers as input, we want to make sure that we # don't mess up something and inadvertently return NaN for comparisons. assert not math.isnan(desired_result) or both_not_na.sum() == 0 assert_almost_equal(actual_result, desired_result) def _ensure_dtype(series: "pd.Series[Any]") -> "pd.Series[Any]": if is_object_dtype(series) and len(series) and not isinstance(series.iloc[0], str): return series.astype(type(series.iloc[0])) return series def _binarize( pos_label: Any, series: "pd.Series[Any]", ) -> "pd.Series[bool]": return ( series if pos_label is None else cast( "pd.Series[bool]", series == pos_label, ) )

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server