getExperimentEvaluators.test.ts•11.8 kB
import { createClassificationEvaluator } from "@arizeai/phoenix-evals";
import { fromPhoenixLLMEvaluator } from "../../../src/experiments/helpers/fromPhoenixLLMEvaluator";
import { getExperimentEvaluators } from "../../../src/experiments/helpers/getExperimentEvaluators";
import { Evaluator } from "../../../src/types/experiments";
import { MockLanguageModelV2 } from "ai/test";
import { beforeEach, describe, expect, it, vi } from "vitest";
// Mock the fromPhoenixLLMEvaluator function
const mockFromPhoenixEvaluator = vi.fn();
vi.mock("../../../src/experiments/helpers/fromPhoenixLLMEvaluator", () => ({
fromPhoenixLLMEvaluator: vi.fn(),
}));
describe("getExperimentEvaluators", () => {
beforeEach(() => {
vi.clearAllMocks();
mockFromPhoenixEvaluator.mockClear();
// Set up the mock implementation
vi.mocked(fromPhoenixLLMEvaluator).mockImplementation(
mockFromPhoenixEvaluator
);
});
describe("ClassificationEvaluator handling", () => {
it("should convert a valid ClassificationEvaluator", () => {
const classificationEvaluator = createClassificationEvaluator({
name: "test-classifier",
model: new MockLanguageModelV2({
doGenerate: async () => ({
finishReason: "stop",
usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
content: [
{
type: "text",
text: `{"label": "good", "explanation": "test explanation"}`,
},
],
warnings: [],
}),
}),
promptTemplate: "Evaluate this: {{output}}",
choices: { good: 1, bad: 0 },
});
const mockConvertedEvaluator: Evaluator = {
name: "test-classifier",
kind: "LLM",
evaluate: vi.fn().mockResolvedValue({ score: 1, label: "good" }),
};
mockFromPhoenixEvaluator.mockReturnValue(mockConvertedEvaluator);
const result = getExperimentEvaluators([classificationEvaluator]);
expect(mockFromPhoenixEvaluator).toHaveBeenCalledWith(
classificationEvaluator
);
expect(result).toEqual([mockConvertedEvaluator]);
});
it("should handle multiple ClassificationEvaluators", () => {
const evaluator1 = createClassificationEvaluator({
name: "evaluator-1",
model: new MockLanguageModelV2({
doGenerate: async () => ({
finishReason: "stop",
usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
content: [
{
type: "text",
text: `{"label": "correct", "explanation": "test"}`,
},
],
warnings: [],
}),
}),
promptTemplate: "Evaluate: {{output}}",
choices: { correct: 1, incorrect: 0 },
});
const evaluator2 = createClassificationEvaluator({
name: "evaluator-2",
model: new MockLanguageModelV2({
doGenerate: async () => ({
finishReason: "stop",
usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
content: [
{
type: "text",
text: `{"label": "relevant", "explanation": "test"}`,
},
],
warnings: [],
}),
}),
promptTemplate: "Is this relevant: {{output}}",
choices: { relevant: 1, irrelevant: 0 },
});
const mockConverted1: Evaluator = {
name: "evaluator-1",
kind: "LLM",
evaluate: vi.fn(),
};
const mockConverted2: Evaluator = {
name: "evaluator-2",
kind: "LLM",
evaluate: vi.fn(),
};
mockFromPhoenixEvaluator
.mockReturnValueOnce(mockConverted1)
.mockReturnValueOnce(mockConverted2);
const result = getExperimentEvaluators([evaluator1, evaluator2]);
expect(mockFromPhoenixEvaluator).toHaveBeenCalledTimes(2);
expect(mockFromPhoenixEvaluator).toHaveBeenNthCalledWith(1, evaluator1);
expect(mockFromPhoenixEvaluator).toHaveBeenNthCalledWith(2, evaluator2);
expect(result).toEqual([mockConverted1, mockConverted2]);
});
});
describe("Evaluator handling", () => {
it("should pass through a valid Evaluator unchanged", () => {
const mockEvaluator: Evaluator = {
name: "test-evaluator",
kind: "CODE",
evaluate: async ({ input: _input, output: _output }) => ({
score: 1.0,
label: "pass",
}),
};
const result = getExperimentEvaluators([mockEvaluator]);
expect(mockFromPhoenixEvaluator).not.toHaveBeenCalled();
expect(result).toEqual([mockEvaluator]);
});
it("should handle multiple Evaluators", () => {
const evaluator1: Evaluator = {
name: "evaluator-1",
kind: "CODE",
evaluate: vi.fn(),
};
const evaluator2: Evaluator = {
name: "evaluator-2",
kind: "LLM",
evaluate: vi.fn(),
};
const result = getExperimentEvaluators([evaluator1, evaluator2]);
expect(mockFromPhoenixEvaluator).not.toHaveBeenCalled();
expect(result).toEqual([evaluator1, evaluator2]);
});
});
describe("Mixed evaluator types", () => {
it("should handle a mix of ClassificationEvaluator and Evaluator", () => {
const classificationEvaluator = createClassificationEvaluator({
name: "classification-eval",
model: new MockLanguageModelV2({
doGenerate: async () => ({
finishReason: "stop",
usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
content: [
{
type: "text",
text: `{"label": "good", "explanation": "test"}`,
},
],
warnings: [],
}),
}),
promptTemplate: "Classify: {{output}}",
choices: { good: 1, bad: 0 },
});
const experimentEvaluator: Evaluator = {
name: "experiment-eval",
kind: "CODE",
evaluate: vi.fn(),
};
const mockConvertedEvaluator: Evaluator = {
name: "classification-eval",
kind: "LLM",
evaluate: vi.fn(),
};
mockFromPhoenixEvaluator.mockReturnValue(mockConvertedEvaluator);
const result = getExperimentEvaluators([
classificationEvaluator,
experimentEvaluator,
]);
expect(mockFromPhoenixEvaluator).toHaveBeenCalledWith(
classificationEvaluator
);
expect(result).toEqual([mockConvertedEvaluator, experimentEvaluator]);
});
});
describe("Error handling", () => {
it("should throw an error for unsupported evaluator types", () => {
const unsupportedEvaluator = {
name: "unsupported",
// Missing required properties
};
expect(() => getExperimentEvaluators([unsupportedEvaluator])).toThrow(
`Unsupported evaluator: ${JSON.stringify(unsupportedEvaluator)}`
);
});
it("should throw an error for null evaluator", () => {
expect(() => getExperimentEvaluators([null])).toThrow(
"Unsupported evaluator: null"
);
});
it("should throw an error for undefined evaluator", () => {
expect(() => getExperimentEvaluators([undefined])).toThrow(
"Unsupported evaluator: undefined"
);
});
it("should throw an error for primitive types", () => {
expect(() => getExperimentEvaluators(["string"])).toThrow(
'Unsupported evaluator: "string"'
);
expect(() => getExperimentEvaluators([123])).toThrow(
"Unsupported evaluator: 123"
);
expect(() => getExperimentEvaluators([true])).toThrow(
"Unsupported evaluator: true"
);
});
it("should throw an error for objects missing required properties", () => {
const invalidEvaluator1 = {
name: "test",
// Missing evaluate, kind
};
const invalidEvaluator2 = {
evaluate: vi.fn(),
// Missing name, kind
};
const invalidEvaluator3 = {
name: "test",
evaluate: vi.fn(),
// Missing kind
};
expect(() => getExperimentEvaluators([invalidEvaluator1])).toThrow(
`Unsupported evaluator: ${JSON.stringify(invalidEvaluator1)}`
);
expect(() => getExperimentEvaluators([invalidEvaluator2])).toThrow(
`Unsupported evaluator: ${JSON.stringify(invalidEvaluator2)}`
);
expect(() => getExperimentEvaluators([invalidEvaluator3])).toThrow(
`Unsupported evaluator: ${JSON.stringify(invalidEvaluator3)}`
);
});
it("should treat object with CODE kind as Evaluator, not ClassificationEvaluator", () => {
const evaluatorWithCodeKind = {
name: "test-classifier",
kind: "CODE", // This makes it an Evaluator, not a ClassificationEvaluator
evaluate: vi.fn(),
};
const result = getExperimentEvaluators([evaluatorWithCodeKind]);
// Should not call fromPhoenixEvaluator since it's treated as an Evaluator
expect(mockFromPhoenixEvaluator).not.toHaveBeenCalled();
expect(result).toEqual([evaluatorWithCodeKind]);
});
});
describe("Edge cases", () => {
it("should handle empty array", () => {
const result = getExperimentEvaluators([]);
expect(result).toEqual([]);
expect(mockFromPhoenixEvaluator).not.toHaveBeenCalled();
});
it("should handle evaluator with extra properties", () => {
const evaluatorWithExtras: Evaluator & { extraProp: string } = {
name: "test-evaluator",
kind: "CODE",
evaluate: vi.fn(),
extraProp: "extra",
};
const result = getExperimentEvaluators([evaluatorWithExtras]);
expect(result).toEqual([evaluatorWithExtras]);
});
describe("Type guard validation", () => {
it("should correctly identify ClassificationEvaluator vs Evaluator", () => {
// This tests the type guard logic indirectly
const classificationEvaluator = createClassificationEvaluator({
name: "classification-eval",
model: new MockLanguageModelV2({
doGenerate: async () => ({
finishReason: "stop",
usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
content: [
{
type: "text",
text: `{"label": "valid", "explanation": "test"}`,
},
],
warnings: [],
}),
}),
promptTemplate: "Validate: {{output}}",
choices: { valid: 1, invalid: 0 },
});
const codeEvaluator: Evaluator = {
name: "code-eval",
kind: "CODE", // Different kind should make it not a ClassificationEvaluator
evaluate: vi.fn(),
};
const llmEvaluator: Evaluator = {
name: "llm-eval",
kind: "LLM",
evaluate: vi.fn(),
};
const mockConvertedEvaluator: Evaluator = {
name: "classification-eval",
kind: "LLM",
evaluate: vi.fn(),
};
mockFromPhoenixEvaluator.mockReturnValue(mockConvertedEvaluator);
const result = getExperimentEvaluators([
classificationEvaluator,
codeEvaluator,
llmEvaluator,
]);
// Only the ClassificationEvaluator should be converted
expect(mockFromPhoenixEvaluator).toHaveBeenCalledTimes(1);
expect(mockFromPhoenixEvaluator).toHaveBeenCalledWith(
classificationEvaluator
);
expect(result).toEqual([
mockConvertedEvaluator,
codeEvaluator,
llmEvaluator,
]);
});
});
});
});