@arizeai/phoenix-mcp

Official

Overview Schema Related Servers Score Discussions

runExperiment.test.ts•10.8 KiB

import { createClassificationEvaluator } from "@arizeai/phoenix-evals"; import * as getDatasetModule from "../../src/datasets/getDataset"; import { asEvaluator, runExperiment, } from "../../src/experiments/runExperiment"; import type { Example } from "../../src/types/datasets"; import type { EvaluatorParams } from "../../src/types/experiments"; import { MockLanguageModelV2 } from "ai/test"; import { beforeEach, describe, expect, it, vi } from "vitest"; const mockDataset = { id: "dataset-1", name: "mock-dataset", description: "A mock dataset", versionId: "v1", metadata: {}, examples: [ { id: "ex-1", input: { name: "Alice" }, output: { text: "Hello, Alice!" }, metadata: {}, updatedAt: new Date(), }, { id: "ex-2", input: { name: "Bob" }, output: { text: "Hello, Bob!" }, metadata: {}, updatedAt: new Date(), }, ], }; describe("runExperiment (dryRun)", () => { beforeEach(() => { vi.clearAllMocks(); vi.spyOn(getDatasetModule, "getDataset").mockResolvedValue(mockDataset); }); it("runs the task and evaluators in dryRun mode", async () => { const task = async (example: Example) => `Hello, ${example.input.name}!`; const matchesEvaluator = asEvaluator({ name: "matches", kind: "CODE", evaluate: async ({ output, expected }: EvaluatorParams) => { const expectedText = (expected as { text?: string })?.text ?? ""; const outputStr = typeof output === "string" ? output : String(output); return { label: outputStr === expectedText ? "match" : "no match", score: outputStr === expectedText ? 1 : 0, explanation: outputStr === expectedText ? "matches" : "does not match", metadata: {}, }; }, }); const containsHelloEvaluator = asEvaluator({ name: "contains-hello", kind: "CODE", evaluate: async ({ output }: EvaluatorParams) => { const outputStr = typeof output === "string" ? output : String(output); return { label: outputStr.includes("Hello") ? "contains hello" : "does not contain hello", score: outputStr.includes("Hello") ? 1 : 0, explanation: outputStr.includes("Hello") ? "contains hello" : "does not contain hello", metadata: {}, }; }, }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [matchesEvaluator, containsHelloEvaluator], dryRun: true, }); expect(experiment).toBeDefined(); expect(experiment.runs).toBeDefined(); expect(Object.keys(experiment.runs)).toHaveLength(2); expect(experiment.evaluationRuns).toBeDefined(); if (experiment.evaluationRuns) { expect(Array.isArray(experiment.evaluationRuns)).toBe(true); // There should be 2 runs * 2 evaluators = 4 evaluation runs expect(experiment.evaluationRuns.length).toBe(4); // Check that the evaluation results are as expected for (const evalRun of experiment.evaluationRuns) { expect(evalRun.result).toHaveProperty("label"); expect(evalRun.result).toHaveProperty("score"); } } }); it("respects dryRun count", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const evaluator = asEvaluator({ name: "dummy", kind: "CODE", evaluate: async () => ({ label: "ok", score: 1, explanation: "", metadata: {}, }), }); // Only run 1 example const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [evaluator], dryRun: 1, }); expect(Object.keys(experiment.runs)).toHaveLength(1); if (experiment.evaluationRuns) { expect(experiment.evaluationRuns.length).toBe(1); } }); it("runs experiments with repetitions", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const evaluator = asEvaluator({ name: "dummy", kind: "CODE", evaluate: async () => ({ label: "ok", score: 1, explanation: "", metadata: {}, }), }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [evaluator], dryRun: true, repetitions: 3, }); // Should have 2 examples * 3 repetitions = 6 runs expect(Object.keys(experiment.runs)).toHaveLength(6); if (experiment.evaluationRuns) { // Should have 6 runs * 1 evaluator = 6 evaluation runs expect(experiment.evaluationRuns.length).toBe(6); } }); it("defaults to 1 repetition when not specified", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const evaluator = asEvaluator({ name: "dummy", kind: "CODE", evaluate: async () => ({ label: "ok", score: 1, explanation: "", metadata: {}, }), }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [evaluator], dryRun: true, }); // Should have 2 examples * 1 repetition = 2 runs expect(Object.keys(experiment.runs)).toHaveLength(2); if (experiment.evaluationRuns) { expect(experiment.evaluationRuns.length).toBe(2); } }); it("should throw an error if repetitions is invalid", async () => { await expect( runExperiment({ dataset: { datasetId: mockDataset.id }, task: () => "", dryRun: true, repetitions: 0, }) ).rejects.toThrow("repetitions must be an integer greater than 0"); await expect( runExperiment({ dataset: { datasetId: mockDataset.id }, task: () => "", dryRun: true, repetitions: -1, }) ).rejects.toThrow("repetitions must be an integer greater than 0"); }); it("should work with phoenix-evals evaluators", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const correctnessEvaluator = createClassificationEvaluator({ name: "correctness", model: new MockLanguageModelV2({ doGenerate: async () => ({ finishReason: "stop", usage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 }, content: [ { type: "text", text: `{"label": "correct", "explanation": "because" }`, }, ], warnings: [], }), }), promptTemplate: "Is the following text correct: {{output}}", choices: { correct: 1, incorrect: 0 }, }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [correctnessEvaluator], dryRun: true, }); expect(experiment).toBeDefined(); expect(experiment.runs).toBeDefined(); expect(Object.keys(experiment.runs)).toHaveLength(2); expect(experiment.evaluationRuns).toHaveLength(2); expect(experiment.evaluationRuns?.[0].annotatorKind).toBe("LLM"); expect(experiment.evaluationRuns?.[0].name).toBe("correctness"); expect(experiment.evaluationRuns?.[0].result).toBeDefined(); expect(experiment.evaluationRuns?.[0].result?.label).toBe("correct"); expect(experiment.evaluationRuns?.[0].result?.score).toBe(1); expect(experiment.evaluationRuns?.[0].result?.explanation).toBe("because"); }); it("should work with mixed evaluators: manual asEvaluator and phoenix-evals", async () => { const task = (example: Example) => `Hello, ${example.input.name}!`; // Manual evaluator using asEvaluator const manualEvaluator = asEvaluator({ name: "manual-length-check", kind: "CODE", evaluate: async ({ output }: EvaluatorParams) => { const outputStr = typeof output === "string" ? output : String(output); const isGoodLength = outputStr.length > 5 && outputStr.length < 50; return { label: isGoodLength ? "good_length" : "bad_length", score: isGoodLength ? 1 : 0, explanation: `Output length is ${outputStr.length} characters`, metadata: { length: outputStr.length }, }; }, }); // Phoenix-evals evaluator const phoenixEvaluator = createClassificationEvaluator({ name: "politeness", model: new MockLanguageModelV2({ doGenerate: async () => ({ finishReason: "stop", usage: { inputTokens: 15, outputTokens: 25, totalTokens: 40 }, content: [ { type: "text", text: `{"label": "polite", "explanation": "greeting is polite"}`, }, ], warnings: [], }), }), promptTemplate: "Is this greeting polite: {{output}}", choices: { polite: 1, rude: 0 }, }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [manualEvaluator, phoenixEvaluator], dryRun: true, }); expect(experiment).toBeDefined(); expect(experiment.runs).toBeDefined(); expect(Object.keys(experiment.runs)).toHaveLength(2); expect(experiment.evaluationRuns).toBeDefined(); if (experiment.evaluationRuns) { expect(Array.isArray(experiment.evaluationRuns)).toBe(true); // There should be 2 runs * 2 evaluators = 4 evaluation runs expect(experiment.evaluationRuns.length).toBe(4); // Check that both evaluators ran const manualEvalRuns = experiment.evaluationRuns.filter( (run) => run.name === "manual-length-check" ); const phoenixEvalRuns = experiment.evaluationRuns.filter( (run) => run.name === "politeness" ); expect(manualEvalRuns).toHaveLength(2); expect(phoenixEvalRuns).toHaveLength(2); // Check manual evaluator results for (const evalRun of manualEvalRuns) { expect(evalRun.annotatorKind).toBe("CODE"); expect(evalRun.result).toHaveProperty("label"); expect(evalRun.result).toHaveProperty("score"); expect(evalRun.result).toHaveProperty("explanation"); expect(evalRun.result).toHaveProperty("metadata"); expect(evalRun.result?.label).toBe("good_length"); expect(evalRun.result?.score).toBe(1); expect(evalRun.result?.metadata).toHaveProperty("length"); } // Check phoenix-evals evaluator results for (const evalRun of phoenixEvalRuns) { expect(evalRun.annotatorKind).toBe("LLM"); expect(evalRun.result).toHaveProperty("label"); expect(evalRun.result).toHaveProperty("score"); expect(evalRun.result).toHaveProperty("explanation"); expect(evalRun.result?.label).toBe("polite"); expect(evalRun.result?.score).toBe(1); expect(evalRun.result?.explanation).toBe("greeting is polite"); } } }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

runExperiment.test.ts•10.8 KiB