@arizeai/phoenix-mcp

Official

227

7,302

Overview InspectNew Endpoints Schema Related Servers Reviews Score

runExperiment.test.ts•5.93 kB

import { describe, it, expect, vi, beforeEach } from "vitest"; import { runExperiment, asEvaluator, } from "../../src/experiments/runExperiment"; import * as getDatasetModule from "../../src/datasets/getDataset"; import type { Example } from "../../src/types/datasets"; import type { EvaluatorParams } from "../../src/types/experiments"; const mockDataset = { id: "dataset-1", name: "mock-dataset", description: "A mock dataset", versionId: "v1", metadata: {}, examples: [ { id: "ex-1", input: { name: "Alice" }, output: { text: "Hello, Alice!" }, metadata: {}, updatedAt: new Date(), }, { id: "ex-2", input: { name: "Bob" }, output: { text: "Hello, Bob!" }, metadata: {}, updatedAt: new Date(), }, ], }; describe("runExperiment (dryRun)", () => { beforeEach(() => { vi.clearAllMocks(); vi.spyOn(getDatasetModule, "getDataset").mockResolvedValue(mockDataset); }); it("runs the task and evaluators in dryRun mode", async () => { const task = async (example: Example) => `Hello, ${example.input.name}!`; const matchesEvaluator = asEvaluator({ name: "matches", kind: "CODE", evaluate: async ({ output, expected }: EvaluatorParams) => { const expectedText = (expected as { text?: string })?.text ?? ""; const outputStr = typeof output === "string" ? output : String(output); return { label: outputStr === expectedText ? "match" : "no match", score: outputStr === expectedText ? 1 : 0, explanation: outputStr === expectedText ? "matches" : "does not match", metadata: {}, }; }, }); const containsHelloEvaluator = asEvaluator({ name: "contains-hello", kind: "CODE", evaluate: async ({ output }: EvaluatorParams) => { const outputStr = typeof output === "string" ? output : String(output); return { label: outputStr.includes("Hello") ? "contains hello" : "does not contain hello", score: outputStr.includes("Hello") ? 1 : 0, explanation: outputStr.includes("Hello") ? "contains hello" : "does not contain hello", metadata: {}, }; }, }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [matchesEvaluator, containsHelloEvaluator], dryRun: true, }); expect(experiment).toBeDefined(); expect(experiment.runs).toBeDefined(); expect(Object.keys(experiment.runs)).toHaveLength(2); expect(experiment.evaluationRuns).toBeDefined(); if (experiment.evaluationRuns) { expect(Array.isArray(experiment.evaluationRuns)).toBe(true); // There should be 2 runs * 2 evaluators = 4 evaluation runs expect(experiment.evaluationRuns.length).toBe(4); // Check that the evaluation results are as expected for (const evalRun of experiment.evaluationRuns) { expect(evalRun.result).toHaveProperty("label"); expect(evalRun.result).toHaveProperty("score"); } } }); it("respects dryRun count", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const evaluator = asEvaluator({ name: "dummy", kind: "CODE", evaluate: async () => ({ label: "ok", score: 1, explanation: "", metadata: {}, }), }); // Only run 1 example const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [evaluator], dryRun: 1, }); expect(Object.keys(experiment.runs)).toHaveLength(1); if (experiment.evaluationRuns) { expect(experiment.evaluationRuns.length).toBe(1); } }); it("runs experiments with repetitions", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const evaluator = asEvaluator({ name: "dummy", kind: "CODE", evaluate: async () => ({ label: "ok", score: 1, explanation: "", metadata: {}, }), }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [evaluator], dryRun: true, repetitions: 3, }); // Should have 2 examples * 3 repetitions = 6 runs expect(Object.keys(experiment.runs)).toHaveLength(6); if (experiment.evaluationRuns) { // Should have 6 runs * 1 evaluator = 6 evaluation runs expect(experiment.evaluationRuns.length).toBe(6); } }); it("defaults to 1 repetition when not specified", async () => { const task = (example: Example) => `Hi, ${example.input.name}`; const evaluator = asEvaluator({ name: "dummy", kind: "CODE", evaluate: async () => ({ label: "ok", score: 1, explanation: "", metadata: {}, }), }); const experiment = await runExperiment({ dataset: { datasetId: mockDataset.id }, task, evaluators: [evaluator], dryRun: true, }); // Should have 2 examples * 1 repetition = 2 runs expect(Object.keys(experiment.runs)).toHaveLength(2); if (experiment.evaluationRuns) { expect(experiment.evaluationRuns.length).toBe(2); } }); it("should throw an error if repetitions is invalid", async () => { await expect( runExperiment({ dataset: { datasetId: mockDataset.id }, task: () => "", dryRun: true, repetitions: 0, }) ).rejects.toThrow("repetitions must be an integer greater than 0"); await expect( runExperiment({ dataset: { datasetId: mockDataset.id }, task: () => "", dryRun: true, repetitions: -1, }) ).rejects.toThrow("repetitions must be an integer greater than 0"); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server