@arizeai/phoenix-mcp

Official

Overview InspectNew Schema Related Servers Score

330

7,744

resumeEvaluation.test.ts•16.8 kB

import { createClient, type PhoenixClient } from "../../src/client"; import * as getExperimentInfoModule from "../../src/experiments/getExperimentInfo"; import { asExperimentEvaluator } from "../../src/experiments/helpers"; import { resumeEvaluation } from "../../src/experiments/resumeEvaluation"; import type { EvaluatorParams } from "../../src/types/experiments"; import { beforeEach, describe, expect, it, vi } from "vitest"; vi.mock("../../src/client"); vi.mock("@arizeai/phoenix-otel", () => ({ register: vi.fn(() => ({ getTracer: vi.fn(() => ({ startSpan: vi.fn(() => ({ end: vi.fn(), setStatus: vi.fn(), setAttribute: vi.fn(), recordException: vi.fn(), })), startActiveSpan: vi.fn((name, fn) => { // Execute the callback synchronously with a mock span return fn({ end: vi.fn(), setStatus: vi.fn(), setAttribute: vi.fn(), setAttributes: vi.fn(), recordException: vi.fn(), spanContext: vi.fn(() => ({ traceId: "mock-trace-id", spanId: "mock-span-id", })), }); }), })), forceFlush: vi.fn(() => Promise.resolve()), })), trace: { getTracer: vi.fn(() => ({ startSpan: vi.fn(() => ({ end: vi.fn(), setStatus: vi.fn(), setAttribute: vi.fn(), recordException: vi.fn(), })), startActiveSpan: vi.fn((name, fn) => { return fn({ end: vi.fn(), setStatus: vi.fn(), setAttribute: vi.fn(), recordException: vi.fn(), }); }), })), }, SpanStatusCode: { OK: 1, ERROR: 2, }, objectAsAttributes: vi.fn((obj) => obj), createNoOpProvider: vi.fn(), NodeTracerProvider: vi.fn(), Tracer: vi.fn(), })); const mockExperimentInfo = { id: "exp-1", datasetId: "dataset-1", datasetVersionId: "v1", repetitions: 1, metadata: {}, projectName: "test-project", createdAt: new Date().toISOString(), updatedAt: new Date().toISOString(), exampleCount: 2, successfulRunCount: 2, failedRunCount: 0, missingRunCount: 0, }; const mockIncompleteEvaluations = [ { experiment_run: { id: "run-1", experiment_id: "exp-1", dataset_example_id: "ex-1", repetition_number: 1, output: { text: "Hello, Alice!" }, start_time: new Date().toISOString(), end_time: new Date().toISOString(), error: null, trace_id: null, }, dataset_example: { id: "ex-1", input: { name: "Alice" }, output: { text: "Hello, Alice!" }, metadata: {}, }, evaluation_names: ["correctness", "relevance"], }, { experiment_run: { id: "run-2", experiment_id: "exp-1", dataset_example_id: "ex-2", repetition_number: 1, output: { text: "Hi, Bob!" }, start_time: new Date().toISOString(), end_time: new Date().toISOString(), error: null, trace_id: null, }, dataset_example: { id: "ex-2", input: { name: "Bob" }, output: { text: "Hello, Bob!" }, metadata: {}, }, evaluation_names: ["correctness"], }, ]; describe("resumeEvaluation", () => { let mockClient: PhoenixClient; beforeEach(() => { vi.clearAllMocks(); // Mock getExperimentInfo vi.spyOn(getExperimentInfoModule, "getExperimentInfo").mockResolvedValue( mockExperimentInfo ); // Create mock client mockClient = { GET: vi.fn(), POST: vi.fn(), config: { baseUrl: "http://localhost:6006", }, }; // Mock client.GET for incomplete evaluations mockClient.GET.mockImplementation((url: string) => { if (url.includes("incomplete-evaluations")) { return Promise.resolve({ data: { data: mockIncompleteEvaluations, next_cursor: null, }, }); } return Promise.resolve({ data: {} }); }); // Mock client.POST for evaluation results mockClient.POST.mockResolvedValue({ data: { id: "eval-123", }, }); vi.mocked(createClient).mockReturnValue(mockClient); }); it("should resume incomplete evaluations with single-output evaluators", async () => { const correctnessFn = vi.fn( async ({ output, expected }: EvaluatorParams) => { const expectedText = (expected as { text?: string })?.text ?? ""; const outputText = (output as { text?: string })?.text ?? ""; return { score: outputText === expectedText ? 1 : 0, label: outputText === expectedText ? "correct" : "incorrect", }; } ); const relevanceFn = vi.fn(async () => ({ score: 0.9, label: "relevant", })); const correctnessEvaluator = asExperimentEvaluator({ name: "correctness", kind: "CODE", evaluate: correctnessFn, }); const relevanceEvaluator = asExperimentEvaluator({ name: "relevance", kind: "LLM", evaluate: relevanceFn, }); await resumeEvaluation({ experimentId: "exp-1", evaluators: [correctnessEvaluator, relevanceEvaluator], client: mockClient, }); // Each evaluator should be called exactly once per matching incomplete evaluation // correctness: 2 times (run-1 and run-2 both need it) // relevance: 1 time (only run-1 needs it) expect(correctnessFn).toHaveBeenCalledTimes(2); expect(relevanceFn).toHaveBeenCalledTimes(1); // Should fetch experiment info expect(getExperimentInfoModule.getExperimentInfo).toHaveBeenCalledWith({ client: mockClient, experimentId: "exp-1", }); // Should fetch incomplete evaluations expect(mockClient.GET).toHaveBeenCalledWith( "/v1/experiments/{experiment_id}/incomplete-evaluations", expect.objectContaining({ params: expect.objectContaining({ path: { experiment_id: "exp-1" }, }), }) ); // Should submit evaluation results // run-1 needs: correctness, relevance (2 evals) // run-2 needs: correctness (1 eval) // Total: 3 evaluations expect(mockClient.POST).toHaveBeenCalledTimes(3); expect(mockClient.POST).toHaveBeenCalledWith( "/v1/experiment_evaluations", expect.objectContaining({ body: expect.objectContaining({ experiment_run_id: expect.any(String), name: expect.any(String), annotator_kind: expect.any(String), }), }) ); }); it("should handle pagination of incomplete evaluations", async () => { const evaluator = asExperimentEvaluator({ name: "correctness", kind: "CODE", evaluate: async () => ({ score: 1, label: "correct" }), }); // Mock pagination: first call returns cursor, second returns no cursor mockClient.GET.mockImplementation( (url: string, options?: { params?: { query?: { cursor?: string } } }) => { if (url.includes("incomplete-evaluations")) { const cursor = options?.params?.query?.cursor; if (!cursor) { // First page return Promise.resolve({ data: { data: [mockIncompleteEvaluations[0]], next_cursor: "cursor-1", }, }); } else { // Second page return Promise.resolve({ data: { data: [mockIncompleteEvaluations[1]], next_cursor: null, }, }); } } return Promise.resolve({ data: {} }); } ); await resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], client: mockClient, }); // Should fetch incomplete evaluations twice (pagination) const incompleteEvalsCalls = mockClient.GET.mock.calls.filter( (call: unknown[]) => (call[0] as string).includes("incomplete-evaluations") ); expect(incompleteEvalsCalls).toHaveLength(2); // Second call should include cursor expect(incompleteEvalsCalls[1][1]).toMatchObject({ params: { query: expect.objectContaining({ cursor: "cursor-1", }), }, }); }); it("should handle empty incomplete evaluations", async () => { const evaluator = asExperimentEvaluator({ name: "correctness", kind: "CODE", evaluate: async () => ({ score: 1, label: "correct" }), }); // Mock no incomplete evaluations mockClient.GET.mockImplementation((url: string) => { if (url.includes("incomplete-evaluations")) { return Promise.resolve({ data: { data: [], next_cursor: null, }, }); } return Promise.resolve({ data: {} }); }); await resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], client: mockClient, }); // Should not submit any evaluation results expect(mockClient.POST).not.toHaveBeenCalled(); }); it("should handle evaluator failures gracefully", async () => { const failingFn = vi.fn(async ({ output }: EvaluatorParams) => { const outputText = (output as { text?: string })?.text ?? ""; if (outputText.includes("Alice")) { throw new Error("Evaluator failed for Alice"); } return { score: 1, label: "correct" }; }); const failingEvaluator = asExperimentEvaluator({ name: "correctness", kind: "CODE", evaluate: failingFn, }); await resumeEvaluation({ experimentId: "exp-1", evaluators: [failingEvaluator], client: mockClient, }); // Evaluator should be called exactly once per incomplete evaluation, even for failures // Both runs need correctness evaluation, so 2 calls total (1 fails, 1 succeeds) expect(failingFn).toHaveBeenCalledTimes(2); // Should still attempt all evaluations even if some fail expect(mockClient.POST).toHaveBeenCalled(); }); it("should validate inputs", async () => { // Empty evaluators array await expect( resumeEvaluation({ experimentId: "exp-1", evaluators: [], client: mockClient, }) ).rejects.toThrow("Must specify at least one evaluator"); }); it("should respect custom concurrency", async () => { const evaluator = asExperimentEvaluator({ name: "correctness", kind: "CODE", evaluate: async () => { // Small delay to test concurrency await new Promise((resolve) => setTimeout(resolve, 10)); return { score: 1, label: "correct" }; }, }); const startTime = Date.now(); await resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], concurrency: 10, client: mockClient, }); const endTime = Date.now(); // With high concurrency, should complete faster than sequential // This is a rough test, but should generally hold expect(endTime - startTime).toBeLessThan(100); }); describe("stopOnFirstError", () => { // Test helper: creates an evaluator that fails for Alice const createFailingEvaluator = (name = "correctness") => { const evaluateFn = vi.fn(async ({ output }: EvaluatorParams) => { const outputText = (output as { text?: string })?.text ?? ""; if (outputText.includes("Alice")) { throw new Error("Evaluator failed for Alice"); } return { score: 1, label: "correct" }; }); return { evaluator: asExperimentEvaluator({ name, kind: "CODE" as const, evaluate: evaluateFn, }), evaluateFn, }; }; it("should stop on first error when stopOnFirstError is true", async () => { const { evaluator, evaluateFn } = createFailingEvaluator(); await expect( resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], stopOnFirstError: true, client: mockClient, }) ).rejects.toThrow("Evaluator failed for Alice"); expect(evaluateFn).toHaveBeenCalled(); }); it("should continue processing when stopOnFirstError is false (default)", async () => { const { evaluator, evaluateFn } = createFailingEvaluator(); await resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], stopOnFirstError: false, client: mockClient, }); expect(evaluateFn).toHaveBeenCalledTimes(2); }); it("should stop fetching new pages when stopOnFirstError is triggered", async () => { // Create more data to ensure pagination const largeDataset = Array.from({ length: 100 }, (_, i) => ({ experiment_run: { id: `run-${i}`, experiment_id: "exp-1", dataset_example_id: `ex-${i}`, repetition_number: 1, output: { text: i === 0 ? "Hello, Alice!" : "Hello, Bob!" }, start_time: new Date().toISOString(), end_time: new Date().toISOString(), error: null, trace_id: null, }, dataset_example: { id: `ex-${i}`, input: { name: i === 0 ? "Alice" : "Bob" }, output: { text: `Hello, ${i === 0 ? "Alice" : "Bob"}!` }, metadata: {}, updated_at: new Date().toISOString(), }, evaluation_names: ["correctness"], })); // Mock pagination with multiple pages let pageCount = 0; mockClient.GET.mockImplementation( ( url: string, options?: { params?: { query?: { cursor?: string; limit?: number } } } ) => { if (url.includes("incomplete-evaluations")) { pageCount++; const limit = options?.params?.query?.limit ?? 50; const cursor = options?.params?.query?.cursor; const startIdx = cursor ? parseInt(cursor) : 0; const endIdx = Math.min(startIdx + limit, largeDataset.length); return Promise.resolve({ data: { data: largeDataset.slice(startIdx, endIdx), next_cursor: endIdx < largeDataset.length ? String(endIdx) : null, }, }); } return Promise.resolve({ data: {} }); } ); const { evaluator } = createFailingEvaluator(); await expect( resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], stopOnFirstError: true, client: mockClient, }) ).rejects.toThrow("Evaluator failed for Alice"); expect(pageCount).toBeLessThan(3); }); it("should record failed evaluations even when stopping early", async () => { const { evaluator } = createFailingEvaluator(); await expect( resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], stopOnFirstError: true, client: mockClient, }) ).rejects.toThrow(); expect(mockClient.POST).toHaveBeenCalledWith( "/v1/experiment_evaluations", expect.objectContaining({ body: expect.objectContaining({ error: "Evaluator failed for Alice", }), }) ); }); it("should stop all concurrent workers when one fails", async () => { const evaluationOrder: string[] = []; const failingFn = vi.fn(async ({ output }: EvaluatorParams) => { const outputText = (output as { text?: string })?.text ?? ""; const runId = outputText.includes("Alice") ? "run-1" : "run-2"; evaluationOrder.push(runId); // Add slight delay to ensure concurrency await new Promise((resolve) => setTimeout(resolve, 5)); if (outputText.includes("Alice")) { throw new Error("Evaluator failed for Alice"); } return { score: 1, label: "correct" }; }); const failingEvaluator = asExperimentEvaluator({ name: "correctness", kind: "CODE", evaluate: failingFn, }); try { await resumeEvaluation({ experimentId: "exp-1", evaluators: [failingEvaluator], stopOnFirstError: true, concurrency: 5, client: mockClient, }); } catch { // Expected to throw } // Should not process all runs expect(evaluationOrder.length).toBeLessThanOrEqual(2); }); it("should default to stopOnFirstError = false", async () => { const { evaluator, evaluateFn } = createFailingEvaluator(); await resumeEvaluation({ experimentId: "exp-1", evaluators: [evaluator], client: mockClient, }); expect(evaluateFn).toHaveBeenCalledTimes(2); }); }); });

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server