@arizeai/phoenix-mcp

Official

Overview Schema Related Servers Score Discussions

phoenix
app
tests

server-evaluators.spec.ts•19.6 KiB

import { randomUUID } from "crypto"; import { expect, test } from "@playwright/test"; test.describe.serial("Server Evaluators", () => { const datasetName = `test-dataset-${randomUUID()}`; // Store the custom evaluator name for use across multiple tests const customEvaluatorName = `custom-eval-${randomUUID().slice(0, 8)}`; const updatedDescription = "Updated description for testing"; test("can create a dataset with an example", async ({ page }) => { await page.goto("/datasets"); await page.waitForURL("**/datasets"); // Click New Dataset button to open the create dataset dialog await page.getByRole("button", { name: "New Dataset" }).click(); // Verify dialog opens with the correct heading await expect( page.getByRole("heading", { name: "Create Dataset" }) ).toBeVisible(); // Switch to the "From scratch" tab await page.getByRole("tab", { name: "From scratch" }).click(); // Fill in dataset details in the dialog await page.getByLabel("Dataset Name").clear(); await page.getByLabel("Dataset Name").fill(datasetName); await page.getByLabel("Description").fill("Test dataset for evaluators"); // Create the dataset await page.getByRole("button", { name: "Create Dataset" }).click(); // Wait for dialog to close and verify we're on the new dataset page await expect(page.getByTestId("dialog")).not.toBeVisible(); // Wait for the dataset to appear in the table await expect(page.getByRole("link", { name: datasetName })).toBeVisible(); // Navigate to the dataset to verify it was created await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); // Verify dataset was created await expect( page.getByRole("heading", { name: datasetName }) ).toBeVisible(); // Add an example to the dataset (required for playground to work) await page .getByRole("button", { name: "Add Dataset Example" }) .or(page.getByRole("button", { name: "Example" })) .click(); // Wait for the Add Example dialog to open await expect(page.getByRole("dialog")).toBeVisible(); // Fill in the input field with valid JSON // JSONEditor renders a CodeMirror editor with .cm-content // Scope to the dialog to avoid picking up background editors const dialog = page.getByRole("dialog"); const inputTextArea = dialog.locator(".cm-content").first(); await expect(inputTextArea).toBeVisible(); await inputTextArea.click(); // Select all existing content and replace it await page.keyboard.press("ControlOrMeta+a"); // Use insertText instead of type to bypass CodeMirror's bracket/quote // auto-closing which mangles character-by-character input await page.keyboard.insertText( '{"question": "What is 2+2?", "context": "Math"}' ); // Uncheck the "Create more" checkbox so the dialog closes after adding. // Click the label text because React Aria's Checkbox has a hidden input // whose visual overlay intercepts pointer events on the native checkbox. await page.getByText("Create more", { exact: true }).click(); // Click Add Example button to save await page.getByRole("button", { name: "Add Example" }).click(); // Wait for dialog to close await expect(page.getByRole("dialog")).not.toBeVisible(); // Verify the example appears in the table (or at least the table is no longer empty) await expect(page.getByRole("row")).toHaveCount(2); // header + 1 example }); test("can navigate to evaluators tab", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.waitForURL("**/datasets"); // Click on the dataset name await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); // Click on Evaluators tab await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Verify we're on the evaluators tab await expect( page.getByRole("tab", { name: /Evaluators/i }) ).toHaveAttribute("aria-selected", "true"); // Verify the empty state shows prebuilt evaluator suggestions await expect( page.getByText("No evaluators added to this dataset") ).toBeVisible(); }); // Store names for prebuilt evaluators const prebuiltLLMEvaluatorName = `correctness-${randomUUID().slice(0, 8)}`; const prebuiltCodeEvaluatorName = `exact-match-${randomUUID().slice(0, 8)}`; test("can add a prebuilt LLM evaluator (correctness)", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Click Add evaluator button await page.getByRole("button", { name: "Add evaluator" }).click(); // Hover over "Use LLM evaluator template" to open submenu await page .getByRole("menuitem", { name: "Use LLM evaluator template" }) .hover(); // Wait for submenu to appear and click "Correctness" await page .getByRole("menuitem", { name: /Correctness/i }) .first() .click(); // Verify the Create Evaluator dialog opens with prefilled template await expect( page.getByRole("heading", { name: "Create Evaluator" }) ).toBeVisible(); // Update the name to our unique test name const nameInput = page.getByRole("textbox", { name: "Name" }).first(); await nameInput.clear(); await nameInput.fill(prebuiltLLMEvaluatorName); // Click Create button await page.getByRole("button", { name: "Create" }).click(); // Wait for dialog to close await expect(page.getByTestId("dialog")).not.toBeVisible(); // Verify the evaluator appears in the table await expect( page.getByRole("cell", { name: prebuiltLLMEvaluatorName, exact: true }) ).toBeVisible(); }); test("can add a prebuilt code evaluator (exact_match)", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Click Add evaluator button await page.getByRole("button", { name: "Add evaluator" }).click(); // Hover over "Use built-in code evaluator" to open submenu await page .getByRole("menuitem", { name: "Use built-in code evaluator" }) .hover(); // Wait for submenu to appear and click "exact_match" await page.getByRole("menuitem", { name: /exact_match/i }).click(); // Verify the Create Evaluator dialog opens await expect( page.getByRole("heading", { name: "Create Evaluator" }) ).toBeVisible(); // Update the name to our unique test name const nameInput = page.getByRole("textbox", { name: "Name" }).first(); await nameInput.clear(); await nameInput.fill(prebuiltCodeEvaluatorName); // Fill in the required Expected field (allowsCustomValue ComboBox) await page .getByRole("combobox", { name: "Expected path mapping" }) .fill("input.question"); // Fill in the required Actual field (allowsCustomValue ComboBox) await page .getByRole("combobox", { name: "Actual path mapping" }) .fill("input.context"); // Click Create button await page.getByRole("button", { name: "Create" }).click(); // Wait for dialog to close await expect(page.getByTestId("dialog")).not.toBeVisible(); // Verify the evaluator appears in the table await expect( page.getByRole("cell", { name: prebuiltCodeEvaluatorName, exact: true }) ).toBeVisible(); }); test("can configure input mapping for code evaluator", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Find the row containing our code evaluator and click its action menu const evaluatorRow = page.getByRole("row").filter({ has: page.getByRole("cell", { name: prebuiltCodeEvaluatorName, exact: true, }), }); // Click the action menu button (three dots) in the row await evaluatorRow.getByRole("button").last().click(); // Click "Edit" from the menu await page.getByRole("menuitem", { name: "Edit" }).click(); // Verify the Edit Evaluator dialog opens await expect( page.getByRole("heading", { name: "Edit Evaluator" }) ).toBeVisible(); // Find the Expected field's input mode selector and verify it exists // The SwitchableEvaluatorInput has a mode toggle (path vs literal) const expectedLabel = page.getByText("Expected", { exact: true }); await expect(expectedLabel).toBeVisible(); // Find the Actual field and verify it exists const actualLabel = page.getByText("Actual", { exact: true }); await expect(actualLabel).toBeVisible(); // Find and verify the case sensitive switch exists const caseSensitiveSwitch = page.getByRole("switch", { name: /Case sensitive/i, }); await expect(caseSensitiveSwitch).toBeVisible(); // Toggle the case sensitive switch off // Note: React Aria's Switch creates a hidden input with role="switch". // Clicking via getByRole targets this hidden input which can cause issues. // Click the label text instead which properly toggles the switch. await page.getByText("Case sensitive", { exact: true }).click(); // Click Update button await page.getByRole("button", { name: "Update" }).click(); // Wait for dialog to close await expect(page.getByTestId("dialog")).not.toBeVisible(); // Verify the evaluator still appears in the table await expect( page.getByRole("cell", { name: prebuiltCodeEvaluatorName, exact: true }) ).toBeVisible(); }); test("can create a custom LLM evaluator from scratch", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Click Add evaluator button await page.getByRole("button", { name: "Add evaluator" }).click(); // Select "Create new LLM evaluator" from the dropdown await page .getByRole("menuitem", { name: "Create new LLM evaluator" }) .click(); // Verify the Create Evaluator dialog opens await expect( page.getByRole("heading", { name: "Create Evaluator" }) ).toBeVisible(); // Fill in the evaluator name await page .getByRole("textbox", { name: "Name" }) .first() .fill(customEvaluatorName); // Fill in the description await page .getByRole("textbox", { name: /Description/i }) .fill("Initial description for custom evaluator"); // Fill in the System message - find the textbox within the System section const systemSection = page.locator( 'button:has-text("System"):not([role="menuitem"])' ); const systemTextbox = systemSection .locator("..") .locator("..") .getByRole("textbox"); await systemTextbox.fill("You are an evaluator. Evaluate the output."); // Fill in the User message - find the textbox within the User section const userSection = page.locator( 'button:has-text("User"):not([role="menuitem"])' ); const userTextbox = userSection .locator("..") .locator("..") .getByRole("textbox") .first(); await userTextbox.fill( "Please evaluate this output: {{output}}\n\nReference: {{reference}}" ); // Click Create button await page.getByRole("button", { name: "Create" }).click(); // Wait for dialog to close await expect(page.getByTestId("dialog")).not.toBeVisible(); // Verify the evaluator appears in the table await expect( page.getByRole("cell", { name: customEvaluatorName, exact: true }) ).toBeVisible(); }); test("can edit an LLM evaluator", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Find the row containing our custom evaluator and click its action menu const evaluatorRow = page.getByRole("row").filter({ has: page.getByRole("cell", { name: customEvaluatorName, exact: true }), }); // Click the action menu button (three dots) in the row await evaluatorRow.getByRole("button").last().click(); // Click "Edit" from the menu await page.getByRole("menuitem", { name: "Edit" }).click(); // Verify the Edit Evaluator dialog opens await expect( page.getByRole("heading", { name: "Edit Evaluator" }) ).toBeVisible(); // Update the description const descriptionInput = page.getByRole("textbox", { name: /Description/i, }); await descriptionInput.clear(); await descriptionInput.fill(updatedDescription); // Click Update button await page.getByRole("button", { name: "Update" }).click(); // Wait for dialog to close await expect(page.getByTestId("dialog")).not.toBeVisible(); // Verify the evaluator still appears in the table await expect( page.getByRole("cell", { name: customEvaluatorName, exact: true }) ).toBeVisible(); }); test("can verify evaluator edits were saved", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Find the row containing our custom evaluator and click its action menu const evaluatorRow = page.getByRole("row").filter({ has: page.getByRole("cell", { name: customEvaluatorName, exact: true }), }); // Click the action menu button (three dots) in the row await evaluatorRow.getByRole("button").last().click(); // Click "Edit" from the menu await page.getByRole("menuitem", { name: "Edit" }).click(); // Verify the Edit Evaluator dialog opens await expect( page.getByRole("heading", { name: "Edit Evaluator" }) ).toBeVisible(); // Verify the updated description is present const descriptionInput = page.getByRole("textbox", { name: /Description/i, }); await expect(descriptionInput).toHaveValue(updatedDescription); // Close the dialog await page.getByRole("button", { name: "Cancel" }).click(); await expect(page.getByTestId("dialog")).not.toBeVisible(); }); test("evaluators are visible in playground when dataset is selected", async ({ page, }) => { // First, navigate to the dataset to get its ID from the URL await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); // Extract the dataset ID from the URL const url = page.url(); const match = url.match(/datasets\/([^/]+)/); const datasetId = match ? match[1] : ""; expect(datasetId).toBeTruthy(); // Navigate to the playground with the dataset selected await page.goto(`/playground?datasetId=${datasetId}`); // Wait for the playground to load with dataset mode await page.waitForURL(`**/playground?datasetId=${datasetId}`); // Check if the playground is showing the "No provider" message // If so, we error and fail the test const noProviderMessage = page.getByText( "The playground is not available until an LLM provider client is installed" ); const readinessResult = await Promise.race([ noProviderMessage.waitFor({ state: "visible" }).then(() => "no-provider"), page .getByText("Experiment", { exact: true }) .waitFor({ state: "visible" }) .then(() => "ready"), ]); if (readinessResult === "no-provider") { throw new Error( "Playground requires an LLM provider to be installed. Playwright test environment is not configured correctly." ); } // Wait for the playground title to appear first await expect( page.getByRole("heading", { name: "Playground" }) ).toBeVisible(); // Wait for the "Experiment" text to appear, which indicates // the dataset section has loaded (this appears in PlaygroundExperimentToolbar) await expect(page.getByText("Experiment", { exact: true })).toBeVisible(); // Find and click the Evaluators button to open the evaluators menu // Use the button inside the content area (not the tab) const evaluatorsButton = page .getByTestId("content") .getByRole("button", { name: /Evaluators/i }); await expect(evaluatorsButton).toBeVisible(); await evaluatorsButton.click(); // Wait for the evaluators menu to appear - the GridList has aria-label="Select evaluators" // React Aria GridList renders with role="grid" const evaluatorsList = page.locator('[aria-label="Select evaluators"]'); await expect(evaluatorsList).toBeVisible(); // Verify that the prebuilt LLM evaluator (correctness) appears in the list // GridList items render as role="row" await expect( evaluatorsList.getByRole("row", { name: new RegExp(prebuiltLLMEvaluatorName), }) ).toBeVisible(); // Verify that the prebuilt code evaluator (exact match) appears in the list await expect( evaluatorsList.getByRole("row", { name: new RegExp(prebuiltCodeEvaluatorName), }) ).toBeVisible(); // Verify that the custom LLM evaluator appears in the list await expect( evaluatorsList.getByRole("row", { name: new RegExp(customEvaluatorName) }) ).toBeVisible(); }); test("can navigate to evaluator details pages", async ({ page }) => { // Navigate to the dataset's evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Navigate to the LLM evaluator details page await page .getByRole("link", { name: prebuiltLLMEvaluatorName, exact: true }) .click(); await page.waitForURL("**/evaluators/**"); // Verify the LLM evaluator details page loaded await expect( page.getByRole("heading", { name: prebuiltLLMEvaluatorName }) ).toBeVisible(); // Navigate back to the evaluators tab await page.goto("/datasets"); await page.getByRole("link", { name: datasetName }).click(); await page.waitForURL("**/datasets/**/examples"); await page.getByRole("tab", { name: /Evaluators/i }).click(); await page.waitForURL("**/evaluators"); // Navigate to the built-in code evaluator details page await page .getByRole("link", { name: prebuiltCodeEvaluatorName, exact: true }) .click(); await page.waitForURL("**/evaluators/**"); // Verify the built-in evaluator details page loaded await expect( page.getByRole("heading", { name: prebuiltCodeEvaluatorName }) ).toBeVisible(); }); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

server-evaluators.spec.ts•19.6 KiB