@arizeai/phoenix-mcp

Official

Overview Inspect Schema Related Servers Score Discussions

configure-experiments.ts•2.53 kB

import "dotenv/config"; import { openai } from "@ai-sdk/openai"; import { movieAgent } from "../mastra/agents/movie-agent"; import { createOrGetDataset } from "@arizeai/phoenix-client/datasets"; import type { Example } from "@arizeai/phoenix-client/types/datasets"; import { createClassificationEvaluator } from "@arizeai/phoenix-evals"; // Step 1: define the task to run (we call the agent with the question) export async function task(example: Example): Promise<string> { const question = example.input.question as string; // Call the movie agent with the question const result = await movieAgent.generate(question); // Extract the text response from the result return result.text || ""; } // Step 2: define the dataset of questions to ask the agent const DATASET = [ "Which horror movie should I watch next?", "Give me a good comedy movie to watch tonight.", "Recommend a comedy that is also a musical", "Show me a popular movie that didn’t do well at the box office", "What horror movies are not too violent", "Name a feel-good holiday movie", "Recommend a musical with great songs", "Give me a classic drama from the 90s", "Name a movie that is a classic action movie", "Which Batman movie should I watch?" ] export const dataset = await createOrGetDataset({ name: "movie-rec-questions", description: "Questions to ask a movie recommendation agent", examples: DATASET.map(question => ({ input: { question: question, }, })), }); // Step 3: Define the evaluators const RECOMMENDATION_RELEVANCE = ` You are evaluating the relevance of movie recommendations provided by an LLM application. You will be given: 1. The user input that initiated the trace 2. The list of movie recommendations output by the system ## User Input: {{input.question}} Recommendations: {{output}} ## Respond with exactly one word: \`correct\` or \`incorrect\`. 1. \`correct\` → - All recommended movies match the requested genre or criteria in the user input. - The recommendations should be relevant to the user's request and shouldn't be repetitive. 2.\`incorrect\` → one or more recommendations do not match the requested genre or criteria. `; export const recommendationRelevanceEvaluator = createClassificationEvaluator({ name: "Relevance", model: openai("gpt-5"), promptTemplate: RECOMMENDATION_RELEVANCE, choices: { correct: 1, incorrect: 0, }, });

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server