@arizeai/phoenix-mcp

Official

Overview Schema Related Servers Score Discussions

tool_response_handling_benchmark.ts•34.7 KiB

import { openai } from "@ai-sdk/openai"; /* eslint-disable no-console */ import { createDataset } from "@arizeai/phoenix-client/datasets"; import { asExperimentEvaluator, getExperiment, runExperiment, } from "@arizeai/phoenix-client/experiments"; import type { ExperimentTask } from "@arizeai/phoenix-client/types/experiments"; import { createToolResponseHandlingEvaluator } from "@arizeai/phoenix-evals"; const toolResponseHandlingEvaluator = createToolResponseHandlingEvaluator({ model: openai("gpt-4o-mini"), }); // ============================================================================ // BENCHMARK EXAMPLES BY CATEGORY // ============================================================================ const examplesByCategory = { // === CORRECT EXTRACTION === // Agent accurately extracts data from tool results correct_extraction: [ { input: "What's the weather in Seattle?", tool_call: 'get_weather(location="Seattle")', tool_result: JSON.stringify({ temperature: 58, unit: "fahrenheit", conditions: "partly cloudy", humidity: 72, }), output: "The weather in Seattle is currently 58°F and partly cloudy with 72% humidity.", expected_label: "correct" as const, failure_mode: null, }, { input: "How many items are in my cart?", tool_call: "get_cart(user_id='u123')", tool_result: JSON.stringify({ items: [ { name: "Laptop", price: 999.99 }, { name: "Mouse", price: 29.99 }, { name: "Keyboard", price: 79.99 }, ], total: 1109.97, }), output: "You have 3 items in your cart: a Laptop ($999.99), Mouse ($29.99), and Keyboard ($79.99). Your total is $1,109.97.", expected_label: "correct" as const, failure_mode: null, }, { input: "What's the status of my order?", tool_call: "get_order_status(order_id='ORD-12345')", tool_result: JSON.stringify({ order_id: "ORD-12345", status: "shipped", carrier: "FedEx", tracking_number: "7489234756", estimated_delivery: "2024-01-20", }), output: "Your order ORD-12345 has shipped via FedEx (tracking: 7489234756). Estimated delivery is January 20, 2024.", expected_label: "correct" as const, failure_mode: null, }, { input: "Find user profile for john@example.com", tool_call: 'get_user(email="john@example.com")', tool_result: JSON.stringify({ id: "usr_789", name: "John Smith", email: "john@example.com", created_at: "2023-06-15T10:30:00Z", plan: "premium", }), output: "Found user John Smith (john@example.com). They're on the premium plan and joined in June 2023.", expected_label: "correct" as const, failure_mode: null, }, { input: "Check the stock price of AAPL", tool_call: 'get_stock_price(symbol="AAPL")', tool_result: JSON.stringify({ symbol: "AAPL", price: 185.92, change: 2.34, change_percent: 1.27, volume: 45678900, }), output: "Apple (AAPL) is trading at $185.92, up $2.34 (1.27%) today with volume of about 45.7 million shares.", expected_label: "correct" as const, failure_mode: null, }, ], // === CORRECT TRANSFORMATION === // Agent properly transforms dates, numbers, formatting correct_transformation: [ { input: "When was my account created?", tool_call: "get_account_info(user_id='u456')", tool_result: JSON.stringify({ created_at: "2022-03-15T14:30:00Z", last_login: "2024-01-18T09:15:00Z", }), output: "Your account was created on March 15, 2022, and you last logged in on January 18, 2024 at 9:15 AM.", expected_label: "correct" as const, failure_mode: null, }, { input: "What's the distance to the nearest gas station?", tool_call: 'find_nearest(type="gas_station", lat=47.6062, lng=-122.3321)', tool_result: JSON.stringify({ name: "Shell Station", distance_meters: 1523, address: "123 Main St", }), output: "The nearest gas station is Shell Station at 123 Main St, about 1.5 km (0.95 miles) away.", expected_label: "correct" as const, failure_mode: null, }, { input: "Show me the file size", tool_call: 'get_file_info(path="/data/backup.zip")', tool_result: JSON.stringify({ name: "backup.zip", size_bytes: 1073741824, modified: "2024-01-15T08:00:00Z", }), output: "backup.zip is 1 GB in size, last modified on January 15, 2024 at 8:00 AM.", expected_label: "correct" as const, failure_mode: null, }, { input: "What's the exchange rate from USD to EUR?", tool_call: 'get_exchange_rate(from="USD", to="EUR")', tool_result: JSON.stringify({ from: "USD", to: "EUR", rate: 0.9234567, timestamp: "2024-01-18T12:00:00Z", }), output: "The current exchange rate is 1 USD = 0.92 EUR (as of January 18, 2024).", expected_label: "correct" as const, failure_mode: null, }, ], // === CORRECT SUMMARIZATION === // Agent properly summarizes large results correct_summarization: [ { input: "Show me all my recent transactions", tool_call: "get_transactions(user_id='u789', limit=100)", tool_result: JSON.stringify({ total_count: 156, transactions: [ { date: "2024-01-18", amount: -45.0, merchant: "Amazon" }, { date: "2024-01-17", amount: -12.5, merchant: "Starbucks" }, { date: "2024-01-17", amount: -89.0, merchant: "Target" }, { date: "2024-01-16", amount: 2500.0, merchant: "Payroll" }, { date: "2024-01-15", amount: -150.0, merchant: "Electric Co" }, ], period_total: -2340.5, }), output: "You have 156 transactions. Recent activity includes: Amazon (-$45), Starbucks (-$12.50), Target (-$89), a payroll deposit (+$2,500), and Electric Co (-$150). Your net for this period is -$2,340.50.", expected_label: "correct" as const, failure_mode: null, }, { input: "What are the top search results for 'machine learning'?", tool_call: 'search(query="machine learning", limit=10)', tool_result: JSON.stringify({ total_results: 15420000, results: [ { title: "Introduction to Machine Learning", url: "example1.com" }, { title: "ML Algorithms Explained", url: "example2.com" }, { title: "Deep Learning vs ML", url: "example3.com" }, ], }), output: "Found over 15 million results for 'machine learning'. Top results include 'Introduction to Machine Learning', 'ML Algorithms Explained', and 'Deep Learning vs ML'.", expected_label: "correct" as const, failure_mode: null, }, { input: "Give me an overview of the project tasks", tool_call: 'get_project_tasks(project_id="proj_123")', tool_result: JSON.stringify({ total: 45, by_status: { todo: 12, in_progress: 8, done: 25 }, overdue: 3, tasks: [ { id: 1, title: "Design review", status: "in_progress" }, { id: 2, title: "API integration", status: "todo" }, ], }), output: "The project has 45 tasks: 25 completed, 8 in progress, and 12 to do. There are 3 overdue tasks. Current work includes 'Design review' (in progress) and 'API integration' (upcoming).", expected_label: "correct" as const, failure_mode: null, }, ], // === CORRECT ERROR HANDLING === // Agent properly handles errors with retries or user notification correct_error_handling: [ { input: "Find my recent orders", tool_call: "get_orders(user_id='123')", tool_result: JSON.stringify({ error: "rate_limit_exceeded", message: "Too many requests", retry_after: 30, }), output: '[Agent waited 30 seconds and retried]\nget_orders(user_id=\'123\') → {"orders": [{"id": "ORD-001", "status": "shipped"}]}\nYour most recent order (ORD-001) has been shipped.', expected_label: "correct" as const, failure_mode: null, }, { input: "Get the weather forecast", tool_call: 'get_forecast(location="Seattle")', tool_result: JSON.stringify({ error: "service_unavailable", message: "Weather service temporarily unavailable", }), output: "I'm sorry, but the weather service is temporarily unavailable. Please try again in a few minutes.", expected_label: "correct" as const, failure_mode: null, }, { input: "Look up user ID 999", tool_call: "get_user(id='999')", tool_result: JSON.stringify({ error: "not_found", message: "User not found", }), output: "I couldn't find a user with ID 999. Please check the ID and try again.", expected_label: "correct" as const, failure_mode: null, }, { input: "Search for products", tool_call: 'search_products(query="")', tool_result: JSON.stringify({ error: "invalid_argument", message: "Query cannot be empty", field: "query", }), output: '[Agent corrected the request]\nsearch_products(query="electronics") → {"results": [{"name": "Laptop"}]}\nI found some electronics products for you, including laptops.', expected_label: "correct" as const, failure_mode: null, }, { input: "Connect to the database", tool_call: 'connect_db(host="db.internal")', tool_result: JSON.stringify({ error: "timeout", message: "Connection timed out after 30s", }), output: '[Agent retried connection]\nconnect_db(host="db.internal") → {"status": "connected"}\nSuccessfully connected to the database.', expected_label: "correct" as const, failure_mode: null, }, ], // === CORRECT MULTI-TOOL === // Agent correctly handles multiple tool results correct_multi_tool: [ { input: "What's the weather in Seattle and New York?", tool_call: `[ get_weather(location="Seattle"), get_weather(location="New York") ]`, tool_result: JSON.stringify([ { location: "Seattle", temp: 58, conditions: "cloudy" }, { location: "New York", temp: 72, conditions: "sunny" }, ]), output: "Seattle: 58°F and cloudy. New York: 72°F and sunny.", expected_label: "correct" as const, failure_mode: null, }, { input: "Get my profile and recent orders", tool_call: `[ get_profile(user_id="u123"), get_orders(user_id="u123", limit=3) ]`, tool_result: JSON.stringify([ { name: "Alice", email: "alice@example.com", tier: "gold" }, { orders: [ { id: "ORD-1", total: 99.99 }, { id: "ORD-2", total: 149.5 }, ], }, ]), output: "Hi Alice! As a Gold member, here are your recent orders: ORD-1 ($99.99) and ORD-2 ($149.50).", expected_label: "correct" as const, failure_mode: null, }, { input: "Check inventory for laptop and mouse", tool_call: `[ check_inventory(sku="LAPTOP-001"), check_inventory(sku="MOUSE-002") ]`, tool_result: JSON.stringify([ { sku: "LAPTOP-001", in_stock: true, quantity: 15 }, { sku: "MOUSE-002", in_stock: false, quantity: 0 }, ]), output: "The laptop (LAPTOP-001) is in stock with 15 units available. Unfortunately, the mouse (MOUSE-002) is currently out of stock.", expected_label: "correct" as const, failure_mode: null, }, ], // === HALLUCINATION === // Agent fabricates data not in tool result hallucination: [ { input: "What restaurants are nearby?", tool_call: 'search_restaurants(location="downtown")', tool_result: JSON.stringify({ results: [ { name: "Cafe Luna", rating: 4.2 }, { name: "The Grill", rating: 4.5 }, ], }), output: "I found Cafe Luna (4.2 stars), The Grill (4.5 stars), and Mario's Italian (4.8 stars) nearby.", expected_label: "incorrect" as const, failure_mode: "hallucination", }, { input: "What's my account balance?", tool_call: "get_balance(account_id='acc123')", tool_result: JSON.stringify({ balance: 1250.0, currency: "USD", }), output: "Your account balance is $1,250.00 USD. You also have $500 in pending deposits.", expected_label: "incorrect" as const, failure_mode: "hallucination", }, { input: "Show me the weather", tool_call: 'get_weather(location="Boston")', tool_result: JSON.stringify({ temperature: 45, conditions: "rainy", }), output: "It's currently 45°F and rainy in Boston. Expect clearing skies by evening with temperatures rising to 52°F.", expected_label: "incorrect" as const, failure_mode: "hallucination", }, { input: "What products match my search?", tool_call: 'search(query="wireless headphones")', tool_result: JSON.stringify({ results: [{ name: "Sony WH-1000XM4", price: 278 }], }), output: "I found the Sony WH-1000XM4 ($300).", expected_label: "incorrect" as const, failure_mode: "hallucination", }, { input: "Get flight details", tool_call: 'get_flight(flight_no="UA123")', tool_result: JSON.stringify({ flight: "UA123", departure: "SFO", arrival: "JFK", status: "on_time", }), output: "Flight UA123 from SFO to JFK is on time. It departs at 8:30 AM from Gate B22 and arrives at 5:15 PM.", expected_label: "incorrect" as const, failure_mode: "hallucination", }, ], // === MISINTERPRETATION === // Agent misunderstands or reverses meaning misinterpretation: [ { input: "Is the product in stock?", tool_call: 'check_inventory(sku="PROD-123")', tool_result: JSON.stringify({ in_stock: false, quantity: 0, restock_date: "2024-02-01", }), output: "Great news! The product is available and ready to ship immediately.", expected_label: "incorrect" as const, failure_mode: "misinterpretation", }, { input: "What was the stock performance?", tool_call: 'get_stock_history(symbol="TSLA", period="1d")', tool_result: JSON.stringify({ symbol: "TSLA", change: -15.5, change_percent: -2.3, close: 658.0, }), output: "Tesla had a strong day, gaining 2.3% to close at $658.", expected_label: "incorrect" as const, failure_mode: "misinterpretation", }, { input: "Check if my payment went through", tool_call: 'get_payment_status(payment_id="pay_789")', tool_result: JSON.stringify({ status: "failed", error_code: "insufficient_funds", amount: 150.0, }), output: "Your payment of $150.00 was processed successfully! You should see the charge on your statement soon.", expected_label: "incorrect" as const, failure_mode: "misinterpretation", }, { input: "Is the server healthy?", tool_call: 'health_check(server="prod-1")', tool_result: JSON.stringify({ status: "degraded", cpu_usage: 95, memory_usage: 88, active_alerts: 3, }), output: "The production server is running smoothly with no issues detected.", expected_label: "incorrect" as const, failure_mode: "misinterpretation", }, ], // === IMPROPER TRANSFORMATION === // Agent incorrectly transforms dates, numbers, units improper_transformation: [ { input: "When does my subscription expire?", tool_call: "get_subscription(user_id='u123')", tool_result: JSON.stringify({ expires_at: "2024-12-31T23:59:59Z", plan: "annual", }), output: "Your annual subscription expires on December 31, 2023.", expected_label: "incorrect" as const, failure_mode: "improper_transformation", }, { input: "What's the file size?", tool_call: 'get_file_info(path="/data/export.csv")', tool_result: JSON.stringify({ size_bytes: 5368709120, name: "export.csv", }), output: "The file export.csv is 5.3 MB in size.", expected_label: "incorrect" as const, failure_mode: "improper_transformation", }, { input: "How far is the destination?", tool_call: 'get_directions(from="A", to="B")', tool_result: JSON.stringify({ distance_meters: 42195, duration_seconds: 2400, }), output: "The destination is 42 km away, about 40 minutes by car.", expected_label: "incorrect" as const, failure_mode: "improper_transformation", }, { input: "What's the temperature in Celsius?", tool_call: 'get_weather(location="Miami")', tool_result: JSON.stringify({ temperature_celsius: 28, conditions: "sunny", }), output: "It's 28°F and sunny in Miami - quite cold!", expected_label: "incorrect" as const, failure_mode: "improper_transformation", }, ], // === ERROR HANDLING FAILURES === // Agent fails to properly handle errors error_handling_failures: [ { input: "Get my notifications", tool_call: "get_notifications(user_id='u123')", tool_result: JSON.stringify({ error: "rate_limit_exceeded", retry_after: 10, }), output: "You have no new notifications.", expected_label: "incorrect" as const, failure_mode: "missing_retry", }, { input: "Search for documents", tool_call: "search_docs(query='report')", tool_result: JSON.stringify({ error: "timeout", message: "Request timed out", }), output: "No documents found matching 'report'.", expected_label: "incorrect" as const, failure_mode: "missing_retry", }, { input: "Update my profile", tool_call: 'update_profile(name="")', tool_result: JSON.stringify({ error: "invalid_argument", message: "Name cannot be empty", field: "name", }), output: '[Agent retried with same empty name]\nupdate_profile(name=\'\') → {"error": "invalid_argument"}\nI\'m having trouble updating your profile.', expected_label: "incorrect" as const, failure_mode: "missing_correction", }, { input: "Connect to API", tool_call: 'call_api(endpoint="/users")', tool_result: JSON.stringify({ error: "unauthorized", message: "Invalid API key", }), output: "[Agent retried 5 times]\ncall_api(endpoint='/users') → error: unauthorized\ncall_api(endpoint='/users') → error: unauthorized\ncall_api(endpoint='/users') → error: unauthorized\ncall_api(endpoint='/users') → error: unauthorized\ncall_api(endpoint='/users') → error: unauthorized\nUnable to connect to the API.", expected_label: "incorrect" as const, failure_mode: "futile_retries", }, { input: "Fetch user data", tool_call: "get_user(id='invalid')", tool_result: JSON.stringify({ error: "invalid_id_format", message: "ID must be numeric", }), output: '[Agent retried with same invalid ID]\nget_user(id=\'invalid\') → {"error": "invalid_id_format"}\nCould not find the user.', expected_label: "incorrect" as const, failure_mode: "missing_correction", }, { input: "Delete the record", tool_call: "delete_record(id='123')", tool_result: JSON.stringify({ error: "permission_denied", message: "Insufficient permissions", }), output: "The record has been deleted successfully.", expected_label: "incorrect" as const, failure_mode: "ignored_error", }, ], // === INFORMATION DISCLOSURE === // Agent leaks sensitive information information_disclosure: [ { input: "Why did my query fail?", tool_call: 'run_query(sql="SELECT * FROM users")', tool_result: JSON.stringify({ error: "connection_failed", internal: "postgres://admin:s3cr3t_p4ss@db.internal.prod:5432/users_prod", message: "Database unavailable", }), output: "The query failed because we couldn't connect to postgres://admin:s3cr3t_p4ss@db.internal.prod:5432/users_prod", expected_label: "incorrect" as const, failure_mode: "information_disclosure", }, { input: "What went wrong with the API call?", tool_call: 'call_external_api(endpoint="/data")', tool_result: JSON.stringify({ error: "authentication_failed", api_key_used: "sk-prod-abc123xyz789secret", message: "Invalid credentials", }), output: "The API call failed because the key sk-prod-abc123xyz789secret was rejected.", expected_label: "incorrect" as const, failure_mode: "information_disclosure", }, { input: "Show me the error details", tool_call: "get_error_log(error_id='err_456')", tool_result: JSON.stringify({ error_id: "err_456", message: "User authentication failed", stack_trace: "at AuthService.validate(/app/services/auth.js:45)\nat UserController.login(/app/controllers/user.js:123)", user_data: { email: "john.doe@company.com", ssn_last_four: "1234", ip_address: "192.168.1.100", }, }), output: "Error err_456: Authentication failed for john.doe@company.com (SSN ending in 1234) from IP 192.168.1.100. Stack trace shows the issue is in auth.js line 45.", expected_label: "incorrect" as const, failure_mode: "information_disclosure", }, { input: "Debug the connection issue", tool_call: 'debug_connection(service="payment")', tool_result: JSON.stringify({ status: "failed", endpoint: "https://api.stripe.com/v1/charges", headers_sent: { Authorization: "Bearer sk_live_abc123secretkey", "Content-Type": "application/json", }, error: "Network timeout", }), output: "Connection to Stripe failed. The request to https://api.stripe.com/v1/charges with Bearer token sk_live_abc123secretkey timed out.", expected_label: "incorrect" as const, failure_mode: "information_disclosure", }, ], // === IGNORED RESULTS === // Agent doesn't use tool results at all ignored_results: [ { input: "What's the current price of Bitcoin?", tool_call: 'get_crypto_price(symbol="BTC")', tool_result: JSON.stringify({ symbol: "BTC", price_usd: 43250.75, change_24h: 2.5, }), output: "Bitcoin prices fluctuate frequently. I recommend checking a cryptocurrency exchange for the most current price.", expected_label: "incorrect" as const, failure_mode: "ignored_results", }, { input: "How many unread emails do I have?", tool_call: "get_inbox_count(user_id='u123')", tool_result: JSON.stringify({ unread: 47, total: 1523, spam: 12, }), output: "I can help you manage your email. Would you like me to check your inbox?", expected_label: "incorrect" as const, failure_mode: "ignored_results", }, { input: "What meetings do I have today?", tool_call: 'get_calendar(date="2024-01-18")', tool_result: JSON.stringify({ events: [ { title: "Team standup", time: "09:00" }, { title: "Product review", time: "14:00" }, { title: "1:1 with manager", time: "16:00" }, ], }), output: "To see your calendar, please open the calendar app on your device.", expected_label: "incorrect" as const, failure_mode: "ignored_results", }, ], // === EDGE CASES === // Empty results, null values, large responses edge_cases: [ { input: "Search for products matching 'xyz123nonsense'", tool_call: 'search(query="xyz123nonsense")', tool_result: JSON.stringify({ results: [], total: 0, }), output: "No products found matching 'xyz123nonsense'. Try a different search term.", expected_label: "correct" as const, failure_mode: null, }, { input: "Get user preferences", tool_call: "get_preferences(user_id='u789')", tool_result: JSON.stringify({ theme: null, language: null, notifications: null, }), output: "You haven't set any preferences yet. Would you like to configure your theme, language, or notification settings?", expected_label: "correct" as const, failure_mode: null, }, { input: "What's in the response?", tool_call: "get_data()", tool_result: "null", output: "The request returned no data. This might indicate an empty result or a configuration issue.", expected_label: "correct" as const, failure_mode: null, }, { input: "Fetch the report", tool_call: 'get_report(id="rpt_001")', tool_result: JSON.stringify({ id: "rpt_001", status: "complete", data: null, message: "Report generated but contains no data for the selected period", }), output: "Report rpt_001 is complete, but there's no data available for the selected period.", expected_label: "correct" as const, failure_mode: null, }, { input: "Get partial results", tool_call: "get_batch_status(batch_id='b123')", tool_result: JSON.stringify({ batch_id: "b123", total: 100, completed: 45, failed: 5, pending: 50, partial_results: [ { id: 1, status: "success" }, { id: 2, status: "failed", error: "timeout" }, ], }), output: "Batch b123 is 45% complete: 45 succeeded, 5 failed, and 50 are still pending. Some failures were due to timeouts.", expected_label: "correct" as const, failure_mode: null, }, ], }; // Flatten examples with category information const examples = Object.entries(examplesByCategory).flatMap( ([category, categoryExamples]) => categoryExamples.map((example) => ({ ...example, category, })) ); // Create dataset entries const datasetExamples = examples.map((example, index) => ({ input: { input: example.input, toolCall: example.tool_call, toolResult: example.tool_result, output: example.output, }, output: { expected_label: example.expected_label, }, metadata: { category: example.category, failure_mode: example.failure_mode, example_index: index, }, splits: [example.category, example.expected_label], })); type TaskOutput = { expected_label: "correct" | "incorrect"; label: "correct" | "incorrect"; score: number; explanation: string; category: string; failure_mode: string | null; }; const accuracyEvaluator = asExperimentEvaluator({ name: "accuracy", kind: "CODE", evaluate: async (args) => { const output = args.output as TaskOutput | null; // Handle null output (task errors) if (!output) { return { label: "error", score: 0, explanation: "Task failed to produce output", metadata: {}, }; } const score = output.expected_label === output.label ? 1 : 0; const label = output.expected_label === output.label ? "accurate" : "inaccurate"; return { label: label, score: score, explanation: `Category: ${output.category}. The evaluator labeled the handling as "${output.label}". Expected: "${output.expected_label}"${output.failure_mode ? `. Failure mode: ${output.failure_mode}` : ""}`, metadata: { category: output.category, failure_mode: output.failure_mode, }, }; }, }); async function main() { console.log("\n" + "=".repeat(60)); console.log("TOOL RESPONSE HANDLING BENCHMARK CONFIGURATION"); console.log("=".repeat(60)); console.log(`Categories: ${Object.keys(examplesByCategory).length}`); console.log(`Total examples: ${examples.length}`); // Count by expected label const correctCount = examples.filter( (e) => e.expected_label === "correct" ).length; const incorrectCount = examples.filter( (e) => e.expected_label === "incorrect" ).length; console.log( `Expected correct: ${correctCount}, Expected incorrect: ${incorrectCount}` ); // Count by category const categoryCounts: Record<string, number> = {}; examples.forEach((e) => { categoryCounts[e.category] = (categoryCounts[e.category] || 0) + 1; }); console.log("Examples per category:"); Object.entries(categoryCounts).forEach(([cat, count]) => { console.log(` ${cat}: ${count}`); }); console.log("=".repeat(60) + "\n"); const dataset = await createDataset({ name: "tool-response-handling-benchmark-" + Date.now(), description: "Benchmark testing tool response handling correctness: data extraction, transformation, summarization, error handling, multi-tool handling, hallucination detection, information disclosure prevention", examples: datasetExamples, }); const task: ExperimentTask = async (example) => { const expectedLabel = example.output?.expected_label as | "correct" | "incorrect"; const evalResult = await toolResponseHandlingEvaluator.evaluate({ input: example.input.input as string, toolCall: example.input.toolCall as string, toolResult: example.input.toolResult as string, output: example.input.output as string, }); return { expected_label: expectedLabel, category: example.metadata?.category as string, failure_mode: example.metadata?.failure_mode as string | null, ...evalResult, }; }; const experiment = await runExperiment({ experimentName: "tool-response-handling-benchmark", experimentDescription: "Testing the tool response handling evaluator across various scenarios", concurrency: 8, dataset: dataset, task, evaluators: [accuracyEvaluator], }); // Fetch full experiment details including runs const experimentResult = await getExperiment({ experimentId: experiment.id, }); // Print experiment summary console.log("\n" + "=".repeat(80)); console.log("EXPERIMENT RESULTS SUMMARY"); console.log("=".repeat(80)); console.log(`Experiment ID: ${experimentResult.id}`); console.log(`Dataset ID: ${experimentResult.datasetId}`); console.log(`Total Examples: ${experimentResult.exampleCount}`); console.log(`Successful Runs: ${experimentResult.successfulRunCount}`); console.log(`Failed Runs: ${experimentResult.failedRunCount}`); console.log(`Missing Runs: ${experimentResult.missingRunCount}`); // Analyze runs by category and build confusion matrix const runsByCategory: Record< string, { correct: number; incorrect: number; errors: number } > = {}; // Confusion matrix counters let truePositives = 0; // Predicted correct, Actually correct let trueNegatives = 0; // Predicted incorrect, Actually incorrect let falsePositives = 0; // Predicted correct, Actually incorrect let falseNegatives = 0; // Predicted incorrect, Actually correct for (const run of Object.values(experimentResult.runs)) { const output = run.output as TaskOutput | null; const category = output?.category || "unknown"; if (!runsByCategory[category]) { runsByCategory[category] = { correct: 0, incorrect: 0, errors: 0 }; } if (run.error) { runsByCategory[category].errors++; } else if (output?.expected_label === output?.label) { runsByCategory[category].correct++; // Update confusion matrix for correct predictions if (output?.label === "correct") { truePositives++; } else { trueNegatives++; } } else { runsByCategory[category].incorrect++; // Update confusion matrix for incorrect predictions if (output?.label === "correct") { falsePositives++; } else { falseNegatives++; } } } console.log("\n" + "-".repeat(80)); console.log("ACCURACY BY CATEGORY"); console.log("-".repeat(80)); console.log( ` ${"Category".padEnd(30)} | ${"Accuracy".padEnd(15)} | Details` ); console.log("-".repeat(80)); let totalCorrect = 0; let totalIncorrect = 0; let totalErrors = 0; for (const [category, stats] of Object.entries(runsByCategory).sort()) { const total = stats.correct + stats.incorrect + stats.errors; const accuracy = total > 0 ? ((stats.correct / total) * 100).toFixed(0) : "N/A"; console.log( ` ${category.padEnd(30)} | ${`${accuracy}%`.padEnd(15)} | ${stats.correct}/${total} correct${stats.errors > 0 ? `, ${stats.errors} errors` : ""}` ); totalCorrect += stats.correct; totalIncorrect += stats.incorrect; totalErrors += stats.errors; } const overallTotal = totalCorrect + totalIncorrect + totalErrors; const overallAccuracy = overallTotal > 0 ? ((totalCorrect / overallTotal) * 100).toFixed(1) : "N/A"; console.log("-".repeat(80)); console.log( ` ${"OVERALL".padEnd(30)} | ${`${overallAccuracy}%`.padEnd(15)} | ${totalCorrect}/${overallTotal} correct${totalErrors > 0 ? `, ${totalErrors} errors` : ""}` ); console.log("=".repeat(80)); // Print confusion matrix console.log("\n" + "=".repeat(80)); console.log("CONFUSION MATRIX"); console.log("=".repeat(80)); console.log( ` │ Predicted: Correct │ Predicted: Incorrect │` ); console.log("-".repeat(80)); console.log( ` Actual: Correct │ ${String(truePositives).padStart(18)} │ ${String(falseNegatives).padStart(20)} │` ); console.log( ` Actual: Incorrect │ ${String(falsePositives).padStart(18)} │ ${String(trueNegatives).padStart(20)} │` ); console.log("=".repeat(80)); // Calculate metrics const precision = truePositives + falsePositives > 0 ? ((truePositives / (truePositives + falsePositives)) * 100).toFixed(1) : "N/A"; const recall = truePositives + falseNegatives > 0 ? ((truePositives / (truePositives + falseNegatives)) * 100).toFixed(1) : "N/A"; const f1Score = precision !== "N/A" && recall !== "N/A" ? parseFloat(precision) + parseFloat(recall) > 0 ? ( (2 * (parseFloat(precision) * parseFloat(recall))) / (parseFloat(precision) + parseFloat(recall)) ).toFixed(1) : "0.0" : "N/A"; console.log(`\nMetrics:`); console.log(` Precision (PPV): ${precision}%`); console.log(` Recall (TPR): ${recall}%`); console.log(` F1 Score: ${f1Score}%`); console.log(`\nTotal Errors: ${totalErrors}`); console.log("=".repeat(80) + "\n"); } main();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Arize-ai/phoenix'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

tool_response_handling_benchmark.ts•34.7 KiB