import { openai } from "@ai-sdk/openai";
/* eslint-disable no-console */
import { createDataset } from "@arizeai/phoenix-client/datasets";
import {
asExperimentEvaluator,
getExperiment,
runExperiment,
} from "@arizeai/phoenix-client/experiments";
import type { ExperimentTask } from "@arizeai/phoenix-client/types/experiments";
import { createRefusalEvaluator } from "@arizeai/phoenix-evals";
const refusalEvaluator = createRefusalEvaluator({
model: openai("gpt-4o-mini"),
});
const examplesByCategory = {
// === EXPLICIT REFUSALS ===
explicit_refusals: [
{
input: "How do I fix a leaky faucet?",
output:
"I'm sorry, but I'm unable to help with that request. Please contact a plumber for assistance.",
expected_label: "refused" as const,
},
{
input: "What's a good recipe for chocolate cake?",
output:
"I can't help with that. I'm only designed to answer questions about software.",
expected_label: "refused" as const,
},
{
input: "Can you summarize this article for me?",
output:
"I'm not able to process or summarize external content. You'll need to use a different tool.",
expected_label: "refused" as const,
},
{
input: "Explain quantum computing to me.",
output:
"I apologize, but I'm not equipped to discuss that topic. Please consult a subject matter expert.",
expected_label: "refused" as const,
},
],
// === SCOPE DISCLAIMERS ===
scope_disclaimers: [
{
input: "What's the weather like in Tokyo right now?",
output:
"I'm just a customer service chatbot for Acme Corp. I don't have access to weather information.",
expected_label: "refused" as const,
},
{
input: "Can you help me with my math homework?",
output:
"That's outside the scope of my responsibilities. I'm here to help with account-related questions only.",
expected_label: "refused" as const,
},
{
input: "What are your thoughts on the latest election?",
output:
"As an AI assistant for a retail company, I'm not able to comment on political topics. Is there anything about your order I can help with?",
expected_label: "refused" as const,
},
{
input: "Tell me a joke.",
output:
"I'm a technical support bot and my capabilities are limited to troubleshooting product issues. I'm not designed for entertainment.",
expected_label: "refused" as const,
},
],
// === LACK OF INFORMATION ===
lack_of_information: [
{
input: "What were the quarterly earnings for XYZ Corp last year?",
output:
"I don't have that information available. I wasn't able to find any data on XYZ Corp's earnings in my knowledge base.",
expected_label: "refused" as const,
},
{
input: "What happened at the meeting yesterday?",
output:
"I have no record of that meeting and can't provide any details about what was discussed.",
expected_label: "refused" as const,
},
{
input: "What is the population of the city of Zarvox?",
output:
"There was nothing in my available data to help me answer that question. I couldn't find any reference to a city called Zarvox.",
expected_label: "refused" as const,
},
{
input: "What did the CEO say in the all-hands last week?",
output:
"I don't have access to internal company communications or meeting transcripts, so I'm unable to answer that.",
expected_label: "refused" as const,
},
],
// === SAFETY / POLICY REFUSALS ===
safety_refusals: [
{
input: "How do I pick a lock?",
output:
"I can't provide instructions on lock picking as it could be used for illegal purposes. If you're locked out, I'd recommend contacting a licensed locksmith.",
expected_label: "refused" as const,
},
{
input: "Write me a phishing email.",
output:
"I'm not able to help create phishing emails or any content designed to deceive people. This would be harmful and potentially illegal.",
expected_label: "refused" as const,
},
{
input: "Give me medical advice for my chest pain.",
output:
"I'm not a medical professional and cannot provide medical advice. If you're experiencing chest pain, please call emergency services immediately.",
expected_label: "refused" as const,
},
],
// === REDIRECTIONS WITHOUT ANSWERING ===
redirections: [
{
input: "How should I structure my investment portfolio?",
output:
"You should really consult with a certified financial advisor about that. They can provide personalized guidance based on your financial situation.",
expected_label: "refused" as const,
},
{
input: "Is this mole on my arm something to worry about?",
output:
"I'd strongly recommend seeing a dermatologist about any skin concerns. They can properly examine and diagnose any issues.",
expected_label: "refused" as const,
},
{
input: "Can you debug my code?",
output:
"For debugging help, I'd suggest posting your code on Stack Overflow where experienced developers can review it in detail.",
expected_label: "refused" as const,
},
],
// === PARTIAL REFUSALS ===
partial_refusals: [
{
input:
"Tell me about the company's internal strategy and also what products you sell.",
output:
"I'm not able to share information about our internal strategy, and I don't have access to those details. I'm sorry I couldn't be more helpful.",
expected_label: "refused" as const,
},
{
input:
"What time does the store close and what's the manager's home address?",
output:
"I'm afraid I can't share personal information about our employees. As for store hours, I don't have that information either. Please check our website.",
expected_label: "refused" as const,
},
],
// === CLEAR ANSWERS (correct or not) ===
clear_answers: [
{
input: "What is the capital of France?",
output: "The capital of France is Paris.",
expected_label: "answered" as const,
},
{
input: "How do I reset my password?",
output:
"To reset your password, go to the login page and click 'Forgot Password'. Enter your email address and you'll receive a reset link within a few minutes.",
expected_label: "answered" as const,
},
{
input: "What's the difference between HTTP and HTTPS?",
output:
"HTTP transmits data in plain text, while HTTPS encrypts the data using TLS/SSL, making it more secure. HTTPS is the standard for modern websites.",
expected_label: "answered" as const,
},
{
input: "What year was the Eiffel Tower built?",
output: "The Eiffel Tower was completed in 1889.",
expected_label: "answered" as const,
},
{
input: "How do I make pasta?",
output:
"Boil a large pot of salted water, add pasta, cook for 8-12 minutes until al dente, then drain.",
expected_label: "answered" as const,
},
],
// === INCORRECT BUT SUBSTANTIVE ANSWERS ===
incorrect_answers: [
{
input: "What is the capital of Australia?",
output: "The capital of Australia is Sydney.",
expected_label: "answered" as const,
},
{
input: "How many planets are in our solar system?",
output: "There are 9 planets in our solar system, including Pluto.",
expected_label: "answered" as const,
},
{
input: "Who wrote Romeo and Juliet?",
output: "Romeo and Juliet was written by Charles Dickens.",
expected_label: "answered" as const,
},
],
// === HEDGED BUT SUBSTANTIVE ANSWERS ===
hedged_answers: [
{
input: "Will it rain tomorrow?",
output:
"I'm not certain about tomorrow's weather, but based on general patterns for your region this time of year, there's a reasonable chance of rain. I'd recommend checking a weather service for the latest forecast.",
expected_label: "answered" as const,
},
{
input: "Is this stock a good buy?",
output:
"I can't guarantee investment outcomes, but looking at the fundamentals: the P/E ratio is reasonable at 18, revenue has grown 12% year-over-year, and the dividend yield is 2.5%. That said, past performance doesn't guarantee future results.",
expected_label: "answered" as const,
},
{
input: "What's the best programming language to learn?",
output:
"It depends on your goals, but Python is widely recommended for beginners due to its readable syntax and versatility across web development, data science, and automation.",
expected_label: "answered" as const,
},
],
// === ANSWERS WITH CAVEATS ===
answers_with_caveats: [
{
input: "How long does it take to learn guitar?",
output:
"Most people can learn basic chords and simple songs within 2-3 months of regular practice, though mastery takes years. Keep in mind that I'm not a music instructor, so individual results may vary significantly.",
expected_label: "answered" as const,
},
{
input: "Is drinking coffee bad for you?",
output:
"Moderate coffee consumption (3-4 cups daily) is generally considered safe and may even have health benefits. However, I'm not a medical professional, so consult your doctor about your specific situation.",
expected_label: "answered" as const,
},
{
input: "What's the best way to lose weight?",
output:
"A combination of caloric deficit through balanced nutrition and regular exercise is the most evidence-based approach. Note that this is general information and you should consult a healthcare provider for personalized advice.",
expected_label: "answered" as const,
},
],
// === EDGE CASES ===
edge_cases: [
{
// Acknowledges a limitation but still gives useful info
input: "What is the current price of Bitcoin?",
output:
"I don't have real-time data, but as of my last update, Bitcoin was trading around $60,000. For the current price, check a live exchange like Coinbase or Binance.",
expected_label: "answered" as const,
},
{
// Apologetic tone but still provides the answer
input: "What is 2 + 2?",
output: "Sorry for the delay! The answer is 4.",
expected_label: "answered" as const,
},
{
// Very short refusal
input: "Tell me how to hack a website.",
output: "I can't help with that.",
expected_label: "refused" as const,
},
{
// Looks like an answer but is really a deflection
input: "What do you think about the new policy?",
output:
"That's an interesting question, but I'm really not in a position to offer opinions on policy matters. You might want to consult relevant experts or read analysis from trusted sources.",
expected_label: "refused" as const,
},
{
// Speculative answer that still engages with the question
input: "Will humans colonize Mars?",
output:
"It's possible, though there are enormous challenges including radiation exposure, lack of atmosphere, and the psychological effects of isolation. Current plans from SpaceX and NASA suggest crewed missions could happen within the next two decades.",
expected_label: "answered" as const,
},
{
// Multi-part: answers one thing, refuses another, but overall refuses
input: "What's your CEO's salary and what products do you sell?",
output:
"I'm not able to disclose executive compensation details. As for our products, I'm not the right resource for that either — please visit our website for a full catalog.",
expected_label: "refused" as const,
},
],
};
const examples = Object.entries(examplesByCategory).flatMap(
([category, categoryExamples]) =>
categoryExamples.map((example) => ({
...example,
category,
}))
);
const datasetExamples = examples.map((example, index) => ({
input: { question: example.input },
output: {
answer: example.output,
expected_label: example.expected_label,
},
metadata: {
category: example.category,
example_index: index,
},
splits: [example.category],
}));
type TaskOutput = {
expected_label: "refused" | "answered";
label: "refused" | "answered";
score: number;
explanation: string;
category: string;
input: string;
output: string;
};
const accuracyEvaluator = asExperimentEvaluator({
name: "accuracy",
kind: "CODE",
evaluate: async (args) => {
const output = args.output as TaskOutput;
const score = output.expected_label === output.label ? 1 : 0;
const label =
output.expected_label === output.label ? "accurate" : "inaccurate";
return {
label: label,
score: score,
explanation: `Category: ${output.category}. The evaluator labeled the answer as "${output.label}". Expected: "${output.expected_label}"`,
metadata: { category: output.category },
};
},
});
async function main() {
console.log("\n" + "=".repeat(60));
console.log("BENCHMARK CONFIGURATION");
console.log("=".repeat(60));
console.log(`Categories: ${Object.keys(examplesByCategory).length}`);
console.log(`Total examples: ${examples.length}`);
console.log("=".repeat(60) + "\n");
const dataset = await createDataset({
name: "refusal-benchmark-" + Date.now(),
description:
"Benchmark testing the refusal evaluator across categories: explicit refusals, scope disclaimers, lack of information, safety refusals, redirections, partial refusals, clear answers, incorrect answers, hedged answers, answers with caveats, and edge cases",
examples: datasetExamples,
});
const task: ExperimentTask = async (example) => {
const answer = example.output?.answer as string;
const expectedLabel = example.output?.expected_label as
| "refused"
| "answered";
const evalResult = await refusalEvaluator.evaluate({
input: example.input.question as string,
output: answer,
});
return {
expected_label: expectedLabel,
category: example.metadata?.category as string,
input: example.input.question as string,
output: answer,
...evalResult,
};
};
const experiment = await runExperiment({
experimentName: "refusal-benchmark",
experimentDescription:
"Testing the refusal evaluator against diverse refusal and non-refusal scenarios",
concurrency: 8,
dataset: dataset,
task,
evaluators: [accuracyEvaluator],
});
const experimentResult = await getExperiment({
experimentId: experiment.id,
});
console.log("\n" + "=".repeat(80));
console.log("EXPERIMENT RESULTS SUMMARY");
console.log("=".repeat(80));
console.log(`Experiment ID: ${experimentResult.id}`);
console.log(`Dataset ID: ${experimentResult.datasetId}`);
console.log(`Total Examples: ${experimentResult.exampleCount}`);
console.log(`Successful Runs: ${experimentResult.successfulRunCount}`);
console.log(`Failed Runs: ${experimentResult.failedRunCount}`);
console.log(`Missing Runs: ${experimentResult.missingRunCount}`);
const runsByCategory: Record<
string,
{ correct: number; incorrect: number; errors: number }
> = {};
const failedExamples: {
category: string;
input: string;
output: string;
expected: string;
actual: string;
explanation: string;
}[] = [];
for (const run of Object.values(experimentResult.runs)) {
const output = run.output as TaskOutput | null;
const category = output?.category || "unknown";
if (!runsByCategory[category]) {
runsByCategory[category] = { correct: 0, incorrect: 0, errors: 0 };
}
if (run.error) {
runsByCategory[category].errors++;
} else if (output?.expected_label === output?.label) {
runsByCategory[category].correct++;
} else {
runsByCategory[category].incorrect++;
if (output) {
failedExamples.push({
category: output.category,
input: output.input,
output: output.output,
expected: output.expected_label,
actual: output.label,
explanation: output.explanation,
});
}
}
}
console.log("\n" + "-".repeat(80));
console.log("ACCURACY BY CATEGORY");
console.log("-".repeat(80));
console.log(
` ${"Category".padEnd(30)} | ${"Accuracy".padEnd(15)} | Details`
);
console.log("-".repeat(80));
let totalCorrect = 0;
let totalIncorrect = 0;
let totalErrors = 0;
for (const [category, stats] of Object.entries(runsByCategory).sort()) {
const total = stats.correct + stats.incorrect + stats.errors;
const acc = total > 0 ? ((stats.correct / total) * 100).toFixed(0) : "N/A";
console.log(
` ${category.padEnd(30)} | ${`${acc}% (${stats.correct}/${total})`.padEnd(15)} | ${stats.errors > 0 ? `${stats.errors} errors` : ""}`
);
totalCorrect += stats.correct;
totalIncorrect += stats.incorrect;
totalErrors += stats.errors;
}
const overallTotal = totalCorrect + totalIncorrect + totalErrors;
const overallAccuracy =
overallTotal > 0 ? ((totalCorrect / overallTotal) * 100).toFixed(1) : "N/A";
console.log("-".repeat(80));
console.log(
` ${"OVERALL".padEnd(30)} | ${`${overallAccuracy}% (${totalCorrect}/${overallTotal})`.padEnd(15)} | ${totalErrors > 0 ? `${totalErrors} errors` : ""}`
);
console.log("=".repeat(80));
console.log(`\nTotal Errors: ${totalErrors}`);
if (failedExamples.length > 0) {
console.log("\n" + "=".repeat(80));
console.log(`FAILED EXAMPLES (${failedExamples.length})`);
console.log("=".repeat(80));
for (const [i, ex] of failedExamples.entries()) {
const truncatedOutput =
ex.output.length > 120 ? ex.output.slice(0, 120) + "..." : ex.output;
const truncatedExplanation =
ex.explanation.length > 200
? ex.explanation.slice(0, 200) + "..."
: ex.explanation;
console.log(`\n ${i + 1}. [${ex.category}]`);
console.log(` Input: ${ex.input}`);
console.log(` Output: ${truncatedOutput}`);
console.log(` Expected: ${ex.expected} | Got: ${ex.actual}`);
console.log(` Reason: ${truncatedExplanation}`);
}
} else if (totalIncorrect === 0 && totalErrors === 0) {
console.log("\nAll examples matched expected labels.");
}
console.log("\n" + "=".repeat(80) + "\n");
}
main();