run-evals.ts•8.19 kB
#!/usr/bin/env tsx
import { Command } from "commander";
import * as fs from "fs/promises";
import * as path from "path";
import { evaluate } from "mcpvals";
import os from "os";
import chalk from "chalk";
// Load environment variables from .env file
import { config } from "dotenv";
config();
// Types for evaluation results
interface EvaluationResult {
workflowName: string;
passed: boolean;
overallScore: number;
results: Array<{
metric: string;
passed: boolean;
score: number;
details: string;
metadata?: Record<string, unknown>;
}>;
}
interface EvaluationReport {
config: Record<string, unknown>;
evaluations: EvaluationResult[];
passed: boolean;
timestamp: Date;
}
interface TestResult {
config: string;
passed: boolean;
score: number;
duration: number;
workflows: {
name: string;
passed: boolean;
score: number;
}[];
}
interface EvalConfig {
workflows: Array<{ name?: string }>;
passThreshold?: number;
[key: string]: unknown;
}
const program = new Command();
program
.name("browserbase-mcp-evals")
.description("Run evaluation tests for Browserbase MCP Server")
.version("1.0.0");
program
.command("run")
.description("Run evaluation tests")
.option(
"-c, --config <path>",
"Config file path",
"./evals/mcp-eval.config.json",
)
.option("-d, --debug", "Enable debug output")
.option("-j, --json", "Output results as JSON")
.option("-l, --llm", "Enable LLM judge")
.option("-o, --output <path>", "Save results to file")
.option(
"-p, --pass-threshold <number>",
"Minimum average score (0-1) required to pass. Can also be set via EVAL_PASS_THRESHOLD env var.",
)
.option("-t, --timeout <ms>", "Override timeout in milliseconds")
.action(async (options) => {
try {
const startTime = Date.now();
// Check for required environment variables
const requiredEnvVars = [
"BROWSERBASE_API_KEY",
"BROWSERBASE_PROJECT_ID",
"ANTHROPIC_API_KEY",
"GEMINI_API_KEY",
];
const missingVars = requiredEnvVars.filter((v) => !process.env[v]);
if (missingVars.length > 0) {
console.error(
chalk.red(
`Missing required environment variables: ${missingVars.join(", ")}`,
),
);
console.error(
chalk.yellow("Please set them before running the tests."),
);
console.error(chalk.yellow("Example:"));
for (const missingVar of missingVars) {
switch (missingVar) {
case "BROWSERBASE_API_KEY":
console.error(
chalk.yellow(
" export BROWSERBASE_API_KEY='your_api_key_here'",
),
);
break;
case "BROWSERBASE_PROJECT_ID":
console.error(
chalk.yellow(
" export BROWSERBASE_PROJECT_ID='your_project_id_here'",
),
);
break;
case "ANTHROPIC_API_KEY":
console.error(
chalk.yellow(
" export ANTHROPIC_API_KEY='sk-ant-your_key_here'",
),
);
break;
case "GEMINI_API_KEY":
console.error(
chalk.yellow(" export GEMINI_API_KEY='your_gemini_key_here'"),
);
break;
}
}
process.exit(1);
}
// Check for LLM judge requirements
if (options.llm && !process.env.OPENAI_API_KEY) {
console.error(
chalk.red("LLM judge requires OPENAI_API_KEY environment variable"),
);
process.exit(1);
}
// Resolve config path
const configPath = path.resolve(options.config);
// Load config to get workflow count for display
const configContent = await fs.readFile(configPath, "utf-8");
const config: EvalConfig = JSON.parse(configContent);
console.log(chalk.blue(`Running evaluation tests from: ${configPath}`));
console.log(chalk.gray(`Workflows to test: ${config.workflows.length}`));
// Prepare evaluation options
const evalOptions = {
debug: options.debug,
reporter: (options.json ? "json" : "console") as
| "json"
| "console"
| "junit"
| undefined,
llmJudge: options.llm,
timeout: options.timeout ? parseInt(options.timeout) : undefined,
};
console.log(
chalk.yellow(
"Parallel mode: splitting workflows and running concurrently",
),
);
const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "mcp-evals-"));
const workflowFiles: string[] = [];
for (let i = 0; i < config.workflows.length; i++) {
const wf = config.workflows[i];
const wfConfig = { ...config, workflows: [wf] };
const wfPath = path.join(
tmpDir,
`workflow-${i}-${(wf.name || "unnamed").replace(/[^a-z0-9_-]/gi, "_")}.json`,
);
await fs.writeFile(wfPath, JSON.stringify(wfConfig, null, 2));
workflowFiles.push(wfPath);
}
const reports: EvaluationReport[] = await Promise.all(
workflowFiles.map((wfPath) => evaluate(wfPath, evalOptions)),
);
// Aggregate results
const allEvaluations = reports.flatMap((r) => r.evaluations);
const duration = Date.now() - startTime;
// Determine pass/fail based on threshold instead of strict all-pass
const avgScore =
allEvaluations.length === 0
? 0
: allEvaluations.reduce((sum, e) => sum + e.overallScore, 0) /
allEvaluations.length;
const thresholdFromEnv =
(process.env.EVAL_PASS_THRESHOLD || process.env.PASS_THRESHOLD) ?? "";
const thresholdFromCli = options.passThreshold ?? "";
const thresholdFromConfig =
typeof config.passThreshold === "number"
? String(config.passThreshold)
: "";
const threshold = (() => {
const raw = String(
thresholdFromCli || thresholdFromEnv || thresholdFromConfig,
).trim();
const parsed = Number.parseFloat(raw);
if (!Number.isFinite(parsed)) return 0.6; // default lowered threshold
return parsed;
})();
const passed = avgScore >= threshold;
const finalReport: EvaluationReport = {
config: { parallel: true, source: configPath },
evaluations: allEvaluations,
passed,
timestamp: new Date(),
};
const finalResult: TestResult = {
config: configPath,
passed,
score: avgScore,
duration,
workflows: allEvaluations.map((e) => ({
name: e.workflowName,
passed: e.passed,
score: e.overallScore,
})),
};
// Best-effort cleanup
try {
await Promise.all(workflowFiles.map((f) => fs.unlink(f)));
await fs.rmdir(tmpDir);
} catch {
// ignore cleanup errors
}
// Output results
if (options.json) {
console.log(JSON.stringify(finalResult, null, 2));
} else {
console.log(
chalk.green(
`\nTest execution completed in ${(finalResult.duration / 1000).toFixed(2)}s`,
),
);
console.log(
chalk.gray(
`Threshold for pass: ${threshold.toFixed(2)} | Average score: ${finalResult.score.toFixed(3)}`,
),
);
console.log(
chalk[finalResult.passed ? "green" : "red"](
`Overall result: ${finalResult.passed ? "PASSED" : "FAILED"} (${(finalResult.score * 100).toFixed(1)}%)`,
),
);
}
// Save to file if requested
if (options.output) {
await fs.writeFile(
options.output,
JSON.stringify(finalReport, null, 2),
);
console.log(chalk.gray(`Results saved to: ${options.output}`));
}
process.exit(finalResult.passed ? 0 : 1);
} catch (error) {
console.error("Error running evaluation tests:", error);
process.exit(1);
}
});
program.parse();