Browserbase MCP Server

run-evals.ts•7.99 KiB

#!/usr/bin/env tsx import { Command } from "commander"; import * as fs from "fs/promises"; import * as path from "path"; import { evaluate } from "mcpvals"; import os from "os"; import chalk from "chalk"; // Load environment variables from .env file import { config } from "dotenv"; config(); // Types for evaluation results interface EvaluationResult { workflowName: string; passed: boolean; overallScore: number; results: Array<{ metric: string; passed: boolean; score: number; details: string; metadata?: Record<string, unknown>; }>; } interface EvaluationReport { config: Record<string, unknown>; evaluations: EvaluationResult[]; passed: boolean; timestamp: Date; } interface TestResult { config: string; passed: boolean; score: number; duration: number; workflows: { name: string; passed: boolean; score: number; }[]; } interface EvalConfig { workflows: Array<{ name?: string }>; passThreshold?: number; [key: string]: unknown; } const program = new Command(); program .name("browserbase-mcp-evals") .description("Run evaluation tests for Browserbase MCP Server") .version("1.0.0"); program .command("run") .description("Run evaluation tests") .option( "-c, --config <path>", "Config file path", "./evals/mcp-eval.config.json", ) .option("-d, --debug", "Enable debug output") .option("-j, --json", "Output results as JSON") .option("-l, --llm", "Enable LLM judge") .option("-o, --output <path>", "Save results to file") .option( "-p, --pass-threshold <number>", "Minimum average score (0-1) required to pass. Can also be set via EVAL_PASS_THRESHOLD env var.", ) .option("-t, --timeout <ms>", "Override timeout in milliseconds") .action(async (options) => { try { const startTime = Date.now(); // Check for required environment variables const requiredEnvVars = [ "BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID", "ANTHROPIC_API_KEY", "GEMINI_API_KEY", ]; const missingVars = requiredEnvVars.filter((v) => !process.env[v]); if (missingVars.length > 0) { console.error( chalk.red( `Missing required environment variables: ${missingVars.join(", ")}`, ), ); console.error( chalk.yellow("Please set them before running the tests."), ); console.error(chalk.yellow("Example:")); for (const missingVar of missingVars) { switch (missingVar) { case "BROWSERBASE_API_KEY": console.error( chalk.yellow( " export BROWSERBASE_API_KEY='your_api_key_here'", ), ); break; case "BROWSERBASE_PROJECT_ID": console.error( chalk.yellow( " export BROWSERBASE_PROJECT_ID='your_project_id_here'", ), ); break; case "ANTHROPIC_API_KEY": console.error( chalk.yellow( " export ANTHROPIC_API_KEY='sk-ant-your_key_here'", ), ); break; case "GEMINI_API_KEY": console.error( chalk.yellow(" export GEMINI_API_KEY='your_gemini_key_here'"), ); break; } } process.exit(1); } // Check for LLM judge requirements if (options.llm && !process.env.OPENAI_API_KEY) { console.error( chalk.red("LLM judge requires OPENAI_API_KEY environment variable"), ); process.exit(1); } // Resolve config path const configPath = path.resolve(options.config); // Load config to get workflow count for display const configContent = await fs.readFile(configPath, "utf-8"); const config: EvalConfig = JSON.parse(configContent); console.log(chalk.blue(`Running evaluation tests from: ${configPath}`)); console.log(chalk.gray(`Workflows to test: ${config.workflows.length}`)); // Prepare evaluation options const evalOptions = { debug: options.debug, reporter: (options.json ? "json" : "console") as | "json" | "console" | "junit" | undefined, llmJudge: options.llm, timeout: options.timeout ? parseInt(options.timeout) : undefined, }; console.log( chalk.yellow( "Parallel mode: splitting workflows and running concurrently", ), ); const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "mcp-evals-")); const workflowFiles: string[] = []; for (let i = 0; i < config.workflows.length; i++) { const wf = config.workflows[i]; const wfConfig = { ...config, workflows: [wf] }; const wfPath = path.join( tmpDir, `workflow-${i}-${(wf.name || "unnamed").replace(/[^a-z0-9_-]/gi, "_")}.json`, ); await fs.writeFile(wfPath, JSON.stringify(wfConfig, null, 2)); workflowFiles.push(wfPath); } const reports: EvaluationReport[] = await Promise.all( workflowFiles.map((wfPath) => evaluate(wfPath, evalOptions)), ); // Aggregate results const allEvaluations = reports.flatMap((r) => r.evaluations); const duration = Date.now() - startTime; // Determine pass/fail based on threshold instead of strict all-pass const avgScore = allEvaluations.length === 0 ? 0 : allEvaluations.reduce((sum, e) => sum + e.overallScore, 0) / allEvaluations.length; const thresholdFromEnv = (process.env.EVAL_PASS_THRESHOLD || process.env.PASS_THRESHOLD) ?? ""; const thresholdFromCli = options.passThreshold ?? ""; const thresholdFromConfig = typeof config.passThreshold === "number" ? String(config.passThreshold) : ""; const threshold = (() => { const raw = String( thresholdFromCli || thresholdFromEnv || thresholdFromConfig, ).trim(); const parsed = Number.parseFloat(raw); if (!Number.isFinite(parsed)) return 0.6; // default lowered threshold return parsed; })(); const passed = avgScore >= threshold; const finalReport: EvaluationReport = { config: { parallel: true, source: configPath }, evaluations: allEvaluations, passed, timestamp: new Date(), }; const finalResult: TestResult = { config: configPath, passed, score: avgScore, duration, workflows: allEvaluations.map((e) => ({ name: e.workflowName, passed: e.passed, score: e.overallScore, })), }; // Best-effort cleanup try { await Promise.all(workflowFiles.map((f) => fs.unlink(f))); await fs.rmdir(tmpDir); } catch { // ignore cleanup errors } // Output results if (options.json) { console.log(JSON.stringify(finalResult, null, 2)); } else { console.log( chalk.green( `\nTest execution completed in ${(finalResult.duration / 1000).toFixed(2)}s`, ), ); console.log( chalk.gray( `Threshold for pass: ${threshold.toFixed(2)} | Average score: ${finalResult.score.toFixed(3)}`, ), ); console.log( chalk[finalResult.passed ? "green" : "red"]( `Overall result: ${finalResult.passed ? "PASSED" : "FAILED"} (${(finalResult.score * 100).toFixed(1)}%)`, ), ); } // Save to file if requested if (options.output) { await fs.writeFile( options.output, JSON.stringify(finalReport, null, 2), ); console.log(chalk.gray(`Results saved to: ${options.output}`)); } process.exit(finalResult.passed ? 0 : 1); } catch (error) { console.error("Error running evaluation tests:", error); process.exit(1); } }); program.parse();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kolbertistvan2/stagehand-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

run-evals.ts•7.99 KiB