Skip to main content
Glama

actors-mcp-server

Official
by apify
run-evaluation.ts10.6 kB
#!/usr/bin/env tsx /** * Main evaluation script for MCP tool calling (TypeScript version). */ import { createClient } from '@arizeai/phoenix-client'; // eslint-disable-next-line import/extensions import { getDatasetInfo } from '@arizeai/phoenix-client/datasets'; // eslint-disable-next-line import/extensions import { asEvaluator, runExperiment } from '@arizeai/phoenix-client/experiments'; import type { ExperimentEvaluationRun, ExperimentTask } from '@arizeai/phoenix-client/types/experiments'; import dotenv from 'dotenv'; import yargs from 'yargs'; // eslint-disable-next-line import/extensions import { hideBin } from 'yargs/helpers'; import log from '@apify/log'; import { loadTools, createOpenRouterTask, createToolSelectionLLMEvaluator } from './evaluation-utils.js'; import { DATASET_NAME, MODELS_TO_EVALUATE, PASS_THRESHOLD, EVALUATOR_NAMES, type EvaluatorName, sanitizeHeaderValue, validateEnvVars } from './config.js'; type EvaluatorResult = { model: string; experimentName: string; experimentId: string; evaluatorName: EvaluatorName; accuracy: number; correct: number; total: number; passed: boolean; error?: string; }; /** * Type for command line arguments */ type CliArgs = { datasetName?: string; }; log.setLevel(log.LEVELS.DEBUG); const RUN_LLM_EVALUATOR = true; const RUN_TOOLS_EXACT_MATCH_EVALUATOR = true; dotenv.config({ path: '.env' }); // Parse command line arguments using yargs const argv = yargs(hideBin(process.argv)) .wrap(null) // Disable automatic wrapping to avoid issues with long lines .usage('Usage: $0 [options]') .env() .option('dataset-name', { type: 'string', describe: 'Custom dataset name to evaluate (default: from config.ts)', example: 'my_custom_dataset', }) .help('help') .alias('h', 'help') .version(false) .epilogue('Examples:') .epilogue(' $0 # Use default dataset from config') .epilogue(' $0 --dataset-name tmp-1 # Evaluate custom dataset') .epilogue(' npm run evals:run -- --dataset-name custom_v1 # Via npm script') .parseSync() as CliArgs; // Sanitize secrets early to avoid invalid header characters in CI process.env.OPENROUTER_API_KEY = sanitizeHeaderValue(process.env.OPENROUTER_API_KEY); // Tools match evaluator: returns score 1 if expected tool_calls match output list, 0 otherwise const toolsExactMatch = asEvaluator({ name: EVALUATOR_NAMES.TOOLS_EXACT_MATCH, kind: 'CODE', evaluate: async ({ output, expected }: any) => { log.info(`Evaluating tools match. Expected: ${JSON.stringify(expected)}, Output: ${JSON.stringify(output)}`); let expectedTools = expected?.expectedTools || []; if (typeof expectedTools === 'string') { expectedTools = expectedTools.split(', '); } if (!expectedTools || expectedTools.length === 0) { log.debug('Tools match: No expected tools provided'); return { score: 1.0, explanation: 'No expected tools present in the test case, either not required or not provided', }; } // Normalize tool names: treat call-actor with step="info" as equivalent to fetch-actor-details const normalizeToolName = (toolName: string): string => { // Normalize call-actor to fetch-actor-details (bidirectional equivalence) if (toolName === 'call-actor' || toolName === 'fetch-actor-details') { return 'fetch-actor-details'; } return toolName; }; const normalizeToolCall = (toolCall: any): string => { const toolName = toolCall.function?.name || ''; // If it's call-actor with step="info", treat it as fetch-actor-details if (toolName === 'call-actor') { try { const args = JSON.parse(toolCall.function?.arguments || '{}'); if (args.step === 'info') { return 'fetch-actor-details'; } } catch (e) { // If we can't parse arguments, just return the tool name } } return toolName; }; // Normalize expected tools (both call-actor and fetch-actor-details → fetch-actor-details) const normalizedExpectedTools = [...expectedTools] .map(normalizeToolName) .sort(); const outputToolsTmp = (output?.tool_calls || []) .map(normalizeToolCall) .sort(); const outputToolsSet = Array.from(new Set(outputToolsTmp)).sort(); // it is correct if outputTools includes multiple calls to the same tool const isCorrect = JSON.stringify(normalizedExpectedTools) === JSON.stringify(outputToolsSet); const score = isCorrect ? 1.0 : 0.0; const explanation = `Expected: ${JSON.stringify(normalizedExpectedTools)}, Got: ${JSON.stringify(outputToolsSet)}`; log.debug(`🤖 Tools exact match: score=${score}, output=${JSON.stringify(outputToolsSet)}, expected=${JSON.stringify(normalizedExpectedTools)}`); return { score, explanation, }; }, }); function processEvaluatorResult( experiment: any, modelName: string, experimentName: string, evaluatorName: EvaluatorName ): EvaluatorResult { const runsMap = experiment.runs ?? {}; const evalRuns = experiment.evaluationRuns ?? []; const total = Object.keys(runsMap).length; const evaluatorRuns = evalRuns.filter((er: ExperimentEvaluationRun) => er.name === evaluatorName); const correct = evaluatorRuns.filter((er: ExperimentEvaluationRun) => (er.result?.score ?? 0) > 0.5).length; const accuracy = total > 0 ? correct / total : 0; return { model: modelName, experimentName, experimentId: experiment.id, evaluatorName, accuracy, correct, total, passed: accuracy >= PASS_THRESHOLD, }; } function printResults(results: EvaluatorResult[]): void { log.info('📊 Results:'); for (const result of results) { const { model, evaluatorName, accuracy, correct, total, passed, error } = result; if (error) { log.info(`${model}: ${evaluatorName} ❌ Error`); } else { const status = passed ? 'PASS' : 'FAIL'; log.info(`${model}: ${evaluatorName} ${(accuracy * 100).toFixed(1)}% (${correct}/${total}) ${status}`); } } log.info(`\nPass threshold: ${(PASS_THRESHOLD * 100).toFixed(1)}%`); const allPassed = results.every(r => !r.error && r.passed); if (allPassed) { log.info('All tests passed'); } else { log.info('Some tests failed'); } } async function main(datasetName: string): Promise<number> { log.info('Starting MCP tool calling evaluation'); if (!validateEnvVars()) { return 1; } const tools = await loadTools(); log.info(`Loaded ${tools.length} tools`); // Phoenix client init (options may be provided via env) const client = createClient({ options: { baseUrl: process.env.PHOENIX_BASE_URL!, headers: { Authorization: `Bearer ${sanitizeHeaderValue(process.env.PHOENIX_API_KEY)}` }, }, }); // Resolve dataset by name -> id let datasetId: string | undefined; try { const info = await getDatasetInfo({ client, dataset: { datasetName } }); datasetId = info?.id as string | undefined; } catch (e) { log.error(`Error loading dataset: ${e}`); return 1; } if (!datasetId) throw new Error(`Dataset "${datasetName}" not found`); log.info(`Loaded dataset "${datasetName}" with ID: ${datasetId}`); const results: EvaluatorResult[] = []; // Create the LLM evaluator with loaded tools const toolSelectionLLMEvaluator = createToolSelectionLLMEvaluator(tools); for (const modelName of MODELS_TO_EVALUATE) { log.info(`\nEvaluating model: ${modelName}`); // OpenRouter task const taskFn = createOpenRouterTask(modelName, tools); // Get PR info for better tracking const prNumber = process.env.GITHUB_PR_NUMBER || 'local'; const prLabel = prNumber === 'local' ? 'local run' : `PR #${prNumber}`; const experimentName = `MCP server: ${modelName}`; const experimentDescription = `${modelName}, ${prLabel}`; const evaluators = []; if (RUN_TOOLS_EXACT_MATCH_EVALUATOR) { evaluators.push(toolsExactMatch); } if (RUN_LLM_EVALUATOR) { evaluators.push(toolSelectionLLMEvaluator); } try { const experiment = await runExperiment({ client, dataset: { datasetName }, // Cast to satisfy ExperimentTask type task: taskFn as ExperimentTask, evaluators, experimentName, experimentDescription, concurrency: 10, }); log.info(`Experiment run completed`); // Process each evaluator separately results.push(processEvaluatorResult(experiment, modelName, experimentName, EVALUATOR_NAMES.TOOLS_EXACT_MATCH)); results.push(processEvaluatorResult(experiment, modelName, experimentName, EVALUATOR_NAMES.TOOL_SELECTION_LLM)); } catch (e: unknown) { const err = e instanceof Error ? e : new Error(String(e)); log.error(`Error evaluating ${modelName}:`, err); log.error(`Full error trace: ${err.stack ?? err.message}`); // Add error results for both evaluators Object.values(EVALUATOR_NAMES).forEach(evaluatorName => { results.push({ model: modelName, experimentName, experimentId: '', evaluatorName, accuracy: 0, correct: 0, total: 0, passed: false, error: err.message }); }); } } printResults(results); const allPassed = results.every(r => !r.error && r.passed); return allPassed ? 0 : 1; } // Run main(argv.datasetName || DATASET_NAME) .then((code) => process.exit(code)) .catch((err) => { log.error('Unexpected error:', err); process.exit(1); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/apify/actors-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server