actors-mcp-server

Official

Overview Schema Related Servers Score Discussions

run-workflow-evals.ts•9.28 KiB

#!/usr/bin/env node /* eslint-disable no-console */ /* eslint-disable import/extensions */ /** * Main CLI entry point for workflow evaluations * * Usage: * npm run evals:workflow * npm run evals:workflow -- --category basic * npm run evals:workflow -- --id test-001 * npm run evals:workflow -- --verbose * npm run evals:workflow -- --concurrency 10 */ import path from 'node:path'; import pLimit from 'p-limit'; import yargs from 'yargs'; import { hideBin } from 'yargs/helpers'; import { DEFAULT_TOOL_TIMEOUT_SECONDS, MODELS } from './config.js'; import { executeConversation } from './conversation-executor.js'; import { LlmClient } from './llm-client.js'; import { McpClient } from './mcp-client.js'; import type { EvaluationResult } from './output-formatter.js'; import { formatDetailedResult, formatResultsTable } from './output-formatter.js'; import { loadResultsDatabase, saveResultsDatabase, updateResultsWithEvaluations, } from './results-writer.js'; import type { WorkflowTestCase } from './test-cases-loader.js'; import { filterTestCases, loadTestCases } from './test-cases-loader.js'; import { evaluateConversation } from './workflow-judge.js'; type CliArgs = { category?: string; id?: string; verbose?: boolean; testCasesPath?: string; agentModel?: string; judgeModel?: string; toolTimeout?: number; concurrency?: number; output?: boolean; } /** * Helper function to log messages with test ID prefix */ function logWithPrefix(testId: string, message: string): void { const lines = message.split('\n'); for (const line of lines) { console.log(`[${testId}] ${line}`); } } /** * Run a single test case evaluation */ async function runSingleTest( testCase: WorkflowTestCase, index: number, total: number, argv: CliArgs, llmClient: LlmClient, apifyToken: string, ): Promise<EvaluationResult> { const testId = testCase.id; logWithPrefix(testId, `[${index + 1}/${total}] Running...`); // Create FRESH MCP instance per test for isolation const mcpClient = new McpClient(argv.toolTimeout); const startTime = Date.now(); let result: EvaluationResult; try { // Start MCP server with test-specific tools (if configured) await mcpClient.start(apifyToken, testCase.tools); // Get server instructions (if provided) const serverInstructions = mcpClient.getInstructions(); // Execute conversation (tools fetched dynamically inside) const conversation = await executeConversation({ userPrompt: testCase.query, mcpClient, llmClient, maxTurns: testCase.maxTurns, model: argv.agentModel, serverInstructions, }); // Judge conversation const judgeResult = await evaluateConversation(testCase, conversation, llmClient, argv.judgeModel); const durationMs = Date.now() - startTime; result = { testCase, conversation, judgeResult, durationMs, }; logWithPrefix(testId, ` ${judgeResult.verdict === 'PASS' ? '✅' : '❌'} ${judgeResult.verdict} (${durationMs}ms)`); } catch (error) { const durationMs = Date.now() - startTime; result = { testCase, conversation: { userPrompt: testCase.query, turns: [], completed: false, hitMaxTurns: false, totalTurns: 0, }, judgeResult: { verdict: 'FAIL', reason: 'Error during execution', rawResponse: '', }, durationMs, error: error instanceof Error ? error.message : String(error), }; logWithPrefix(testId, ` 🔥 ERROR (${durationMs}ms)`); } finally { // ALWAYS cleanup MCP client for this test try { await mcpClient.cleanup(); } catch (cleanupError) { logWithPrefix(testId, ` ⚠️ Cleanup failed: ${cleanupError}`); } } // Show detailed output if verbose if (argv.verbose) { logWithPrefix(testId, ''); logWithPrefix(testId, formatDetailedResult(result)); } return result; } async function main() { // Parse CLI arguments const argv = await yargs(hideBin(process.argv)) .option('category', { type: 'string', description: 'Filter by test case category', }) .option('id', { type: 'string', description: 'Run specific test case by ID', }) .option('verbose', { type: 'boolean', description: 'Show detailed output for each test', default: false, }) .option('test-cases-path', { type: 'string', description: 'Path to test cases JSON file', }) .option('agent-model', { type: 'string', description: `LLM model for the agent (default: ${MODELS.agent})`, default: MODELS.agent, }) .option('judge-model', { type: 'string', description: `LLM model for the judge (default: ${MODELS.judge})`, default: MODELS.judge, }) .option('tool-timeout', { type: 'number', description: `Tool call timeout in seconds (default: ${DEFAULT_TOOL_TIMEOUT_SECONDS})`, default: DEFAULT_TOOL_TIMEOUT_SECONDS, }) .option('concurrency', { alias: 'c', type: 'number', description: 'Number of tests to run in parallel (default: 4)', default: 4, }) .option('output', { alias: 'o', type: 'boolean', description: 'Save test results to JSON file (evals/workflows/results.json)', default: false, }) .help() .argv as CliArgs; console.log('='.repeat(100)); console.log('Workflow Evaluation Runner'); console.log('='.repeat(100)); console.log(); // Check environment variables const apifyToken = process.env.APIFY_TOKEN; const openrouterKey = process.env.OPENROUTER_API_KEY; if (!apifyToken) { console.error('❌ Error: APIFY_TOKEN environment variable is required'); process.exit(1); } if (!openrouterKey) { console.error('❌ Error: OPENROUTER_API_KEY environment variable is required'); process.exit(1); } // Load and filter test cases console.log('📂 Loading test cases...'); let testCases; try { testCases = loadTestCases(argv.testCasesPath); } catch (error) { console.error(`❌ Failed to load test cases: ${error}`); process.exit(1); } const filteredTestCases = filterTestCases(testCases, { id: argv.id, category: argv.category, }); if (filteredTestCases.length === 0) { console.log('⚠️ No test cases found matching the filters.'); console.log(''); console.log('Available test cases:'); for (const tc of testCases) { console.log(` - ${tc.id} (${tc.category}): ${tc.query}`); } process.exit(0); } console.log(`✅ Loaded ${filteredTestCases.length} test case(s)`); console.log(); // Initialize LLM client (shared across all tests - stateless) const llmClient = new LlmClient(); // Run evaluations console.log(`▶️ Running ${filteredTestCases.length} evaluation(s) with concurrency ${argv.concurrency}...`); console.log(); // Create concurrency limiter const limit = pLimit(argv.concurrency!); // Execute tests in parallel with concurrency control const resultPromises = filteredTestCases.map(async (testCase, index) => { return limit(async () => { return runSingleTest(testCase, index, filteredTestCases.length, argv, llmClient, apifyToken); }); }); // Wait for all tests to complete const results = await Promise.all(resultPromises); // Save results to file if --output flag is present if (argv.output) { const resultsPath = path.join(process.cwd(), 'evals/workflows/results.json'); try { const database = loadResultsDatabase(resultsPath); const updatedDatabase = updateResultsWithEvaluations( database, results, argv.agentModel!, argv.judgeModel!, ); saveResultsDatabase(resultsPath, updatedDatabase); console.log(`✅ Results saved to: ${resultsPath}`); console.log(); } catch (error) { console.error(`❌ Failed to save results: ${error}`); console.error(' Results will still be displayed but not persisted.'); console.log(); } } // Display results console.log(formatResultsTable(results)); // Exit with appropriate code - ALL tests must pass const totalTests = results.length; const passedTests = results.filter((r) => !r.error && r.judgeResult.verdict === 'PASS').length; const errorTests = results.filter((r) => r.error).length; // Exit 0 only if ALL tests passed with no errors const allPassed = totalTests > 0 && passedTests === totalTests && errorTests === 0; process.exit(allPassed ? 0 : 1); } void main();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/apify/actors-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

run-workflow-evals.ts•9.28 KiB