Skip to main content
Glama

actors-mcp-server

Official
by apify
evaluation-utils.ts6.95 kB
/** * Shared evaluation utilities extracted from run-evaluation.ts */ import { readFileSync } from 'node:fs'; import { dirname as pathDirname, join } from 'node:path'; import { fileURLToPath } from 'node:url'; import OpenAI from 'openai'; import { createOpenAI } from '@ai-sdk/openai'; import { asEvaluator } from '@arizeai/phoenix-client/experiments'; import { createClassifierFn } from '@arizeai/phoenix-evals'; import log from '@apify/log'; import { ApifyClient } from '../src/apify-client.js'; import { getToolPublicFieldOnly, processParamsGetTools } from '../src/index-internals.js'; import type { ToolBase, ToolEntry } from '../src/types.js'; import { SYSTEM_PROMPT, TOOL_CALLING_BASE_TEMPLATE, TOOL_SELECTION_EVAL_MODEL, EVALUATOR_NAMES, TEMPERATURE, sanitizeHeaderValue } from './config.js'; type ExampleInputOnly = { input: Record<string, unknown>, metadata?: Record<string, unknown>, output?: never }; export type TestCase = { id: string; category: string; query: string; context?: string | string[]; expectedTools?: string[]; reference?: string; }; export type TestData = { version: string; testCases: TestCase[]; }; // eslint-disable-next-line consistent-return export function loadTestCases(filePath: string): TestData { const filename = fileURLToPath(import.meta.url); const dirname = pathDirname(filename); const testCasesPath = join(dirname, filePath); try { const fileContent = readFileSync(testCasesPath, 'utf-8'); return JSON.parse(fileContent) as TestData; } catch { log.error(`Error: Test cases file not found at ${testCasesPath}`); process.exit(1); } } export function filterByCategory(testCases: TestCase[], category: string): TestCase[] { // Convert wildcard pattern to regex const pattern = category.replace(/\*/g, '.*'); const regex = new RegExp(`^${pattern}$`); return testCases.filter((testCase) => regex.test(testCase.category)); } export function filterById(testCases: TestCase[], idPattern: string): TestCase[] { const regex = new RegExp(idPattern); return testCases.filter((testCase) => regex.test(testCase.id)); } export async function loadTools(): Promise<ToolBase[]> { const apifyClient = new ApifyClient({ token: process.env.APIFY_API_TOKEN || '' }); const urlTools = await processParamsGetTools('', apifyClient); return urlTools.map((t: ToolEntry) => getToolPublicFieldOnly(t)) as ToolBase[]; } export function transformToolsToOpenAIFormat(tools: ToolBase[]): OpenAI.Chat.Completions.ChatCompletionTool[] { return tools.map((tool) => ({ type: 'function', function: { name: tool.name, description: tool.description, parameters: tool.inputSchema as OpenAI.Chat.ChatCompletionTool['function']['parameters'], }, })); } export function createOpenRouterTask(modelName: string, tools: ToolBase[]) { const toolsOpenAI = transformToolsToOpenAIFormat(tools); return async (example: ExampleInputOnly): Promise<{ tool_calls: Array<{ function?: { name?: string } }>; llm_response: string; query: string; context: string; reference: string; }> => { const client = new OpenAI({ baseURL: process.env.OPENROUTER_BASE_URL, apiKey: sanitizeHeaderValue(process.env.OPENROUTER_API_KEY), }); log.info(`Input: ${JSON.stringify(example)}`); const context = JSON.stringify(example.input?.context ?? {}); const query = String(example.input?.query ?? ''); const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [ { role: 'system', content: SYSTEM_PROMPT }, ]; if (context) { messages.push({ role: 'user', content: `My previous interaction with the assistant: ${context}` }); } messages.push({ role: 'user', content: `${query}`, }); log.info(`Messages to model: ${JSON.stringify(messages)}`); const response = await client.chat.completions.create({ model: modelName, messages, tools: toolsOpenAI, temperature: TEMPERATURE, // Use configured temperature (0 = deterministic) }); log.info(`Model response: ${JSON.stringify(response.choices[0])}`); return { tool_calls: response.choices[0].message.tool_calls || [], llm_response: response.choices[0].message.content || '', query: String(example.input?.query ?? ''), context: String(JSON.stringify(example.input?.context ?? '{}')), reference: String(example.input?.reference ?? ''), }; }; } export function createClassifierEvaluator() { const openai = createOpenAI({ // custom settings, e.g. baseURL: process.env.OPENROUTER_BASE_URL, apiKey: process.env.OPENROUTER_API_KEY, }); return createClassifierFn({ model: openai(TOOL_SELECTION_EVAL_MODEL), choices: {correct: 1.0, incorrect: 0.0}, promptTemplate: TOOL_CALLING_BASE_TEMPLATE, }); } // LLM-based evaluator using Phoenix classifier - more robust than direct LLM calls export function createToolSelectionLLMEvaluator(tools: ToolBase[]) { const evaluator = createClassifierEvaluator(); return asEvaluator({ name: EVALUATOR_NAMES.TOOL_SELECTION_LLM, kind: 'LLM', evaluate: async ({ input, output, expected }: any) => { const evalInput = { query: input?.query || '', context: JSON.stringify(input?.context || {}), tool_calls: JSON.stringify(output?.tool_calls || []), llm_response: output?.llm_response || '', reference: expected?.reference || '', // tool_definitions: JSON.stringify(tools) }; log.info(`Evaluating tool selection. Input: query: ${input?.query}, context: ${JSON.stringify(input?.context || {})}, tool_calls: ${JSON.stringify(output?.tool_calls)}, llm_response: ${output?.llm_response}, tool definitions: ${JSON.stringify(tools.map((t) => t.name))}, reference: ${expected?.reference}`); try { const result = await evaluator(evalInput); log.info(`🕵 Tool selection: score: ${result.score}: ${JSON.stringify(result)}`); return { score: result.score || 0.0, explanation: result.explanation || 'No explanation returned by model' }; } catch (error) { log.info(`Tool selection evaluation failed: ${error}`); return { score: 0.0, explanation: `Evaluation failed: ${error}` }; } }, }); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/apify/actors-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server