/**
* Configuration for Apify MCP Server evaluations.
*/
import { readFileSync } from 'node:fs';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
// Read version from test-cases.json
function getTestCasesVersion(): string {
const currentFilename = fileURLToPath(import.meta.url);
const currentDirname = dirname(currentFilename);
const testCasesPath = join(currentDirname, 'test-cases.json');
const testCasesContent = readFileSync(testCasesPath, 'utf-8');
const testCases = JSON.parse(testCasesContent);
return testCases.version;
}
// Evaluator names
export const EVALUATOR_NAMES = {
TOOLS_EXACT_MATCH: 'tool-exact-match',
TOOL_SELECTION_LLM: 'tool-selection-llm',
} as const;
export type EvaluatorName = typeof EVALUATOR_NAMES[keyof typeof EVALUATOR_NAMES];
// Models to evaluate
// 'openai/gpt-4.1-mini', // DO NOT USE - it has much worse performance than gpt-4o-mini and other models
// 'openai/gpt-4o-mini', // Neither used in cursor nor copilot
// 'openai/gpt-4.1',
export const MODELS_TO_EVALUATE = [
'anthropic/claude-haiku-4.5',
// 'anthropic/claude-sonnet-4.5',
'google/gemini-2.5-flash',
// 'google/gemini-2.5-pro',
'openai/gpt-5',
// 'openai/gpt-5-mini',
'openai/gpt-4o-mini',
];
export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4.1';
export const PASS_THRESHOLD = 0.7;
// LLM sampling parameters
// Temperature = 0 provides deterministic, focused responses
export const TEMPERATURE = 0;
export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
// System prompt - instructions mainly cursor (very similar instructions in copilot)
// https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools/blob/main/Cursor%20Prompts/Agent%20Prompt%20v1.2.txt
// https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools/blob/main/VSCode%20Agent/Prompt.txt
export const SYSTEM_PROMPT = `
You are a helpful assistant with a set of tools.
Follow these rules regarding tool calls:
1. ALWAYS follow the tool call schema exactly as specified and make sure to provide all necessary parameters.
2. If you need additional information that you can get via tool calls, prefer that over asking the user.
3. Only use the standard tool call format and the available tools.
`;
// Should TOOL DEFINITIONS be included in the prompt?
// Including tool definitions significantly increases prompt size and can affect evaluation results.
// Changing a tool definition may not impact tool call correctness, but it can alter the evaluation outcome.
// This can lead to inconsistent or circular evaluation results.
//
// PROMPT with tools definitions:
//
// "incorrect" means that the chosen tool was not correct
// or that the tool signature includes parameter values that don't match
// the formats specified in the tool definitions below.
//
// You must not use any outside information or make assumptions.
// Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
// the [Tool Definitions], and the [Reference instructions] (if provided).
export const TOOL_CALLING_BASE_TEMPLATE = `
You are an evaluation assistant responsible for assessing user queries and corresponding tool calls to
determine whether the correct tool was selected and if the tool choice appropriately matches the user's request
Tool calls are generated by a separate agent and chosen from a provided list of tools.
You must judge whether this agent made the correct selection.
[BEGIN DATA]
************
[User's previous interaction with the assistant]: {{context}}
[User query]: {{query}}
************
[LLM decided to call these tools]: {{tool_calls}}
[LLM response]: {{llm_response}}
************
[REFERENCE INSTRUCTIONS]: {{reference}}
[END DATA]
DECISION: [correct or incorrect]
EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
Your answer must consist of a single word: "correct" or "incorrect".
No extra text, symbols, or formatting is allowed.
"correct" means the agent selected the correct tool, extracted the proper parameters from the query,
crafted a runnable and accurate tool call, and used only information present in the query or context.
"incorrect" means the selected tool was not appropriate, or if any tool parameters do not match the expected signature,
or if reference instructions were not properly followed.
Do not use external knowledge or make assumptions.
Make your decision strictly based on the information within [BEGIN DATA] and [END DATA].
If [Reference instructions] are included, they specify requirements for tool usage.
If the tool call does not conform, the answer must be "incorrect".
## Output Format
The response must be exactly:
Decision: either "correct" or "incorrect".
Explanation: brief explanation of the decision.
`
export function getRequiredEnvVars(): Record<string, string | undefined> {
return {
PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY,
OPENROUTER_BASE_URL: process.env.OPENROUTER_BASE_URL,
};
}
// Removes newlines and trims whitespace. Useful for Authorization header values
// because CI secrets sometimes include trailing newlines or quotes.
export function sanitizeHeaderValue(value?: string): string | undefined {
if (value == null) return value;
return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
}
export function validateEnvVars(): boolean {
const envVars = getRequiredEnvVars();
const missing = Object.entries(envVars)
.filter(([, value]) => !value)
.map(([key]) => key);
if (missing.length > 0) {
// eslint-disable-next-line no-console
console.error(`Missing required environment variables: ${missing.join(', ')}`);
return false;
}
return true;
}