/**
* Configuration for Apify MCP Server evaluations.
*/
import { readFileSync } from 'node:fs';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
// Read version from test-cases.json
function getTestCasesVersion(): string {
const currentFilename = fileURLToPath(import.meta.url);
const currentDirname = dirname(currentFilename);
const testCasesPath = join(currentDirname, 'test-cases.json');
const testCasesContent = readFileSync(testCasesPath, 'utf-8');
const testCases = JSON.parse(testCasesContent);
return testCases.version;
}
// Evaluator names
export const EVALUATOR_NAMES = {
TOOLS_EXACT_MATCH: 'tool-exact-match',
TOOL_SELECTION_LLM: 'tool-selection-llm',
} as const;
export type EvaluatorName = typeof EVALUATOR_NAMES[keyof typeof EVALUATOR_NAMES];
// Models to evaluate
// 'openai/gpt-4.1-mini', // DO NOT USE - it has much worse performance than gpt-4o-mini and other models
// 'openai/gpt-4o-mini', // Neither used in cursor nor copilot
// 'openai/gpt-4.1',
export const MODELS_TO_EVALUATE = [
'anthropic/claude-haiku-4.5',
// 'anthropic/claude-sonnet-4.5',
'google/gemini-2.5-flash',
// 'google/gemini-2.5-pro',
'openai/gpt-5',
// 'openai/gpt-5-mini',
'openai/gpt-4o-mini',
];
export const TOOL_SELECTION_EVAL_MODEL = 'openai/gpt-4.1';
export const PASS_THRESHOLD = 0.7;
// LLM sampling parameters
// Temperature = 0 provides deterministic, focused responses
export const TEMPERATURE = 0;
export const DATASET_NAME = `mcp_server_dataset_v${getTestCasesVersion()}`;
// System prompt - instructions mainly cursor (very similar instructions in copilot)
// https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools/blob/main/Cursor%20Prompts/Agent%20Prompt%20v1.2.txt
// https://github.com/x1xhlol/system-prompts-and-models-of-ai-tools/blob/main/VSCode%20Agent/Prompt.txt
export const SYSTEM_PROMPT = `
You are a helpful assistant with a set of tools.
Follow these rules regarding tool calls:
1. ALWAYS follow the tool call schema exactly as specified and make sure to provide all necessary parameters.
2. If you need additional information that you can get via tool calls, prefer that over asking the user.
3. Only use the standard tool call format and the available tools.
`;
// Should TOOL DEFINITIONS be included in the prompt?
// Including tool definitions significantly increases prompt size and can affect evaluation results.
// Changing a tool definition may not impact tool call correctness, but it can alter the evaluation outcome.
// This can lead to inconsistent or circular evaluation results.
//
// PROMPT with tools definitions:
//
// "incorrect" means that the chosen tool was not correct
// or that the tool signature includes parameter values that don't match
// the formats specified in the tool definitions below.
//
// You must not use any outside information or make assumptions.
// Base your decision solely on the information provided in [BEGIN DATA] ... [END DATA],
// the [Tool Definitions], and the [Reference instructions] (if provided).
export const TOOL_CALLING_BASE_TEMPLATE = `
You are an evaluation assistant responsible for assessing user queries and corresponding tool calls to
determine whether the correct tool was selected and if the tool choice appropriately matches the user's request
Tool calls are generated by a separate agent and chosen from a provided list of tools.
You must judge whether this agent made the correct selection.
## Important tool context
**search-actors**: Searches the Apify Store to find scraping tools/Actors (NOT celebrity actors). This finds pre-built scraping solutions.
- Use when query mentions: "Actor", "tool", "scraper", or asks about finding/discovering scraping capabilities
- Example: "Find an Actor for Instagram" or "What tools scrape Twitter?"
**apify-slash-rag-web-browser**: Browses the web to get data immediately (one-time data retrieval).
- Use when query has time indicators ("today", "recent", "current", "latest") or asks for immediate data
- Example: "Get flight prices for tomorrow" or "What's the current weather?"
**call-actor**: Has a mandatory two-step workflow: step="info" first (gets Actor details), then step="call" (runs Actor).
- Calling with step="info" is CORRECT and required before execution
- Do NOT penalize the info step - it's part of the normal workflow
**fetch-actor-details**: Gets Actor documentation without running it. Overlaps with call-actor step="info".
- Both fetch-actor-details AND call-actor step="info" are valid for getting Actor parameters/details
**search-apify-docs**: Searches Apify documentation for general info about Apify platform/features.
- Use when query asks about Apify concepts, features, or how to use the platform
- Searches across all documentation to find relevant pages
- Example: "How to create an Apify Actor?" or "What is Apify Proxy?"
**get-actor-output**: Retrieves the output data (results) from a completed Actor run using its datasetId.
- Use when query asks to get/fetch/retrieve data from a previous Actor execution
- Returns the actual scraped data, not Actor documentation
- Example: "Get the data from my last Actor run" or "Show me the results from dataset abc123"
**fetch-apify-docs**: Fetches the full content of a specific Apify documentation page by its URL.
- Use when user provides a specific docs URL they want to read
- Different from search-apify-docs which searches across all documentation
- Example: "Fetch https://docs.apify.com/platform/actors/running" or "Show me the content of this docs page"
## Keyword Length Guidelines
- Short, specific keywords (1-20 chars) are ideal: "Instagram", "Twitter posts", "Amazon"
- Multiple specific searches are BETTER than one generic search (e.g., searching "Instagram", "Twitter", "TikTok" separately is better than "social media")
- Only penalize if keywords are >100 chars or clearly irrelevant/off-topic
- Do NOT penalize thoughtful additions like date filters or specific platforms
[BEGIN DATA]
************
[User's previous interaction with the assistant]: {{context}}
[User query]: {{query}}
************
[LLM decided to call these tools]: {{tool_calls}}
[LLM response]: {{llm_response}}
************
[REFERENCE INSTRUCTIONS]: {{reference}}
[END DATA]
DECISION: [correct or incorrect]
EXPLANATION: [Super short explanation of why the tool choice was correct or incorrect]
Your answer must consist of a single word: "correct" or "incorrect".
No extra text, symbols, or formatting is allowed.
"correct" means the agent selected the correct tool, extracted the proper parameters from the query,
crafted a runnable and accurate tool call, and used only information present in the query or context.
"incorrect" means the selected tool was not appropriate, or if any tool parameters do not match the expected signature,
or if reference instructions were not properly followed.
Do not use external knowledge or make assumptions.
Make your decision strictly based on the information within [BEGIN DATA] and [END DATA].
If [Reference instructions] are included, they specify requirements for tool usage.
If the tool call does not conform, the answer must be "incorrect".
## Output Format
The response must be exactly:
Decision: either "correct" or "incorrect".
Explanation: brief explanation of the decision.
`
export function getRequiredEnvVars(): Record<string, string | undefined> {
return {
PHOENIX_BASE_URL: process.env.PHOENIX_BASE_URL,
PHOENIX_API_KEY: process.env.PHOENIX_API_KEY,
OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY,
OPENROUTER_BASE_URL: process.env.OPENROUTER_BASE_URL,
};
}
// Removes newlines and trims whitespace. Useful for Authorization header values
// because CI secrets sometimes include trailing newlines or quotes.
export function sanitizeHeaderValue(value?: string): string | undefined {
if (value == null) return value;
return value.replace(/[\r\n]/g, '').trim().replace(/^"|"$/g, '');
}
export function validateEnvVars(): boolean {
const envVars = getRequiredEnvVars();
const missing = Object.entries(envVars)
.filter(([, value]) => !value)
.map(([key]) => key);
if (missing.length > 0) {
// eslint-disable-next-line no-console
console.error(`Missing required environment variables: ${missing.join(', ')}`);
return false;
}
return true;
}