# Model Configuration for MCP Eval Server
# Configure available judge models for evaluation
# Updated to align with agent_runtimes/langchain_agent environment variable patterns
models:
openai:
gpt-4:
provider: "openai"
model_name: "gpt-4"
api_key_env: "OPENAI_API_KEY"
organization_env: "OPENAI_ORGANIZATION" # Changed from OPENAI_ORG_ID to match agent_runtimes
base_url_env: "OPENAI_BASE_URL" # Added for custom OpenAI endpoints
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 8192
optimal_temperature: 0.3
consistency_level: "high"
gpt-4o-mini:
provider: "openai"
model_name: "gpt-4o-mini"
api_key_env: "OPENAI_API_KEY"
organization_env: "OPENAI_ORGANIZATION"
base_url_env: "OPENAI_BASE_URL"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 128000
optimal_temperature: 0.3
consistency_level: "high"
gpt-4-turbo:
provider: "openai"
model_name: "gpt-4-turbo-preview"
api_key_env: "OPENAI_API_KEY"
organization_env: "OPENAI_ORGANIZATION"
base_url_env: "OPENAI_BASE_URL"
default_temperature: 0.3
max_tokens: 4000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 128000
optimal_temperature: 0.3
consistency_level: "high"
gpt-3.5-turbo:
provider: "openai"
model_name: "gpt-3.5-turbo"
api_key_env: "OPENAI_API_KEY"
organization_env: "OPENAI_ORGANIZATION"
base_url_env: "OPENAI_BASE_URL"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: false # Less reliable for complex ranking
supports_reference: true
max_context_length: 16384
optimal_temperature: 0.2
consistency_level: "medium"
azure:
gpt-5-chat:
provider: "azure"
deployment_name: "gpt-5-chat"
model_name: "gpt-4o"
api_base_env: "AZURE_OPENAI_ENDPOINT"
api_key_env: "AZURE_OPENAI_API_KEY"
api_version_env: "AZURE_OPENAI_API_VERSION"
deployment_name_env: "AZURE_DEPLOYMENT_NAME"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 128000
optimal_temperature: 0.3
consistency_level: "high"
gpt-4-azure:
provider: "azure"
deployment_name: "gpt-4"
model_name: "gpt-4"
api_base_env: "AZURE_OPENAI_ENDPOINT"
api_key_env: "AZURE_OPENAI_API_KEY" # Changed to match agent_runtimes
api_version: "2024-02-15-preview" # Updated to match agent_runtimes
deployment_name_env: "AZURE_DEPLOYMENT_NAME" # Added for consistency
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 8192
optimal_temperature: 0.3
consistency_level: "high"
gpt-4-turbo-azure:
provider: "azure"
deployment_name: "gpt-4-turbo"
model_name: "gpt-4-turbo"
api_base_env: "AZURE_OPENAI_ENDPOINT"
api_key_env: "AZURE_OPENAI_API_KEY"
api_version: "2024-02-15-preview"
deployment_name_env: "AZURE_DEPLOYMENT_NAME"
default_temperature: 0.3
max_tokens: 4000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 128000
optimal_temperature: 0.3
consistency_level: "high"
gpt-35-turbo-azure:
provider: "azure"
deployment_name: "gpt-35-turbo"
model_name: "gpt-3.5-turbo"
api_base_env: "AZURE_OPENAI_ENDPOINT"
api_key_env: "AZURE_OPENAI_API_KEY"
api_version: "2024-02-15-preview"
deployment_name_env: "AZURE_DEPLOYMENT_NAME"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: false
supports_reference: true
max_context_length: 16384
optimal_temperature: 0.2
consistency_level: "medium"
anthropic:
claude-3-sonnet:
provider: "anthropic"
model_name: "claude-3-sonnet-20240229"
api_key_env: "ANTHROPIC_API_KEY"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 200000
optimal_temperature: 0.3
consistency_level: "high"
claude-3-haiku:
provider: "anthropic"
model_name: "claude-3-haiku-20240307"
api_key_env: "ANTHROPIC_API_KEY"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 200000
optimal_temperature: 0.3
consistency_level: "high"
claude-3-opus:
provider: "anthropic"
model_name: "claude-3-opus-20240229"
api_key_env: "ANTHROPIC_API_KEY"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 200000
optimal_temperature: 0.3
consistency_level: "high"
bedrock:
claude-3-sonnet-bedrock:
provider: "bedrock"
model_id: "anthropic.claude-3-sonnet-20240229-v1:0"
model_name: "claude-3-sonnet"
aws_access_key_env: "AWS_ACCESS_KEY_ID"
aws_secret_key_env: "AWS_SECRET_ACCESS_KEY"
aws_region_env: "AWS_REGION"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 200000
optimal_temperature: 0.3
consistency_level: "high"
claude-3-haiku-bedrock:
provider: "bedrock"
model_id: "anthropic.claude-3-haiku-20240307-v1:0"
model_name: "claude-3-haiku"
aws_access_key_env: "AWS_ACCESS_KEY_ID"
aws_secret_key_env: "AWS_SECRET_ACCESS_KEY"
aws_region_env: "AWS_REGION"
default_temperature: 0.3
max_tokens: 2000
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: true
supports_reference: true
max_context_length: 200000
optimal_temperature: 0.3
consistency_level: "high"
ollama:
llama2-7b:
provider: "ollama"
model_name: "llama2:7b"
base_url_env: "OLLAMA_BASE_URL"
default_temperature: 0.3
max_tokens: 2000
request_timeout: 60 # OLLAMA can be slower
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: false # May be less reliable
supports_reference: true
max_context_length: 4096
optimal_temperature: 0.3
consistency_level: "medium"
llama3-8b:
provider: "ollama"
model_name: "llama3:8b"
base_url_env: "OLLAMA_BASE_URL"
default_temperature: 0.3
max_tokens: 2000
request_timeout: 60
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: false
supports_reference: true
max_context_length: 8192
optimal_temperature: 0.3
consistency_level: "medium"
mistral-7b:
provider: "ollama"
model_name: "mistral:7b"
base_url_env: "OLLAMA_BASE_URL"
default_temperature: 0.3
max_tokens: 2000
request_timeout: 60
capabilities:
supports_cot: true
supports_pairwise: true
supports_ranking: false
supports_reference: true
max_context_length: 8192
optimal_temperature: 0.3
consistency_level: "medium"
# Default model selection preferences
defaults:
primary_judge: "gpt-4o-mini" # Updated to match agent_runtimes default
fallback_judge: "gpt-3.5-turbo"
fast_judge: "gpt-4o-mini" # Fast and efficient
consensus_judges: ["gpt-4", "claude-3-sonnet", "gpt-4-turbo"] # Multi-provider consensus
# Model usage recommendations
recommendations:
high_stakes_evaluation: ["gpt-4", "claude-3-opus", "gpt-4-turbo"]
batch_processing: ["gpt-4o-mini", "gpt-3.5-turbo", "claude-3-haiku"]
complex_reasoning: ["gpt-4-turbo", "claude-3-opus", "gpt-4"]
cost_effective: ["gpt-4o-mini", "claude-3-haiku", "llama3-8b"]
multilingual: ["gpt-4", "claude-3-sonnet", "gpt-4-turbo"]
open_source: ["llama3-8b", "mistral-7b", "llama2-7b"]
cloud_agnostic: ["claude-3-sonnet-bedrock", "gpt-4-azure"]