#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Location: ./mcp-servers/python/mcp_eval_server/validate_models.py
Copyright 2025
SPDX-License-Identifier: Apache-2.0
Authors: Mihai Criveti
Model validation and connectivity testing script.
"""
# Standard
import asyncio
import logging
import os
import sys
from typing import Dict
# Add current directory to path for imports
sys.path.insert(0, ".")
# Third-Party
from mcp_eval_server.tools.judge_tools import JudgeTools # noqa: E402 # pylint: disable=wrong-import-position,no-name-in-module
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S")
logger = logging.getLogger(__name__)
def check_environment_variables() -> Dict[str, bool]:
"""Check for required environment variables.
Returns:
Dict[str, bool]: Dictionary mapping environment variable names to their presence status
"""
env_checks = {}
# OpenAI
env_checks["OPENAI_API_KEY"] = bool(os.getenv("OPENAI_API_KEY"))
env_checks["OPENAI_ORGANIZATION"] = bool(os.getenv("OPENAI_ORGANIZATION"))
env_checks["OPENAI_BASE_URL"] = bool(os.getenv("OPENAI_BASE_URL"))
# Azure OpenAI
env_checks["AZURE_OPENAI_API_KEY"] = bool(os.getenv("AZURE_OPENAI_API_KEY"))
env_checks["AZURE_OPENAI_ENDPOINT"] = bool(os.getenv("AZURE_OPENAI_ENDPOINT"))
env_checks["AZURE_DEPLOYMENT_NAME"] = bool(os.getenv("AZURE_DEPLOYMENT_NAME"))
# Anthropic
env_checks["ANTHROPIC_API_KEY"] = bool(os.getenv("ANTHROPIC_API_KEY"))
# AWS Bedrock
env_checks["AWS_ACCESS_KEY_ID"] = bool(os.getenv("AWS_ACCESS_KEY_ID"))
env_checks["AWS_SECRET_ACCESS_KEY"] = bool(os.getenv("AWS_SECRET_ACCESS_KEY"))
env_checks["AWS_REGION"] = bool(os.getenv("AWS_REGION"))
# Google Gemini
env_checks["GOOGLE_API_KEY"] = bool(os.getenv("GOOGLE_API_KEY"))
# IBM Watsonx.ai
env_checks["WATSONX_API_KEY"] = bool(os.getenv("WATSONX_API_KEY"))
env_checks["WATSONX_PROJECT_ID"] = bool(os.getenv("WATSONX_PROJECT_ID"))
env_checks["WATSONX_URL"] = bool(os.getenv("WATSONX_URL"))
# OLLAMA
env_checks["OLLAMA_BASE_URL"] = bool(os.getenv("OLLAMA_BASE_URL"))
return env_checks
async def test_judge_functionality(judge_tools: JudgeTools, judge_name: str) -> bool:
"""Test basic functionality of a judge.
Args:
judge_tools: Initialized JudgeTools instance
judge_name: Name of the judge to test
Returns:
bool: True if judge test passes, False otherwise
"""
try:
# Simple test evaluation
criteria = [{"name": "accuracy", "description": "Factual accuracy", "scale": "1-5", "weight": 1.0}]
rubric = {"criteria": [], "scale_description": {"1": "Wrong", "5": "Correct"}}
result = await judge_tools.evaluate_response(response="Paris is the capital of France.", criteria=criteria, rubric=rubric, judge_model=judge_name)
# Check if we got a valid result
return isinstance(result, dict) and "overall_score" in result and "scores" in result and result["overall_score"] > 0
except Exception as e:
logger.error(f" โ Test failed for {judge_name}: {e}")
return False
async def main():
"""Main validation function.
Returns:
bool: True if at least one judge is available and working, False otherwise
"""
logger.info("๐ MCP Eval Server - Model Validation & Connectivity Test")
logger.info("=" * 60)
# Check environment variables
logger.info("๐ Environment Variables Check:")
env_vars = check_environment_variables()
providers = {
"OpenAI": ["OPENAI_API_KEY"],
"Azure OpenAI": ["AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT"],
"Anthropic": ["ANTHROPIC_API_KEY"],
"AWS Bedrock": ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"],
"Google Gemini": ["GOOGLE_API_KEY"],
"IBM Watsonx.ai": ["WATSONX_API_KEY", "WATSONX_PROJECT_ID"],
"OLLAMA": ["OLLAMA_BASE_URL"],
}
for provider, required_vars in providers.items():
has_required = all(env_vars.get(var, False) for var in required_vars)
if provider == "OpenAI":
pass
elif provider == "Azure OpenAI":
pass
elif provider == "AWS Bedrock":
pass
status = "โ
" if has_required else "โ ๏ธ "
logger.info(f" {status} {provider}: {has_required}")
if not has_required:
missing = [var for var in required_vars if not env_vars.get(var, False)]
logger.info(f" Missing: {', '.join(missing)}")
logger.info("")
# Initialize judge tools
logger.info("โ๏ธ Initializing Judge Tools:")
judge_tools = JudgeTools()
available_judges = judge_tools.get_available_judges()
logger.info(f" Loaded {len(available_judges)} judges")
# Group judges by provider
judge_by_provider = {}
for judge_name in available_judges:
info = judge_tools.get_judge_info(judge_name)
provider = info.get("provider", "unknown")
if provider not in judge_by_provider:
judge_by_provider[provider] = []
judge_by_provider[provider].append(judge_name)
logger.info("")
logger.info("๐ Judge Models by Provider:")
for provider, judges in judge_by_provider.items():
logger.info(f" {provider.title()}: {len(judges)} judges")
for judge_name in judges:
info = judge_tools.get_judge_info(judge_name)
model_name = info.get("model_name", "N/A")
logger.info(f" โข {judge_name} โ {model_name}")
logger.info("")
# Test basic functionality
logger.info("๐งช Testing Basic Functionality:")
logger.info(" Testing with simple evaluation: 'Paris is the capital of France.'")
test_results = {}
# Always test rule-based judge first (no API key needed)
test_judges = []
if "rule-based" in available_judges:
test_judges.append("rule-based")
# Add first 2 other judges
other_judges = [j for j in available_judges if j != "rule-based"][:2]
test_judges.extend(other_judges)
for judge_name in test_judges:
logger.info(f" Testing {judge_name}...")
judge_success = await test_judge_functionality(judge_tools, judge_name)
test_results[judge_name] = judge_success
status = "โ
" if judge_success else "โ"
logger.info(f" {status} {'Passed' if judge_success else 'Failed'}")
logger.info("")
# Summary
logger.info("๐ Validation Summary:")
logger.info(f" ๐ Total judges available: {len(available_judges)}")
logger.info(f" ๐งช Judges tested: {len(test_results)}")
logger.info(f" โ
Tests passed: {sum(test_results.values())}")
logger.info(f" โ Tests failed: {len(test_results) - sum(test_results.values())}")
# Recommendations
logger.info("")
logger.info("๐ก Recommendations:")
# Check if we have at least one working judge
working_judges = [name for name, success in test_results.items() if success]
if working_judges:
logger.info(f" โ
Primary judge available: {working_judges[0]}")
else:
logger.info(" โ ๏ธ No working judges found - check API keys and connectivity")
# Provider-specific recommendations
if not env_vars.get("OPENAI_API_KEY"):
logger.info(" ๐ก For OpenAI models: export OPENAI_API_KEY='sk-...'")
if not env_vars.get("ANTHROPIC_API_KEY"):
logger.info(" ๐ก For Anthropic models: export ANTHROPIC_API_KEY='sk-ant-...'")
if not env_vars.get("AZURE_OPENAI_API_KEY"):
logger.info(" ๐ก For Azure OpenAI: export AZURE_OPENAI_API_KEY='...' and AZURE_OPENAI_ENDPOINT='...'")
# Rule-based judge always works
if "rule-based" in available_judges:
logger.info(" โ
Rule-based judge available (no API key required)")
logger.info("")
logger.info("๐ฏ Model validation completed!")
return len(available_judges) > 0 and any(test_results.values())
if __name__ == "__main__":
success = asyncio.run(main())
sys.exit(0 if success else 1)