OpenRouter MCP Server

test_benchmark.py•34.6 KiB

""" Test cases for model benchmarking functionality. This module tests the benchmark handler that allows comparing multiple AI models by sending the same prompt and analyzing their responses. """ import pytest import asyncio import json from unittest.mock import Mock, patch, AsyncMock from datetime import datetime, timezone from src.openrouter_mcp.handlers.benchmark import ( BenchmarkHandler, BenchmarkResult, ModelComparison, BenchmarkMetrics ) class TestBenchmarkResult: """Test the BenchmarkResult data class.""" def test_benchmark_result_creation(self): """Test creating a benchmark result.""" result = BenchmarkResult( model_id="openai/gpt-4", prompt="Test prompt", response="Test response", response_time_ms=150.5, tokens_used=25, cost=0.0015, timestamp=datetime.now(timezone.utc) ) assert result.model_id == "openai/gpt-4" assert result.prompt == "Test prompt" assert result.response == "Test response" assert result.response_time_ms == 150.5 assert result.tokens_used == 25 assert result.cost == 0.0015 assert isinstance(result.timestamp, datetime) def test_benchmark_result_to_dict(self): """Test converting benchmark result to dictionary.""" timestamp = datetime.now(timezone.utc) result = BenchmarkResult( model_id="anthropic/claude-3-opus", prompt="Explain quantum computing", response="Quantum computing uses quantum bits...", response_time_ms=200.0, tokens_used=50, cost=0.003, timestamp=timestamp, error=None ) result_dict = result.to_dict() assert result_dict["model_id"] == "anthropic/claude-3-opus" assert result_dict["prompt"] == "Explain quantum computing" assert result_dict["response"] == "Quantum computing uses quantum bits..." assert result_dict["response_time_ms"] == 200.0 assert result_dict["tokens_used"] == 50 assert result_dict["cost"] == 0.003 assert result_dict["timestamp"] == timestamp.isoformat() assert result_dict["error"] is None def test_benchmark_result_with_error(self): """Test benchmark result with error.""" result = BenchmarkResult( model_id="openai/gpt-3.5-turbo", prompt="Test prompt", response=None, response_time_ms=0, tokens_used=0, cost=0, timestamp=datetime.now(timezone.utc), error="API rate limit exceeded" ) assert result.error == "API rate limit exceeded" assert result.response is None assert result.tokens_used == 0 class TestBenchmarkMetrics: """Test the BenchmarkMetrics class.""" def test_metrics_calculation(self): """Test calculating metrics from benchmark results.""" results = [ BenchmarkResult( model_id="model1", prompt="test", response="response1", response_time_ms=100, tokens_used=20, cost=0.001, timestamp=datetime.now(timezone.utc) ), BenchmarkResult( model_id="model1", prompt="test", response="response2", response_time_ms=150, tokens_used=25, cost=0.0015, timestamp=datetime.now(timezone.utc) ) ] metrics = BenchmarkMetrics.from_results(results) assert metrics.avg_response_time_ms == 125.0 assert metrics.avg_tokens_used == 22.5 assert metrics.avg_cost == 0.00125 assert metrics.total_cost == 0.0025 assert metrics.success_rate == 1.0 assert metrics.sample_count == 2 def test_metrics_with_errors(self): """Test metrics calculation with some errors.""" results = [ BenchmarkResult( model_id="model1", prompt="test", response="response1", response_time_ms=100, tokens_used=20, cost=0.001, timestamp=datetime.now(timezone.utc) ), BenchmarkResult( model_id="model1", prompt="test", response=None, response_time_ms=0, tokens_used=0, cost=0, timestamp=datetime.now(timezone.utc), error="API error" ) ] metrics = BenchmarkMetrics.from_results(results) assert metrics.success_rate == 0.5 assert metrics.sample_count == 2 # Averages should only include successful results assert metrics.avg_response_time_ms == 100.0 assert metrics.avg_tokens_used == 20.0 class TestModelComparison: """Test the ModelComparison class.""" def test_model_comparison_creation(self): """Test creating a model comparison.""" results = { "openai/gpt-4": [ BenchmarkResult( model_id="openai/gpt-4", prompt="test", response="response", response_time_ms=200, tokens_used=30, cost=0.002, timestamp=datetime.now(timezone.utc) ) ], "anthropic/claude-3-opus": [ BenchmarkResult( model_id="anthropic/claude-3-opus", prompt="test", response="response", response_time_ms=150, tokens_used=25, cost=0.0015, timestamp=datetime.now(timezone.utc) ) ] } comparison = ModelComparison( prompt="test", models=["openai/gpt-4", "anthropic/claude-3-opus"], results=results, timestamp=datetime.now(timezone.utc) ) assert comparison.prompt == "test" assert len(comparison.models) == 2 assert "openai/gpt-4" in comparison.models assert len(comparison.results) == 2 def test_model_comparison_rankings(self): """Test getting model rankings from comparison.""" results = { "openai/gpt-4": [ BenchmarkResult( model_id="openai/gpt-4", prompt="test", response="response", response_time_ms=200, tokens_used=30, cost=0.002, timestamp=datetime.now(timezone.utc) ) ], "anthropic/claude-3-opus": [ BenchmarkResult( model_id="anthropic/claude-3-opus", prompt="test", response="response", response_time_ms=150, tokens_used=25, cost=0.0015, timestamp=datetime.now(timezone.utc) ) ] } comparison = ModelComparison( prompt="test", models=["openai/gpt-4", "anthropic/claude-3-opus"], results=results, timestamp=datetime.now(timezone.utc) ) rankings = comparison.get_rankings() # Claude should be ranked first (faster and cheaper) assert rankings["speed"][0]["model"] == "anthropic/claude-3-opus" assert rankings["cost"][0]["model"] == "anthropic/claude-3-opus" assert rankings["speed"][0]["metric"] == 150 assert rankings["cost"][0]["metric"] == 0.0015 class TestBenchmarkHandler: """Test the BenchmarkHandler class.""" @pytest.fixture def handler(self): """Create a benchmark handler instance.""" # Use a mock API key for testing return BenchmarkHandler(api_key="test-api-key") @pytest.fixture def mock_client(self): """Create a mock OpenRouter client.""" client = Mock() client.chat_completion = AsyncMock() return client @pytest.mark.asyncio async def test_benchmark_single_model(self, handler, mock_client): """Test benchmarking a single model.""" # Mock the response mock_client.chat_completion.return_value = { "choices": [{"message": {"content": "Test response"}}], "usage": {"total_tokens": 25} } handler.client = mock_client result = await handler.benchmark_model( model_id="openai/gpt-4", prompt="Test prompt", temperature=0.7, max_tokens=100 ) assert result.model_id == "openai/gpt-4" assert result.prompt == "Test prompt" assert result.response == "Test response" assert result.tokens_used == 25 assert result.response_time_ms > 0 mock_client.chat_completion.assert_called_once() @pytest.mark.asyncio async def test_benchmark_multiple_models(self, handler, mock_client): """Test benchmarking multiple models.""" # Mock different responses for different models mock_client.chat_completion.side_effect = [ { "choices": [{"message": {"content": "GPT-4 response"}}], "usage": {"total_tokens": 30} }, { "choices": [{"message": {"content": "Claude response"}}], "usage": {"total_tokens": 25} } ] handler.client = mock_client comparison = await handler.benchmark_models( models=["openai/gpt-4", "anthropic/claude-3-opus"], prompt="Compare these models", temperature=0.7, max_tokens=100, runs_per_model=1 ) assert comparison.prompt == "Compare these models" assert len(comparison.models) == 2 assert "openai/gpt-4" in comparison.results assert "anthropic/claude-3-opus" in comparison.results gpt4_results = comparison.results["openai/gpt-4"] assert len(gpt4_results) == 1 assert gpt4_results[0].response == "GPT-4 response" claude_results = comparison.results["anthropic/claude-3-opus"] assert len(claude_results) == 1 assert claude_results[0].response == "Claude response" @pytest.mark.asyncio async def test_benchmark_with_multiple_runs(self, handler, mock_client): """Test benchmarking with multiple runs per model.""" # Mock multiple responses mock_client.chat_completion.side_effect = [ { "choices": [{"message": {"content": f"Response {i}"}}], "usage": {"total_tokens": 20 + i} } for i in range(3) ] handler.client = mock_client comparison = await handler.benchmark_models( models=["openai/gpt-4"], prompt="Test multiple runs", temperature=0.7, max_tokens=100, runs_per_model=3 ) assert len(comparison.results["openai/gpt-4"]) == 3 for i, result in enumerate(comparison.results["openai/gpt-4"]): assert result.response == f"Response {i}" assert result.tokens_used == 20 + i @pytest.mark.asyncio async def test_benchmark_with_error_handling(self, handler, mock_client): """Test benchmark with error handling.""" # Mock an error for one model mock_client.chat_completion.side_effect = [ { "choices": [{"message": {"content": "Success"}}], "usage": {"total_tokens": 20} }, Exception("API Error"), ] handler.client = mock_client comparison = await handler.benchmark_models( models=["openai/gpt-4", "anthropic/claude-3-opus"], prompt="Test error handling", temperature=0.7, max_tokens=100, runs_per_model=1 ) # GPT-4 should succeed assert comparison.results["openai/gpt-4"][0].error is None assert comparison.results["openai/gpt-4"][0].response == "Success" # Claude should have an error assert comparison.results["anthropic/claude-3-opus"][0].error is not None assert "API Error" in comparison.results["anthropic/claude-3-opus"][0].error assert comparison.results["anthropic/claude-3-opus"][0].response is None @pytest.mark.asyncio async def test_save_and_load_comparison(self, handler, tmp_path): """Test saving and loading benchmark comparisons.""" # Create a comparison comparison = ModelComparison( prompt="Test prompt", models=["model1", "model2"], results={ "model1": [ BenchmarkResult( model_id="model1", prompt="Test prompt", response="Response 1", response_time_ms=100, tokens_used=20, cost=0.001, timestamp=datetime.now(timezone.utc) ) ], "model2": [ BenchmarkResult( model_id="model2", prompt="Test prompt", response="Response 2", response_time_ms=150, tokens_used=25, cost=0.0015, timestamp=datetime.now(timezone.utc) ) ] }, timestamp=datetime.now(timezone.utc) ) # Save comparison file_path = tmp_path / "benchmark_test.json" handler.save_comparison(comparison, str(file_path)) # Load comparison loaded = handler.load_comparison(str(file_path)) assert loaded.prompt == comparison.prompt assert loaded.models == comparison.models assert len(loaded.results) == len(comparison.results) assert loaded.results["model1"][0].response == "Response 1" assert loaded.results["model2"][0].response == "Response 2" def test_format_comparison_report(self, handler): """Test formatting comparison report.""" comparison = ModelComparison( prompt="Test prompt", models=["openai/gpt-4", "anthropic/claude-3-opus"], results={ "openai/gpt-4": [ BenchmarkResult( model_id="openai/gpt-4", prompt="Test prompt", response="GPT-4 response", response_time_ms=200, tokens_used=30, cost=0.002, timestamp=datetime.now(timezone.utc) ) ], "anthropic/claude-3-opus": [ BenchmarkResult( model_id="anthropic/claude-3-opus", prompt="Test prompt", response="Claude response", response_time_ms=150, tokens_used=25, cost=0.0015, timestamp=datetime.now(timezone.utc) ) ] }, timestamp=datetime.now(timezone.utc) ) report = handler.format_comparison_report(comparison) assert "Benchmark Comparison Report" in report assert "Test prompt" in report assert "openai/gpt-4" in report assert "anthropic/claude-3-opus" in report assert "Response Time" in report assert "Cost" in report assert "Rankings" in report class TestAdvancedBenchmarkMetrics: """Test enhanced benchmark metrics functionality.""" def test_detailed_cost_calculation(self): """Test detailed token-based cost calculation.""" # Test with detailed token usage breakdown result = BenchmarkResult( model_id="openai/gpt-4", prompt="Test prompt", response="Test response", response_time_ms=150.0, tokens_used=100, cost=0.006, # Expected: 50 prompt tokens * 0.03/1k + 50 completion tokens * 0.06/1k timestamp=datetime.now(timezone.utc), prompt_tokens=50, completion_tokens=50, input_cost_per_1k_tokens=0.03, output_cost_per_1k_tokens=0.06 ) # This test will fail initially - we need to add these fields assert hasattr(result, 'prompt_tokens') assert hasattr(result, 'completion_tokens') assert hasattr(result, 'input_cost_per_1k_tokens') assert hasattr(result, 'output_cost_per_1k_tokens') assert result.prompt_tokens == 50 assert result.completion_tokens == 50 def test_response_quality_scoring(self): """Test response quality assessment.""" result = BenchmarkResult( model_id="openai/gpt-4", prompt="Explain quantum computing", response="Quantum computing uses quantum bits or qubits...", response_time_ms=200.0, tokens_used=75, cost=0.0045, timestamp=datetime.now(timezone.utc), quality_score=0.85, # Score based on completeness, accuracy, relevance response_length=150, contains_code_example=False, language_coherence_score=0.9 ) # These will fail initially - we need to add quality assessment assert hasattr(result, 'quality_score') assert hasattr(result, 'response_length') assert hasattr(result, 'contains_code_example') assert hasattr(result, 'language_coherence_score') assert result.quality_score == 0.85 assert result.response_length == 150 assert result.language_coherence_score == 0.9 def test_throughput_measurement(self): """Test throughput calculation (tokens per second).""" result = BenchmarkResult( model_id="anthropic/claude-3-opus", prompt="Generate a story", response="Once upon a time...", response_time_ms=2000.0, # 2 seconds tokens_used=200, cost=0.01, timestamp=datetime.now(timezone.utc), throughput_tokens_per_second=100.0 # 200 tokens / 2 seconds ) # This will fail initially - we need throughput calculation assert hasattr(result, 'throughput_tokens_per_second') assert result.throughput_tokens_per_second == 100.0 def test_enhanced_metrics_calculation(self): """Test enhanced metrics with new fields.""" results = [ BenchmarkResult( model_id="model1", prompt="test", response="response1", response_time_ms=1000, tokens_used=100, cost=0.005, timestamp=datetime.now(timezone.utc), prompt_tokens=40, completion_tokens=60, quality_score=0.8, throughput_tokens_per_second=100.0 ), BenchmarkResult( model_id="model1", prompt="test", response="response2", response_time_ms=1500, tokens_used=150, cost=0.0075, timestamp=datetime.now(timezone.utc), prompt_tokens=50, completion_tokens=100, quality_score=0.9, throughput_tokens_per_second=100.0 ) ] metrics = BenchmarkMetrics.from_results(results) # Test enhanced metrics - these will fail initially assert hasattr(metrics, 'avg_quality_score') assert hasattr(metrics, 'avg_throughput') assert hasattr(metrics, 'avg_prompt_tokens') assert hasattr(metrics, 'avg_completion_tokens') assert hasattr(metrics, 'cost_per_quality_point') assert abs(metrics.avg_quality_score - 0.85) < 0.0001 assert metrics.avg_throughput == 100.0 assert metrics.avg_prompt_tokens == 45.0 assert metrics.avg_completion_tokens == 80.0 # Cost per quality point = avg_cost / avg_quality_score assert abs(metrics.cost_per_quality_point - (0.00625 / 0.85)) < 0.0001 class TestAdvancedBenchmarkHandler: """Test enhanced benchmark handler functionality.""" @pytest.fixture def enhanced_handler(self): """Create an enhanced benchmark handler.""" from src.openrouter_mcp.handlers.benchmark import EnhancedBenchmarkHandler return EnhancedBenchmarkHandler(api_key="test-api-key") @pytest.mark.asyncio async def test_quality_assessment(self, enhanced_handler): """Test response quality assessment.""" # This will fail initially - we need to implement quality assessment response_text = "Quantum computing is a revolutionary technology that uses quantum bits..." quality_score = enhanced_handler.assess_response_quality( prompt="Explain quantum computing", response=response_text ) assert 0 <= quality_score <= 1 assert quality_score > 0.5 # Should be decent quality @pytest.mark.asyncio async def test_detailed_cost_calculation_from_api(self, enhanced_handler): """Test detailed cost calculation from actual API response.""" mock_response = { "choices": [{"message": {"content": "Test response"}}], "usage": { "prompt_tokens": 10, "completion_tokens": 15, "total_tokens": 25 }, "model": "openai/gpt-4" } # This will fail initially - we need enhanced cost calculation cost_details = enhanced_handler.calculate_detailed_cost( api_response=mock_response, model_pricing={ "prompt": 0.03, # per 1k tokens "completion": 0.06 # per 1k tokens } ) assert "input_cost" in cost_details assert "output_cost" in cost_details assert "total_cost" in cost_details assert cost_details["input_cost"] == 10 * 0.03 / 1000 assert cost_details["output_cost"] == 15 * 0.06 / 1000 @pytest.mark.asyncio async def test_parallel_benchmarking(self, enhanced_handler): """Test parallel model benchmarking for better performance.""" # This will fail initially - we need parallel execution models = ["openai/gpt-4", "anthropic/claude-3-opus", "google/gemini-pro"] with patch.object(enhanced_handler, 'benchmark_model') as mock_benchmark: mock_benchmark.return_value = BenchmarkResult( model_id="test", prompt="test", response="test", response_time_ms=100, tokens_used=20, cost=0.001, timestamp=datetime.now(timezone.utc) ) comparison = await enhanced_handler.benchmark_models_parallel( models=models, prompt="Test parallel execution", max_concurrent=3 ) # Should have been called for all models assert mock_benchmark.call_count == len(models) assert len(comparison.models) == len(models) class TestNewBenchmarkTools: """Test new MCP benchmark tools.""" @pytest.mark.asyncio async def test_export_benchmark_report_tool(self): """Test exporting benchmark reports to different formats.""" from src.openrouter_mcp.handlers.benchmark import ( BenchmarkReportExporter, BenchmarkHandler, ModelComparison, BenchmarkResult ) from datetime import datetime, timezone import os # Create a mock comparison results = { "openai/gpt-4": [ BenchmarkResult( model_id="openai/gpt-4", prompt="Test prompt", response="Test response", response_time_ms=200, tokens_used=30, cost=0.002, timestamp=datetime.now(timezone.utc) ) ] } comparison = ModelComparison( prompt="Test prompt", models=["openai/gpt-4"], results=results, timestamp=datetime.now(timezone.utc) ) # Test the exporter directly handler = BenchmarkHandler(api_key="test-key") # Use test key exporter = BenchmarkReportExporter(handler) # Test CSV export import tempfile with tempfile.TemporaryDirectory() as temp_dir: csv_path = os.path.join(temp_dir, "test_report.csv") result_path = exporter.export_to_csv(comparison, csv_path) assert os.path.exists(result_path) assert result_path == csv_path # Test markdown export md_path = os.path.join(temp_dir, "test_report.md") result_path = exporter.export_to_markdown(comparison, md_path) assert os.path.exists(result_path) assert result_path == md_path # Test JSON export json_path = os.path.join(temp_dir, "test_report.json") result_path = exporter.export_to_json(comparison, json_path) assert os.path.exists(result_path) assert result_path == json_path @pytest.mark.asyncio async def test_compare_model_performance_tool(self): """Test advanced model performance comparison.""" from src.openrouter_mcp.handlers.benchmark import ( ModelPerformanceAnalyzer, BenchmarkHandler, BenchmarkMetrics ) # Test the analyzer directly handler = BenchmarkHandler(api_key="test-key") analyzer = ModelPerformanceAnalyzer(handler) # Create mock metrics mock_metrics = { "openai/gpt-4": BenchmarkMetrics( avg_response_time_ms=200, avg_tokens_used=30, avg_cost=0.002, total_cost=0.002, success_rate=1.0, sample_count=1, avg_quality_score=0.8, avg_throughput=150 ), "anthropic/claude-3-opus": BenchmarkMetrics( avg_response_time_ms=150, avg_tokens_used=25, avg_cost=0.0015, total_cost=0.0015, success_rate=1.0, sample_count=1, avg_quality_score=0.9, avg_throughput=167 ) } weighting = { "speed": 0.25, "cost": 0.25, "quality": 0.4, "throughput": 0.1 } result = analyzer.compare_models_weighted(mock_metrics, weighting) assert "overall_ranking" in result assert "detailed_scores" in result assert len(result["overall_ranking"]) == 2 assert all(metric in result["detailed_scores"] for metric in ["speed", "cost", "quality", "throughput"]) class TestBenchmarkIntegration: """Integration tests for enhanced benchmark system.""" @pytest.mark.asyncio async def test_end_to_end_enhanced_benchmarking(self): """Test complete enhanced benchmarking workflow.""" from src.openrouter_mcp.handlers.benchmark import ( EnhancedBenchmarkHandler, BenchmarkResult, ModelComparison, BenchmarkReportExporter, ModelPerformanceAnalyzer ) from unittest.mock import AsyncMock, Mock, patch from datetime import datetime, timezone import tempfile import os # Create enhanced handler handler = EnhancedBenchmarkHandler(api_key="test-key") # Mock client response mock_response = { "choices": [{"message": {"content": "Test response for quantum computing analysis"}}], "usage": { "prompt_tokens": 15, "completion_tokens": 35, "total_tokens": 50 } } handler.client.chat_completion = AsyncMock(return_value=mock_response) handler.model_cache.get_model_info = AsyncMock(return_value={ "pricing": {"prompt": 0.03, "completion": 0.06} }) # Test enhanced single model benchmark result = await handler.benchmark_model( model_id="openai/gpt-4", prompt="Explain quantum computing", temperature=0.7, max_tokens=100 ) # Verify enhanced metrics were calculated assert result.model_id == "openai/gpt-4" assert result.prompt_tokens == 15 assert result.completion_tokens == 35 assert result.quality_score is not None assert result.quality_score > 0 assert result.throughput_tokens_per_second is not None assert result.response_length == len("Test response for quantum computing analysis") # Test cost calculation expected_cost = (15 * 0.03 + 35 * 0.06) / 1_000_000 assert abs(result.cost - expected_cost) < 0.0001 # Create comparison for testing export and analysis comparison = ModelComparison( prompt="Explain quantum computing", models=["openai/gpt-4", "anthropic/claude-3-opus"], results={ "openai/gpt-4": [result], "anthropic/claude-3-opus": [result] # Using same result for simplicity }, timestamp=datetime.now(timezone.utc) ) # Test export functionality exporter = BenchmarkReportExporter(handler) with tempfile.TemporaryDirectory() as temp_dir: # Test all export formats csv_path = exporter.export_to_csv(comparison, os.path.join(temp_dir, "test.csv")) assert os.path.exists(csv_path) md_path = exporter.export_to_markdown(comparison, os.path.join(temp_dir, "test.md")) assert os.path.exists(md_path) json_path = exporter.export_to_json(comparison, os.path.join(temp_dir, "test.json")) assert os.path.exists(json_path) # Test performance analysis analyzer = ModelPerformanceAnalyzer(handler) metrics = comparison.get_metrics() analysis_result = analyzer.compare_models_weighted( metrics, {"speed": 0.3, "cost": 0.3, "quality": 0.4} ) assert "overall_ranking" in analysis_result assert "detailed_scores" in analysis_result assert len(analysis_result["overall_ranking"]) == 2 def test_quality_analyzer_comprehensive(self): """Test comprehensive quality analysis.""" from src.openrouter_mcp.handlers.benchmark import ResponseQualityAnalyzer analyzer = ResponseQualityAnalyzer() # Test high-quality response high_quality_response = """ Quantum computing is a revolutionary technology that leverages quantum mechanics principles. Unlike classical computers that use bits (0 or 1), quantum computers use quantum bits or qubits. These qubits can exist in superposition, allowing them to be both 0 and 1 simultaneously. This property enables quantum computers to process vast amounts of information in parallel. """ analysis = analyzer.analyze_response("Explain quantum computing", high_quality_response.strip()) assert analysis["quality_score"] > 0.7 assert analysis["response_length"] > 200 assert analysis["contains_code_example"] is False assert analysis["language_coherence_score"] > 0.6 assert analysis["completeness_score"] > 0.7 assert analysis["relevance_score"] > 0.8 # Test code-containing response code_response = """ Here's how to implement a simple quantum circuit: ```python from qiskit import QuantumCircuit, execute, Aer def create_bell_state(): qc = QuantumCircuit(2, 2) qc.h(0) # Hadamard gate qc.cx(0, 1) # CNOT gate return qc ``` This creates a Bell state, demonstrating quantum entanglement. """ code_analysis = analyzer.analyze_response("Show me quantum code", code_response.strip()) assert code_analysis["contains_code_example"] is True assert code_analysis["quality_score"] > 0.6 class TestBenchmarkMCPTool: """Test the MCP tool integration for benchmarking.""" @pytest.mark.asyncio async def test_benchmark_models_tool_registration(self): """Test that benchmark_models_tool is properly registered.""" from src.openrouter_mcp.handlers.benchmark import benchmark_models_tool # Check that the tool is registered (it will be a FunctionTool object) assert benchmark_models_tool is not None assert hasattr(benchmark_models_tool, 'name') assert benchmark_models_tool.name == 'benchmark_models_tool' @pytest.mark.asyncio async def test_get_benchmark_history_tool_registration(self): """Test that get_benchmark_history_tool is properly registered.""" from src.openrouter_mcp.handlers.benchmark import get_benchmark_history_tool # Check that the tool is registered assert get_benchmark_history_tool is not None assert hasattr(get_benchmark_history_tool, 'name') assert get_benchmark_history_tool.name == 'get_benchmark_history_tool' @pytest.mark.asyncio async def test_compare_model_categories_tool_registration(self): """Test that compare_model_categories_tool is properly registered.""" from src.openrouter_mcp.handlers.benchmark import compare_model_categories_tool # Check that the tool is registered assert compare_model_categories_tool is not None assert hasattr(compare_model_categories_tool, 'name') assert compare_model_categories_tool.name == 'compare_model_categories_tool'

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/physics91/openrouter-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_benchmark.py•34.6 KiB