Skip to main content
Glama
test_mcp_benchmark.py20.3 kB
#!/usr/bin/env python3 """ MCP 벤치마크 도구 테스트 """ import json import os import tempfile import pytest from unittest.mock import Mock, AsyncMock, patch, MagicMock from datetime import datetime from src.openrouter_mcp.handlers import mcp_benchmark from src.openrouter_mcp.handlers.benchmark import ( BenchmarkResult, BenchmarkMetrics, EnhancedBenchmarkResult, EnhancedBenchmarkMetrics, BenchmarkReportExporter, ModelPerformanceAnalyzer ) from src.openrouter_mcp.handlers.mcp_benchmark import ( get_benchmark_handler, benchmark_models, get_benchmark_history, compare_model_categories, export_benchmark_report, compare_model_performance ) class TestMCPBenchmarkTools: """MCP 벤치마크 도구 테스트 클래스""" @pytest.fixture def mock_env(self): """환경변수 모킹""" with patch.dict(os.environ, {'OPENROUTER_API_KEY': 'test-api-key'}): yield @pytest.fixture def temp_dir(self): """임시 디렉토리 생성""" with tempfile.TemporaryDirectory() as temp_dir: yield temp_dir @pytest.fixture def mock_benchmark_result(self): """모킹된 벤치마크 결과""" metrics = EnhancedBenchmarkMetrics( avg_response_time=1.5, # 초 단위 avg_cost=0.001, quality_score=8.5, throughput=100.0, success_rate=1.0, avg_prompt_tokens=100.0, avg_completion_tokens=50.0, avg_total_tokens=150.0, speed_score=0.8, cost_score=0.9, throughput_score=0.85 ) result = EnhancedBenchmarkResult( model_id="test-model", success=True, response="테스트 응답입니다.", error_message=None, metrics=metrics, timestamp=datetime.now() ) return result @pytest.mark.asyncio async def test_get_benchmark_handler(self, mock_env): """벤치마크 핸들러 싱글톤 테스트""" with patch('src.openrouter_mcp.handlers.mcp_benchmark.EnhancedBenchmarkHandler') as mock_handler_class: with patch('src.openrouter_mcp.handlers.mcp_benchmark.ModelCache') as mock_cache_class: mock_handler = Mock() mock_handler_class.return_value = mock_handler mock_cache_class.return_value = Mock() # 첫 번째 호출 handler1 = await mcp_benchmark.get_benchmark_handler() # 두 번째 호출 (싱글톤이므로 같은 인스턴스) handler2 = await mcp_benchmark.get_benchmark_handler() assert handler1 is handler2 mock_handler_class.assert_called_once() @pytest.mark.asyncio async def test_benchmark_models_success(self, mock_env, mock_benchmark_result): """모델 벤치마킹 성공 테스트""" models = ["gpt-3.5-turbo", "claude-3-haiku"] prompt = "테스트 프롬프트" # 모킹된 핸들러 결과 mock_results = { "gpt-3.5-turbo": mock_benchmark_result, "claude-3-haiku": mock_benchmark_result } with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = AsyncMock() mock_handler.benchmark_models.return_value = mock_results mock_handler.save_results = AsyncMock() mock_get_handler.return_value = mock_handler with patch('src.openrouter_mcp.handlers.mcp_benchmark.ModelPerformanceAnalyzer') as mock_analyzer_class: mock_analyzer = Mock() mock_analyzer.rank_models.return_value = [(mock_benchmark_result, 0.85)] mock_analyzer_class.return_value = mock_analyzer result = await benchmark_models( models=models, prompt=prompt, runs=2, delay_seconds=0.5 ) # 결과 검증 assert "timestamp" in result assert "config" in result assert "results" in result assert "ranking" in result assert len(result["results"]) == 2 assert result["config"]["models"] == models assert result["config"]["prompt"] == prompt # 핸들러 호출 검증 mock_handler.benchmark_models.assert_called_once_with( model_ids=models, prompt=prompt, runs=2, delay_between_requests=0.5 ) @pytest.mark.asyncio async def test_benchmark_models_no_success(self, mock_env): """모델 벤치마킹 실패 테스트""" models = ["invalid-model"] # 실패한 결과 failed_result = BenchmarkResult( model_id="invalid-model", success=False, response=None, error_message="Model not found" ) mock_results = {"invalid-model": failed_result} with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = AsyncMock() mock_handler.benchmark_models.return_value = mock_results mock_get_handler.return_value = mock_handler result = await benchmark_models(models=models) assert "results" in result assert result["results"]["invalid-model"]["success"] is False assert "ranking" not in result # 성공한 결과가 없으므로 랭킹 없음 @pytest.mark.asyncio async def test_get_benchmark_history_empty(self, mock_env, temp_dir): """빈 벤치마크 기록 테스트""" with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = Mock() mock_handler.results_dir = temp_dir mock_get_handler.return_value = mock_handler result = await get_benchmark_history() assert result["history"] == [] assert result["total_files"] == 0 assert "벤치마크 기록이 없습니다" in result["message"] @pytest.mark.asyncio async def test_get_benchmark_history_with_files(self, mock_env, temp_dir): """벤치마크 기록 파일이 있을 때 테스트""" # 테스트 결과 파일 생성 test_data = { "timestamp": "2024-01-01T12:00:00", "config": { "models": ["gpt-3.5-turbo"], "prompt": "테스트" }, "results": { "gpt-3.5-turbo": { "success": True, "metrics": { "avg_response_time": 1.5, "quality_score": 8.0 } } } } test_file = os.path.join(temp_dir, "test_benchmark.json") with open(test_file, 'w', encoding='utf-8') as f: json.dump(test_data, f) with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = Mock() mock_handler.results_dir = temp_dir mock_get_handler.return_value = mock_handler result = await get_benchmark_history(limit=5) assert len(result["history"]) == 1 assert result["total_files"] == 1 assert result["history"][0]["models_tested"] == ["gpt-3.5-turbo"] assert result["history"][0]["success_rate"] == "1/1" @pytest.mark.asyncio async def test_compare_model_categories(self, mock_env): """모델 카테고리 비교 테스트""" # 모킹된 모델 데이터 mock_models = [ {"id": "gpt-4", "category": "chat", "quality_score": 9.0}, {"id": "claude-3", "category": "chat", "quality_score": 8.5}, {"id": "codellama", "category": "code", "quality_score": 8.0}, {"id": "dall-e", "category": "image", "quality_score": 7.5} ] with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = AsyncMock() mock_cache = Mock() mock_cache.get_models.return_value = mock_models mock_handler.model_cache = mock_cache # 벤치마크 결과 모킹 mock_handler.benchmark_models.return_value = { "gpt-4": BenchmarkResult("gpt-4", True, "Response", None, BenchmarkMetrics(1.0, 1.0, 1.0, 100, 50, 150, 0.001, 0.001, 0.001, 9.0, 150.0, 1.0, 0.9, 0.8, 0.85)) } mock_get_handler.return_value = mock_handler with patch('src.openrouter_mcp.handlers.mcp_benchmark.ModelPerformanceAnalyzer') as mock_analyzer_class: mock_analyzer = Mock() mock_analyzer.rank_models.return_value = [(BenchmarkResult("gpt-4", True, "Response", None, None), 0.9)] mock_analyzer_class.return_value = mock_analyzer result = await compare_model_categories( categories=["chat"], top_n=2 ) assert "config" in result assert "category_info" in result assert "results" in result assert result["config"]["categories"] == ["chat"] assert "chat" in result["category_info"] @pytest.mark.asyncio async def test_export_benchmark_report_not_found(self, mock_env, temp_dir): """존재하지 않는 벤치마크 파일 내보내기 테스트""" with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = Mock() mock_handler.results_dir = temp_dir mock_get_handler.return_value = mock_handler with pytest.raises(Exception) as exc_info: await export_benchmark_report("nonexistent.json") assert "찾을 수 없습니다" in str(exc_info.value) @pytest.mark.asyncio async def test_export_benchmark_report_markdown(self, mock_env, temp_dir): """벤치마크 보고서 Markdown 내보내기 테스트""" # 테스트 벤치마크 파일 생성 benchmark_data = { "results": { "gpt-3.5-turbo": { "success": True, "metrics": { "avg_response_time": 1.5, "avg_cost": 0.001, "quality_score": 8.0, "throughput": 100.0 }, "response": "테스트 응답" } } } input_file = "test_benchmark.json" input_path = os.path.join(temp_dir, input_file) with open(input_path, 'w', encoding='utf-8') as f: json.dump(benchmark_data, f) with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = Mock() mock_handler.results_dir = temp_dir mock_get_handler.return_value = mock_handler with patch('src.openrouter_mcp.handlers.mcp_benchmark.BenchmarkReportExporter') as mock_exporter_class: mock_exporter = AsyncMock() mock_exporter_class.return_value = mock_exporter result = await export_benchmark_report( benchmark_file=input_file, format="markdown" ) assert result["format"] == "markdown" assert result["input_file"] == input_file assert "output_file" in result assert result["models_included"] == ["gpt-3.5-turbo"] # 내보내기 메서드 호출 검증 mock_exporter.export_markdown.assert_called_once() @pytest.mark.asyncio async def test_compare_model_performance(self, mock_env, mock_benchmark_result): """고급 모델 성능 비교 테스트""" models = ["gpt-4", "claude-3"] weights = {"speed": 0.3, "cost": 0.3, "quality": 0.4} mock_results = { "gpt-4": mock_benchmark_result, "claude-3": mock_benchmark_result } with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = AsyncMock() mock_handler.benchmark_models.return_value = mock_results mock_get_handler.return_value = mock_handler with patch('src.openrouter_mcp.handlers.mcp_benchmark.ModelPerformanceAnalyzer') as mock_analyzer_class: mock_analyzer = Mock() mock_analyzer.rank_models_with_weights.return_value = [(mock_benchmark_result, 0.85)] mock_analyzer_class.return_value = mock_analyzer result = await compare_model_performance( models=models, weights=weights, include_cost_analysis=True ) assert "config" in result assert "ranking" in result assert "detailed_metrics" in result assert "analysis" in result assert "recommendations" in result # 가중치 정규화 검증 total_weight = sum(result["config"]["weights"].values()) assert abs(total_weight - 1.0) < 0.001 # 부동소수점 오차 고려 @pytest.mark.asyncio async def test_compare_model_performance_no_weights(self, mock_env, mock_benchmark_result): """가중치 없는 모델 성능 비교 테스트""" models = ["gpt-4"] mock_results = {"gpt-4": mock_benchmark_result} with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = AsyncMock() mock_handler.benchmark_models.return_value = mock_results mock_get_handler.return_value = mock_handler with patch('src.openrouter_mcp.handlers.mcp_benchmark.ModelPerformanceAnalyzer') as mock_analyzer_class: mock_analyzer = Mock() mock_analyzer.rank_models_with_weights.return_value = [(mock_benchmark_result, 0.85)] mock_analyzer_class.return_value = mock_analyzer result = await compare_model_performance(models=models) # 기본 가중치가 사용되었는지 확인 expected_weights = {"speed": 0.2, "cost": 0.3, "quality": 0.4, "throughput": 0.1} assert result["config"]["weights"] == expected_weights def test_utility_functions(self): """유틸리티 함수들 테스트""" from src.openrouter_mcp.handlers.mcp_benchmark import ( _calculate_avg_response_time, _get_best_model, _get_category_prompt, _calculate_std ) # _calculate_avg_response_time 테스트 results = { "model1": { "success": True, "metrics": {"avg_response_time": 1.5} }, "model2": { "success": True, "metrics": {"avg_response_time": 2.0} }, "model3": { "success": False, "metrics": {"avg_response_time": 3.0} } } avg_time = _calculate_avg_response_time(results) assert avg_time == 1.75 # (1.5 + 2.0) / 2 # _get_best_model 테스트 results_with_quality = { "model1": { "success": True, "metrics": {"quality_score": 8.0} }, "model2": { "success": True, "metrics": {"quality_score": 9.5} } } best_model = _get_best_model(results_with_quality) assert best_model == "model2" # _get_category_prompt 테스트 chat_prompt = _get_category_prompt("chat") code_prompt = _get_category_prompt("code") unknown_prompt = _get_category_prompt("unknown_category") assert "안녕하세요" in chat_prompt assert "파이썬" in code_prompt assert chat_prompt == unknown_prompt # 기본값 # _calculate_std 테스트 values = [1.0, 2.0, 3.0, 4.0, 5.0] std = _calculate_std(values) assert abs(std - 1.5811) < 0.001 # 표준편차 계산 검증 # 단일 값에 대한 표준편차 single_value = _calculate_std([1.0]) assert single_value == 0 @pytest.mark.asyncio async def test_error_handling(self, mock_env): """에러 핸들링 테스트""" # API 키가 없을 때 with patch.dict(os.environ, {}, clear=True): with pytest.raises(Exception) as exc_info: await get_benchmark_handler() assert "OPENROUTER_API_KEY" in str(exc_info.value) # 벤치마크 실행 중 예외 발생 with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = AsyncMock() mock_handler.benchmark_models.side_effect = Exception("API Error") mock_get_handler.return_value = mock_handler with pytest.raises(Exception) as exc_info: await benchmark_models(models=["test-model"]) assert "벤치마킹 실패" in str(exc_info.value) @pytest.mark.asyncio async def test_model_filter_in_history(self, mock_env, temp_dir): """벤치마크 기록에서 모델 필터링 테스트""" # GPT 모델이 포함된 파일 gpt_data = { "results": { "gpt-4": {"success": True}, "gpt-3.5-turbo": {"success": True} }, "config": {} } # Claude 모델이 포함된 파일 claude_data = { "results": { "claude-3-opus": {"success": True}, "claude-3-sonnet": {"success": True} }, "config": {} } gpt_file = os.path.join(temp_dir, "gpt_benchmark.json") claude_file = os.path.join(temp_dir, "claude_benchmark.json") with open(gpt_file, 'w', encoding='utf-8') as f: json.dump(gpt_data, f) with open(claude_file, 'w', encoding='utf-8') as f: json.dump(claude_data, f) with patch('src.openrouter_mcp.handlers.mcp_benchmark.get_benchmark_handler') as mock_get_handler: mock_handler = Mock() mock_handler.results_dir = temp_dir mock_get_handler.return_value = mock_handler # GPT 모델만 필터링 result = await get_benchmark_history(model_filter="gpt") assert result["filter_applied"] is True assert len(result["history"]) == 1 assert "gpt-4" in result["history"][0]["models_tested"] # 모든 모델 (필터 없음) result_all = await get_benchmark_history() assert result_all["filter_applied"] is False assert len(result_all["history"]) == 2 if __name__ == "__main__": pytest.main([__file__, "-v"])

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/physics91/openrouter-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server