Skip to main content
Glama

MaverickMCP

by wshobson
MIT License
165
  • Apple
llm_speed_demo.py17.5 kB
#!/usr/bin/env python3 """ Focused LLM Speed Optimization Demonstration This script demonstrates the core LLM optimization capabilities that provide 2-3x speed improvements, focusing on areas we can control directly. Demonstrates: - Adaptive model selection based on time constraints - Fast model execution (Gemini 2.5 Flash) - Token generation speed optimization - Progressive timeout management - Model performance comparison """ import asyncio import os import sys import time from datetime import datetime from typing import Any # Add the project root to Python path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from maverick_mcp.providers.openrouter_provider import OpenRouterProvider, TaskType from maverick_mcp.utils.llm_optimization import AdaptiveModelSelector class LLMSpeedDemonstrator: """Focused demonstration of LLM speed optimizations.""" def __init__(self): """Initialize the demonstration.""" api_key = os.getenv("OPENROUTER_API_KEY") if not api_key: raise ValueError( "OPENROUTER_API_KEY environment variable is required. " "Please set it with your OpenRouter API key." ) self.openrouter_provider = OpenRouterProvider(api_key=api_key) self.model_selector = AdaptiveModelSelector(self.openrouter_provider) # Test scenarios focused on different urgency levels self.test_scenarios = [ { "name": "Emergency Analysis (Critical Speed)", "prompt": "Analyze NVIDIA's latest earnings impact on AI market sentiment. 2-3 key points only.", "time_budget": 15.0, "task_type": TaskType.QUICK_ANSWER, "expected_speed": ">100 tok/s", }, { "name": "Technical Analysis (Fast Response)", "prompt": "Provide technical analysis of Apple stock including RSI, MACD, and support levels. Be concise.", "time_budget": 30.0, "task_type": TaskType.TECHNICAL_ANALYSIS, "expected_speed": ">80 tok/s", }, { "name": "Market Research (Moderate Speed)", "prompt": "Analyze Federal Reserve interest rate policy impact on technology sector. Include risk assessment.", "time_budget": 45.0, "task_type": TaskType.MARKET_ANALYSIS, "expected_speed": ">60 tok/s", }, { "name": "Complex Synthesis (Quality Balance)", "prompt": "Synthesize renewable energy investment opportunities for 2025, considering policy changes, technology advances, and market trends.", "time_budget": 60.0, "task_type": TaskType.RESULT_SYNTHESIS, "expected_speed": ">40 tok/s", }, ] def print_header(self, title: str): """Print formatted header.""" print("\n" + "=" * 80) print(f" {title}") print("=" * 80) def print_subheader(self, title: str): """Print formatted subheader.""" print(f"\n--- {title} ---") async def validate_openrouter_connection(self) -> bool: """Validate OpenRouter API is accessible.""" self.print_header("🔧 API VALIDATION") try: test_llm = self.openrouter_provider.get_llm(TaskType.GENERAL) from langchain_core.messages import HumanMessage test_response = await asyncio.wait_for( test_llm.ainvoke([HumanMessage(content="test connection")]), timeout=10.0, ) print("✅ OpenRouter API: Connected successfully") print(f" Response length: {len(test_response.content)} chars") return True except Exception as e: print(f"❌ OpenRouter API: Failed - {e}") return False async def demonstrate_model_selection(self): """Show intelligent model selection for different scenarios.""" self.print_header("🧠 ADAPTIVE MODEL SELECTION") for scenario in self.test_scenarios: print(f"\n📋 Scenario: {scenario['name']}") print(f" Time Budget: {scenario['time_budget']}s") print(f" Task Type: {scenario['task_type'].value}") print(f" Expected Speed: {scenario['expected_speed']}") # Calculate task complexity complexity = self.model_selector.calculate_task_complexity( content=scenario["prompt"], task_type=scenario["task_type"], focus_areas=["analysis"], ) # Get optimal model for time budget model_config = self.model_selector.select_model_for_time_budget( task_type=scenario["task_type"], time_remaining_seconds=scenario["time_budget"], complexity_score=complexity, content_size_tokens=len(scenario["prompt"]) // 4, ) print(f" 📊 Complexity Score: {complexity:.2f}") print(f" 🎯 Selected Model: {model_config.model_id}") print(f" ⏱️ Max Timeout: {model_config.timeout_seconds}s") print(f" 🌡️ Temperature: {model_config.temperature}") print(f" 📝 Max Tokens: {model_config.max_tokens}") # Check if speed-optimized is_speed_model = any( x in model_config.model_id.lower() for x in ["flash", "haiku", "4o-mini", "deepseek"] ) print(f" 🚀 Speed Optimized: {'✅' if is_speed_model else '❌'}") async def run_speed_benchmarks(self): """Run actual speed benchmarks for each scenario.""" self.print_header("⚡ LIVE SPEED BENCHMARKS") results = [] baseline_time = 60.0 # Historical baseline from timeout issues for i, scenario in enumerate(self.test_scenarios, 1): print(f"\n🔍 Benchmark {i}/{len(self.test_scenarios)}: {scenario['name']}") print(f" Query: {scenario['prompt'][:60]}...") try: # Get optimal model configuration complexity = self.model_selector.calculate_task_complexity( content=scenario["prompt"], task_type=scenario["task_type"], ) model_config = self.model_selector.select_model_for_time_budget( task_type=scenario["task_type"], time_remaining_seconds=scenario["time_budget"], complexity_score=complexity, content_size_tokens=len(scenario["prompt"]) // 4, ) # Execute with timing llm = self.openrouter_provider.get_llm( model_override=model_config.model_id, temperature=model_config.temperature, max_tokens=model_config.max_tokens, ) start_time = time.time() from langchain_core.messages import HumanMessage response = await asyncio.wait_for( llm.ainvoke([HumanMessage(content=scenario["prompt"])]), timeout=model_config.timeout_seconds, ) execution_time = time.time() - start_time # Calculate metrics response_length = len(response.content) estimated_tokens = response_length // 4 tokens_per_second = ( estimated_tokens / execution_time if execution_time > 0 else 0 ) speed_improvement = ( baseline_time / execution_time if execution_time > 0 else 0 ) # Results result = { "scenario": scenario["name"], "model_used": model_config.model_id, "execution_time": execution_time, "time_budget": scenario["time_budget"], "budget_used_pct": (execution_time / scenario["time_budget"]) * 100, "tokens_per_second": tokens_per_second, "response_length": response_length, "speed_improvement": speed_improvement, "target_achieved": execution_time <= scenario["time_budget"], "response_preview": response.content[:150] + "..." if len(response.content) > 150 else response.content, } results.append(result) # Print immediate results status_icon = "✅" if result["target_achieved"] else "⚠️" print( f" {status_icon} Completed: {execution_time:.2f}s ({result['budget_used_pct']:.1f}% of budget)" ) print(f" 🎯 Model: {model_config.model_id}") print(f" 🚀 Speed: {tokens_per_second:.0f} tok/s") print( f" 📊 Improvement: {speed_improvement:.1f}x faster than baseline" ) print(f" 💬 Preview: {result['response_preview']}") # Brief pause between tests await asyncio.sleep(1) except Exception as e: print(f" ❌ Failed: {str(e)}") results.append( { "scenario": scenario["name"], "error": str(e), "target_achieved": False, } ) return results def analyze_benchmark_results(self, results: list[dict[str, Any]]): """Analyze and report benchmark results.""" self.print_header("📊 SPEED OPTIMIZATION ANALYSIS") successful_tests = [r for r in results if not r.get("error")] failed_tests = [r for r in results if r.get("error")] targets_achieved = [r for r in successful_tests if r.get("target_achieved")] print("📈 Overall Performance:") print(f" Total Tests: {len(results)}") print(f" Successful: {len(successful_tests)}") print(f" Failed: {len(failed_tests)}") print(f" Targets Hit: {len(targets_achieved)}/{len(results)}") print(f" Success Rate: {(len(targets_achieved) / len(results) * 100):.1f}%") if successful_tests: # Speed metrics avg_execution_time = sum( r["execution_time"] for r in successful_tests ) / len(successful_tests) max_execution_time = max(r["execution_time"] for r in successful_tests) avg_tokens_per_second = sum( r["tokens_per_second"] for r in successful_tests ) / len(successful_tests) avg_speed_improvement = sum( r["speed_improvement"] for r in successful_tests ) / len(successful_tests) print("\n⚡ Speed Metrics:") print(f" Average Execution Time: {avg_execution_time:.2f}s") print(f" Maximum Execution Time: {max_execution_time:.2f}s") print(f" Average Token Generation: {avg_tokens_per_second:.0f} tok/s") print(f" Average Speed Improvement: {avg_speed_improvement:.1f}x") # Historical comparison historical_baseline = 60.0 # Average timeout failure time if max_execution_time > 0: overall_improvement = historical_baseline / max_execution_time print("\n🎯 Speed Validation:") print( f" Historical Baseline: {historical_baseline}s (timeout failures)" ) print(f" Current Max Time: {max_execution_time:.2f}s") print(f" Overall Improvement: {overall_improvement:.1f}x") if overall_improvement >= 3.0: print( f" 🎉 EXCELLENT: {overall_improvement:.1f}x speed improvement!" ) elif overall_improvement >= 2.0: print( f" ✅ SUCCESS: {overall_improvement:.1f}x speed improvement achieved!" ) elif overall_improvement >= 1.5: print( f" 👍 GOOD: {overall_improvement:.1f}x improvement (target: 2x)" ) else: print( f" ⚠️ NEEDS WORK: Only {overall_improvement:.1f}x improvement" ) # Model performance breakdown self.print_subheader("🧠 MODEL PERFORMANCE BREAKDOWN") model_stats = {} for result in successful_tests: model = result["model_used"] if model not in model_stats: model_stats[model] = [] model_stats[model].append(result) for model, model_results in model_stats.items(): avg_speed = sum(r["tokens_per_second"] for r in model_results) / len( model_results ) avg_time = sum(r["execution_time"] for r in model_results) / len( model_results ) success_rate = ( len([r for r in model_results if r["target_achieved"]]) / len(model_results) ) * 100 print(f" {model}:") print(f" Tests: {len(model_results)}") print(f" Avg Speed: {avg_speed:.0f} tok/s") print(f" Avg Time: {avg_time:.2f}s") print(f" Success Rate: {success_rate:.0f}%") async def run_comprehensive_demo(self): """Run the complete LLM speed demonstration.""" print("🚀 MaverickMCP LLM Speed Optimization Demonstration") print(f"⏰ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("🎯 Goal: Demonstrate 2-3x LLM speed improvements") # Step 1: Validate connection if not await self.validate_openrouter_connection(): print("\n❌ Cannot proceed - API connection failed") return False # Step 2: Show model selection intelligence await self.demonstrate_model_selection() # Step 3: Run live speed benchmarks results = await self.run_speed_benchmarks() # Step 4: Analyze results self.analyze_benchmark_results(results) # Final summary self.print_header("🎉 DEMONSTRATION SUMMARY") successful_tests = [r for r in results if not r.get("error")] targets_achieved = [r for r in successful_tests if r.get("target_achieved")] print("✅ LLM Speed Optimization Results:") print(f" Tests Executed: {len(results)}") print(f" Successful: {len(successful_tests)}") print(f" Targets Achieved: {len(targets_achieved)}") print(f" Success Rate: {(len(targets_achieved) / len(results) * 100):.1f}%") if successful_tests: max_time = max(r["execution_time"] for r in successful_tests) avg_speed = sum(r["tokens_per_second"] for r in successful_tests) / len( successful_tests ) speed_improvement = 60.0 / max_time if max_time > 0 else 0 print( f" Fastest Response: {min(r['execution_time'] for r in successful_tests):.2f}s" ) print(f" Average Token Speed: {avg_speed:.0f} tok/s") print(f" Speed Improvement: {speed_improvement:.1f}x faster") print("\n📊 Key Optimizations Demonstrated:") print(" ✅ Adaptive Model Selection (context-aware)") print(" ✅ Time-Budget Optimization") print(" ✅ Fast Model Utilization (Gemini Flash, Claude Haiku)") print(" ✅ Progressive Timeout Management") print(" ✅ Token Generation Speed Optimization") # Success criteria: at least 75% success rate and 2x improvement success_criteria = len(targets_achieved) >= len(results) * 0.75 and ( successful_tests and 60.0 / max(r["execution_time"] for r in successful_tests) >= 2.0 ) return success_criteria async def main(): """Main demonstration entry point.""" demo = LLMSpeedDemonstrator() try: success = await demo.run_comprehensive_demo() if success: print("\n🎉 LLM Speed Demonstration PASSED - Optimizations validated!") return 0 else: print("\n⚠️ Demonstration had mixed results - review analysis above") return 1 except KeyboardInterrupt: print("\n\n⏹️ Demonstration interrupted by user") return 130 except Exception as e: print(f"\n💥 Demonstration failed with error: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": # Check required environment variables if not os.getenv("OPENROUTER_API_KEY"): print("❌ Missing OPENROUTER_API_KEY environment variable") print("Please check your .env file") sys.exit(1) # Run the demonstration exit_code = asyncio.run(main()) sys.exit(exit_code)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wshobson/maverick-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server