MaverickMCP

maverick-mcp
examples

llm_speed_demo.py•17.1 KiB

#!/usr/bin/env python3 """ Focused LLM Speed Optimization Demonstration This script demonstrates the core LLM optimization capabilities that provide 2-3x speed improvements, focusing on areas we can control directly. Demonstrates: - Adaptive model selection based on time constraints - Fast model execution (Gemini 2.5 Flash) - Token generation speed optimization - Progressive timeout management - Model performance comparison """ import asyncio import os import sys import time from datetime import datetime from typing import Any # Add the project root to Python path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from maverick_mcp.providers.openrouter_provider import OpenRouterProvider, TaskType from maverick_mcp.utils.llm_optimization import AdaptiveModelSelector class LLMSpeedDemonstrator: """Focused demonstration of LLM speed optimizations.""" def __init__(self): """Initialize the demonstration.""" api_key = os.getenv("OPENROUTER_API_KEY") if not api_key: raise ValueError( "OPENROUTER_API_KEY environment variable is required. " "Please set it with your OpenRouter API key." ) self.openrouter_provider = OpenRouterProvider(api_key=api_key) self.model_selector = AdaptiveModelSelector(self.openrouter_provider) # Test scenarios focused on different urgency levels self.test_scenarios = [ { "name": "Emergency Analysis (Critical Speed)", "prompt": "Analyze NVIDIA's latest earnings impact on AI market sentiment. 2-3 key points only.", "time_budget": 15.0, "task_type": TaskType.QUICK_ANSWER, "expected_speed": ">100 tok/s", }, { "name": "Technical Analysis (Fast Response)", "prompt": "Provide technical analysis of Apple stock including RSI, MACD, and support levels. Be concise.", "time_budget": 30.0, "task_type": TaskType.TECHNICAL_ANALYSIS, "expected_speed": ">80 tok/s", }, { "name": "Market Research (Moderate Speed)", "prompt": "Analyze Federal Reserve interest rate policy impact on technology sector. Include risk assessment.", "time_budget": 45.0, "task_type": TaskType.MARKET_ANALYSIS, "expected_speed": ">60 tok/s", }, { "name": "Complex Synthesis (Quality Balance)", "prompt": "Synthesize renewable energy investment opportunities for 2025, considering policy changes, technology advances, and market trends.", "time_budget": 60.0, "task_type": TaskType.RESULT_SYNTHESIS, "expected_speed": ">40 tok/s", }, ] def print_header(self, title: str): """Print formatted header.""" print("\n" + "=" * 80) print(f" {title}") print("=" * 80) def print_subheader(self, title: str): """Print formatted subheader.""" print(f"\n--- {title} ---") async def validate_openrouter_connection(self) -> bool: """Validate OpenRouter API is accessible.""" self.print_header("🔧 API VALIDATION") try: test_llm = self.openrouter_provider.get_llm(TaskType.GENERAL) from langchain_core.messages import HumanMessage test_response = await asyncio.wait_for( test_llm.ainvoke([HumanMessage(content="test connection")]), timeout=10.0, ) print("✅ OpenRouter API: Connected successfully") print(f" Response length: {len(test_response.content)} chars") return True except Exception as e: print(f"❌ OpenRouter API: Failed - {e}") return False async def demonstrate_model_selection(self): """Show intelligent model selection for different scenarios.""" self.print_header("🧠 ADAPTIVE MODEL SELECTION") for scenario in self.test_scenarios: print(f"\n📋 Scenario: {scenario['name']}") print(f" Time Budget: {scenario['time_budget']}s") print(f" Task Type: {scenario['task_type'].value}") print(f" Expected Speed: {scenario['expected_speed']}") # Calculate task complexity complexity = self.model_selector.calculate_task_complexity( content=scenario["prompt"], task_type=scenario["task_type"], focus_areas=["analysis"], ) # Get optimal model for time budget model_config = self.model_selector.select_model_for_time_budget( task_type=scenario["task_type"], time_remaining_seconds=scenario["time_budget"], complexity_score=complexity, content_size_tokens=len(scenario["prompt"]) // 4, ) print(f" 📊 Complexity Score: {complexity:.2f}") print(f" 🎯 Selected Model: {model_config.model_id}") print(f" ⏱️ Max Timeout: {model_config.timeout_seconds}s") print(f" 🌡️ Temperature: {model_config.temperature}") print(f" 📝 Max Tokens: {model_config.max_tokens}") # Check if speed-optimized is_speed_model = any( x in model_config.model_id.lower() for x in ["flash", "haiku", "4o-mini", "deepseek"] ) print(f" 🚀 Speed Optimized: {'✅' if is_speed_model else '❌'}") async def run_speed_benchmarks(self): """Run actual speed benchmarks for each scenario.""" self.print_header("⚡ LIVE SPEED BENCHMARKS") results = [] baseline_time = 60.0 # Historical baseline from timeout issues for i, scenario in enumerate(self.test_scenarios, 1): print(f"\n🔍 Benchmark {i}/{len(self.test_scenarios)}: {scenario['name']}") print(f" Query: {scenario['prompt'][:60]}...") try: # Get optimal model configuration complexity = self.model_selector.calculate_task_complexity( content=scenario["prompt"], task_type=scenario["task_type"], ) model_config = self.model_selector.select_model_for_time_budget( task_type=scenario["task_type"], time_remaining_seconds=scenario["time_budget"], complexity_score=complexity, content_size_tokens=len(scenario["prompt"]) // 4, ) # Execute with timing llm = self.openrouter_provider.get_llm( model_override=model_config.model_id, temperature=model_config.temperature, max_tokens=model_config.max_tokens, ) start_time = time.time() from langchain_core.messages import HumanMessage response = await asyncio.wait_for( llm.ainvoke([HumanMessage(content=scenario["prompt"])]), timeout=model_config.timeout_seconds, ) execution_time = time.time() - start_time # Calculate metrics response_length = len(response.content) estimated_tokens = response_length // 4 tokens_per_second = ( estimated_tokens / execution_time if execution_time > 0 else 0 ) speed_improvement = ( baseline_time / execution_time if execution_time > 0 else 0 ) # Results result = { "scenario": scenario["name"], "model_used": model_config.model_id, "execution_time": execution_time, "time_budget": scenario["time_budget"], "budget_used_pct": (execution_time / scenario["time_budget"]) * 100, "tokens_per_second": tokens_per_second, "response_length": response_length, "speed_improvement": speed_improvement, "target_achieved": execution_time <= scenario["time_budget"], "response_preview": response.content[:150] + "..." if len(response.content) > 150 else response.content, } results.append(result) # Print immediate results status_icon = "✅" if result["target_achieved"] else "⚠️" print( f" {status_icon} Completed: {execution_time:.2f}s ({result['budget_used_pct']:.1f}% of budget)" ) print(f" 🎯 Model: {model_config.model_id}") print(f" 🚀 Speed: {tokens_per_second:.0f} tok/s") print( f" 📊 Improvement: {speed_improvement:.1f}x faster than baseline" ) print(f" 💬 Preview: {result['response_preview']}") # Brief pause between tests await asyncio.sleep(1) except Exception as e: print(f" ❌ Failed: {str(e)}") results.append( { "scenario": scenario["name"], "error": str(e), "target_achieved": False, } ) return results def analyze_benchmark_results(self, results: list[dict[str, Any]]): """Analyze and report benchmark results.""" self.print_header("📊 SPEED OPTIMIZATION ANALYSIS") successful_tests = [r for r in results if not r.get("error")] failed_tests = [r for r in results if r.get("error")] targets_achieved = [r for r in successful_tests if r.get("target_achieved")] print("📈 Overall Performance:") print(f" Total Tests: {len(results)}") print(f" Successful: {len(successful_tests)}") print(f" Failed: {len(failed_tests)}") print(f" Targets Hit: {len(targets_achieved)}/{len(results)}") print(f" Success Rate: {(len(targets_achieved) / len(results) * 100):.1f}%") if successful_tests: # Speed metrics avg_execution_time = sum( r["execution_time"] for r in successful_tests ) / len(successful_tests) max_execution_time = max(r["execution_time"] for r in successful_tests) avg_tokens_per_second = sum( r["tokens_per_second"] for r in successful_tests ) / len(successful_tests) avg_speed_improvement = sum( r["speed_improvement"] for r in successful_tests ) / len(successful_tests) print("\n⚡ Speed Metrics:") print(f" Average Execution Time: {avg_execution_time:.2f}s") print(f" Maximum Execution Time: {max_execution_time:.2f}s") print(f" Average Token Generation: {avg_tokens_per_second:.0f} tok/s") print(f" Average Speed Improvement: {avg_speed_improvement:.1f}x") # Historical comparison historical_baseline = 60.0 # Average timeout failure time if max_execution_time > 0: overall_improvement = historical_baseline / max_execution_time print("\n🎯 Speed Validation:") print( f" Historical Baseline: {historical_baseline}s (timeout failures)" ) print(f" Current Max Time: {max_execution_time:.2f}s") print(f" Overall Improvement: {overall_improvement:.1f}x") if overall_improvement >= 3.0: print( f" 🎉 EXCELLENT: {overall_improvement:.1f}x speed improvement!" ) elif overall_improvement >= 2.0: print( f" ✅ SUCCESS: {overall_improvement:.1f}x speed improvement achieved!" ) elif overall_improvement >= 1.5: print( f" 👍 GOOD: {overall_improvement:.1f}x improvement (target: 2x)" ) else: print( f" ⚠️ NEEDS WORK: Only {overall_improvement:.1f}x improvement" ) # Model performance breakdown self.print_subheader("🧠 MODEL PERFORMANCE BREAKDOWN") model_stats = {} for result in successful_tests: model = result["model_used"] if model not in model_stats: model_stats[model] = [] model_stats[model].append(result) for model, model_results in model_stats.items(): avg_speed = sum(r["tokens_per_second"] for r in model_results) / len( model_results ) avg_time = sum(r["execution_time"] for r in model_results) / len( model_results ) success_rate = ( len([r for r in model_results if r["target_achieved"]]) / len(model_results) ) * 100 print(f" {model}:") print(f" Tests: {len(model_results)}") print(f" Avg Speed: {avg_speed:.0f} tok/s") print(f" Avg Time: {avg_time:.2f}s") print(f" Success Rate: {success_rate:.0f}%") async def run_comprehensive_demo(self): """Run the complete LLM speed demonstration.""" print("🚀 MaverickMCP LLM Speed Optimization Demonstration") print(f"⏰ Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("🎯 Goal: Demonstrate 2-3x LLM speed improvements") # Step 1: Validate connection if not await self.validate_openrouter_connection(): print("\n❌ Cannot proceed - API connection failed") return False # Step 2: Show model selection intelligence await self.demonstrate_model_selection() # Step 3: Run live speed benchmarks results = await self.run_speed_benchmarks() # Step 4: Analyze results self.analyze_benchmark_results(results) # Final summary self.print_header("🎉 DEMONSTRATION SUMMARY") successful_tests = [r for r in results if not r.get("error")] targets_achieved = [r for r in successful_tests if r.get("target_achieved")] print("✅ LLM Speed Optimization Results:") print(f" Tests Executed: {len(results)}") print(f" Successful: {len(successful_tests)}") print(f" Targets Achieved: {len(targets_achieved)}") print(f" Success Rate: {(len(targets_achieved) / len(results) * 100):.1f}%") if successful_tests: max_time = max(r["execution_time"] for r in successful_tests) avg_speed = sum(r["tokens_per_second"] for r in successful_tests) / len( successful_tests ) speed_improvement = 60.0 / max_time if max_time > 0 else 0 print( f" Fastest Response: {min(r['execution_time'] for r in successful_tests):.2f}s" ) print(f" Average Token Speed: {avg_speed:.0f} tok/s") print(f" Speed Improvement: {speed_improvement:.1f}x faster") print("\n📊 Key Optimizations Demonstrated:") print(" ✅ Adaptive Model Selection (context-aware)") print(" ✅ Time-Budget Optimization") print(" ✅ Fast Model Utilization (Gemini Flash, Claude Haiku)") print(" ✅ Progressive Timeout Management") print(" ✅ Token Generation Speed Optimization") # Success criteria: at least 75% success rate and 2x improvement success_criteria = len(targets_achieved) >= len(results) * 0.75 and ( successful_tests and 60.0 / max(r["execution_time"] for r in successful_tests) >= 2.0 ) return success_criteria async def main(): """Main demonstration entry point.""" demo = LLMSpeedDemonstrator() try: success = await demo.run_comprehensive_demo() if success: print("\n🎉 LLM Speed Demonstration PASSED - Optimizations validated!") return 0 else: print("\n⚠️ Demonstration had mixed results - review analysis above") return 1 except KeyboardInterrupt: print("\n\n⏹️ Demonstration interrupted by user") return 130 except Exception as e: print(f"\n💥 Demonstration failed with error: {e}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": # Check required environment variables if not os.getenv("OPENROUTER_API_KEY"): print("❌ Missing OPENROUTER_API_KEY environment variable") print("Please check your .env file") sys.exit(1) # Run the demonstration exit_code = asyncio.run(main()) sys.exit(exit_code)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wshobson/maverick-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

llm_speed_demo.py•17.1 KiB