#!/usr/bin/env python3
"""
Focused LLM Speed Optimization Demonstration
This script demonstrates the core LLM optimization capabilities that provide
2-3x speed improvements, focusing on areas we can control directly.
Demonstrates:
- Adaptive model selection based on time constraints
- Fast model execution (Gemini 2.5 Flash)
- Token generation speed optimization
- Progressive timeout management
- Model performance comparison
"""
import asyncio
import os
import sys
import time
from datetime import datetime
from typing import Any
# Add the project root to Python path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from maverick_mcp.providers.openrouter_provider import OpenRouterProvider, TaskType
from maverick_mcp.utils.llm_optimization import AdaptiveModelSelector
class LLMSpeedDemonstrator:
"""Focused demonstration of LLM speed optimizations."""
def __init__(self):
"""Initialize the demonstration."""
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
raise ValueError(
"OPENROUTER_API_KEY environment variable is required. "
"Please set it with your OpenRouter API key."
)
self.openrouter_provider = OpenRouterProvider(api_key=api_key)
self.model_selector = AdaptiveModelSelector(self.openrouter_provider)
# Test scenarios focused on different urgency levels
self.test_scenarios = [
{
"name": "Emergency Analysis (Critical Speed)",
"prompt": "Analyze NVIDIA's latest earnings impact on AI market sentiment. 2-3 key points only.",
"time_budget": 15.0,
"task_type": TaskType.QUICK_ANSWER,
"expected_speed": ">100 tok/s",
},
{
"name": "Technical Analysis (Fast Response)",
"prompt": "Provide technical analysis of Apple stock including RSI, MACD, and support levels. Be concise.",
"time_budget": 30.0,
"task_type": TaskType.TECHNICAL_ANALYSIS,
"expected_speed": ">80 tok/s",
},
{
"name": "Market Research (Moderate Speed)",
"prompt": "Analyze Federal Reserve interest rate policy impact on technology sector. Include risk assessment.",
"time_budget": 45.0,
"task_type": TaskType.MARKET_ANALYSIS,
"expected_speed": ">60 tok/s",
},
{
"name": "Complex Synthesis (Quality Balance)",
"prompt": "Synthesize renewable energy investment opportunities for 2025, considering policy changes, technology advances, and market trends.",
"time_budget": 60.0,
"task_type": TaskType.RESULT_SYNTHESIS,
"expected_speed": ">40 tok/s",
},
]
def print_header(self, title: str):
"""Print formatted header."""
print("\n" + "=" * 80)
print(f" {title}")
print("=" * 80)
def print_subheader(self, title: str):
"""Print formatted subheader."""
print(f"\n--- {title} ---")
async def validate_openrouter_connection(self) -> bool:
"""Validate OpenRouter API is accessible."""
self.print_header("π§ API VALIDATION")
try:
test_llm = self.openrouter_provider.get_llm(TaskType.GENERAL)
from langchain_core.messages import HumanMessage
test_response = await asyncio.wait_for(
test_llm.ainvoke([HumanMessage(content="test connection")]),
timeout=10.0,
)
print("β
OpenRouter API: Connected successfully")
print(f" Response length: {len(test_response.content)} chars")
return True
except Exception as e:
print(f"β OpenRouter API: Failed - {e}")
return False
async def demonstrate_model_selection(self):
"""Show intelligent model selection for different scenarios."""
self.print_header("π§ ADAPTIVE MODEL SELECTION")
for scenario in self.test_scenarios:
print(f"\nπ Scenario: {scenario['name']}")
print(f" Time Budget: {scenario['time_budget']}s")
print(f" Task Type: {scenario['task_type'].value}")
print(f" Expected Speed: {scenario['expected_speed']}")
# Calculate task complexity
complexity = self.model_selector.calculate_task_complexity(
content=scenario["prompt"],
task_type=scenario["task_type"],
focus_areas=["analysis"],
)
# Get optimal model for time budget
model_config = self.model_selector.select_model_for_time_budget(
task_type=scenario["task_type"],
time_remaining_seconds=scenario["time_budget"],
complexity_score=complexity,
content_size_tokens=len(scenario["prompt"]) // 4,
)
print(f" π Complexity Score: {complexity:.2f}")
print(f" π― Selected Model: {model_config.model_id}")
print(f" β±οΈ Max Timeout: {model_config.timeout_seconds}s")
print(f" π‘οΈ Temperature: {model_config.temperature}")
print(f" π Max Tokens: {model_config.max_tokens}")
# Check if speed-optimized
is_speed_model = any(
x in model_config.model_id.lower()
for x in ["flash", "haiku", "4o-mini", "deepseek"]
)
print(f" π Speed Optimized: {'β
' if is_speed_model else 'β'}")
async def run_speed_benchmarks(self):
"""Run actual speed benchmarks for each scenario."""
self.print_header("β‘ LIVE SPEED BENCHMARKS")
results = []
baseline_time = 60.0 # Historical baseline from timeout issues
for i, scenario in enumerate(self.test_scenarios, 1):
print(f"\nπ Benchmark {i}/{len(self.test_scenarios)}: {scenario['name']}")
print(f" Query: {scenario['prompt'][:60]}...")
try:
# Get optimal model configuration
complexity = self.model_selector.calculate_task_complexity(
content=scenario["prompt"],
task_type=scenario["task_type"],
)
model_config = self.model_selector.select_model_for_time_budget(
task_type=scenario["task_type"],
time_remaining_seconds=scenario["time_budget"],
complexity_score=complexity,
content_size_tokens=len(scenario["prompt"]) // 4,
)
# Execute with timing
llm = self.openrouter_provider.get_llm(
model_override=model_config.model_id,
temperature=model_config.temperature,
max_tokens=model_config.max_tokens,
)
start_time = time.time()
from langchain_core.messages import HumanMessage
response = await asyncio.wait_for(
llm.ainvoke([HumanMessage(content=scenario["prompt"])]),
timeout=model_config.timeout_seconds,
)
execution_time = time.time() - start_time
# Calculate metrics
response_length = len(response.content)
estimated_tokens = response_length // 4
tokens_per_second = (
estimated_tokens / execution_time if execution_time > 0 else 0
)
speed_improvement = (
baseline_time / execution_time if execution_time > 0 else 0
)
# Results
result = {
"scenario": scenario["name"],
"model_used": model_config.model_id,
"execution_time": execution_time,
"time_budget": scenario["time_budget"],
"budget_used_pct": (execution_time / scenario["time_budget"]) * 100,
"tokens_per_second": tokens_per_second,
"response_length": response_length,
"speed_improvement": speed_improvement,
"target_achieved": execution_time <= scenario["time_budget"],
"response_preview": response.content[:150] + "..."
if len(response.content) > 150
else response.content,
}
results.append(result)
# Print immediate results
status_icon = "β
" if result["target_achieved"] else "β οΈ"
print(
f" {status_icon} Completed: {execution_time:.2f}s ({result['budget_used_pct']:.1f}% of budget)"
)
print(f" π― Model: {model_config.model_id}")
print(f" π Speed: {tokens_per_second:.0f} tok/s")
print(
f" π Improvement: {speed_improvement:.1f}x faster than baseline"
)
print(f" π¬ Preview: {result['response_preview']}")
# Brief pause between tests
await asyncio.sleep(1)
except Exception as e:
print(f" β Failed: {str(e)}")
results.append(
{
"scenario": scenario["name"],
"error": str(e),
"target_achieved": False,
}
)
return results
def analyze_benchmark_results(self, results: list[dict[str, Any]]):
"""Analyze and report benchmark results."""
self.print_header("π SPEED OPTIMIZATION ANALYSIS")
successful_tests = [r for r in results if not r.get("error")]
failed_tests = [r for r in results if r.get("error")]
targets_achieved = [r for r in successful_tests if r.get("target_achieved")]
print("π Overall Performance:")
print(f" Total Tests: {len(results)}")
print(f" Successful: {len(successful_tests)}")
print(f" Failed: {len(failed_tests)}")
print(f" Targets Hit: {len(targets_achieved)}/{len(results)}")
print(f" Success Rate: {(len(targets_achieved) / len(results) * 100):.1f}%")
if successful_tests:
# Speed metrics
avg_execution_time = sum(
r["execution_time"] for r in successful_tests
) / len(successful_tests)
max_execution_time = max(r["execution_time"] for r in successful_tests)
avg_tokens_per_second = sum(
r["tokens_per_second"] for r in successful_tests
) / len(successful_tests)
avg_speed_improvement = sum(
r["speed_improvement"] for r in successful_tests
) / len(successful_tests)
print("\nβ‘ Speed Metrics:")
print(f" Average Execution Time: {avg_execution_time:.2f}s")
print(f" Maximum Execution Time: {max_execution_time:.2f}s")
print(f" Average Token Generation: {avg_tokens_per_second:.0f} tok/s")
print(f" Average Speed Improvement: {avg_speed_improvement:.1f}x")
# Historical comparison
historical_baseline = 60.0 # Average timeout failure time
if max_execution_time > 0:
overall_improvement = historical_baseline / max_execution_time
print("\nπ― Speed Validation:")
print(
f" Historical Baseline: {historical_baseline}s (timeout failures)"
)
print(f" Current Max Time: {max_execution_time:.2f}s")
print(f" Overall Improvement: {overall_improvement:.1f}x")
if overall_improvement >= 3.0:
print(
f" π EXCELLENT: {overall_improvement:.1f}x speed improvement!"
)
elif overall_improvement >= 2.0:
print(
f" β
SUCCESS: {overall_improvement:.1f}x speed improvement achieved!"
)
elif overall_improvement >= 1.5:
print(
f" π GOOD: {overall_improvement:.1f}x improvement (target: 2x)"
)
else:
print(
f" β οΈ NEEDS WORK: Only {overall_improvement:.1f}x improvement"
)
# Model performance breakdown
self.print_subheader("π§ MODEL PERFORMANCE BREAKDOWN")
model_stats = {}
for result in successful_tests:
model = result["model_used"]
if model not in model_stats:
model_stats[model] = []
model_stats[model].append(result)
for model, model_results in model_stats.items():
avg_speed = sum(r["tokens_per_second"] for r in model_results) / len(
model_results
)
avg_time = sum(r["execution_time"] for r in model_results) / len(
model_results
)
success_rate = (
len([r for r in model_results if r["target_achieved"]])
/ len(model_results)
) * 100
print(f" {model}:")
print(f" Tests: {len(model_results)}")
print(f" Avg Speed: {avg_speed:.0f} tok/s")
print(f" Avg Time: {avg_time:.2f}s")
print(f" Success Rate: {success_rate:.0f}%")
async def run_comprehensive_demo(self):
"""Run the complete LLM speed demonstration."""
print("π MaverickMCP LLM Speed Optimization Demonstration")
print(f"β° Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("π― Goal: Demonstrate 2-3x LLM speed improvements")
# Step 1: Validate connection
if not await self.validate_openrouter_connection():
print("\nβ Cannot proceed - API connection failed")
return False
# Step 2: Show model selection intelligence
await self.demonstrate_model_selection()
# Step 3: Run live speed benchmarks
results = await self.run_speed_benchmarks()
# Step 4: Analyze results
self.analyze_benchmark_results(results)
# Final summary
self.print_header("π DEMONSTRATION SUMMARY")
successful_tests = [r for r in results if not r.get("error")]
targets_achieved = [r for r in successful_tests if r.get("target_achieved")]
print("β
LLM Speed Optimization Results:")
print(f" Tests Executed: {len(results)}")
print(f" Successful: {len(successful_tests)}")
print(f" Targets Achieved: {len(targets_achieved)}")
print(f" Success Rate: {(len(targets_achieved) / len(results) * 100):.1f}%")
if successful_tests:
max_time = max(r["execution_time"] for r in successful_tests)
avg_speed = sum(r["tokens_per_second"] for r in successful_tests) / len(
successful_tests
)
speed_improvement = 60.0 / max_time if max_time > 0 else 0
print(
f" Fastest Response: {min(r['execution_time'] for r in successful_tests):.2f}s"
)
print(f" Average Token Speed: {avg_speed:.0f} tok/s")
print(f" Speed Improvement: {speed_improvement:.1f}x faster")
print("\nπ Key Optimizations Demonstrated:")
print(" β
Adaptive Model Selection (context-aware)")
print(" β
Time-Budget Optimization")
print(" β
Fast Model Utilization (Gemini Flash, Claude Haiku)")
print(" β
Progressive Timeout Management")
print(" β
Token Generation Speed Optimization")
# Success criteria: at least 75% success rate and 2x improvement
success_criteria = len(targets_achieved) >= len(results) * 0.75 and (
successful_tests
and 60.0 / max(r["execution_time"] for r in successful_tests) >= 2.0
)
return success_criteria
async def main():
"""Main demonstration entry point."""
demo = LLMSpeedDemonstrator()
try:
success = await demo.run_comprehensive_demo()
if success:
print("\nπ LLM Speed Demonstration PASSED - Optimizations validated!")
return 0
else:
print("\nβ οΈ Demonstration had mixed results - review analysis above")
return 1
except KeyboardInterrupt:
print("\n\nβΉοΈ Demonstration interrupted by user")
return 130
except Exception as e:
print(f"\nπ₯ Demonstration failed with error: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
# Check required environment variables
if not os.getenv("OPENROUTER_API_KEY"):
print("β Missing OPENROUTER_API_KEY environment variable")
print("Please check your .env file")
sys.exit(1)
# Run the demonstration
exit_code = asyncio.run(main())
sys.exit(exit_code)