Skip to main content
Glama
router.py31.8 kB
#!/usr/bin/env python3 """ MCP Router - Intelligent Model Routing System Routes queries to the best model based on query characteristics and context. """ import os import json import time from dataclasses import dataclass, field from typing import Dict, List, Optional, Any, Callable from enum import Enum from pathlib import Path import logging # Load environment variables try: from dotenv import load_dotenv load_dotenv() except ImportError: pass logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class TaskType(Enum): """Types of tasks that models can handle.""" REASONING = "reasoning" # Complex reasoning, analysis, planning CODE_GENERATION = "code_generation" # Writing code CODE_EDIT = "code_edit" # Editing/refactoring code SUMMARIZATION = "summarization" # Summarizing content QUESTION_ANSWERING = "qa" # Answering questions TRANSLATION = "translation" # Language translation CREATIVE = "creative" # Creative writing, brainstorming STREAMING = "streaming" # Real-time streaming responses MULTIMODAL = "multimodal" # Image, audio, video processing EMBEDDING = "embedding" # Generating embeddings class Complexity(Enum): """Query complexity levels.""" LOW = "low" # Simple, straightforward tasks MEDIUM = "medium" # Moderate complexity HIGH = "high" # Complex, requires deep reasoning VERY_HIGH = "very_high" # Extremely complex @dataclass class ModelCapabilities: """Capabilities and characteristics of a model.""" name: str provider: str # e.g., "openai", "anthropic", "google" model_id: str # e.g., "gpt-4o", "claude-3-5-sonnet" # Capabilities supports_reasoning: bool = True supports_code: bool = True supports_streaming: bool = False supports_multimodal: bool = False supports_embeddings: bool = False # Performance characteristics max_tokens: int = 4096 context_window: int = 128000 cost_per_1k_tokens_input: float = 0.0 cost_per_1k_tokens_output: float = 0.0 avg_latency_ms: int = 500 # Average latency in milliseconds # Quality scores (0-1) reasoning_quality: float = 0.8 code_quality: float = 0.8 speed_score: float = 0.8 # Task preferences preferred_tasks: List[TaskType] = field(default_factory=list) unsuitable_tasks: List[TaskType] = field(default_factory=list) # API configuration api_key_env_var: str = "" base_url: Optional[str] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "name": self.name, "provider": self.provider, "model_id": self.model_id, "supports_reasoning": self.supports_reasoning, "supports_code": self.supports_code, "supports_streaming": self.supports_streaming, "supports_multimodal": self.supports_multimodal, "supports_embeddings": self.supports_embeddings, "max_tokens": self.max_tokens, "context_window": self.context_window, "cost_per_1k_tokens_input": self.cost_per_1k_tokens_input, "cost_per_1k_tokens_output": self.cost_per_1k_tokens_output, "avg_latency_ms": self.avg_latency_ms, "reasoning_quality": self.reasoning_quality, "code_quality": self.code_quality, "speed_score": self.speed_score, "preferred_tasks": [t.value for t in self.preferred_tasks], "unsuitable_tasks": [t.value for t in self.unsuitable_tasks], } @dataclass class QueryContext: """Context information about a query.""" query: str task_type: Optional[TaskType] = None complexity: Optional[Complexity] = None estimated_tokens: int = 0 requires_streaming: bool = False requires_multimodal: bool = False requires_embeddings: bool = False priority: str = "normal" # "low", "normal", "high", "urgent" cost_sensitivity: str = "normal" # "low", "normal", "high" latency_sensitivity: str = "normal" # "low", "normal", "high" metadata: Dict[str, Any] = field(default_factory=dict) @dataclass class RoutingDecision: """Result of routing a query.""" selected_model: ModelCapabilities confidence: float # 0-1 confidence score reasoning: str # Explanation of why this model was chosen alternatives: List[ModelCapabilities] = field(default_factory=list) estimated_cost: float = 0.0 estimated_latency_ms: int = 0 class QueryAnalyzer: """Analyzes queries to determine characteristics.""" def __init__(self): self.task_keywords = { TaskType.REASONING: [ "analyze", "explain", "why", "how", "reason", "plan", "design", "architecture", "strategy", "think", "consider", "evaluate" ], TaskType.CODE_GENERATION: [ "write", "create", "generate", "implement", "build", "code", "function", "class", "script", "program" ], TaskType.CODE_EDIT: [ "refactor", "fix", "improve", "optimize", "update", "modify", "change", "edit", "rewrite" ], TaskType.SUMMARIZATION: [ "summarize", "summary", "brief", "overview", "condense" ], TaskType.QUESTION_ANSWERING: [ "what", "when", "where", "who", "which", "question", "answer" ], TaskType.CREATIVE: [ "creative", "imagine", "brainstorm", "story", "poem", "write creatively" ], TaskType.STREAMING: [ "stream", "real-time", "live", "continuous" ], TaskType.MULTIMODAL: [ "image", "picture", "photo", "video", "audio", "visual" ], } def analyze(self, query: str, metadata: Optional[Dict] = None) -> QueryContext: """Analyze a query and return context.""" query_lower = query.lower() # Determine task type task_type = self._detect_task_type(query_lower) # Determine complexity complexity = self._detect_complexity(query, task_type) # Estimate tokens (rough approximation: ~4 chars per token) estimated_tokens = len(query) // 4 # Check for special requirements requires_streaming = any(kw in query_lower for kw in ["stream", "real-time", "live"]) requires_multimodal = any(kw in query_lower for kw in ["image", "picture", "photo", "video", "audio"]) requires_embeddings = "embedding" in query_lower or "embed" in query_lower return QueryContext( query=query, task_type=task_type, complexity=complexity, estimated_tokens=estimated_tokens, requires_streaming=requires_streaming, requires_multimodal=requires_multimodal, requires_embeddings=requires_embeddings, metadata=metadata or {} ) def _detect_task_type(self, query_lower: str) -> TaskType: """Detect the task type from query keywords.""" scores = {task_type: 0 for task_type in TaskType} for task_type, keywords in self.task_keywords.items(): for keyword in keywords: if keyword in query_lower: scores[task_type] += 1 # Return task type with highest score, default to QUESTION_ANSWERING if max(scores.values()) > 0: return max(scores.items(), key=lambda x: x[1])[0] return TaskType.QUESTION_ANSWERING def _detect_complexity(self, query: str, task_type: TaskType) -> Complexity: """Detect query complexity.""" query_lower = query.lower() length = len(query) # Simple heuristics if length < 50: return Complexity.LOW elif length < 200: return Complexity.MEDIUM elif length < 500: return Complexity.HIGH else: return Complexity.VERY_HIGH class MCPRouter: """Main router class for intelligent model selection.""" def __init__(self, config_path: Optional[str] = None): """Initialize the router.""" self.models: Dict[str, ModelCapabilities] = {} self.analyzer = QueryAnalyzer() self.routing_history: List[Dict] = [] # Load default models self._load_default_models() # Load custom config if provided if config_path: self.load_config(config_path) def _load_default_models(self): """Load default model configurations with latest 2025 industry models.""" # ================================================================ # TIER 1: FLAGSHIP MODELS (Complex Architecture & Bug Hunts) # ================================================================ # OpenAI GPT-5.2 - Latest flagship with enhanced reasoning self.register_model(ModelCapabilities( name="GPT-5.2", provider="openai", model_id="gpt-5.2", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=32768, context_window=256000, cost_per_1k_tokens_input=5.00, cost_per_1k_tokens_output=15.00, avg_latency_ms=1200, reasoning_quality=0.99, code_quality=0.98, speed_score=0.80, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION, TaskType.MULTIMODAL], api_key_env_var="OPENAI_API_KEY" )) # Claude 4.5 Opus - Gold standard for complex refactoring self.register_model(ModelCapabilities( name="Claude 4.5 Opus", provider="anthropic", model_id="claude-4.5-opus", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=32768, context_window=200000, cost_per_1k_tokens_input=25.00, cost_per_1k_tokens_output=75.00, avg_latency_ms=2000, reasoning_quality=0.99, code_quality=0.99, speed_score=0.60, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION, TaskType.CREATIVE], api_key_env_var="ANTHROPIC_API_KEY" )) # Claude 4.5 Sonnet - Default for most Cursor users self.register_model(ModelCapabilities( name="Claude 4.5 Sonnet", provider="anthropic", model_id="claude-4.5-sonnet", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=16384, context_window=200000, cost_per_1k_tokens_input=5.00, cost_per_1k_tokens_output=25.00, avg_latency_ms=800, reasoning_quality=0.97, code_quality=0.98, speed_score=0.88, preferred_tasks=[TaskType.CODE_GENERATION, TaskType.CODE_EDIT, TaskType.REASONING], api_key_env_var="ANTHROPIC_API_KEY" )) # ================================================================ # TIER 2: REASONING MODELS (Chain of Thought / Thinking Models) # ================================================================ # OpenAI o3 - Advanced reasoning model self.register_model(ModelCapabilities( name="o3", provider="openai", model_id="o3", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=False, max_tokens=32768, context_window=200000, cost_per_1k_tokens_input=10.00, cost_per_1k_tokens_output=40.00, avg_latency_ms=3000, reasoning_quality=0.99, code_quality=0.95, speed_score=0.50, preferred_tasks=[TaskType.REASONING], api_key_env_var="OPENAI_API_KEY" )) # OpenAI o3-mini (High) - Faster reasoning model self.register_model(ModelCapabilities( name="o3-mini (High)", provider="openai", model_id="o3-mini-high", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=False, max_tokens=16384, context_window=128000, cost_per_1k_tokens_input=1.50, cost_per_1k_tokens_output=6.00, avg_latency_ms=1500, reasoning_quality=0.95, code_quality=0.92, speed_score=0.70, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION], api_key_env_var="OPENAI_API_KEY" )) # Claude 3.7 Sonnet - Thinking model option self.register_model(ModelCapabilities( name="Claude 3.7 Sonnet", provider="anthropic", model_id="claude-3.7-sonnet", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=16384, context_window=200000, cost_per_1k_tokens_input=4.00, cost_per_1k_tokens_output=20.00, avg_latency_ms=1200, reasoning_quality=0.96, code_quality=0.96, speed_score=0.78, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION], api_key_env_var="ANTHROPIC_API_KEY" )) # ================================================================ # TIER 3: NATIVE & FAST MODELS # ================================================================ # Cursor Composer 1 - Native model optimized for Composer self.register_model(ModelCapabilities( name="Composer 1", provider="cursor", model_id="composer-1", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=False, max_tokens=16384, context_window=128000, cost_per_1k_tokens_input=0.10, cost_per_1k_tokens_output=0.30, avg_latency_ms=200, reasoning_quality=0.88, code_quality=0.92, speed_score=0.98, preferred_tasks=[TaskType.CODE_GENERATION, TaskType.CODE_EDIT], api_key_env_var="CURSOR_API_KEY" )) # Gemini 3 Pro - Massive context window for large projects self.register_model(ModelCapabilities( name="Gemini 3 Pro", provider="google", model_id="gemini-3-pro", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=32768, context_window=2000000, # 2M token context window cost_per_1k_tokens_input=2.00, cost_per_1k_tokens_output=8.00, avg_latency_ms=1500, reasoning_quality=0.96, code_quality=0.94, speed_score=0.72, preferred_tasks=[TaskType.REASONING, TaskType.MULTIMODAL, TaskType.CODE_GENERATION], api_key_env_var="GOOGLE_API_KEY" )) # Gemini 3 Flash - Fast with large context self.register_model(ModelCapabilities( name="Gemini 3 Flash", provider="google", model_id="gemini-3-flash", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=16384, context_window=1000000, # 1M token context window cost_per_1k_tokens_input=0.10, cost_per_1k_tokens_output=0.40, avg_latency_ms=400, reasoning_quality=0.88, code_quality=0.90, speed_score=0.95, preferred_tasks=[TaskType.QUESTION_ANSWERING, TaskType.SUMMARIZATION, TaskType.MULTIMODAL], api_key_env_var="GOOGLE_API_KEY" )) # ================================================================ # TIER 4: LEGACY/BUDGET MODELS (Still Available) # ================================================================ # GPT-4o - Previous flagship, still excellent self.register_model(ModelCapabilities( name="GPT-4o", provider="openai", model_id="gpt-4o", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=16384, context_window=128000, cost_per_1k_tokens_input=2.50, cost_per_1k_tokens_output=10.00, avg_latency_ms=800, reasoning_quality=0.95, code_quality=0.95, speed_score=0.85, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION, TaskType.MULTIMODAL], api_key_env_var="OPENAI_API_KEY" )) # GPT-4o-mini - Fast and cost-effective self.register_model(ModelCapabilities( name="GPT-4o-mini", provider="openai", model_id="gpt-4o-mini", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=16384, context_window=128000, cost_per_1k_tokens_input=0.15, cost_per_1k_tokens_output=0.60, avg_latency_ms=400, reasoning_quality=0.80, code_quality=0.85, speed_score=0.95, preferred_tasks=[TaskType.CODE_EDIT, TaskType.QUESTION_ANSWERING, TaskType.SUMMARIZATION], api_key_env_var="OPENAI_API_KEY" )) # Claude 3.5 Sonnet - Previous default, still great self.register_model(ModelCapabilities( name="Claude 3.5 Sonnet", provider="anthropic", model_id="claude-3-5-sonnet-20241022", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=8192, context_window=200000, cost_per_1k_tokens_input=3.00, cost_per_1k_tokens_output=15.00, avg_latency_ms=1000, reasoning_quality=0.96, code_quality=0.97, speed_score=0.80, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION], api_key_env_var="ANTHROPIC_API_KEY" )) # Claude 3.5 Haiku - Fast budget option self.register_model(ModelCapabilities( name="Claude 3.5 Haiku", provider="anthropic", model_id="claude-3-5-haiku-20241022", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=False, max_tokens=8192, context_window=200000, cost_per_1k_tokens_input=0.80, cost_per_1k_tokens_output=4.00, avg_latency_ms=400, reasoning_quality=0.85, code_quality=0.88, speed_score=0.92, preferred_tasks=[TaskType.CODE_EDIT, TaskType.QUESTION_ANSWERING, TaskType.SUMMARIZATION], api_key_env_var="ANTHROPIC_API_KEY" )) # Gemini 2.0 Pro - Current stable Gemini self.register_model(ModelCapabilities( name="Gemini 2.0 Pro", provider="google", model_id="gemini-2.0-pro", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=8192, context_window=2000000, cost_per_1k_tokens_input=1.25, cost_per_1k_tokens_output=5.00, avg_latency_ms=1200, reasoning_quality=0.94, code_quality=0.92, speed_score=0.75, preferred_tasks=[TaskType.REASONING, TaskType.MULTIMODAL, TaskType.CODE_GENERATION], api_key_env_var="GOOGLE_API_KEY" )) # Gemini 2.0 Flash - Fast Gemini option self.register_model(ModelCapabilities( name="Gemini 2.0 Flash", provider="google", model_id="gemini-2.0-flash", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=True, max_tokens=8192, context_window=1000000, cost_per_1k_tokens_input=0.075, cost_per_1k_tokens_output=0.30, avg_latency_ms=500, reasoning_quality=0.82, code_quality=0.85, speed_score=0.90, preferred_tasks=[TaskType.QUESTION_ANSWERING, TaskType.SUMMARIZATION, TaskType.MULTIMODAL], api_key_env_var="GOOGLE_API_KEY" )) # DeepSeek V3 - Open source powerhouse self.register_model(ModelCapabilities( name="DeepSeek V3", provider="deepseek", model_id="deepseek-v3", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=False, max_tokens=8192, context_window=128000, cost_per_1k_tokens_input=0.14, cost_per_1k_tokens_output=0.28, avg_latency_ms=600, reasoning_quality=0.92, code_quality=0.94, speed_score=0.88, preferred_tasks=[TaskType.CODE_GENERATION, TaskType.REASONING], api_key_env_var="DEEPSEEK_API_KEY" )) # DeepSeek R1 - Reasoning-focused open source self.register_model(ModelCapabilities( name="DeepSeek R1", provider="deepseek", model_id="deepseek-r1", supports_reasoning=True, supports_code=True, supports_streaming=True, supports_multimodal=False, max_tokens=16384, context_window=128000, cost_per_1k_tokens_input=0.55, cost_per_1k_tokens_output=2.19, avg_latency_ms=1500, reasoning_quality=0.96, code_quality=0.92, speed_score=0.70, preferred_tasks=[TaskType.REASONING, TaskType.CODE_GENERATION], api_key_env_var="DEEPSEEK_API_KEY" )) def register_model(self, model: ModelCapabilities): """Register a model with the router.""" self.models[model.model_id] = model logger.info(f"Registered model: {model.name} ({model.model_id})") def route( self, query: str, context: Optional[QueryContext] = None, strategy: str = "balanced" ) -> RoutingDecision: """ Route a query to the best model. Args: query: The user query context: Optional pre-analyzed context strategy: Routing strategy ("balanced", "cost", "speed", "quality") Returns: RoutingDecision with selected model and reasoning """ # Analyze query if context not provided if context is None: context = self.analyzer.analyze(query) # Filter compatible models compatible_models = self._filter_compatible_models(context) if not compatible_models: raise ValueError("No compatible models found for this query") # Score models based on strategy scored_models = self._score_models(compatible_models, context, strategy) # Select best model best_model, score = scored_models[0] # Get alternatives alternatives = [model for model, _ in scored_models[1:4]] # Top 3 alternatives # Estimate cost and latency estimated_cost = self._estimate_cost(best_model, context) estimated_latency = best_model.avg_latency_ms # Generate reasoning reasoning = self._generate_reasoning(best_model, context, strategy, score) decision = RoutingDecision( selected_model=best_model, confidence=score, reasoning=reasoning, alternatives=alternatives, estimated_cost=estimated_cost, estimated_latency_ms=estimated_latency ) # Log routing decision self.routing_history.append({ "query": query, "decision": decision.selected_model.model_id, "confidence": decision.confidence, "timestamp": time.time() }) return decision def _filter_compatible_models(self, context: QueryContext) -> List[ModelCapabilities]: """Filter models that are compatible with the query requirements.""" compatible = [] for model in self.models.values(): # Note: API key check is optional for routing (only needed for execution) # This allows routing decisions even when API keys aren't configured # Check streaming requirement if context.requires_streaming and not model.supports_streaming: continue # Check multimodal requirement if context.requires_multimodal and not model.supports_multimodal: continue # Check embeddings requirement if context.requires_embeddings and not model.supports_embeddings: continue # Check task compatibility if context.task_type and context.task_type in model.unsuitable_tasks: continue # Check context window if context.estimated_tokens > model.context_window: continue compatible.append(model) return compatible def _score_models( self, models: List[ModelCapabilities], context: QueryContext, strategy: str ) -> List[tuple]: """Score models based on strategy and context.""" scored = [] for model in models: score = 0.0 # Base score from quality if context.task_type == TaskType.REASONING: score += model.reasoning_quality * 0.4 elif context.task_type in [TaskType.CODE_GENERATION, TaskType.CODE_EDIT]: score += model.code_quality * 0.4 else: score += (model.reasoning_quality + model.code_quality) / 2 * 0.3 # Task preference bonus if context.task_type in model.preferred_tasks: score += 0.2 # Strategy-based scoring if strategy == "cost": # Prefer lower cost cost_score = 1.0 / (1.0 + model.cost_per_1k_tokens_input / 10.0) score += cost_score * 0.3 elif strategy == "speed": # Prefer faster models score += model.speed_score * 0.3 elif strategy == "quality": # Prefer higher quality quality_score = (model.reasoning_quality + model.code_quality) / 2 score += quality_score * 0.3 else: # balanced # Balance cost, speed, and quality cost_score = 1.0 / (1.0 + model.cost_per_1k_tokens_input / 10.0) score += (cost_score * 0.1 + model.speed_score * 0.1 + (model.reasoning_quality + model.code_quality) / 2 * 0.1) # Complexity matching if context.complexity == Complexity.VERY_HIGH: score += model.reasoning_quality * 0.1 elif context.complexity == Complexity.LOW: score += model.speed_score * 0.1 scored.append((model, score)) # Sort by score (descending) scored.sort(key=lambda x: x[1], reverse=True) return scored def _estimate_cost(self, model: ModelCapabilities, context: QueryContext) -> float: """Estimate cost for the query.""" input_cost = (context.estimated_tokens / 1000) * model.cost_per_1k_tokens_input # Assume output is ~50% of input for estimation output_tokens = context.estimated_tokens // 2 output_cost = (output_tokens / 1000) * model.cost_per_1k_tokens_output return input_cost + output_cost def _generate_reasoning( self, model: ModelCapabilities, context: QueryContext, strategy: str, score: float ) -> str: """Generate human-readable reasoning for the selection.""" reasons = [] if context.task_type in model.preferred_tasks: reasons.append(f"Model is optimized for {context.task_type.value} tasks") if strategy == "cost": reasons.append("Selected for cost efficiency") elif strategy == "speed": reasons.append("Selected for low latency") elif strategy == "quality": reasons.append("Selected for highest quality") else: reasons.append("Selected for balanced performance") if context.complexity == Complexity.VERY_HIGH: reasons.append("High complexity requires strong reasoning capabilities") if context.requires_streaming: reasons.append("Streaming support required") if context.requires_multimodal: reasons.append("Multimodal capabilities required") return "; ".join(reasons) if reasons else f"Selected {model.name} with confidence {score:.2f}" def load_config(self, config_path: str): """Load model configuration from JSON file.""" with open(config_path, 'r') as f: config = json.load(f) for model_config in config.get("models", []): model = ModelCapabilities(**model_config) self.register_model(model) def save_config(self, config_path: str): """Save current model configuration to JSON file.""" config = { "models": [model.to_dict() for model in self.models.values()] } with open(config_path, 'w') as f: json.dump(config, f, indent=2) def get_routing_stats(self) -> Dict[str, Any]: """Get statistics about routing decisions.""" if not self.routing_history: return {"total_routes": 0} model_counts = {} for entry in self.routing_history: model_id = entry["decision"] model_counts[model_id] = model_counts.get(model_id, 0) + 1 return { "total_routes": len(self.routing_history), "model_usage": model_counts, "avg_confidence": sum(e["confidence"] for e in self.routing_history) / len(self.routing_history) }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-Castle-Labs/mcp-router'

If you have feedback or need assistance with the MCP directory API, please join our Discord server