GenZ MCP Server

Overview Schema Related Servers Score Discussions

genz-mcp-server
providers

gemini.py•21.7 KiB

"""Gemini model provider implementation.""" import base64 import logging import time from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: from tools.models import ToolModelCategory from google import genai from google.genai import types from .base import ModelCapabilities, ModelProvider, ModelResponse, ProviderType, create_temperature_constraint logger = logging.getLogger(__name__) class GeminiModelProvider(ModelProvider): """Google Gemini model provider implementation.""" # Model configurations using ModelCapabilities objects SUPPORTED_MODELS = { "gemini-2.5-pro": ModelCapabilities( provider=ProviderType.GOOGLE, model_name="gemini-2.5-pro", friendly_name="Gemini (Pro 2.5)", context_window=1_048_576, # 1M tokens max_output_tokens=65_536, supports_extended_thinking=True, supports_system_prompts=True, supports_streaming=True, supports_function_calling=True, supports_json_mode=True, supports_images=True, # Vision capability max_image_size_mb=32.0, # Higher limit for Pro model supports_temperature=True, temperature_constraint=create_temperature_constraint("range"), max_thinking_tokens=32768, # Max thinking tokens for Pro model description="Deep reasoning + thinking mode (1M context) - Complex problems, architecture, deep analysis", aliases=["pro", "gemini pro", "gemini-pro"], ), "gemini-2.0-flash": ModelCapabilities( provider=ProviderType.GOOGLE, model_name="gemini-2.0-flash", friendly_name="Gemini (Flash 2.0)", context_window=1_048_576, # 1M tokens max_output_tokens=65_536, supports_extended_thinking=True, # Experimental thinking mode supports_system_prompts=True, supports_streaming=True, supports_function_calling=True, supports_json_mode=True, supports_images=True, # Vision capability max_image_size_mb=20.0, # Conservative 20MB limit for reliability supports_temperature=True, temperature_constraint=create_temperature_constraint("range"), max_thinking_tokens=24576, # Same as 2.5 flash for consistency description="Gemini 2.0 Flash (1M context) - Latest fast model with experimental thinking, supports audio/video input", aliases=["flash-2.0", "flash2"], ), "gemini-2.0-flash-lite": ModelCapabilities( provider=ProviderType.GOOGLE, model_name="gemini-2.0-flash-lite", friendly_name="Gemin (Flash Lite 2.0)", context_window=1_048_576, # 1M tokens max_output_tokens=65_536, supports_extended_thinking=False, # Not supported per user request supports_system_prompts=True, supports_streaming=True, supports_function_calling=True, supports_json_mode=True, supports_images=False, # Does not support images max_image_size_mb=0.0, # No image support supports_temperature=True, temperature_constraint=create_temperature_constraint("range"), description="Gemini 2.0 Flash Lite (1M context) - Lightweight fast model, text-only", aliases=["flashlite", "flash-lite"], ), "gemini-2.5-flash": ModelCapabilities( provider=ProviderType.GOOGLE, model_name="gemini-2.5-flash", friendly_name="Gemini (Flash 2.5)", context_window=1_048_576, # 1M tokens max_output_tokens=65_536, supports_extended_thinking=True, supports_system_prompts=True, supports_streaming=True, supports_function_calling=True, supports_json_mode=True, supports_images=True, # Vision capability max_image_size_mb=20.0, # Conservative 20MB limit for reliability supports_temperature=True, temperature_constraint=create_temperature_constraint("range"), max_thinking_tokens=24576, # Flash 2.5 thinking budget limit description="Ultra-fast (1M context) - Quick analysis, simple queries, rapid iterations", aliases=["flash", "flash2.5"], ), } # Thinking mode configurations - percentages of model's max_thinking_tokens # These percentages work across all models that support thinking THINKING_BUDGETS = { "minimal": 0.005, # 0.5% of max - minimal thinking for fast responses "low": 0.08, # 8% of max - light reasoning tasks "medium": 0.33, # 33% of max - balanced reasoning (default) "high": 0.67, # 67% of max - complex analysis "max": 1.0, # 100% of max - full thinking budget } # Model-specific thinking token limits MAX_THINKING_TOKENS = { "gemini-2.0-flash": 24576, # Same as 2.5 flash for consistency "gemini-2.0-flash-lite": 0, # No thinking support "gemini-2.5-flash": 24576, # Flash 2.5 thinking budget limit "gemini-2.5-pro": 32768, # Pro 2.5 thinking budget limit } def __init__(self, api_key: str, **kwargs): """Initialize Gemini provider with API key.""" super().__init__(api_key, **kwargs) self._client = None self._token_counters = {} # Cache for token counting @property def client(self): """Lazy initialization of Gemini client.""" if self._client is None: self._client = genai.Client(api_key=self.api_key) return self._client def get_capabilities(self, model_name: str) -> ModelCapabilities: """Get capabilities for a specific Gemini model.""" # Resolve shorthand resolved_name = self._resolve_model_name(model_name) if resolved_name not in self.SUPPORTED_MODELS: raise ValueError(f"Unsupported Gemini model: {model_name}") # Check if model is allowed by restrictions from utils.model_restrictions import get_restriction_service restriction_service = get_restriction_service() # IMPORTANT: Parameter order is (provider_type, model_name, original_name) # resolved_name is the canonical model name, model_name is the user input if not restriction_service.is_allowed(ProviderType.GOOGLE, resolved_name, model_name): raise ValueError(f"Gemini model '{resolved_name}' is not allowed by restriction policy.") # Return the ModelCapabilities object directly from SUPPORTED_MODELS return self.SUPPORTED_MODELS[resolved_name] def generate_content( self, prompt: str, model_name: str, system_prompt: Optional[str] = None, temperature: float = 0.3, max_output_tokens: Optional[int] = None, thinking_mode: str = "medium", images: Optional[list[str]] = None, **kwargs, ) -> ModelResponse: """Generate content using Gemini model.""" # Validate parameters resolved_name = self._resolve_model_name(model_name) self.validate_parameters(model_name, temperature) # Prepare content parts (text and potentially images) parts = [] # Add system and user prompts as text if system_prompt: full_prompt = f"{system_prompt}\n\n{prompt}" else: full_prompt = prompt parts.append({"text": full_prompt}) # Add images if provided and model supports vision if images and self._supports_vision(resolved_name): for image_path in images: try: image_part = self._process_image(image_path) if image_part: parts.append(image_part) except Exception as e: logger.warning(f"Failed to process image {image_path}: {e}") # Continue with other images and text continue elif images and not self._supports_vision(resolved_name): logger.warning(f"Model {resolved_name} does not support images, ignoring {len(images)} image(s)") # Create contents structure contents = [{"parts": parts}] # Prepare generation config generation_config = types.GenerateContentConfig( temperature=temperature, candidate_count=1, ) # Add max output tokens if specified if max_output_tokens: generation_config.max_output_tokens = max_output_tokens # Add thinking configuration for models that support it capabilities = self.get_capabilities(model_name) if capabilities.supports_extended_thinking and thinking_mode in self.THINKING_BUDGETS: # Get model's max thinking tokens and calculate actual budget model_config = self.SUPPORTED_MODELS.get(resolved_name) if model_config and model_config.max_thinking_tokens > 0: max_thinking_tokens = model_config.max_thinking_tokens actual_thinking_budget = int(max_thinking_tokens * self.THINKING_BUDGETS[thinking_mode]) generation_config.thinking_config = types.ThinkingConfig(thinking_budget=actual_thinking_budget) # Retry logic with progressive delays max_retries = 4 # Total of 4 attempts retry_delays = [1, 3, 5, 8] # Progressive delays: 1s, 3s, 5s, 8s last_exception = None for attempt in range(max_retries): try: # Generate content response = self.client.models.generate_content( model=resolved_name, contents=contents, config=generation_config, ) # Extract usage information if available usage = self._extract_usage(response) return ModelResponse( content=response.text, usage=usage, model_name=resolved_name, friendly_name="Gemini", provider=ProviderType.GOOGLE, metadata={ "thinking_mode": thinking_mode if capabilities.supports_extended_thinking else None, "finish_reason": ( getattr(response.candidates[0], "finish_reason", "STOP") if response.candidates else "STOP" ), }, ) except Exception as e: last_exception = e # Check if this is a retryable error using structured error codes is_retryable = self._is_error_retryable(e) # If this is the last attempt or not retryable, give up if attempt == max_retries - 1 or not is_retryable: break # Get progressive delay delay = retry_delays[attempt] # Log retry attempt logger.warning( f"Gemini API error for model {resolved_name}, attempt {attempt + 1}/{max_retries}: {str(e)}. Retrying in {delay}s..." ) time.sleep(delay) # If we get here, all retries failed actual_attempts = attempt + 1 # Convert from 0-based index to human-readable count error_msg = f"Gemini API error for model {resolved_name} after {actual_attempts} attempt{'s' if actual_attempts > 1 else ''}: {str(last_exception)}" raise RuntimeError(error_msg) from last_exception def count_tokens(self, text: str, model_name: str) -> int: """Count tokens for the given text using Gemini's tokenizer.""" self._resolve_model_name(model_name) # For now, use a simple estimation # TODO: Use actual Gemini tokenizer when available in SDK # Rough estimation: ~4 characters per token for English text return len(text) // 4 def get_provider_type(self) -> ProviderType: """Get the provider type.""" return ProviderType.GOOGLE def validate_model_name(self, model_name: str) -> bool: """Validate if the model name is supported and allowed.""" resolved_name = self._resolve_model_name(model_name) # First check if model is supported if resolved_name not in self.SUPPORTED_MODELS: return False # Then check if model is allowed by restrictions from utils.model_restrictions import get_restriction_service restriction_service = get_restriction_service() # IMPORTANT: Parameter order is (provider_type, model_name, original_name) # resolved_name is the canonical model name, model_name is the user input if not restriction_service.is_allowed(ProviderType.GOOGLE, resolved_name, model_name): logger.debug(f"Gemini model '{model_name}' -> '{resolved_name}' blocked by restrictions") return False return True def supports_thinking_mode(self, model_name: str) -> bool: """Check if the model supports extended thinking mode.""" capabilities = self.get_capabilities(model_name) return capabilities.supports_extended_thinking def get_thinking_budget(self, model_name: str, thinking_mode: str) -> int: """Get actual thinking token budget for a model and thinking mode.""" resolved_name = self._resolve_model_name(model_name) model_config = self.SUPPORTED_MODELS.get(resolved_name) if not model_config or not model_config.supports_extended_thinking: return 0 if thinking_mode not in self.THINKING_BUDGETS: return 0 max_thinking_tokens = model_config.max_thinking_tokens if max_thinking_tokens == 0: return 0 return int(max_thinking_tokens * self.THINKING_BUDGETS[thinking_mode]) def _extract_usage(self, response) -> dict[str, int]: """Extract token usage from Gemini response.""" usage = {} # Try to extract usage metadata from response # Note: The actual structure depends on the SDK version and response format if hasattr(response, "usage_metadata"): metadata = response.usage_metadata # Extract token counts with explicit None checks input_tokens = None output_tokens = None if hasattr(metadata, "prompt_token_count"): value = metadata.prompt_token_count if value is not None: input_tokens = value usage["input_tokens"] = value if hasattr(metadata, "candidates_token_count"): value = metadata.candidates_token_count if value is not None: output_tokens = value usage["output_tokens"] = value # Calculate total only if both values are available and valid if input_tokens is not None and output_tokens is not None: usage["total_tokens"] = input_tokens + output_tokens return usage def _supports_vision(self, model_name: str) -> bool: """Check if the model supports vision (image processing).""" # Gemini 2.5 models support vision vision_models = { "gemini-2.5-flash", "gemini-2.5-pro", "gemini-2.0-flash", "gemini-1.5-pro", "gemini-1.5-flash", } return model_name in vision_models def _is_error_retryable(self, error: Exception) -> bool: """Determine if an error should be retried based on structured error codes. Uses Gemini API error structure instead of text pattern matching for reliability. Args: error: Exception from Gemini API call Returns: True if error should be retried, False otherwise """ error_str = str(error).lower() # Check for 429 errors first - these need special handling if "429" in error_str or "quota" in error_str or "resource_exhausted" in error_str: # For Gemini, check for specific non-retryable error indicators # These typically indicate permanent failures or quota/size limits non_retryable_indicators = [ "quota exceeded", "resource exhausted", "context length", "token limit", "request too large", "invalid request", "quota_exceeded", "resource_exhausted", ] # Also check if this is a structured error from Gemini SDK try: # Try to access error details if available if hasattr(error, "details") or hasattr(error, "reason"): # Gemini API errors may have structured details error_details = getattr(error, "details", "") or getattr(error, "reason", "") error_details_str = str(error_details).lower() # Check for non-retryable error codes/reasons if any(indicator in error_details_str for indicator in non_retryable_indicators): logger.debug(f"Non-retryable Gemini error: {error_details}") return False except Exception: pass # Check main error string for non-retryable patterns if any(indicator in error_str for indicator in non_retryable_indicators): logger.debug(f"Non-retryable Gemini error based on message: {error_str[:200]}...") return False # If it's a 429/quota error but doesn't match non-retryable patterns, it might be retryable rate limiting logger.debug(f"Retryable Gemini rate limiting error: {error_str[:100]}...") return True # For non-429 errors, check if they're retryable retryable_indicators = [ "timeout", "connection", "network", "temporary", "unavailable", "retry", "internal error", "408", # Request timeout "500", # Internal server error "502", # Bad gateway "503", # Service unavailable "504", # Gateway timeout "ssl", # SSL errors "handshake", # Handshake failures ] return any(indicator in error_str for indicator in retryable_indicators) def _process_image(self, image_path: str) -> Optional[dict]: """Process an image for Gemini API.""" try: # Use base class validation image_bytes, mime_type = self.validate_image(image_path) # For data URLs, extract the base64 data directly if image_path.startswith("data:"): # Extract base64 data from data URL _, data = image_path.split(",", 1) return {"inline_data": {"mime_type": mime_type, "data": data}} else: # For file paths, encode the bytes image_data = base64.b64encode(image_bytes).decode() return {"inline_data": {"mime_type": mime_type, "data": image_data}} except ValueError as e: logger.warning(str(e)) return None except Exception as e: logger.error(f"Error processing image {image_path}: {e}") return None def get_preferred_model(self, category: "ToolModelCategory", allowed_models: list[str]) -> Optional[str]: """Get Gemini's preferred model for a given category from allowed models. Args: category: The tool category requiring a model allowed_models: Pre-filtered list of models allowed by restrictions Returns: Preferred model name or None """ from tools.models import ToolModelCategory if not allowed_models: return None # Helper to find best model from candidates def find_best(candidates: list[str]) -> Optional[str]: """Return best model from candidates (sorted for consistency).""" return sorted(candidates, reverse=True)[0] if candidates else None if category == ToolModelCategory.EXTENDED_REASONING: # For extended reasoning, prefer models with thinking support # First try Pro models that support thinking pro_thinking = [ m for m in allowed_models if "pro" in m and m in self.SUPPORTED_MODELS and self.SUPPORTED_MODELS[m].supports_extended_thinking ] if pro_thinking: return find_best(pro_thinking) # Then any model that supports thinking any_thinking = [ m for m in allowed_models if m in self.SUPPORTED_MODELS and self.SUPPORTED_MODELS[m].supports_extended_thinking ] if any_thinking: return find_best(any_thinking) # Finally, just prefer Pro models even without thinking pro_models = [m for m in allowed_models if "pro" in m] if pro_models: return find_best(pro_models) elif category == ToolModelCategory.FAST_RESPONSE: # Prefer Flash models for speed flash_models = [m for m in allowed_models if "flash" in m] if flash_models: return find_best(flash_models) # Default for BALANCED or as fallback # Prefer Flash for balanced use, then Pro, then anything flash_models = [m for m in allowed_models if "flash" in m] if flash_models: return find_best(flash_models) pro_models = [m for m in allowed_models if "pro" in m] if pro_models: return find_best(pro_models) # Ultimate fallback to best available model return find_best(allowed_models)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/petems/genz-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

gemini.py•21.7 KiB