Genkit MCP

Official

Overview Schema Related Servers Score Discussions

main.py•32.4 KiB

# Copyright 2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # SPDX-License-Identifier: Apache-2.0 """Media Generation Models Demo - Veo, TTS, Lyria, and Gemini Image. This demo showcases all media generation capabilities in the Google GenAI plugin. See README.md for detailed testing instructions and configuration options. Key Concepts (ELI5):: ┌─────────────────────┬────────────────────────────────────────────────────┐ │ Concept │ ELI5 Explanation │ ├─────────────────────┼────────────────────────────────────────────────────┤ │ Veo │ Google's video generation AI. Describe a scene, │ │ │ get a video clip. Takes 30s to 5min. │ ├─────────────────────┼────────────────────────────────────────────────────┤ │ TTS │ Text-To-Speech. AI reads text aloud with │ │ │ realistic human-like voices. │ ├─────────────────────┼────────────────────────────────────────────────────┤ │ Lyria │ AI music generation. Describe a song and │ │ │ get an audio file back. │ ├─────────────────────┼────────────────────────────────────────────────────┤ │ Gemini Image │ Image generation from Gemini. Describe what │ │ │ you want to see, AI draws it. │ ├─────────────────────┼────────────────────────────────────────────────────┤ │ Background Model │ Long-running generation (like Veo). Start it, │ │ │ poll for status, get result when ready. │ ├─────────────────────┼────────────────────────────────────────────────────┤ │ Standard Model │ Quick generation (TTS, Image). Get results │ │ │ directly in the response. │ └─────────────────────┴────────────────────────────────────────────────────┘ ┌────────────────────────────────────────────────────────────────────────┐ │ Media Generation Models Overview │ ├────────────────────────────────────────────────────────────────────────┤ │ │ │ Model Type │ Output │ API │ Latency │ │ ────────────────┼────────────────┼──────────────────┼────────────────│ │ Veo (Video) │ MP4 video URL │ Background Model │ 30s - 5min │ │ TTS (Speech) │ Audio (WAV) │ Standard Model │ ~1-5 seconds │ │ Lyria (Audio) │ Audio (WAV) │ Standard Model │ ~5-30 seconds │ │ Gemini Image │ Image │ Standard Model │ ~5-15 seconds │ │ │ └────────────────────────────────────────────────────────────────────────┘ Why Different Model Patterns? ============================= - **Standard Models** (TTS, Lyria, Gemini Image): Return results directly. Response time is typically under 30 seconds. - **Background Models** (Veo): Long-running operations that can take minutes. Uses a start/poll/complete pattern with Operation objects. ┌──────────────────────────────────────────────────────────────────────┐ │ Response Time Comparison │ ├──────────────────────────────────────────────────────────────────────┤ │ │ │ Text: |█| ~100ms │ │ TTS: |████| ~1-5s │ │ Image: |████████| ~5-15s │ │ Audio: |████████████| ~5-30s │ │ Video: |████████████████████████████████████| 30s - 5min │ │ │ └──────────────────────────────────────────────────────────────────────┘ Available Flows =============== - `tts_speech_generator` - Text-to-speech with voice selection - `gemini_image_generator` - Image generation with Gemini - `lyria_audio_generator` - Music/audio generation (Vertex AI) - `veo_video_generator` - Video generation (background model) """ import asyncio import base64 import os import time import uuid from typing import Any from pydantic import BaseModel, Field from rich.traceback import install as install_rich_traceback from genkit.ai import Genkit from genkit.blocks.background_model import lookup_background_action from genkit.core.action import ActionRunContext from genkit.core.typing import ( Error, FinishReason, GenerateRequest, GenerateResponse, Message, ModelInfo, Operation, Part, Role, Supports, TextPart, ) # ============================================================================ # Rich Traceback - Beautiful, Rust-like error messages # ============================================================================ install_rich_traceback(show_locals=True, width=120, extra_lines=3) # ============================================================================ # Environment Detection # ============================================================================ HAS_GEMINI_API_KEY = bool(os.getenv('GEMINI_API_KEY')) HAS_GCP_PROJECT = bool(os.getenv('GOOGLE_CLOUD_PROJECT')) # Initialize Genkit if HAS_GEMINI_API_KEY: from genkit.plugins.google_genai import GoogleAI ai = Genkit(plugins=[GoogleAI()]) else: ai = Genkit() if HAS_GCP_PROJECT: pass class TtsInput(BaseModel): """Input for TTS speech generation flow.""" text: str = Field( default='Meow! The magnificent cat leaps gracefully across the rooftops at sunset!', description='The text to convert to speech.', ) voice: str = Field( default='Kore', description='The voice to use for speech generation. Options: Zephyr, Puck, Charon, Kore, etc.', ) class ImageInput(BaseModel): """Input for image generation flow.""" prompt: str = Field( default='A fluffy orange cat wearing a tiny crown, sitting on a velvet cushion', description='Description of the image to generate.', ) aspect_ratio: str = Field( default='16:9', description='Image aspect ratio (e.g., 16:9, 1:1, 9:16).', ) class AudioInput(BaseModel): """Input for audio/music generation flow.""" prompt: str = Field( default='Playful jazz music with soft piano and gentle purring cat sounds', description='Description of the audio to generate.', ) negative_prompt: str | None = Field( default=None, description='A description of what to avoid in the generated audio.', ) class VideoInput(BaseModel): """Input for video generation flow.""" prompt: str = Field( default='A cat chasing a butterfly through a sunlit garden, cinematic slow motion', description='Description of the video to generate.', ) aspect_ratio: str = Field( default='16:9', description='Video aspect ratio (e.g., 16:9, 9:16).', ) duration_seconds: int = Field( default=5, description='Duration of the video in seconds.', ) # ============================================================================ # Simulated Models (for demo without API keys) # ============================================================================ _operations: dict[str, dict[str, Any]] = {} class SimulatedTtsConfig(BaseModel): """Configuration for simulated TTS.""" voice_name: str = Field(default='Kore', description='Voice to use') class SimulatedImageConfig(BaseModel): """Configuration for simulated image generation.""" aspect_ratio: str = Field(default='16:9', description='Image aspect ratio') class SimulatedVeoConfig(BaseModel): """Configuration for simulated Veo.""" duration_seconds: int = Field(default=5, description='Video duration') aspect_ratio: str = Field(default='16:9', description='Video aspect ratio') class SimulatedLyriaConfig(BaseModel): """Configuration for simulated Lyria.""" sample_count: int = Field(default=1, description='Number of audio samples') def _extract_prompt(request: GenerateRequest) -> str: """Extract text prompt from request.""" if request.messages: for msg in request.messages: for part in msg.content: if hasattr(part.root, 'text') and part.root.text: return str(part.root.text) return '' # --- Simulated TTS --- async def simulated_tts_generate( request: GenerateRequest, ctx: ActionRunContext, ) -> GenerateResponse: """Simulate TTS audio generation.""" _extract_prompt(request) await asyncio.sleep(1) # Simulate processing # Create a simple WAV header + silence (for demo) # Real TTS would return actual audio fake_audio = base64.b64encode(b'RIFF' + b'\x00' * 100).decode() return GenerateResponse( message=Message( role=Role.MODEL, content=[ Part.model_validate({ 'media': { 'url': f'data:audio/wav;base64,{fake_audio}', 'contentType': 'audio/wav', } }) ], ), finish_reason=FinishReason.STOP, ) # --- Simulated Image --- async def simulated_image_generate( request: GenerateRequest, ctx: ActionRunContext, ) -> GenerateResponse: """Simulate image generation.""" _extract_prompt(request) await asyncio.sleep(2) # Simulate processing # Create a tiny PNG (1x1 pixel) fake_png = base64.b64encode( b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde' ).decode() return GenerateResponse( message=Message( role=Role.MODEL, content=[ Part.model_validate({ 'media': { 'url': f'data:image/png;base64,{fake_png}', 'contentType': 'image/png', } }) ], ), finish_reason=FinishReason.STOP, ) # --- Simulated Lyria --- async def simulated_lyria_generate( request: GenerateRequest, ctx: ActionRunContext, ) -> GenerateResponse: """Simulate audio generation.""" _extract_prompt(request) await asyncio.sleep(3) # Simulate processing fake_audio = base64.b64encode(b'RIFF' + b'\x00' * 200).decode() return GenerateResponse( message=Message( role=Role.MODEL, content=[ Part.model_validate({ 'media': { 'url': f'data:audio/wav;base64,{fake_audio}', 'contentType': 'audio/wav', } }) ], ), finish_reason=FinishReason.STOP, ) # --- Simulated Veo (Background Model) --- async def simulated_veo_start( request: GenerateRequest, ctx: ActionRunContext, ) -> Operation: """Start simulated video generation.""" prompt = _extract_prompt(request) op_id = f'operations/veo-{uuid.uuid4().hex[:12]}' _operations[op_id] = { 'prompt': prompt, 'start_time': time.time(), 'progress': 0, } return Operation(id=op_id, done=False, metadata={'progress': 0}) async def simulated_veo_check(operation: Operation) -> Operation: """Check simulated video generation status.""" op_data = _operations.get(operation.id) if not op_data: return Operation(id=operation.id, done=True, error=Error(message='Not found')) elapsed = time.time() - op_data['start_time'] progress = min(100, int(elapsed * 10)) # 10 second generation if progress >= 100: video_url = f'https://storage.example.com/{operation.id.split("/")[-1]}.mp4' return Operation( id=operation.id, done=True, metadata={'progress': 100}, output={ 'finishReason': 'stop', 'message': { 'role': 'model', 'content': [{'media': {'url': video_url, 'contentType': 'video/mp4'}}], }, }, ) return Operation( id=operation.id, done=False, metadata={'progress': progress, 'estimatedSeconds': max(0, 10 - elapsed)}, ) # Register simulated models if no API key if not HAS_GEMINI_API_KEY: ai.define_model( name='simulated-tts', fn=simulated_tts_generate, # type: ignore[arg-type] config_schema=SimulatedTtsConfig, info=ModelInfo( label='Simulated TTS', supports=Supports(multiturn=False, media=False, tools=False), ), ) ai.define_model( name='simulated-image', fn=simulated_image_generate, # type: ignore[arg-type] config_schema=SimulatedImageConfig, info=ModelInfo( label='Simulated Image', supports=Supports(multiturn=False, media=True, tools=False), ), ) ai.define_model( name='simulated-lyria', fn=simulated_lyria_generate, # type: ignore[arg-type] config_schema=SimulatedLyriaConfig, info=ModelInfo( label='Simulated Lyria', supports=Supports(multiturn=False, media=True, tools=False), ), ) ai.define_background_model( name='simulated-veo', start=simulated_veo_start, check=simulated_veo_check, config_schema=SimulatedVeoConfig, info=ModelInfo( label='Simulated Veo', supports=Supports(multiturn=False, media=True, tools=False, output=['media']), ), ) # ============================================================================ # Model Selection Helpers # ============================================================================ def get_tts_model() -> str: """Get the TTS model name based on environment. TTS models use the '-preview-' suffix: - gemini-2.5-flash-preview-tts (optimized for latency) - gemini-2.5-pro-preview-tts (optimized for quality) """ if HAS_GEMINI_API_KEY: return 'googleai/gemini-2.5-flash-preview-tts' return 'simulated-tts' def get_image_model() -> str: """Get the image model name based on environment. Available image generation models: - gemini-2.5-flash-image: Fast, efficient image generation (Nano Banana) - gemini-3-pro-image-preview: Professional quality (Nano Banana Pro) Note: Imagen models (imagen-4, etc.) are only available on Vertex AI. """ if HAS_GEMINI_API_KEY: return 'googleai/gemini-2.5-flash-image' return 'simulated-image' def get_lyria_model() -> str: """Get the Lyria model name based on environment.""" if HAS_GCP_PROJECT: return 'vertexai/lyria-002' return 'simulated-lyria' def get_veo_model() -> str: """Get the Veo model name based on environment.""" if HAS_GEMINI_API_KEY: return 'googleai/veo-2.0-generate-001' return 'simulated-veo' # ============================================================================ # Demo Flows - TTS (Text-to-Speech) # ============================================================================ @ai.flow(name='tts_speech_generator', description='Generate speech from text using TTS') async def tts_speech_generator_flow(input: TtsInput) -> dict[str, Any]: """Generate speech audio from text. Text-to-Speech (TTS) converts written text into natural-sounding speech. Gemini TTS supports controllable generation via natural language prompts for style, accent, pace, and tone. Available Voices (30 prebuilt options) ====================================== ┌─────────────────┬─────────────┬─────────────────┬─────────────┐ │ Voice │ Style │ Voice │ Style │ ├─────────────────┼─────────────┼─────────────────┼─────────────┤ │ Zephyr │ Bright │ Puck │ Upbeat │ │ Charon │ Informative │ Kore │ Firm │ │ Fenrir │ Excitable │ Leda │ Youthful │ │ Orus │ Firm │ Aoede │ Breezy │ │ Callirrhoe │ Easy-going │ Autonoe │ Bright │ │ Enceladus │ Breathy │ Iapetus │ Clear │ │ Umbriel │ Easy-going │ Algieba │ Smooth │ │ Despina │ Smooth │ Erinome │ Clear │ │ Algenib │ Gravelly │ Rasalgethi │ Informative │ │ Laomedeia │ Upbeat │ Achernar │ Soft │ │ Alnilam │ Firm │ Schedar │ Even │ │ Gacrux │ Mature │ Pulcherrima │ Forward │ │ Achird │ Friendly │ Zubenelgenubi │ Casual │ │ Vindemiatrix │ Gentle │ Sadachbia │ Lively │ │ Sadaltager │ Knowledgeable│ Sulafat │ Warm │ └─────────────────┴─────────────┴─────────────────┴─────────────┘ Try voices at: https://aistudio.google.com/generate-speech Args: input: TtsInput containing the text to convert to speech. Returns: Dictionary with audio data (base64) or URL. Example: >>> result = await tts_speech_generator_flow(TtsInput(text='Good morning!')) >>> print(result['audio_url'][:50]) data:audio/wav;base64,... """ text = input.text voice = input.voice model = get_tts_model() config: dict[str, Any] = {} if HAS_GEMINI_API_KEY: config = {'speech_config': {'voice_config': {'prebuilt_voice_config': {'voice_name': voice}}}} try: response = await ai.generate(model=model, prompt=text, config=config) except Exception as e: error_msg = str(e) if 'RESOURCE_EXHAUSTED' in error_msg or 'quota' in error_msg.lower(): return { 'error': 'QUOTA_EXCEEDED', 'message': 'TTS requires GCP billing or quota reset.', 'details': 'Visit https://ai.google.dev/gemini-api/docs/rate-limits for info.', 'model': model, } if 'FAILED_PRECONDITION' in error_msg and 'billing' in error_msg.lower(): return { 'error': 'GCP_BILLING_REQUIRED', 'message': 'TTS requires GCP billing.', 'details': 'Visit https://console.cloud.google.com/billing to enable billing.', 'model': model, } raise # Extract audio from response audio_url = None if response.message and response.message.content: for part in response.message.content: if hasattr(part.root, 'media') and part.root.media: audio_url = getattr(part.root.media, 'url', None) break return { 'model': model, 'voice': voice, 'text': text, 'audio_url': audio_url, # pyrefly: ignore[unbound-name] - HAS_GEMINI_API_KEY defined at module level 'using_real_model': HAS_GEMINI_API_KEY, } # ============================================================================ # Demo Flows - Gemini Image Generation # ============================================================================ @ai.flow(name='gemini_image_generator', description='Generate images using Imagen') async def gemini_image_generator_flow(input: ImageInput) -> dict[str, Any]: """Generate images using Imagen image generation. Imagen models can generate high-quality images from text descriptions with excellent prompt following and artifact-free output. Args: input: ImageInput containing the prompt description. Returns: Dictionary with image data or URL. Example: >>> result = await gemini_image_generator_flow(ImageInput(prompt='A cat astronaut')) """ prompt = input.prompt aspect_ratio = input.aspect_ratio model = get_image_model() config: dict[str, Any] = {} if HAS_GEMINI_API_KEY: config = {'image_config': {'aspect_ratio': aspect_ratio}} try: response = await ai.generate(model=model, prompt=prompt, config=config) except Exception as e: error_msg = str(e) if 'RESOURCE_EXHAUSTED' in error_msg or 'quota' in error_msg.lower(): return { 'error': 'QUOTA_EXCEEDED', 'message': 'Image generation requires GCP billing or quota reset.', 'details': 'Experimental models (gemini-2.0-flash-exp-*) require billing.', 'hint': 'Visit https://console.cloud.google.com/billing to enable billing.', 'model': model, } if 'FAILED_PRECONDITION' in error_msg and 'billing' in error_msg.lower(): return { 'error': 'GCP_BILLING_REQUIRED', 'message': 'Image generation requires GCP billing.', 'details': 'Visit https://console.cloud.google.com/billing to enable billing.', 'model': model, } raise # Extract image from response image_url = None if response.message and response.message.content: for part in response.message.content: if hasattr(part.root, 'media') and part.root.media: image_url = getattr(part.root.media, 'url', None) break return { 'model': model, 'prompt': prompt, 'aspect_ratio': aspect_ratio, 'image_url': image_url, # pyrefly: ignore[unbound-name] - HAS_GEMINI_API_KEY defined at module level 'using_real_model': HAS_GEMINI_API_KEY, } # ============================================================================ # Demo Flows - Lyria Audio Generation # ============================================================================ @ai.flow(name='lyria_audio_generator', description='Generate music/audio using Lyria') async def lyria_audio_generator_flow(input: AudioInput) -> dict[str, Any]: """Generate audio/music using Lyria. Lyria is Google's audio generation model available through Vertex AI. It can generate music, ambient sounds, and other audio from text descriptions. Note: Requires GOOGLE_CLOUD_PROJECT environment variable for real Lyria. Args: input: AudioInput containing the prompt description. Returns: Dictionary with audio data (base64 WAV). Example: >>> result = await lyria_audio_generator_flow(AudioInput(prompt='Dance music')) """ prompt = input.prompt negative_prompt = input.negative_prompt model = get_lyria_model() if negative_prompt: pass config: dict[str, Any] = {} if negative_prompt: config['negative_prompt'] = negative_prompt try: response = await ai.generate(model=model, prompt=prompt, config=config) except Exception as e: error_msg = str(e) if 'RESOURCE_EXHAUSTED' in error_msg or 'quota' in error_msg.lower(): return { 'error': 'QUOTA_EXCEEDED', 'message': 'Lyria requires GCP billing or quota reset.', 'model': model, } if 'FAILED_PRECONDITION' in error_msg and 'billing' in error_msg.lower(): return { 'error': 'GCP_BILLING_REQUIRED', 'message': 'Lyria requires Vertex AI with GCP billing.', 'details': 'Visit https://console.cloud.google.com/billing to enable billing.', 'model': model, } raise # Extract audio from response audio_url = None if response.message and response.message.content: for part in response.message.content: if hasattr(part.root, 'media') and part.root.media: audio_url = getattr(part.root.media, 'url', None) break return { 'model': model, 'prompt': prompt, 'negative_prompt': negative_prompt, 'audio_url': audio_url, 'using_real_model': HAS_GCP_PROJECT, } # ============================================================================ # Demo Flows - Veo Video Generation (Background Model) # ============================================================================ @ai.flow(name='veo_video_generator', description='Generate video using Veo (background model)') async def veo_video_generator_flow(input: VideoInput) -> dict[str, Any]: """Generate video using Veo. Veo uses the **background model** pattern because video generation is a long-running operation (30 seconds to several minutes). The flow: 1. Start the operation (returns immediately with job ID) 2. Poll for completion every few seconds 3. Return the video URL when done Args: input: VideoInput containing the prompt description. Returns: Dictionary with video URL and operation details. Example: >>> result = await veo_video_generator_flow(VideoInput(prompt='A sunset over the ocean')) >>> print(result['video_url']) """ prompt = input.prompt aspect_ratio = input.aspect_ratio duration_seconds = input.duration_seconds model = get_veo_model() # Get the background model using its action key action_key = f'/background-model/{model}' video_model = await lookup_background_action(ai.registry, action_key) if video_model is None: return {'error': f'Model {model} not found'} # Build config config: dict[str, Any] = { 'aspect_ratio': aspect_ratio, 'duration_seconds': duration_seconds, } # Start the operation operation = await video_model.start( GenerateRequest( messages=[Message(role=Role.USER, content=[Part(root=TextPart(text=prompt))])], config=config, ) ) # Poll until complete (with timeout) max_wait = 300 # 5 minutes start_time = time.time() poll_count = 0 while not operation.done: if time.time() - start_time > max_wait: return { 'operation_id': operation.id, 'status': 'timeout', 'message': 'Operation timed out after 5 minutes', } await asyncio.sleep(3) poll_count += 1 operation = await video_model.check(operation) operation.metadata.get('progress', 0) if operation.metadata else 0 # Extract video URL video_url = None if operation.output: output = operation.output if isinstance(output, dict): message = output.get('message', {}) content = message.get('content', []) if content: media = content[0].get('media', {}) video_url = media.get('url') if operation.error: return { 'operation_id': operation.id, 'status': 'error', 'error': operation.error, } return { 'operation_id': operation.id, 'status': 'completed', 'video_url': video_url, 'prompt': prompt, 'aspect_ratio': aspect_ratio, 'duration_seconds': duration_seconds, 'model': model, 'using_real_model': HAS_GEMINI_API_KEY, } # ============================================================================ # Demo Flow - All Models Overview # ============================================================================ @ai.flow(name='media_models_overview', description='Overview of all available media models') async def media_models_overview_flow() -> dict[str, Any]: """Get an overview of all available media generation models. Returns information about which models are available based on the current environment configuration. """ return { 'tts': { 'model': get_tts_model(), 'real_available': HAS_GEMINI_API_KEY, 'description': 'Text-to-Speech - converts text to audio', 'voices': ['Zephyr', 'Puck', 'Charon', 'Kore', 'Fenrir', 'Leda', 'Orus'], }, 'image': { 'model': get_image_model(), 'real_available': HAS_GEMINI_API_KEY, 'description': 'Gemini Image - generates images from text', 'aspect_ratios': ['1:1', '16:9', '9:16', '4:3', '3:4'], }, 'lyria': { 'model': get_lyria_model(), 'real_available': HAS_GCP_PROJECT, 'description': 'Lyria - generates music/audio (Vertex AI)', 'note': 'Requires GOOGLE_CLOUD_PROJECT for real model', }, 'veo': { 'model': get_veo_model(), 'real_available': HAS_GEMINI_API_KEY, 'description': 'Veo - generates videos (background model)', 'versions': ['veo-2.0', 'veo-3.0', 'veo-3.0-fast', 'veo-3.1'], }, 'environment': { 'GEMINI_API_KEY': HAS_GEMINI_API_KEY, 'GOOGLE_CLOUD_PROJECT': HAS_GCP_PROJECT, }, } async def main() -> None: """Keep the server alive for the Dev UI.""" await asyncio.Event().wait() if __name__ == '__main__': ai.run_main(main())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/firebase/genkit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

main.py•32.4 KiB