generate_image
Create custom images from text prompts with options for aspect ratio, resolution, character consistency, and factual accuracy using Google Search grounding.
Instructions
Generate image with specified prompt and optional parameters
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| prompt | Yes | The prompt for image generation (English recommended for optimal structured prompt enhancement) | |
| fileName | No | Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR) | |
| inputImagePath | No | Optional absolute path to source image for image-to-image generation. Use when generating variations, style transfers, or similar images based on an existing image (must be an absolute path) | |
| blendImages | No | Enable multi-image blending for combining multiple visual elements naturally. Use when prompt mentions multiple subjects or composite scenes | |
| maintainCharacterConsistency | No | Maintain character appearance consistency. Enable when generating same character in different poses/scenes | |
| useWorldKnowledge | No | Use real-world knowledge for accurate context. Enable for historical figures, landmarks, or factual scenarios | |
| useGoogleSearch | No | Enable Google Search grounding to access real-time web information for factually accurate image generation. Use when prompt requires current or time-sensitive data that may have changed since the model's knowledge cutoff. Leave disabled for creative, fictional, historical, or timeless content. | |
| aspectRatio | No | Aspect ratio for the generated image | |
| imageSize | No | Image resolution for high-quality output. Specify "2K" or "4K" when you need higher resolution images with better text rendering and fine details. Leave unspecified for standard quality. | |
| purpose | No | Intended use for the image (e.g., cookbook cover, social media post, presentation slide). Helps tailor visual style, quality level, and details to match the purpose. |
Implementation Reference
- src/server/mcpServer.ts:206-314 (handler)Main execution handler for the generate_image tool. Validates params, enhances prompt, calls GeminiClient for image generation, saves file, and builds MCP response.private async handleGenerateImage(params: GenerateImageParams) { const result = await ErrorHandler.wrapWithResultType(async () => { // Validate input const validationResult = validateGenerateImageParams(params) if (!validationResult.success) { throw validationResult.error } // Get configuration const configResult = getConfig() if (!configResult.success) { throw configResult.error } // Initialize clients await this.initializeClients() // Handle input image if provided let inputImageData: string | undefined if (params.inputImagePath) { const imageBuffer = await fs.readFile(params.inputImagePath) inputImageData = imageBuffer.toString('base64') } // Generate structured prompt using Gemini 2.0 Flash (unless skipped) let structuredPrompt = params.prompt if (!configResult.data.skipPromptEnhancement && this.structuredPromptGenerator) { const features: FeatureFlags = {} if (params.maintainCharacterConsistency !== undefined) { features.maintainCharacterConsistency = params.maintainCharacterConsistency } if (params.blendImages !== undefined) { features.blendImages = params.blendImages } if (params.useWorldKnowledge !== undefined) { features.useWorldKnowledge = params.useWorldKnowledge } if (params.useGoogleSearch !== undefined) { features.useGoogleSearch = params.useGoogleSearch } const promptResult = await this.structuredPromptGenerator.generateStructuredPrompt( params.prompt, features, inputImageData, // Pass image data for context-aware prompt generation params.purpose // Pass intended use for purpose-aware prompt generation ) if (promptResult.success) { structuredPrompt = promptResult.data.structuredPrompt this.logger.info('mcp-server', 'Structured prompt generated', { originalLength: params.prompt.length, structuredLength: structuredPrompt.length, selectedPractices: promptResult.data.selectedPractices, }) } else { this.logger.warn('mcp-server', 'Using original prompt', { error: promptResult.error.message, }) } } else if (configResult.data.skipPromptEnhancement) { this.logger.info('mcp-server', 'Prompt enhancement skipped (SKIP_PROMPT_ENHANCEMENT=true)') } // Generate image using Gemini 2.5 Flash Image Preview if (!this.geminiClient) { throw new Error('Gemini client not initialized') } const generationResult = await this.geminiClient.generateImage({ prompt: structuredPrompt, ...(inputImageData && { inputImage: inputImageData }), ...(params.aspectRatio && { aspectRatio: params.aspectRatio }), ...(params.imageSize && { imageSize: params.imageSize }), ...(params.useGoogleSearch !== undefined && { useGoogleSearch: params.useGoogleSearch }), }) if (!generationResult.success) { throw generationResult.error } // Save image file const fileName = params.fileName || this.fileManager.generateFileName() const outputPath = path.join(configResult.data.imageOutputDir, fileName) const sanitizedPath = this.securityManager.sanitizeFilePath(outputPath) if (!sanitizedPath.success) { throw sanitizedPath.error } const saveResult = await this.fileManager.saveImage( generationResult.data.imageData, sanitizedPath.data ) if (!saveResult.success) { throw saveResult.error } // Build response return this.responseBuilder.buildSuccessResponse(generationResult.data, saveResult.data) }, 'image-generation') if (result.ok) { return result.value } return this.responseBuilder.buildErrorResponse(result.error) }
- src/server/mcpServer.ts:86-146 (registration)Tool registration in getToolsList(), defining name 'generate_image', description, and detailed inputSchema matching GenerateImageParams.{ name: 'generate_image', description: 'Generate image with specified prompt and optional parameters', inputSchema: { type: 'object' as const, properties: { prompt: { type: 'string' as const, description: 'The prompt for image generation (English recommended for optimal structured prompt enhancement)', }, fileName: { type: 'string' as const, description: 'Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR)', }, inputImagePath: { type: 'string' as const, description: 'Optional absolute path to source image for image-to-image generation. Use when generating variations, style transfers, or similar images based on an existing image (must be an absolute path)', }, blendImages: { type: 'boolean' as const, description: 'Enable multi-image blending for combining multiple visual elements naturally. Use when prompt mentions multiple subjects or composite scenes', }, maintainCharacterConsistency: { type: 'boolean' as const, description: 'Maintain character appearance consistency. Enable when generating same character in different poses/scenes', }, useWorldKnowledge: { type: 'boolean' as const, description: 'Use real-world knowledge for accurate context. Enable for historical figures, landmarks, or factual scenarios', }, useGoogleSearch: { type: 'boolean' as const, description: "Enable Google Search grounding to access real-time web information for factually accurate image generation. Use when prompt requires current or time-sensitive data that may have changed since the model's knowledge cutoff. Leave disabled for creative, fictional, historical, or timeless content.", }, aspectRatio: { type: 'string' as const, description: 'Aspect ratio for the generated image', enum: ['1:1', '2:3', '3:2', '3:4', '4:3', '4:5', '5:4', '9:16', '16:9', '21:9'], }, imageSize: { type: 'string' as const, description: 'Image resolution for high-quality output. Specify "2K" or "4K" when you need higher resolution images with better text rendering and fine details. Leave unspecified for standard quality.', enum: ['2K', '4K'], }, purpose: { type: 'string' as const, description: 'Intended use for the image (e.g., cookbook cover, social media post, presentation slide). Helps tailor visual style, quality level, and details to match the purpose.', }, }, required: ['prompt'], }, },
- src/types/mcp.ts:33-58 (schema)TypeScript interface defining the structure and optional parameters for generate_image tool inputs.export interface GenerateImageParams { /** Prompt for image generation */ prompt: string /** Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR) */ fileName?: string /** Absolute path to input image for editing (optional) */ inputImagePath?: string /** Base64 encoded input image data (optional) */ inputImage?: string /** MIME type of the input image (optional, used with inputImage) */ inputImageMimeType?: string /** Multi-image blending functionality (default: false) */ blendImages?: boolean /** Maintain character consistency across generations (default: false) */ maintainCharacterConsistency?: boolean /** Use world knowledge integration for more accurate context (default: false) */ useWorldKnowledge?: boolean /** Enable Google Search grounding for real-time web information (default: false) */ useGoogleSearch?: boolean /** Aspect ratio for generated image (default: "1:1") */ aspectRatio?: AspectRatio /** Image resolution for high-quality output (e.g., "2K", "4K"). Leave unspecified for standard quality */ imageSize?: ImageSize /** Intended use for the image (e.g., cookbook cover, social media post). Helps tailor visual style and quality */ purpose?: string }
- Input validation helper specifically for GenerateImageParams, called at the start of the handler.export function validateGenerateImageParams( params: GenerateImageParams ): Result<GenerateImageParams, InputValidationError> { // Validate prompt const promptResult = validatePrompt(params.prompt) if (!promptResult.success) { return Err(promptResult.error) } // Validate input image path if provided const imagePathResult = validateImagePath(params.inputImagePath) if (!imagePathResult.success) { return Err(imagePathResult.error) } // Validate blendImages parameter if (params.blendImages !== undefined && typeof params.blendImages !== 'boolean') { return Err( new InputValidationError( 'blendImages must be a boolean value', 'Use true or false for blendImages parameter to enable/disable multi-image blending' ) ) } // Validate maintainCharacterConsistency parameter if ( params.maintainCharacterConsistency !== undefined && typeof params.maintainCharacterConsistency !== 'boolean' ) { return Err( new InputValidationError( 'maintainCharacterConsistency must be a boolean value', 'Use true or false for maintainCharacterConsistency parameter to enable/disable character consistency' ) ) } // Validate useWorldKnowledge parameter if (params.useWorldKnowledge !== undefined && typeof params.useWorldKnowledge !== 'boolean') { return Err( new InputValidationError( 'useWorldKnowledge must be a boolean value', 'Use true or false for useWorldKnowledge parameter to enable/disable world knowledge integration' ) ) } // Validate input image data if provided if (params.inputImage || params.inputImageMimeType) { const imageResult = validateBase64Image(params.inputImage, params.inputImageMimeType) if (!imageResult.success) { return Err(imageResult.error) } } // Validate aspectRatio parameter if (params.aspectRatio && !SUPPORTED_ASPECT_RATIOS.includes(params.aspectRatio)) { return Err( new InputValidationError( `Invalid aspect ratio: ${params.aspectRatio}. Supported values: ${SUPPORTED_ASPECT_RATIOS.join(', ')}`, `Please use one of the supported aspect ratios: ${SUPPORTED_ASPECT_RATIOS.join(', ')}` ) ) } return Ok(params) }
- src/api/geminiClient.ts:175-413 (helper)Core API integration helper that calls Google Gemini API to generate the image, handles multimodal inputs, and parses the base64 response.async generateImage( params: GeminiApiParams ): Promise<Result<GeneratedImageResult, GeminiAPIError | NetworkError>> { try { // Prepare the request content with proper structure for multimodal input const requestContent: unknown[] = [] // Structure the contents properly for image generation/editing if (params.inputImage) { // For image editing: provide image first, then text instructions requestContent.push({ parts: [ { inlineData: { data: params.inputImage, mimeType: 'image/jpeg', // TODO: Dynamic MIME type support }, }, { text: params.prompt, }, ], }) } else { // For text-to-image: provide only text prompt requestContent.push({ parts: [ { text: params.prompt, }, ], }) } // Construct config object for generateContent const imageConfig: Record<string, string> = {} if (params.aspectRatio) { imageConfig['aspectRatio'] = params.aspectRatio } if (params.imageSize) { imageConfig['imageSize'] = params.imageSize } const config = Object.keys(imageConfig).length > 0 ? { imageConfig, responseModalities: ['IMAGE'], } : { responseModalities: ['IMAGE'], } // Construct tools array for Google Search grounding const tools = params.useGoogleSearch ? [{ googleSearch: {} }] : undefined // Generate content using Gemini API (@google/genai v1.17.0+) const rawResponse = await this.genai.models.generateContent({ model: this.modelName, contents: requestContent, config, ...(tools && { tools }), }) // Validate response structure with type guard if (!isGeminiResponse(rawResponse)) { const responseStructure = analyzeResponseStructure(rawResponse) // Check if it's an error response from Gemini const asRecord = rawResponse as Record<string, unknown> if (asRecord['error']) { const error = asRecord['error'] as Record<string, unknown> return Err( new GeminiAPIError(`Gemini API Error: ${error['message'] || 'Unknown error'}`, { code: error['code'], status: error['status'], details: error['details'] || responseStructure, stage: 'api_error', }) ) } return Err( new GeminiAPIError('Invalid response structure from Gemini API', { message: 'The API returned an unexpected response format', responseStructure: responseStructure, stage: 'response_validation', suggestion: 'Check if the API endpoint or model configuration is correct', }) ) } // Extract the actual response data (handle wrapped responses) const responseData = (rawResponse as Record<string, unknown>)['response'] ? ((rawResponse as Record<string, unknown>)['response'] as GeminiResponse) : (rawResponse as GeminiResponse) // Check for prompt feedback (safety blocking) const responseAsRecord = responseData as Record<string, unknown> if (responseAsRecord['promptFeedback']) { const promptFeedback = responseAsRecord['promptFeedback'] as Record<string, unknown> if (promptFeedback['blockReason'] === 'SAFETY') { return Err( new GeminiAPIError('Image generation blocked for safety reasons', { stage: 'prompt_analysis', blockReason: promptFeedback['blockReason'], suggestion: 'Rephrase your prompt to avoid potentially sensitive content', }) ) } if ( promptFeedback['blockReason'] === 'OTHER' || promptFeedback['blockReason'] === 'PROHIBITED_CONTENT' ) { return Err( new GeminiAPIError('Image generation blocked due to prohibited content', { stage: 'prompt_analysis', blockReason: promptFeedback['blockReason'], suggestion: 'Remove any prohibited content from your prompt and try again', }) ) } } // Check for candidates if (!responseData.candidates || responseData.candidates.length === 0) { return Err( new GeminiAPIError('No image generated: Content may have been filtered', { stage: 'generation', candidatesCount: 0, suggestion: 'Try rephrasing your prompt to avoid potentially sensitive content', }) ) } const candidate = responseData.candidates[0] if (!candidate || !candidate.content || !candidate.content.parts) { return Err( new GeminiAPIError('No valid content in response', { stage: 'candidate_extraction', suggestion: 'The API response was incomplete. Please try again', }) ) } const parts = candidate.content.parts // Handle finish reason specific errors before checking parts if (candidate.finishReason) { const finishReason = candidate.finishReason if (finishReason === 'IMAGE_SAFETY') { return Err( new GeminiAPIError('Image generation stopped for safety reasons', { finishReason, stage: 'generation_stopped', suggestion: 'Modify your prompt to avoid potentially sensitive content', safetyRatings: (candidate as Record<string, unknown>)['safetyRatings'] ? ( (candidate as Record<string, unknown>)['safetyRatings'] as Record< string, unknown >[] ) ?.map((rating: Record<string, unknown>) => { const category = (rating['category'] as string) .replace('HARM_CATEGORY_', '') .split('_') .map((word: string) => word.charAt(0) + word.slice(1).toLowerCase()) .join(' ') return `${category} (${rating['blocked'] ? 'BLOCKED' : 'ALLOWED'})` }) .join(', ') : undefined, }) ) } if (finishReason === 'MAX_TOKENS') { return Err( new GeminiAPIError('Maximum token limit reached during generation', { finishReason, stage: 'generation_stopped', suggestion: 'Try using a shorter or simpler prompt', }) ) } } if (parts.length === 0) { return Err( new GeminiAPIError('No content parts in response', { stage: 'content_extraction', suggestion: 'The generation was incomplete. Please try again', }) ) } // Check if we got an image or text (error message) const imagePart = parts.find((part) => part.inlineData?.data) const textPart = parts.find((part) => part.text) if (!imagePart?.inlineData) { // If there's text, it's likely an error message from Gemini const errorMessage = textPart?.text || 'Image generation failed' return Err( new GeminiAPIError('Image generation failed due to content filtering', { reason: errorMessage, stage: 'image_extraction', suggestion: 'The prompt was blocked by safety filters. Try rephrasing your prompt to avoid potentially sensitive content.', }) ) } // Convert base64 image data to Buffer const imageBuffer = Buffer.from(imagePart.inlineData.data, 'base64') const mimeType = imagePart.inlineData.mimeType || 'image/png' // Create metadata const metadata: GeminiGenerationMetadata = { model: this.modelName, prompt: params.prompt, mimeType, timestamp: new Date(), inputImageProvided: !!params.inputImage, ...(responseData.modelVersion && { modelVersion: responseData.modelVersion }), ...(responseData.responseId && { responseId: responseData.responseId }), } return Ok({ imageData: imageBuffer, metadata, }) } catch (error) { return this.handleError(error, params.prompt) } }