generate_image
Create images from text prompts using Google's Gemini API, with options for image editing, character consistency, multi-image blending, and factual accuracy.
Instructions
Generate image with specified prompt and optional parameters
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| blendImages | No | Enable multi-image blending for combining multiple visual elements naturally. Use when prompt mentions multiple subjects or composite scenes | |
| fileName | No | Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR) | |
| inputImagePath | No | Optional absolute path to source image for image-to-image generation. Use when generating variations, style transfers, or similar images based on an existing image (must be an absolute path) | |
| maintainCharacterConsistency | No | Maintain character appearance consistency. Enable when generating same character in different poses/scenes | |
| prompt | Yes | The prompt for image generation (English recommended for optimal structured prompt enhancement) | |
| useWorldKnowledge | No | Use real-world knowledge for accurate context. Enable for historical figures, landmarks, or factual scenarios |
Input Schema (JSON Schema)
{
"properties": {
"blendImages": {
"description": "Enable multi-image blending for combining multiple visual elements naturally. Use when prompt mentions multiple subjects or composite scenes",
"type": "boolean"
},
"fileName": {
"description": "Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR)",
"type": "string"
},
"inputImagePath": {
"description": "Optional absolute path to source image for image-to-image generation. Use when generating variations, style transfers, or similar images based on an existing image (must be an absolute path)",
"type": "string"
},
"maintainCharacterConsistency": {
"description": "Maintain character appearance consistency. Enable when generating same character in different poses/scenes",
"type": "boolean"
},
"prompt": {
"description": "The prompt for image generation (English recommended for optimal structured prompt enhancement)",
"type": "string"
},
"useWorldKnowledge": {
"description": "Use real-world knowledge for accurate context. Enable for historical figures, landmarks, or factual scenarios",
"type": "boolean"
}
},
"required": [
"prompt"
],
"type": "object"
}
Implementation Reference
- src/server/mcpServer.ts:206-314 (handler)Primary handler for the 'generate_image' tool. Orchestrates validation, prompt enhancement, Gemini API call, file saving, and response building.private async handleGenerateImage(params: GenerateImageParams) { const result = await ErrorHandler.wrapWithResultType(async () => { // Validate input const validationResult = validateGenerateImageParams(params) if (!validationResult.success) { throw validationResult.error } // Get configuration const configResult = getConfig() if (!configResult.success) { throw configResult.error } // Initialize clients await this.initializeClients() // Handle input image if provided let inputImageData: string | undefined if (params.inputImagePath) { const imageBuffer = await fs.readFile(params.inputImagePath) inputImageData = imageBuffer.toString('base64') } // Generate structured prompt using Gemini 2.0 Flash (unless skipped) let structuredPrompt = params.prompt if (!configResult.data.skipPromptEnhancement && this.structuredPromptGenerator) { const features: FeatureFlags = {} if (params.maintainCharacterConsistency !== undefined) { features.maintainCharacterConsistency = params.maintainCharacterConsistency } if (params.blendImages !== undefined) { features.blendImages = params.blendImages } if (params.useWorldKnowledge !== undefined) { features.useWorldKnowledge = params.useWorldKnowledge } if (params.useGoogleSearch !== undefined) { features.useGoogleSearch = params.useGoogleSearch } const promptResult = await this.structuredPromptGenerator.generateStructuredPrompt( params.prompt, features, inputImageData, // Pass image data for context-aware prompt generation params.purpose // Pass intended use for purpose-aware prompt generation ) if (promptResult.success) { structuredPrompt = promptResult.data.structuredPrompt this.logger.info('mcp-server', 'Structured prompt generated', { originalLength: params.prompt.length, structuredLength: structuredPrompt.length, selectedPractices: promptResult.data.selectedPractices, }) } else { this.logger.warn('mcp-server', 'Using original prompt', { error: promptResult.error.message, }) } } else if (configResult.data.skipPromptEnhancement) { this.logger.info('mcp-server', 'Prompt enhancement skipped (SKIP_PROMPT_ENHANCEMENT=true)') } // Generate image using Gemini 2.5 Flash Image Preview if (!this.geminiClient) { throw new Error('Gemini client not initialized') } const generationResult = await this.geminiClient.generateImage({ prompt: structuredPrompt, ...(inputImageData && { inputImage: inputImageData }), ...(params.aspectRatio && { aspectRatio: params.aspectRatio }), ...(params.imageSize && { imageSize: params.imageSize }), ...(params.useGoogleSearch !== undefined && { useGoogleSearch: params.useGoogleSearch }), }) if (!generationResult.success) { throw generationResult.error } // Save image file const fileName = params.fileName || this.fileManager.generateFileName() const outputPath = path.join(configResult.data.imageOutputDir, fileName) const sanitizedPath = this.securityManager.sanitizeFilePath(outputPath) if (!sanitizedPath.success) { throw sanitizedPath.error } const saveResult = await this.fileManager.saveImage( generationResult.data.imageData, sanitizedPath.data ) if (!saveResult.success) { throw saveResult.error } // Build response return this.responseBuilder.buildSuccessResponse(generationResult.data, saveResult.data) }, 'image-generation') if (result.ok) { return result.value } return this.responseBuilder.buildErrorResponse(result.error) }
- src/api/geminiClient.ts:175-413 (handler)Core low-level handler that makes the actual API call to Gemini for image generation, handles multimodal inputs, extracts base64 image data.async generateImage( params: GeminiApiParams ): Promise<Result<GeneratedImageResult, GeminiAPIError | NetworkError>> { try { // Prepare the request content with proper structure for multimodal input const requestContent: unknown[] = [] // Structure the contents properly for image generation/editing if (params.inputImage) { // For image editing: provide image first, then text instructions requestContent.push({ parts: [ { inlineData: { data: params.inputImage, mimeType: 'image/jpeg', // TODO: Dynamic MIME type support }, }, { text: params.prompt, }, ], }) } else { // For text-to-image: provide only text prompt requestContent.push({ parts: [ { text: params.prompt, }, ], }) } // Construct config object for generateContent const imageConfig: Record<string, string> = {} if (params.aspectRatio) { imageConfig['aspectRatio'] = params.aspectRatio } if (params.imageSize) { imageConfig['imageSize'] = params.imageSize } const config = Object.keys(imageConfig).length > 0 ? { imageConfig, responseModalities: ['IMAGE'], } : { responseModalities: ['IMAGE'], } // Construct tools array for Google Search grounding const tools = params.useGoogleSearch ? [{ googleSearch: {} }] : undefined // Generate content using Gemini API (@google/genai v1.17.0+) const rawResponse = await this.genai.models.generateContent({ model: this.modelName, contents: requestContent, config, ...(tools && { tools }), }) // Validate response structure with type guard if (!isGeminiResponse(rawResponse)) { const responseStructure = analyzeResponseStructure(rawResponse) // Check if it's an error response from Gemini const asRecord = rawResponse as Record<string, unknown> if (asRecord['error']) { const error = asRecord['error'] as Record<string, unknown> return Err( new GeminiAPIError(`Gemini API Error: ${error['message'] || 'Unknown error'}`, { code: error['code'], status: error['status'], details: error['details'] || responseStructure, stage: 'api_error', }) ) } return Err( new GeminiAPIError('Invalid response structure from Gemini API', { message: 'The API returned an unexpected response format', responseStructure: responseStructure, stage: 'response_validation', suggestion: 'Check if the API endpoint or model configuration is correct', }) ) } // Extract the actual response data (handle wrapped responses) const responseData = (rawResponse as Record<string, unknown>)['response'] ? ((rawResponse as Record<string, unknown>)['response'] as GeminiResponse) : (rawResponse as GeminiResponse) // Check for prompt feedback (safety blocking) const responseAsRecord = responseData as Record<string, unknown> if (responseAsRecord['promptFeedback']) { const promptFeedback = responseAsRecord['promptFeedback'] as Record<string, unknown> if (promptFeedback['blockReason'] === 'SAFETY') { return Err( new GeminiAPIError('Image generation blocked for safety reasons', { stage: 'prompt_analysis', blockReason: promptFeedback['blockReason'], suggestion: 'Rephrase your prompt to avoid potentially sensitive content', }) ) } if ( promptFeedback['blockReason'] === 'OTHER' || promptFeedback['blockReason'] === 'PROHIBITED_CONTENT' ) { return Err( new GeminiAPIError('Image generation blocked due to prohibited content', { stage: 'prompt_analysis', blockReason: promptFeedback['blockReason'], suggestion: 'Remove any prohibited content from your prompt and try again', }) ) } } // Check for candidates if (!responseData.candidates || responseData.candidates.length === 0) { return Err( new GeminiAPIError('No image generated: Content may have been filtered', { stage: 'generation', candidatesCount: 0, suggestion: 'Try rephrasing your prompt to avoid potentially sensitive content', }) ) } const candidate = responseData.candidates[0] if (!candidate || !candidate.content || !candidate.content.parts) { return Err( new GeminiAPIError('No valid content in response', { stage: 'candidate_extraction', suggestion: 'The API response was incomplete. Please try again', }) ) } const parts = candidate.content.parts // Handle finish reason specific errors before checking parts if (candidate.finishReason) { const finishReason = candidate.finishReason if (finishReason === 'IMAGE_SAFETY') { return Err( new GeminiAPIError('Image generation stopped for safety reasons', { finishReason, stage: 'generation_stopped', suggestion: 'Modify your prompt to avoid potentially sensitive content', safetyRatings: (candidate as Record<string, unknown>)['safetyRatings'] ? ( (candidate as Record<string, unknown>)['safetyRatings'] as Record< string, unknown >[] ) ?.map((rating: Record<string, unknown>) => { const category = (rating['category'] as string) .replace('HARM_CATEGORY_', '') .split('_') .map((word: string) => word.charAt(0) + word.slice(1).toLowerCase()) .join(' ') return `${category} (${rating['blocked'] ? 'BLOCKED' : 'ALLOWED'})` }) .join(', ') : undefined, }) ) } if (finishReason === 'MAX_TOKENS') { return Err( new GeminiAPIError('Maximum token limit reached during generation', { finishReason, stage: 'generation_stopped', suggestion: 'Try using a shorter or simpler prompt', }) ) } } if (parts.length === 0) { return Err( new GeminiAPIError('No content parts in response', { stage: 'content_extraction', suggestion: 'The generation was incomplete. Please try again', }) ) } // Check if we got an image or text (error message) const imagePart = parts.find((part) => part.inlineData?.data) const textPart = parts.find((part) => part.text) if (!imagePart?.inlineData) { // If there's text, it's likely an error message from Gemini const errorMessage = textPart?.text || 'Image generation failed' return Err( new GeminiAPIError('Image generation failed due to content filtering', { reason: errorMessage, stage: 'image_extraction', suggestion: 'The prompt was blocked by safety filters. Try rephrasing your prompt to avoid potentially sensitive content.', }) ) } // Convert base64 image data to Buffer const imageBuffer = Buffer.from(imagePart.inlineData.data, 'base64') const mimeType = imagePart.inlineData.mimeType || 'image/png' // Create metadata const metadata: GeminiGenerationMetadata = { model: this.modelName, prompt: params.prompt, mimeType, timestamp: new Date(), inputImageProvided: !!params.inputImage, ...(responseData.modelVersion && { modelVersion: responseData.modelVersion }), ...(responseData.responseId && { responseId: responseData.responseId }), } return Ok({ imageData: imageBuffer, metadata, }) } catch (error) { return this.handleError(error, params.prompt) } }
- src/server/mcpServer.ts:86-146 (registration)MCP tool registration: defines name 'generate_image', description, and detailed input schema returned by ListTools.{ name: 'generate_image', description: 'Generate image with specified prompt and optional parameters', inputSchema: { type: 'object' as const, properties: { prompt: { type: 'string' as const, description: 'The prompt for image generation (English recommended for optimal structured prompt enhancement)', }, fileName: { type: 'string' as const, description: 'Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR)', }, inputImagePath: { type: 'string' as const, description: 'Optional absolute path to source image for image-to-image generation. Use when generating variations, style transfers, or similar images based on an existing image (must be an absolute path)', }, blendImages: { type: 'boolean' as const, description: 'Enable multi-image blending for combining multiple visual elements naturally. Use when prompt mentions multiple subjects or composite scenes', }, maintainCharacterConsistency: { type: 'boolean' as const, description: 'Maintain character appearance consistency. Enable when generating same character in different poses/scenes', }, useWorldKnowledge: { type: 'boolean' as const, description: 'Use real-world knowledge for accurate context. Enable for historical figures, landmarks, or factual scenarios', }, useGoogleSearch: { type: 'boolean' as const, description: "Enable Google Search grounding to access real-time web information for factually accurate image generation. Use when prompt requires current or time-sensitive data that may have changed since the model's knowledge cutoff. Leave disabled for creative, fictional, historical, or timeless content.", }, aspectRatio: { type: 'string' as const, description: 'Aspect ratio for the generated image', enum: ['1:1', '2:3', '3:2', '3:4', '4:3', '4:5', '5:4', '9:16', '16:9', '21:9'], }, imageSize: { type: 'string' as const, description: 'Image resolution for high-quality output. Specify "2K" or "4K" when you need higher resolution images with better text rendering and fine details. Leave unspecified for standard quality.', enum: ['2K', '4K'], }, purpose: { type: 'string' as const, description: 'Intended use for the image (e.g., cookbook cover, social media post, presentation slide). Helps tailor visual style, quality level, and details to match the purpose.', }, }, required: ['prompt'], }, },
- src/types/mcp.ts:33-58 (schema)TypeScript interface defining the input parameters for the generate_image tool.export interface GenerateImageParams { /** Prompt for image generation */ prompt: string /** Optional file name for the generated image (if not specified, generates an auto-named file in IMAGE_OUTPUT_DIR) */ fileName?: string /** Absolute path to input image for editing (optional) */ inputImagePath?: string /** Base64 encoded input image data (optional) */ inputImage?: string /** MIME type of the input image (optional, used with inputImage) */ inputImageMimeType?: string /** Multi-image blending functionality (default: false) */ blendImages?: boolean /** Maintain character consistency across generations (default: false) */ maintainCharacterConsistency?: boolean /** Use world knowledge integration for more accurate context (default: false) */ useWorldKnowledge?: boolean /** Enable Google Search grounding for real-time web information (default: false) */ useGoogleSearch?: boolean /** Aspect ratio for generated image (default: "1:1") */ aspectRatio?: AspectRatio /** Image resolution for high-quality output (e.g., "2K", "4K"). Leave unspecified for standard quality */ imageSize?: ImageSize /** Intended use for the image (e.g., cookbook cover, social media post). Helps tailor visual style and quality */ purpose?: string }
- Input validation helper specifically for GenerateImageParams, checking prompt length, image paths, boolean flags, aspect ratios, etc.export function validateGenerateImageParams( params: GenerateImageParams ): Result<GenerateImageParams, InputValidationError> { // Validate prompt const promptResult = validatePrompt(params.prompt) if (!promptResult.success) { return Err(promptResult.error) } // Validate input image path if provided const imagePathResult = validateImagePath(params.inputImagePath) if (!imagePathResult.success) { return Err(imagePathResult.error) } // Validate blendImages parameter if (params.blendImages !== undefined && typeof params.blendImages !== 'boolean') { return Err( new InputValidationError( 'blendImages must be a boolean value', 'Use true or false for blendImages parameter to enable/disable multi-image blending' ) ) } // Validate maintainCharacterConsistency parameter if ( params.maintainCharacterConsistency !== undefined && typeof params.maintainCharacterConsistency !== 'boolean' ) { return Err( new InputValidationError( 'maintainCharacterConsistency must be a boolean value', 'Use true or false for maintainCharacterConsistency parameter to enable/disable character consistency' ) ) } // Validate useWorldKnowledge parameter if (params.useWorldKnowledge !== undefined && typeof params.useWorldKnowledge !== 'boolean') { return Err( new InputValidationError( 'useWorldKnowledge must be a boolean value', 'Use true or false for useWorldKnowledge parameter to enable/disable world knowledge integration' ) ) } // Validate input image data if provided if (params.inputImage || params.inputImageMimeType) { const imageResult = validateBase64Image(params.inputImage, params.inputImageMimeType) if (!imageResult.success) { return Err(imageResult.error) } } // Validate aspectRatio parameter if (params.aspectRatio && !SUPPORTED_ASPECT_RATIOS.includes(params.aspectRatio)) { return Err( new InputValidationError( `Invalid aspect ratio: ${params.aspectRatio}. Supported values: ${SUPPORTED_ASPECT_RATIOS.join(', ')}`, `Please use one of the supported aspect ratios: ${SUPPORTED_ASPECT_RATIOS.join(', ')}` ) ) } return Ok(params) }