gemini-analyze-image
Analyze images to extract summaries, objects, text, or detailed insights using Gemini's multimodal vision capabilities. Supports JPEG, PNG, WebP, and other formats with optional context for enhanced results.
Instructions
Analyze images using Gemini's multimodal vision capabilities (with learned user preferences)
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| analysis_type | No | Type of analysis to perform: "summary", "objects", "text", "detailed", or "custom" | |
| context | No | Optional context for intelligent enhancement (e.g., "medical", "architectural", "nature") | |
| file_path | Yes | Path to the image file to analyze (supports JPEG, PNG, WebP, HEIC, HEIF, BMP, GIF) |
Implementation Reference
- src/tools/image-analysis.js:51-144 (handler)Executes the image analysis: validates input, loads and encodes image, generates/enhances prompt based on analysis type and context, calls Gemini service, learns from interaction, formats response.async execute(args) { const filePath = validateNonEmptyString(args.file_path, 'file_path'); const analysisType = args.analysis_type ? validateString(args.analysis_type, 'analysis_type', ['summary', 'objects', 'text', 'detailed', 'custom']) : 'summary'; const context = args.context ? validateString(args.context, 'context') : null; log(`Analyzing image file: "${filePath}" with analysis type: "${analysisType}" and context: ${context || 'general'}`, this.name); try { validateFileSize(filePath, config.MAX_IMAGE_SIZE_MB); const imageBuffer = readFileAsBuffer(filePath); const imageBase64 = imageBuffer.toString('base64'); const mimeType = getMimeType(filePath, config.SUPPORTED_IMAGE_MIMES); log(`Image file loaded: ${(imageBuffer.length / 1024).toFixed(2)}KB, MIME type: ${mimeType}`, this.name); let baseAnalysisPrompt; switch (analysisType) { case 'summary': baseAnalysisPrompt = 'Please provide a comprehensive summary of this image. Describe what you see, including objects, people, settings, colors, composition, and overall content.'; // eslint-disable-line max-len break; case 'objects': baseAnalysisPrompt = 'Please identify and describe all objects, people, text, and visual elements visible in this image. List them systematically with their locations and characteristics.'; // eslint-disable-line max-len break; case 'text': baseAnalysisPrompt = 'Please extract and transcribe all text visible in this image. Include any signs, labels, captions, or written content you can read.'; // eslint-disable-line max-len break; case 'detailed': baseAnalysisPrompt = 'Please provide a detailed analysis of this image including: visual description, objects and people present, text content, colors and composition, mood or atmosphere, and any notable details or artistic elements.'; // eslint-disable-line max-len break; case 'custom': baseAnalysisPrompt = context || 'Please analyze this image and describe what you observe.'; break; default: baseAnalysisPrompt = 'Please provide a summary of this image content.'; } let enhancedAnalysisPrompt = baseAnalysisPrompt; if (this.intelligenceSystem.initialized) { try { enhancedAnalysisPrompt = await this.intelligenceSystem.enhancePrompt(baseAnalysisPrompt, context, this.name); log('Applied Tool Intelligence enhancement', this.name); } catch (err) { log(`Tool Intelligence enhancement failed: ${err.message}`, this.name); } } let analysisPrompt = enhancedAnalysisPrompt; if (context && analysisType !== 'custom') { analysisPrompt += ` Additional context: ${context}`; } const analysisText = await this.geminiService.analyzeImage('IMAGE_ANALYSIS', analysisPrompt, imageBase64, mimeType); if (analysisText) { log('Image analysis completed successfully', this.name); if (this.intelligenceSystem.initialized) { try { const resultSummary = `Image analysis completed successfully: ${analysisText.length} characters, type: ${analysisType}`; // eslint-disable-line max-len await this.intelligenceSystem.learnFromInteraction(baseAnalysisPrompt, enhancedAnalysisPrompt, resultSummary, context, this.name); log('Tool Intelligence learned from interaction', this.name); } catch (err) { log(`Tool Intelligence learning failed: ${err.message}`, this.name); } } let finalResponse = `✓ Image file analyzed successfully:\n\n**File:** ${filePath}\n**Size:** ${(imageBuffer.length / 1024).toFixed(2)}KB\n**Format:** ${filePath.split('.').pop().toUpperCase()}\n**Analysis Type:** ${analysisType}\n\n**Analysis:**\n${analysisText}`; // eslint-disable-line max-len if (context && this.intelligenceSystem.initialized) { finalResponse += `\n\n---\n_Enhancement applied based on context: ${context}_`; } return { content: [ { type: 'text', text: finalResponse, }, ], }; } log('No analysis text generated', this.name); return { content: [ { type: 'text', text: `Could not analyze image file: "${filePath}". The image may be corrupted, too complex, or in an unsupported format.`, }, ], }; } catch (error) { log(`Error analyzing image: ${error.message}`, this.name); throw new Error(`Error analyzing image: ${error.message}`); } }
- src/tools/image-analysis.js:20-37 (schema)JSON schema for tool inputs: required file_path, optional analysis_type (enum) and context.type: 'object', properties: { file_path: { type: 'string', description: 'Path to the image file to analyze (supports JPEG, PNG, WebP, HEIC, HEIF, BMP, GIF)', }, analysis_type: { type: 'string', description: 'Type of analysis to perform: "summary", "objects", "text", "detailed", or "custom"', enum: ['summary', 'objects', 'text', 'detailed', 'custom'], }, context: { type: 'string', description: 'Optional context for intelligent enhancement (e.g., "medical", "architectural", "nature")', }, }, required: ['file_path'], },
- src/tools/index.js:86-86 (registration)Registers the ImageAnalysisTool instance with the shared intelligenceSystem and geminiService in the central tool registry.registerTool(new ImageAnalysisTool(intelligenceSystem, geminiService));