Glama
AI Vision Debug MCP Server

#!/usr/bin/env node

const fs = require('fs');
const path = require('path');
const axios = require('axios');

// Configuration
const GEMINI_API_KEY = 'AIzaSyDRcmawVRBc9rVFEjNc4FeCt_5e8VP72GI';
const GEMINI_API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent';

// Path to the test image
const testImagePath = path.join(process.env.HOME, 'Downloads', 'test_screenshot.png');

// Check if the test image exists
if (!fs.existsSync(testImagePath)) {
  console.error(`Test image not found at ${testImagePath}`);
  process.exit(1);
}

// Function to analyze an image with Gemini Vision API
async function analyzeImage(imagePath) {
  try {
    console.log(`Reading image file: ${imagePath}`);
    const imageBuffer = fs.readFileSync(imagePath);
    const base64Image = imageBuffer.toString('base64');

    console.log('Preparing request payload with simplified prompt...');
    const payload = {
      contents: [
        {
          parts: [
            {
              text: "Analyze this UI screenshot and provide a brief description followed by a list of the main UI elements with their coordinates. Format your response as plain text (not JSON) with the following structure:\n\nDESCRIPTION: [brief description of the screenshot]\n\nUI ELEMENTS:\n1. [Element Type] at x:[x], y:[y], width:[width], height:[height]\n2. [Element Type] at x:[x], y:[y], width:[width], height:[height]\n...\n\nFor each UI element, include only:\n1. A number\n2. The element type (Button, Text, Image, Menu, etc.)\n3. The coordinates as x, y, width, height\n\nDo not include the text content of elements to avoid parsing issues."
            },
            {
              inline_data: {
                mime_type: "image/png",
                data: base64Image
              }
            }
          ]
        }
      ],
      generation_config: {
        temperature: 0.1,  // Very low temperature for consistent output
        top_p: 1,
        top_k: 32,
        max_output_tokens: 2048,
      }
    };

    console.log('Sending request to Gemini API...');
    const response = await axios.post(
      `${GEMINI_API_URL}?key=${GEMINI_API_KEY}`,
      payload,
      {
        headers: {
          'Content-Type': 'application/json'
        }
      }
    );

    // Parse the response
    console.log('Processing API response...');
    const geminiResponse = response.data;
    
    if (geminiResponse.candidates && geminiResponse.candidates.length > 0) {
      const content = geminiResponse.candidates[0].content;
      
      if (content && content.parts && content.parts.length > 0) {
        const rawText = content.parts[0].text;
        
        console.log('\n----------------- Raw Response -----------------');
        console.log(rawText.substring(0, 500) + (rawText.length > 500 ? '...' : ''));
        console.log('----------------------------------------------\n');
        
        try {
          // Parse the plain text response
          const descriptionMatch = rawText.match(/DESCRIPTION:\s*(.*?)(?:\n\n|\n)/s);
          const description = descriptionMatch ? descriptionMatch[1].trim() : 'No description found';
          
          // Extract UI elements using regex
          const elementsText = rawText.split('UI ELEMENTS:')[1] || '';
          const elementRegex = /(\d+)\.\s+(\w+)\s+at\s+x:(\d+),\s*y:(\d+),\s*width:(\d+),\s*height:(\d+)/g;
          
          const elements = [];
          let match;
          while ((match = elementRegex.exec(elementsText)) !== null) {
            elements.push({
              id: parseInt(match[1]),
              type: match[2],
              coordinates: {
                x: parseInt(match[3]),
                y: parseInt(match[4]),
                width: parseInt(match[5]),
                height: parseInt(match[6])
              }
            });
          }
          
          // Create the analysis result
          const analysisResult = {
            description: description,
            elements: elements
          };
          
          console.log('\n----------------- Analysis Result -----------------');
          console.log(`Description: ${analysisResult.description.substring(0, 150)}...`);
          console.log(`Number of UI elements detected: ${analysisResult.elements.length}`);
          
          // Log a few elements as examples
          if (analysisResult.elements.length > 0) {
            console.log('\nSample UI Elements:');
            const sampleSize = Math.min(5, analysisResult.elements.length);
            for (let i = 0; i < sampleSize; i++) {
              const element = analysisResult.elements[i];
              console.log(`${element.id}. ${element.type} at [${element.coordinates.x}, ${element.coordinates.y}, ${element.coordinates.width}, ${element.coordinates.height}]`);
            }
          }
          
          console.log('--------------------------------------------------\n');
          return analysisResult;
        } catch (error) {
          console.error('Error parsing response:', error);
          console.log('Raw response:', rawText);
          
          // Create a minimal valid result with just the description
          const descriptionMatch = rawText.match(/DESCRIPTION:\s*(.*?)(?:\n\n|\n)/s);
          const description = descriptionMatch ? descriptionMatch[1].trim() : 'Failed to extract description';
          
          return {
            description: description,
            elements: []
          };
        }
      }
    }
    
    console.error('Unexpected response format from Gemini API');
    return null;
  } catch (error) {
    console.error('Error analyzing image:', error.message);
    if (error.response) {
      console.error('API error details:', error.response.data);
    }
    return null;
  }
}

// Main function
async function main() {
  console.log('🔍 Starting direct test of Gemini Vision API for UI analysis');
  
  try {
    const result = await analyzeImage(testImagePath);
    
    if (result) {
      console.log('✅ Test completed successfully!');
      
      // If we have elements, consider it a full success
      if (result.elements && result.elements.length > 0) {
        console.log(`✅ Full analysis successful with ${result.elements.length} UI elements detected`);
      } else {
        console.log('⚠️ Partial success - description extracted but no UI elements');
      }
    } else {
      console.error('❌ Test failed: No valid analysis result received');
      process.exit(1);
    }
  } catch (error) {
    console.error('❌ Test failed with error:', error);
    process.exit(1);
  }
}

// Run the main function
main();