AI Vision Debug MCP Server
by samihalawa
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const axios = require('axios');
// Configuration
const GEMINI_API_KEY = 'AIzaSyDRcmawVRBc9rVFEjNc4FeCt_5e8VP72GI';
const GEMINI_API_URL = '';
// Path to the test image
const testImagePath = path.join(process.env.HOME, 'Downloads', 'test_screenshot.png');
// Check if the test image exists
if (!fs.existsSync(testImagePath)) {
console.error(`Test image not found at ${testImagePath}`);
// Function to analyze an image with Gemini Vision API
async function analyzeImage(imagePath) {
try {
console.log(`Reading image file: ${imagePath}`);
const imageBuffer = fs.readFileSync(imagePath);
const base64Image = imageBuffer.toString('base64');
console.log('Preparing request payload with simplified prompt...');
const payload = {
contents: [
parts: [
text: "Analyze this UI screenshot and provide a brief description followed by a list of the main UI elements with their coordinates. Format your response as plain text (not JSON) with the following structure:\n\nDESCRIPTION: [brief description of the screenshot]\n\nUI ELEMENTS:\n1. [Element Type] at x:[x], y:[y], width:[width], height:[height]\n2. [Element Type] at x:[x], y:[y], width:[width], height:[height]\n...\n\nFor each UI element, include only:\n1. A number\n2. The element type (Button, Text, Image, Menu, etc.)\n3. The coordinates as x, y, width, height\n\nDo not include the text content of elements to avoid parsing issues."
inline_data: {
mime_type: "image/png",
data: base64Image
generation_config: {
temperature: 0.1, // Very low temperature for consistent output
top_p: 1,
top_k: 32,
max_output_tokens: 2048,
console.log('Sending request to Gemini API...');
const response = await
headers: {
'Content-Type': 'application/json'
// Parse the response
console.log('Processing API response...');
const geminiResponse =;
if (geminiResponse.candidates && geminiResponse.candidates.length > 0) {
const content = geminiResponse.candidates[0].content;
if (content && && > 0) {
const rawText =[0].text;
console.log('\n----------------- Raw Response -----------------');
console.log(rawText.substring(0, 500) + (rawText.length > 500 ? '...' : ''));
try {
// Parse the plain text response
const descriptionMatch = rawText.match(/DESCRIPTION:\s*(.*?)(?:\n\n|\n)/s);
const description = descriptionMatch ? descriptionMatch[1].trim() : 'No description found';
// Extract UI elements using regex
const elementsText = rawText.split('UI ELEMENTS:')[1] || '';
const elementRegex = /(\d+)\.\s+(\w+)\s+at\s+x:(\d+),\s*y:(\d+),\s*width:(\d+),\s*height:(\d+)/g;
const elements = [];
let match;
while ((match = elementRegex.exec(elementsText)) !== null) {
id: parseInt(match[1]),
type: match[2],
coordinates: {
x: parseInt(match[3]),
y: parseInt(match[4]),
width: parseInt(match[5]),
height: parseInt(match[6])
// Create the analysis result
const analysisResult = {
description: description,
elements: elements
console.log('\n----------------- Analysis Result -----------------');
console.log(`Description: ${analysisResult.description.substring(0, 150)}...`);
console.log(`Number of UI elements detected: ${analysisResult.elements.length}`);
// Log a few elements as examples
if (analysisResult.elements.length > 0) {
console.log('\nSample UI Elements:');
const sampleSize = Math.min(5, analysisResult.elements.length);
for (let i = 0; i < sampleSize; i++) {
const element = analysisResult.elements[i];
console.log(`${}. ${element.type} at [${element.coordinates.x}, ${element.coordinates.y}, ${element.coordinates.width}, ${element.coordinates.height}]`);
return analysisResult;
} catch (error) {
console.error('Error parsing response:', error);
console.log('Raw response:', rawText);
// Create a minimal valid result with just the description
const descriptionMatch = rawText.match(/DESCRIPTION:\s*(.*?)(?:\n\n|\n)/s);
const description = descriptionMatch ? descriptionMatch[1].trim() : 'Failed to extract description';
return {
description: description,
elements: []
console.error('Unexpected response format from Gemini API');
return null;
} catch (error) {
console.error('Error analyzing image:', error.message);
if (error.response) {
console.error('API error details:',;
return null;
// Main function
async function main() {
console.log('🔍 Starting direct test of Gemini Vision API for UI analysis');
try {
const result = await analyzeImage(testImagePath);
if (result) {
console.log('✅ Test completed successfully!');
// If we have elements, consider it a full success
if (result.elements && result.elements.length > 0) {
console.log(`✅ Full analysis successful with ${result.elements.length} UI elements detected`);
} else {
console.log('⚠️ Partial success - description extracted but no UI elements');
} else {
console.error('❌ Test failed: No valid analysis result received');
} catch (error) {
console.error('❌ Test failed with error:', error);
// Run the main function