AI Vision Debug MCP Server

  • src
#!/usr/bin/env node import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { CallToolRequestSchema, ErrorCode, ListToolsRequestSchema, McpError, } from '@modelcontextprotocol/sdk/types.js'; import axios from 'axios'; import * as fs from 'fs'; import * as fsPromises from 'fs/promises'; import * as path from 'path'; import * as os from 'os'; import { randomUUID } from 'crypto'; import { z } from 'zod'; import { chromium } from 'playwright'; // Replace Hugging Face API key with Gemini API key const GEMINI_API_KEY = 'AIzaSyDRcmawVRBc9rVFEjNc4FeCt_5e8VP72GI'; const GEMINI_API_URL = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent'; const TEMP_DIR = path.join(os.tmpdir(), 'ai-vision-debug'); const DOWNLOADS_DIR = path.join(os.homedir(), 'Downloads'); // Define a fixed path for the test screenshot const TEST_SCREENSHOT_PATH = path.join(os.homedir(), 'Downloads', 'test_screenshot.png'); // Set up logging to a file instead of console const logDir = path.join(os.tmpdir(), 'ai-vision-debug-logs'); try { if (!fs.existsSync(logDir)) { fs.mkdirSync(logDir, { recursive: true }); } } catch (error) { // Silently fail if we can't create the log directory } const logFile = path.join(logDir, 'ai-vision-debug.log'); function logToFile(message: string): void { try { fs.appendFileSync(logFile, `${new Date().toISOString()} - ${message}\n`); } catch (error) { // Silently fail if we can't write to the log file } } // Session state to track current debugging session interface DebugSession { currentUrl: string | null; lastScreenshotPath: string | null; debugHistory: string[]; elements: UIElement[]; } // Initialize debug session const debugSession: DebugSession = { currentUrl: null, lastScreenshotPath: null, debugHistory: [], elements: [] }; // Define interfaces for the Gemini API response interface GeminiResponsePart { text: string; } interface GeminiResponseContent { parts: GeminiResponsePart[]; role: string; } interface GeminiResponseCandidate { content: GeminiResponseContent; finishReason: string; } interface GeminiResponse { candidates: GeminiResponseCandidate[]; } // Define interface for element with coordinates interface UIElement { id: number; type: string; label?: string; coordinates: { x: number; y: number; width: number; height: number; }; description?: string; } // Define interface for the analysis result interface AnalysisResult { description: string; elements: UIElement[]; } // Add additional schemas for file operations and report generation const ReadFileRequestSchema = z.object({ path: z.string().describe("Path to the file to read"), startLine: z.number().int().describe("Starting line number (1-indexed)"), endLine: z.number().int().describe("Ending line number (1-indexed)"), }); const ModifyFileRequestSchema = z.object({ path: z.string().describe("Path to the file to modify"), startLine: z.number().int().describe("Starting line number to replace (1-indexed)"), endLine: z.number().int().describe("Ending line number to replace (1-indexed)"), content: z.string().describe("New content to replace the specified lines"), }); const GenerateReportRequestSchema = z.object({ testUrl: z.string().describe("URL of the application being tested"), appName: z.string().optional().describe("Name of the application being analyzed"), date: z.string().optional().describe("Date of the analysis (YYYY-MM-DD)"), observations: z.record(z.any()).describe("Observations structured as components, data state, interactions, etc."), }); // Add schema for URL screenshot const ScreenshotUrlRequestSchema = z.object({ url: z.string().describe("URL to capture a screenshot of (e.g., http://localhost:4999, https://google.com)"), fullPage: z.boolean().optional().describe("Whether to capture full page or just viewport. Default: false"), waitForSelector: z.string().optional().describe("Optional CSS selector to wait for before taking screenshot"), waitTime: z.number().optional().describe("Time to wait in milliseconds before taking screenshot. Default: 1000") }); class AIVisionDebugServer { private server: Server; private lastConsoleOutput: string[] = []; private browserInstance: any = null; private browserContext: any = null; constructor() { this.server = new Server( { name: 'ai-vision-debug', version: '0.1.0', }, { capabilities: { tools: { analyze_screen: true, // Disable Playwright-dependent tools click_point: false, take_screenshot: false, click_and_screenshot: false, click_sequence: false, // Keep file-related tools read_file: true, modify_file: true, get_console_output: false, generate_report: true, screenshot_url: true }, }, } ); this.setupToolHandlers(); this.server.onerror = (error) => console.error('[MCP Error]', error); process.on('SIGINT', async () => { await this.cleanup(); process.exit(0); }); } private async ensureTempDir() { try { await fsPromises.access(TEMP_DIR); } catch { await fsPromises.mkdir(TEMP_DIR, { recursive: true }); } } private async cleanup() { await this.server.close(); } /** * Take a screenshot of a URL using Playwright */ private async screenshotUrl( url: string, fullPage: boolean = false, waitForSelector?: string, waitTime: number = 1000 ): Promise<{ path: string, fileUuid: string }> { try { logToFile(`Taking screenshot of URL: ${url}`); // Initialize browser if not already done if (!this.browserInstance) { logToFile('Initializing browser...'); this.browserInstance = await chromium.launch({ headless: true }); this.browserContext = await this.browserInstance.newContext({ viewport: { width: 1280, height: 800 }, userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36' }); } // Create a new page const page = await this.browserContext.newPage(); // Navigate to the URL await page.goto(url, { waitUntil: 'networkidle' }); // Wait for specified time await page.waitForTimeout(waitTime); // Wait for selector if specified if (waitForSelector) { await page.waitForSelector(waitForSelector, { timeout: 10000 }); } // Ensure the temp directory exists await this.ensureTempDir(); // Generate a UUID for the file const fileUuid = randomUUID(); const screenshotPath = path.join(TEMP_DIR, `screenshot_${fileUuid}.png`); // Take the screenshot await page.screenshot({ path: screenshotPath, fullPage: fullPage }); // Close the page but keep browser open for future requests await page.close(); // Update the debug session debugSession.currentUrl = url; debugSession.lastScreenshotPath = screenshotPath; debugSession.debugHistory.push(`Screenshot taken of ${url}`); logToFile(`Screenshot saved to ${screenshotPath}`); return { path: screenshotPath, fileUuid }; } catch (error: any) { logToFile(`Error taking screenshot: ${error}`); throw new Error(`Failed to take screenshot of URL ${url}: ${error.message}`); } } private async getTestScreenshot(customName?: string): Promise<{ path: string, fileUuid: string }> { // If we have a screenshot from the URL tool, use that if (debugSession.lastScreenshotPath) { return { path: debugSession.lastScreenshotPath, fileUuid: randomUUID() }; } // Otherwise use the fixed test screenshot try { await fsPromises.access(TEST_SCREENSHOT_PATH); } catch (error) { throw new Error(`Test screenshot not found at ${TEST_SCREENSHOT_PATH}`); } const fileUuid = randomUUID(); return { path: TEST_SCREENSHOT_PATH, fileUuid }; } private async readFile(filePath: string, startLine: number, endLine: number): Promise<any> { try { logToFile(`Reading file ${filePath} from line ${startLine} to ${endLine}`); if (!fs.existsSync(filePath)) { throw new Error(`File not found: ${filePath}`); } const content = await fsPromises.readFile(filePath, 'utf8'); const lines = content.split('\n'); // Validate line numbers (1-indexed) if (startLine < 1 || startLine > lines.length) { throw new Error(`Invalid start line: ${startLine}. File has ${lines.length} lines.`); } if (endLine < startLine || endLine > lines.length) { throw new Error(`Invalid end line: ${endLine}. File has ${lines.length} lines.`); } // Extract specified lines (adjusting for 0-indexed array) const extractedLines = lines.slice(startLine - 1, endLine); return { content: extractedLines.join('\n'), lineCount: extractedLines.length }; } catch (error: any) { logToFile(`Error reading file: ${error}`); throw new Error(`Failed to read file: ${error.message}`); } } private async modifyFile(filePath: string, startLine: number, endLine: number, content: string): Promise<any> { try { logToFile(`Modifying file ${filePath} from line ${startLine} to ${endLine}`); if (!fs.existsSync(filePath)) { throw new Error(`File not found: ${filePath}`); } const fileContent = await fsPromises.readFile(filePath, 'utf8'); const lines = fileContent.split('\n'); // Validate line numbers (1-indexed) if (startLine < 1 || startLine > lines.length + 1) { throw new Error(`Invalid start line: ${startLine}. File has ${lines.length} lines.`); } if (endLine < startLine - 1 || endLine > lines.length) { throw new Error(`Invalid end line: ${endLine}. File has ${lines.length} lines.`); } // Replace the specified lines with new content const newContentLines = content.split('\n'); const beforeLines = lines.slice(0, startLine - 1); const afterLines = lines.slice(endLine); const modifiedLines = [...beforeLines, ...newContentLines, ...afterLines]; const modifiedContent = modifiedLines.join('\n'); // Write the modified content back to the file await fsPromises.writeFile(filePath, modifiedContent); return { success: true, linesModified: (endLine - startLine + 1), linesAdded: newContentLines.length }; } catch (error: any) { logToFile(`Error modifying file: ${error}`); throw new Error(`Failed to modify file: ${error.message}`); } } private async analyzeWithGemini(filepath: string): Promise<AnalysisResult> { try { // Read image as base64 const imageData = await fsPromises.readFile(filepath); const base64Image = imageData.toString('base64'); // Update prompt to request more detailed element descriptions const prompt = "Analyze this UI screenshot and provide a detailed description followed by a comprehensive list of UI elements with their coordinates and detailed descriptions. Format your response as plain text (not JSON) with the following structure:\n\nDESCRIPTION: [thorough description of the screenshot, including application, purpose, and context]\n\nUI ELEMENTS:\n1. [Element Type] at x:[x], y:[y], width:[width], height:[height] - [Detailed description of what this element is, what it does, its state, and importance]\n2. [Element Type] at x:[x], y:[y], width:[width], height:[height] - [Detailed description of what this element is, what it does, its state, and importance]\n...\n\nFor each UI element, be highly descriptive and specific about:\n1. What the element represents (button, link, form field, etc.)\n2. Its current state (active, disabled, selected, etc.)\n3. Its purpose and function in the interface\n4. Any text content that helps identify the element\n5. The coordinates as x, y, width, height\n\nBe as detailed as possible in your descriptions to enable accurate identification."; const payload = { contents: [ { parts: [ { text: prompt }, { inline_data: { mime_type: "image/png", data: base64Image } } ] } ], generation_config: { temperature: 0.1, top_p: 1, top_k: 32, max_output_tokens: 8192, // Increased to allow for more detailed descriptions } }; // Make the API request const response = await axios.post( `${GEMINI_API_URL}?key=${GEMINI_API_KEY}`, payload, { headers: { 'Content-Type': 'application/json' } } ); // Process the response const geminiResponse: any = response.data; if (geminiResponse.candidates && geminiResponse.candidates.length > 0) { const content = geminiResponse.candidates[0].content; if (content && content.parts && content.parts.length > 0) { const rawText = content.parts[0].text; // Parse the plain text response const descriptionMatch = rawText.match(/DESCRIPTION:\s*(.*?)(?:\n\n|\n)/s); const description = descriptionMatch ? descriptionMatch[1].trim() : 'No description found'; // Extract UI elements using regex const elementsText = rawText.split('UI ELEMENTS:')[1] || ''; const elementRegex = /(\d+)\.\s+(\w+)\s+at\s+x:(\d+),\s*y:(\d+),\s*width:(\d+),\s*height:(\d+)\s*-\s*(.*?)(?=\n\d+\.|\n\n|$)/gs; const elements: UIElement[] = []; let match; let id = 1; while ((match = elementRegex.exec(elementsText)) !== null) { elements.push({ id: parseInt(match[1]), type: match[2], coordinates: { x: parseInt(match[3]), y: parseInt(match[4]), width: parseInt(match[5]), height: parseInt(match[6]) }, description: match[7]?.trim() || "No description available" // Add the detailed description }); } return { description, elements }; } } throw new Error('Failed to parse Gemini API response'); } catch (error: any) { logToFile(`Error analyzing with Gemini: ${error}`); throw new Error(`Failed to analyze image: ${error?.message || 'Unknown error'}`); } } private async generateUIUXReport(appName: string, testUrl: string, date: string, observations: any): Promise<any> { try { logToFile(`Generating UI/UX report for ${testUrl}`); // Create a structured report with the provided observations const report = { title: `UI/UX Analysis Report${appName ? ` for ${appName}` : ''}`, date: date || new Date().toISOString().split('T')[0], testUrl, summary: "UI Analysis performed using Gemini Vision API", observations }; // Generate a formatted report const reportText = JSON.stringify(report, null, 2); // Write the report to a file (optional) const reportFileName = `uiux_report_${new Date().getTime()}.json`; await fsPromises.writeFile(reportFileName, reportText); return { success: true, report, reportFile: reportFileName }; } catch (error: any) { logToFile(`Error generating report: ${error}`); throw new Error(`Failed to generate report: ${error.message}`); } } private setupToolHandlers() { this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'analyze_screen', description: 'Analyze a test screenshot with AI vision', inputSchema: { type: 'object', properties: { random_string: { type: 'string', description: 'Dummy parameter for no-parameter tools' } }, required: [] } }, { name: 'screenshot_url', description: 'Take a screenshot of a URL using a web browser', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL to capture a screenshot of (e.g., http://localhost:4999, https://google.com)' }, fullPage: { type: 'boolean', description: 'Whether to capture full page or just viewport. Default: false' }, waitForSelector: { type: 'string', description: 'Optional CSS selector to wait for before taking screenshot' }, waitTime: { type: 'number', description: 'Time to wait in milliseconds before taking screenshot. Default: 1000' } }, required: ['url'] } }, { name: 'read_file', description: 'Read content from a file between specified line numbers', inputSchema: { type: 'object', properties: { path: { type: 'string', description: 'Path to the file' }, startLine: { type: 'number', description: 'Starting line number (1-indexed)' }, endLine: { type: 'number', description: 'Ending line number (1-indexed)' } }, required: ['path', 'startLine', 'endLine'] } }, { name: 'modify_file', description: 'Modify content in a file between specified line numbers', inputSchema: { type: 'object', properties: { path: { type: 'string', description: 'Path to the file' }, startLine: { type: 'number', description: 'Starting line number (1-indexed)' }, endLine: { type: 'number', description: 'Ending line number (1-indexed)' }, content: { type: 'string', description: 'New content to replace the specified lines' } }, required: ['path', 'startLine', 'endLine', 'content'] } }, { name: 'generate_report', description: 'Generate a comprehensive UI/UX analysis report', inputSchema: { type: 'object', properties: { appName: { type: 'string', description: 'Name of the application being analyzed' }, testUrl: { type: 'string', description: 'URL of the application being tested' }, date: { type: 'string', description: 'Date of the analysis (YYYY-MM-DD)' }, observations: { type: 'object', description: 'Observations structured as components, data state, interactions, etc.' } }, required: ['testUrl', 'observations'] } } ] })); this.server.setRequestHandler(CallToolRequestSchema, async (request) => { switch (request.params.name) { case 'screenshot_url': { try { const args = request.params.arguments as { url: string; fullPage?: boolean; waitForSelector?: string; waitTime?: number; }; if (!args.url) { throw new McpError( ErrorCode.InvalidParams, 'URL is required' ); } // Validate URL format try { new URL(args.url); } catch (error) { throw new McpError( ErrorCode.InvalidParams, `Invalid URL format: ${args.url}` ); } // Take screenshot const screenshot = await this.screenshotUrl( args.url, args.fullPage || false, args.waitForSelector, args.waitTime || 1000 ); return { content: [ { type: 'text', text: `Screenshot captured successfully from URL: ${args.url}\nPath: ${screenshot.path}\n\nYou can now use 'analyze_screen' to analyze this screenshot.` } ] }; } catch (error: any) { throw new McpError( ErrorCode.InternalError, `Failed to capture screenshot: ${error?.message || 'Unknown error'}` ); } } case 'analyze_screen': { try { // Use the last URL screenshot if available, otherwise fall back to the fixed test screenshot const screenshot = await this.getTestScreenshot(); const results = await this.analyzeWithGemini(screenshot.path); // Store elements in the debug session debugSession.elements = results.elements; // Add to debug history debugSession.debugHistory.push(`Screen analyzed: found ${results.elements.length} elements`); // Format the response to include both description and elements with coordinates const formattedDescription = results.description; const formattedElements = results.elements.map(element => `${element.id}. ${element.type ? element.type + ': ' : ''}${element.label} [${element.coordinates.x}, ${element.coordinates.y}, ${element.coordinates.width}, ${element.coordinates.height}]` ).join('\n'); const responseText = `${formattedDescription}\n\nClickable Elements (with coordinates [x, y, width, height]):\n${formattedElements}`; // Include debug session info const debugInfo = debugSession.currentUrl ? `\n\nCurrent debug URL: ${debugSession.currentUrl}\nDebug session has ${debugSession.debugHistory.length} steps` : '\n\nNo active debugging session - use screenshot_url to start one'; return { content: [ { type: 'text', text: responseText + debugInfo } ] }; } catch (error: any) { throw new McpError( ErrorCode.InternalError, `Failed to analyze screen: ${error?.message || 'Unknown error'}` ); } } case 'read_file': { try { const args = request.params.arguments as { path: string; startLine: number; endLine: number }; if (!args.path || typeof args.startLine !== 'number' || typeof args.endLine !== 'number') { throw new McpError( ErrorCode.InvalidParams, 'Invalid file read parameters provided' ); } const content = await this.readFile(args.path, args.startLine, args.endLine); return { content: [ { type: 'text', text: content } ] }; } catch (error: any) { throw new McpError( ErrorCode.InternalError, `Failed to read file: ${error?.message || 'Unknown error'}` ); } } case 'modify_file': { try { const args = request.params.arguments as { path: string; startLine: number; endLine: number; content: string; }; if (!args.path || typeof args.startLine !== 'number' || typeof args.endLine !== 'number' || args.content === undefined) { throw new McpError( ErrorCode.InvalidParams, 'Invalid file modification parameters provided' ); } const result = await this.modifyFile(args.path, args.startLine, args.endLine, args.content); return { content: [ { type: 'text', text: result } ] }; } catch (error: any) { throw new McpError( ErrorCode.InternalError, `Failed to modify file: ${error?.message || 'Unknown error'}` ); } } case 'generate_report': { try { const args = request.params.arguments as { appName?: string; testUrl: string; date?: string; observations: Record<string, any>; }; if (!args.testUrl || !args.observations) { throw new McpError( ErrorCode.InvalidParams, 'Missing required parameters: testUrl and observations' ); } const report = await this.generateUIUXReport( args.appName || '', args.testUrl, args.date || '', args.observations ); return { content: [ { type: 'text', text: report } ] }; } catch (error: any) { throw new McpError( ErrorCode.InternalError, `Failed to generate report: ${error?.message || 'Unknown error'}` ); } } default: throw new McpError( ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}` ); } }); // Register additional tools /* this.server.setRequestHandler( "tools/call", async (request: any) => { if (request.params.name === "mcp__analyze_screen") { logToFile("Calling mcp__analyze_screen tool with arguments: " + JSON.stringify(request.params.arguments)); const result = await this.analyzeWithGemini(request.params.arguments); return { result }; } else if (request.params.name === "mcp__read_file") { logToFile("Calling mcp__read_file tool with arguments: " + JSON.stringify(request.params.arguments)); // Implementation... } else if (request.params.name === "mcp__modify_file") { logToFile("Calling mcp__modify_file tool with arguments: " + JSON.stringify(request.params.arguments)); // Implementation... } else if (request.params.name === "mcp__generate_report") { logToFile("Calling mcp__generate_report tool with arguments: " + JSON.stringify(request.params.arguments)); // Implementation... } // Default handler throw new Error(`Unknown tool: ${request.params.name}`); } ); */ } async run() { try { // Ensure the temp directory exists await this.ensureTempDir(); // Set error handler this.server.onerror = (error) => { logToFile(`MCP Server error: ${error}`); }; // Connect the server const transport = new StdioServerTransport(); await this.server.connect(transport); logToFile('MCP Server started'); // Add cleanup handler process.on('SIGINT', async () => { logToFile('Shutting down...'); if (this.browserInstance) { await this.browserInstance.close(); } await this.cleanup(); process.exit(0); }); } catch (error) { logToFile(`Failed to start MCP Server: ${error}`); process.exit(1); } } } // Start the server const server = new AIVisionDebugServer(); server.run().catch((error) => { logToFile(`Failed to run server: ${error}`); process.exit(1); });