AI Vision MCP Server

detect_objects_in_image.ts•23.3 KiB

/** * MCP Tool: detect_objects_in_image * Detects objects in images using AI vision models with bounding box annotations. */ import path from 'path'; import type { AnalysisOptions } from '../types/Providers.js'; import type { VisionProvider } from '../types/Providers.js'; import { FileService } from '../services/FileService.js'; import type { Config } from '../types/Config.js'; import { VisionError } from '../types/Errors.js'; import { FUNCTION_NAMES } from '../constants/FunctionNames.js'; import type { ObjectDetectionArgs, ObjectDetectionResponse, DetectedObject, DetectionWithFile, DetectionWithTempFile, DetectionOnly, ObjectDetectionMetadata, } from '../types/ObjectDetection.js'; import { ImageAnnotator } from '../utils/imageAnnotator.js'; import sharp from 'sharp'; // System instruction for object detection with web context awareness const DETECTION_SYSTEM_INSTRUCTION = ` You are a visual detection assistant that names detected objects based on image context. STEP 1 - DETECT CONTEXT: Determine whether the image represents a webpage. Consider it a webpage if you detect multiple web indicators such as: - Browser UI (tabs, address bar, navigation buttons) - Web-style layouts (menus, grids, form layouts) - HTML controls (inputs, buttons, dropdowns) - Web fonts or text rendering - Visible URL or webpage content STEP 2 - NAME ELEMENTS: - If the image appears to be a webpage → use HTML element names (e.g., button, input, a, nav, header, section, h1-h6, p, img, video) - Otherwise → use general object names based on visual meaning. STEP 3 - OUTPUT FORMAT: Return a valid JSON array (no text outside JSON) with: { "object": "<name based on context>", "label": "<short description>", "normalized_box_2d": [ymin, xmin, ymax, xmax] // normalized (0-1000) } Bounding box rules: - Tightly fit visible area (exclude shadows/whitespace) - Avoid overlap when separable - Maintain ymin < ymax and xmin < xmax - Differentiate duplicates by traits (e.g., color, position) `; // Detection schema equivalent to the one in gemini_object_detection.js const createDetectionSchema = (provider: string) => { if (provider === 'google') { // Google GenAI schema format return { type: 'array', items: { type: 'object', properties: { object: { type: 'string', description: 'Generic category for detected object element.', }, label: { type: 'string', description: 'Descriptive label or instance-specific detail.', }, normalized_box_2d: { type: 'array', minItems: 4, maxItems: 4, items: { type: 'integer', }, description: 'Bounding box coordinates [ymin, xmin, ymax, xmax], normalized to 0-1000', }, }, required: ['object', 'label', 'normalized_box_2d'], }, }; } else { // Vertex AI and other providers - standard JSON schema return { type: 'array', items: { type: 'object', properties: { object: { type: 'string', description: 'Generic category for detected object element.', }, label: { type: 'string', description: 'Descriptive label or instance-specific detail.', }, normalized_box_2d: { type: 'array', minItems: 4, maxItems: 4, items: { type: 'integer', }, description: 'Bounding box coordinates [ymin, xmin, ymax, xmax], normalized to 0-1000', }, }, required: ['object', 'label', 'normalized_box_2d'], }, }; } }; export type { ObjectDetectionArgs } from '../types/ObjectDetection.js'; /** * Generate CSS selector suggestions based on detected object type */ function suggestCSSSelectors(detection: DetectedObject): string[] { const selectors = []; const { object, label } = detection; // HTML element-specific selectors for web contexts if (object.startsWith('input[type=')) { const inputType = object.match(/type="([^"]+)"/)?.[1]; if (inputType) { selectors.push(`input[type="${inputType}"]`); // Add name-based selector if label suggests a name const nameHint = label .toLowerCase() .replace(/\s+/g, '_') .replace(/[^a-z0-9_]/g, ''); if (nameHint) { selectors.push(`input[name="${nameHint}"]`); } } } else if (object === 'button') { selectors.push('button[type="submit"]'); if (label) { selectors.push(`button:has-text("${label.replace(/\s+button$/i, '')}")`); } } else if (object === 'select') { selectors.push('select'); const nameHint = label .toLowerCase() .replace(/\s+/g, '_') .replace(/[^a-z0-9_]/g, ''); if (nameHint) { selectors.push(`select[name="${nameHint}"]`); } } else if (object === 'a') { selectors.push('a'); if (label) { selectors.push(`a:has-text("${label}")`); } } else if (object.startsWith('h') && /^h[1-6]$/.test(object)) { selectors.push(object); if (label) { selectors.push(`${object}:has-text("${label}")`); } } else if ( ['nav', 'header', 'footer', 'main', 'section', 'article'].includes(object) ) { selectors.push(object); } else { // Generic fallback for non-HTML elements selectors.push(object); } return selectors.slice(0, 2); // Return top 2 suggestions } /** * Generate hybrid summary with CSS selectors and minimal coordinates */ function generateDetectionSummary( detections: DetectedObject[], imageMetadata: { width: number; height: number; size_bytes: number; format: string; }, model: string, provider: string ): string { const summary = []; // Header with image context summary.push(`🖼️ IMAGE ANALYSIS COMPLETE\n`); summary.push( `📏 Source Image: ${imageMetadata.width}×${imageMetadata.height} pixels (${imageMetadata.format.toUpperCase()}, ${(imageMetadata.size_bytes / 1024 / 1024).toFixed(1)}MB)` ); summary.push(`🤖 Detection Model: ${model} (${provider})`); summary.push(`📊 Elements Found: ${detections.length} elements detected\n`); // Context-aware guidance based on detected elements const webElements = [ 'button', 'input', 'select', 'textarea', 'nav', 'header', 'footer', 'main', 'section', 'article', 'a', 'form', 'label', 'fieldset', ]; const hasWebElements = detections.some(d => webElements.some( webEl => d.object === webEl || d.object.startsWith(webEl + '[') ) ); if (hasWebElements) { // Show web automation guidance for web interfaces summary.push(`⚠️ FOR WEB AUTOMATION:`); summary.push( `- **RECOMMENDED**: Use CSS selectors for reliable automation (primary approach)` ); summary.push( `- **REFERENCE ONLY**: Percentage coordinates for spatial context (secondary reference)` ); summary.push( `- **AVOID**: Direct coordinate-based clicking for automation` ); summary.push( `- **Technical Note**: Raw coordinates use normalized_box_2d format [ymin, xmin, ymax, xmax] on 0-1000 scale\n` ); } else { // Show general object detection guidance for non-web content summary.push(`⚠️ OBJECT DETECTION RESULTS:`); summary.push( `- **SPATIAL REFERENCE**: Coordinates show relative positioning within image` ); summary.push( `- **COORDINATE FORMAT**: normalized_box_2d format [ymin, xmin, ymax, xmax] on 0-1000 scale\n` ); } // Element details with hybrid format summary.push(`## 🔍 DETECTED ELEMENTS:\n`); detections.forEach((detection, index) => { const [ymin, xmin, ymax, xmax] = detection.normalized_box_2d; // Convert normalized to percentage (0-1000 → 0-100) const centerX = (xmin + xmax) / 2 / 10; // 78.5% const centerY = (ymin + ymax) / 2 / 10; // 26.7% const widthPercent = (xmax - xmin) / 10; // 13.0% const heightPercent = (ymax - ymin) / 10; // 4.5% // Calculate pixel coordinates const pixelBox = { x: Math.round((xmin / 1000) * imageMetadata.width), y: Math.round((ymin / 1000) * imageMetadata.height), width: Math.round(((xmax - xmin) / 1000) * imageMetadata.width), height: Math.round(((ymax - ymin) / 1000) * imageMetadata.height), centerX: Math.round(((xmin + xmax) / 2 / 1000) * imageMetadata.width), centerY: Math.round(((ymin + ymax) / 2 / 1000) * imageMetadata.height), }; // Element header summary.push(`### ${index + 1}. ${detection.object} - ${detection.label}`); // Only show automation guidance for web elements const isWebElement = webElements.some( webEl => detection.object === webEl || detection.object.startsWith(webEl + '[') ); if (isWebElement) { // Generate CSS selector suggestions for web elements const selectors = suggestCSSSelectors(detection); summary.push( `- **Automation**: ${selectors.map(s => `\`${s}\``).join(' or ')}` ); } // Always show position for spatial reference summary.push( `- **Position**: ${centerX.toFixed(1)}% across, ${centerY.toFixed(1)}% down (${widthPercent.toFixed(1)}% × ${heightPercent.toFixed(1)}% size)` ); // Always show pixel information summary.push( `- **Pixels**: ${pixelBox.width}×${pixelBox.height} at (${pixelBox.x}, ${pixelBox.y}), center (${pixelBox.centerX}, ${pixelBox.centerY})\n` ); }); return summary.join('\n'); } export async function detect_objects_in_image( args: ObjectDetectionArgs, config: Config, imageProvider: VisionProvider, imageFileService: FileService ): Promise<ObjectDetectionResponse> { try { // Validate arguments if (!args.imageSource) { throw new VisionError('imageSource is required', 'MISSING_ARGUMENT'); } if (!args.prompt) { throw new VisionError('prompt is required', 'MISSING_ARGUMENT'); } // Handle image source (URL vs local file vs base64) const processedImageSource = await imageFileService.handleImageSource( args.imageSource ); console.log( `[detect_objects_in_image] Processed image source: ${processedImageSource.substring( 0, 100 )}${processedImageSource.length > 100 ? '...' : ''}` ); // Get original image buffer and dimensions for annotation let originalImageBuffer: Buffer; // eslint-disable-next-line prefer-const let imageWidth: number; // eslint-disable-next-line prefer-const let imageHeight: number; if (args.imageSource.startsWith('data:image/')) { // Base64 image const base64Data = args.imageSource.split(',')[1]; originalImageBuffer = Buffer.from(base64Data, 'base64'); } else if (args.imageSource.startsWith('http')) { // URL - fetch the image const response = await fetch(args.imageSource); if (!response.ok) { throw new VisionError( `Failed to fetch image from URL: ${response.statusText}`, 'FETCH_ERROR' ); } originalImageBuffer = Buffer.from(await response.arrayBuffer()); } else { // Local file path originalImageBuffer = await imageFileService.readFile(args.imageSource); } // Get image dimensions using Sharp const sharpMetadata = await sharp(originalImageBuffer).metadata(); // eslint-disable-next-line prefer-const imageWidth = sharpMetadata.width || 0; // eslint-disable-next-line prefer-const imageHeight = sharpMetadata.height || 0; if (imageWidth === 0 || imageHeight === 0) { throw new VisionError( 'Unable to determine image dimensions', 'INVALID_IMAGE' ); } console.log( `[detect_objects_in_image] Image size: ${imageWidth}x${imageHeight}` ); // Use the provided prompt as the detection query const detectionPrompt = args.prompt; // Merge default options with provided options const options: AnalysisOptions = { temperature: config.TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE ?? config.TEMPERATURE_FOR_IMAGE ?? config.TEMPERATURE, topP: config.TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE ?? config.TOP_P_FOR_IMAGE ?? config.TOP_P, topK: config.TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE ?? config.TOP_K_FOR_IMAGE ?? config.TOP_K, maxTokens: config.MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE ?? config.MAX_TOKENS_FOR_IMAGE ?? config.MAX_TOKENS, taskType: 'image', functionName: FUNCTION_NAMES.DETECT_OBJECTS_IN_IMAGE, // Add structured output configuration for object detection responseSchema: createDetectionSchema(config.IMAGE_PROVIDER), // Add system instruction to guide the model's behavior systemInstruction: DETECTION_SYSTEM_INSTRUCTION, ...args.options, // User options override defaults }; console.log( '[detect_objects_in_image] Analyzing image for object detection...' ); console.log( `[detect_objects_in_image] Configuration: temperature=${options.temperature}, topP=${options.topP}, topK=${options.topK}, maxTokens=${options.maxTokens}` ); // Analyze the image for object detection const result = await imageProvider.analyzeImage( processedImageSource, detectionPrompt, options ); console.log( `[detect_objects_in_image] Response length: ${result.text.length} characters` ); console.log( `[detect_objects_in_image] Response ends with: "${result.text.slice(-50)}"` ); // Parse detection results let detections: DetectedObject[]; try { // Try to parse the result directly detections = JSON.parse(result.text); } catch (parseError) { console.log( `[detect_objects_in_image] Initial JSON parse failed, attempting cleanup...` ); console.log( `[detect_objects_in_image] Raw response (first 500 chars): ${result.text.substring(0, 500)}` ); console.log( `[detect_objects_in_image] Full response length: ${result.text.length} characters` ); // Try to extract JSON from markdown code blocks if present let cleanedText = result.text.trim(); // Remove markdown code fences if present if (cleanedText.startsWith('```')) { const lines = cleanedText.split('\n'); // Remove first line (```json or ```) lines.shift(); // Remove last line if it's closing fence if (lines[lines.length - 1].trim() === '```') { lines.pop(); } cleanedText = lines.join('\n').trim(); } // Try parsing the cleaned text try { detections = JSON.parse(cleanedText); console.log( `[detect_objects_in_image] Successfully parsed after cleanup` ); } catch (secondError) { console.error( `[detect_objects_in_image] Failed to parse even after cleanup` ); console.error( `[detect_objects_in_image] Cleaned text (first 1000 chars): ${cleanedText.substring(0, 1000)}` ); // Try to fix truncated JSON arrays let fixedText = cleanedText; // Check if the JSON looks like a truncated array if (cleanedText.startsWith('[') && !cleanedText.endsWith(']')) { console.log( `[detect_objects_in_image] Attempting to fix truncated JSON array...` ); // Find the last complete object by looking for the last complete "}," const lastCompleteObjectIndex = cleanedText.lastIndexOf('},'); if (lastCompleteObjectIndex > 0) { // Truncate at the last complete object and close the array fixedText = cleanedText.substring(0, lastCompleteObjectIndex + 1) + '\n]'; console.log( `[detect_objects_in_image] Fixed text ends with: "${fixedText.slice(-100)}"` ); try { detections = JSON.parse(fixedText); console.log( `[detect_objects_in_image] Successfully parsed truncated JSON after fix. Objects found: ${detections.length}` ); } catch (thirdError) { console.error( `[detect_objects_in_image] Failed to parse even after fixing truncated JSON` ); throw new VisionError( `Failed to parse detection results as JSON (response appears truncated): ${parseError instanceof Error ? parseError.message : String(parseError)}. Raw response (first 500 chars): ${result.text.substring(0, 500)}. Consider increasing maxTokens parameter.`, 'PARSE_ERROR', config.IMAGE_PROVIDER, parseError instanceof Error ? parseError : undefined ); } } else { throw new VisionError( `Failed to parse detection results as JSON (response appears truncated): ${parseError instanceof Error ? parseError.message : String(parseError)}. Raw response (first 500 chars): ${result.text.substring(0, 500)}. Consider increasing maxTokens parameter.`, 'PARSE_ERROR', config.IMAGE_PROVIDER, parseError instanceof Error ? parseError : undefined ); } } else { throw new VisionError( `Failed to parse detection results as JSON: ${parseError instanceof Error ? parseError.message : String(parseError)}. Raw response (first 500 chars): ${result.text.substring(0, 500)}`, 'PARSE_ERROR', config.IMAGE_PROVIDER, parseError instanceof Error ? parseError : undefined ); } } } console.log( `[detect_objects_in_image] Detected ${detections.length} objects` ); // Validate and filter detections with valid normalized coordinates const processedDetections = detections .map((detection: any) => { if ( !detection.normalized_box_2d || !Array.isArray(detection.normalized_box_2d) || detection.normalized_box_2d.length !== 4 ) { console.warn( `[detect_objects_in_image] Skipping detection with invalid coordinates: ${JSON.stringify(detection)}` ); return null; } const [normY1, normX1, normY2, normX2] = detection.normalized_box_2d; // Validate coordinate ranges (should be 0-1000) if ( normY1 < 0 || normX1 < 0 || normY2 > 1000 || normX2 > 1000 || normY1 >= normY2 || normX1 >= normX2 ) { console.warn( `[detect_objects_in_image] Skipping detection with invalid coordinate ranges: ${detection.object} [${normY1}, ${normX1}, ${normY2}, ${normX2}]` ); return null; } // Return simplified detection object return { object: detection.object, label: detection.label, normalized_box_2d: detection.normalized_box_2d, }; }) .filter(Boolean) as DetectedObject[]; // Draw annotations on image const annotator = new ImageAnnotator(); const annotatedImageBuffer = await annotator.drawAnnotations( originalImageBuffer, processedDetections, imageWidth, imageHeight ); const annotatedImageSize = annotatedImageBuffer.length; // Determine output format from original image const outputFormat = sharpMetadata.format || 'png'; console.log( `[detect_objects_in_image] Annotated image size: ${annotatedImageSize} bytes` ); // Generate human-readable text summary with percentage coordinates const imageMetadata = { width: imageWidth, height: imageHeight, size_bytes: originalImageBuffer.length, format: outputFormat, }; const summary = generateDetectionSummary( processedDetections, imageMetadata, 'AI Vision Model', // Use generic model name since result.model doesn't exist config.IMAGE_PROVIDER ); console.log( `[detect_objects_in_image] Generated text summary (${summary.length} characters)` ); // Create enhanced metadata from result const detectionMetadata: ObjectDetectionMetadata = { model: result.metadata?.model || 'unknown', provider: result.metadata?.provider || config.IMAGE_PROVIDER, usage: result.metadata?.usage, processingTime: result.metadata?.processingTime || 0, fileType: 'image/' + outputFormat, fileSize: originalImageBuffer.length, modelVersion: result.metadata?.modelVersion, responseId: result.metadata?.responseId, fileSaveStatus: 'saved', // Default, will be overridden if file save fails }; // 2-step workflow for image file handling if (args.outputFilePath) { // Step 1: Explicit outputFilePath provided → Save to exact path await annotator.saveToExplicitPath( args.outputFilePath, annotatedImageBuffer ); console.log( `[detect_objects_in_image] Annotated image saved to: ${args.outputFilePath}` ); const response: DetectionWithFile = { detections: processedDetections, file: { path: path.resolve(args.outputFilePath), size_bytes: annotatedImageSize, format: outputFormat, }, image_metadata: { width: imageWidth, height: imageHeight, original_size: originalImageBuffer.length, }, summary: summary, metadata: detectionMetadata, }; return response; } else { // Step 2: No explicit path → Try temp file, skip on permission error const saveResult = await annotator.saveToTempFileOrSkip( annotatedImageBuffer, outputFormat ); if (saveResult.method === 'temp_file') { // Success: Return temp file response console.log(`[detect_objects_in_image] Image saved to temp: ${saveResult.path}`); const response: DetectionWithTempFile = { detections: processedDetections, tempFile: { path: saveResult.path, size_bytes: annotatedImageSize, format: outputFormat, }, image_metadata: { width: imageWidth, height: imageHeight, original_size: originalImageBuffer.length, }, summary: summary, metadata: detectionMetadata, }; return response; } else { // Permission error: Return detection data only with updated metadata console.warn(`[detect_objects_in_image] Returning detection results without file output due to permission error.`); const response: DetectionOnly = { detections: processedDetections, image_metadata: { width: imageWidth, height: imageHeight, original_size: originalImageBuffer.length, }, summary: summary, metadata: { ...detectionMetadata, fileSaveStatus: 'skipped_due_to_permissions', }, }; return response; } } } catch (error) { console.error('Error in detect_objects_in_image tool:', error); if (error instanceof VisionError) { throw error; } throw new VisionError( `Failed to detect objects in image: ${error instanceof Error ? error.message : String(error)}`, 'DETECTION_ERROR', config.IMAGE_PROVIDER, error instanceof Error ? error : undefined ); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/honeyvig/ai-vision-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

detect_objects_in_image.ts•23.3 KiB