detect-all-objects
Analyze images to detect, count, and locate all objects with detailed descriptions using the DINO-X Image Detection MCP Server.
Instructions
Analyze an image to detect all identifiable objects, returning the category, count, coordinate positions and detailed descriptions for each object.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| imageFileUri | Yes | URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'. | |
| includeDescription | Yes | Whether to return a description of the objects detected in the image, but will take longer to process. |
Implementation Reference
- src/dino-x/client.ts:203-217 (handler)Core handler implementation in DinoXApiClient that performs the actual API call to DINO-X service for detecting all objects using a universal prompt.async detectAllObjects( imageFileUri: string, includeDescription: boolean ): Promise<DetectionResult> { return this.performDetection(imageFileUri, includeDescription, { model: "DINO-X-1.0", prompt: { type: "universal", universal: 1 }, targets: ["bbox"], bbox_threshold: 0.25, iou_threshold: 0.8 }); }
- src/servers/stdio-server.ts:120-195 (registration)Registers the 'detect-all-objects' tool in the STDIO MCP server, including input schema (Zod) and execution handler that calls the DinoXApiClient.private registerDetectAllObjectsTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_ALL_OBJECTS]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, includeDescription } = args; if (!imageFileUri) { return { content: [ { type: 'text', text: 'Image file URI is required', }, ], } } const { objects } = await this.api.detectAllObjects(imageFileUri, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); return { name: obj.category, bbox, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `Objects detected in image: ${Object.keys(categories).map(cat => `${cat} (${categories[cat].length})` )?.join(', ')}.` }, { type: "text", text: `Detailed object detection results: ${JSON.stringify(objectsInfo, null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in [xmin, ymin, xmax, ymax] format, where the origin (0,0) is at the top-left corner of the image. These coordinates help determine the exact position and spatial relationships of objects in the image.` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect objects from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) }
- src/servers/http-server.ts:153-227 (registration)Registers the 'detect-all-objects' tool in the HTTP MCP server, including input schema (Zod) and execution handler that calls the DinoXApiClient.private registerDetectAllObjectsTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_ALL_OBJECTS]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://'."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, includeDescription } = args; if (!imageFileUri) { return { content: [ { type: 'text', text: 'Image file URI is required', }, ], } } const { objects } = await this.api.detectAllObjects(imageFileUri, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); return { name: obj.category, bbox, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `Objects detected in image: ${Object.keys(categories).map(cat => `${cat} (${categories[cat].length})` )?.join(', ')}.` }, { type: "text", text: `Detailed object detection results: ${JSON.stringify(objectsInfo, null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in [xmin, ymin, xmax, ymax] format, where the origin (0,0) is at the top-left corner of the image. These coordinates help determine the exact position and spatial relationships of objects in the image.` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect objects from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } )
- src/constants/tool.ts:16-19 (schema)Tool schema definition: name and description used across registrations.[Tool.DETECT_ALL_OBJECTS]: { name: Tool.DETECT_ALL_OBJECTS, description: "Analyze an image to detect all identifiable objects, returning the category, count, coordinate positions and detailed descriptions for each object.", },
- src/utils/index.ts:82-89 (helper)Helper utility to parse bounding box array [xmin,ymin,xmax,ymax] into named object, used in tool response formatting.export const parseBbox = (bbox: number[]) => { return { xmin: parseFloat(bbox[0].toFixed(1)), ymin: parseFloat(bbox[1].toFixed(1)), xmax: parseFloat(bbox[2].toFixed(1)), ymax: parseFloat(bbox[3].toFixed(1)) }; };