object-detection-by-text
Detect and count objects in an image using a text prompt, providing detailed descriptions and 2D coordinates for precise visual analysis.
Instructions
Analyze an image based on a text prompt to identify and count specific objects, and return detailed descriptions of the objects and their 2D coordinates.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| imageFileUri | Yes | URI of the input image. Preferred for remote or local files. Must start with "https://" or "file://". | |
| includeDescription | Yes | Whether to return a description of the objects detected in the image, but will take longer to process. | |
| textPrompt | Yes | Nouns of target objects (English only, avoid adjectives). Use periods to separate multiple categories (e.g., 'person.car.traffic light'). |
Implementation Reference
- src/constants/tool.ts:20-22 (registration)Tool name definition and metadata (name and description) used for registration and schema[Tool.DETECT_BY_TEXT]: { name: Tool.DETECT_BY_TEXT, description: "Analyze an image based on a text prompt to identify and count specific objects, and return detailed descriptions of the objects and their 2D coordinates.",
- src/dino-x/client.ts:186-201 (handler)Core handler function in DinoXApiClient that performs object detection by text prompt using the DINO-X APIasync detectObjectsByText( imageFileUri: string, textPrompt: string, includeDescription: boolean ): Promise<DetectionResult> { return this.performDetection(imageFileUri, includeDescription, { model: "DINO-X-1.0", prompt: { type: "text", text: textPrompt }, targets: ["bbox"], bbox_threshold: 0.25, iou_threshold: 0.8 }); }
- src/servers/http-server.ts:75-151 (registration)MCP tool registration including schema (Zod), description, and wrapper handler for HTTP server transportprivate registerDetectByTextTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_BY_TEXT]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://.'"), textPrompt: z.string().describe("Nouns of target objects (English only, avoid adjectives). Use periods to separate multiple categories (e.g., 'person.car.traffic light')."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, textPrompt, includeDescription } = args; if (!imageFileUri || !textPrompt) { return { content: [ { type: 'text', text: 'Image file URI and text prompt are required', }, ], } } const { objects } = await this.api.detectObjectsByText(imageFileUri, textPrompt, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); return { name: obj.category, bbox, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `Objects detected in image: ${Object.keys(categories).map(cat => `${cat} (${categories[cat].length})` )?.join(', ')}.` }, { type: "text", text: `Detailed object detection results: ${JSON.stringify((objectsInfo), null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in {xmin, ymin, xmax, ymax} format, where the origin (0,0) is at the top-left corner of the image. These coordinates help determine the exact position and spatial relationships of objects in the image.` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect objects from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) }
- src/servers/stdio-server.ts:42-117 (registration)MCP tool registration including schema (Zod), description, and wrapper handler for STDIO server transportprivate registerDetectByTextTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_BY_TEXT]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), textPrompt: z.string().describe("Nouns of target objects (English only, avoid adjectives). Use periods to separate multiple categories (e.g., 'person.car.traffic light')."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, textPrompt, includeDescription } = args; if (!imageFileUri || !textPrompt) { return { content: [ { type: 'text', text: 'Image file URI and text prompt are required', }, ], } } const { objects } = await this.api.detectObjectsByText(imageFileUri, textPrompt, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); return { name: obj.category, bbox, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `Objects detected in image: ${Object.keys(categories).map(cat => `${cat} (${categories[cat].length})` )?.join(', ')}.` }, { type: "text", text: `Detailed object detection results: ${JSON.stringify((objectsInfo), null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in {xmin, ymin, xmax, ymax} format, where the origin (0,0) is at the top-left corner of the image. These coordinates help determine the exact position and spatial relationships of objects in the image.` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect objects from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } )
- src/utils/index.ts:82-89 (helper)Utility function to parse and format bounding box coordinates used in tool response formattingexport const parseBbox = (bbox: number[]) => { return { xmin: parseFloat(bbox[0].toFixed(1)), ymin: parseFloat(bbox[1].toFixed(1)), xmax: parseFloat(bbox[2].toFixed(1)), ymax: parseFloat(bbox[3].toFixed(1)) }; };