Skip to main content
Glama
IDEA-Research

DINO-X Image Detection MCP Server

stdio-server.ts12 kB
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { DinoXApiClient } from "../dino-x/client.js"; import { BASE_URL, DEFAULT_TIMEOUT, Tool, ToolConfigs } from "../constants/tool.js"; import { z } from 'zod'; import { getDefaultImageStorageDirectory, parseBbox, parsePoseKeypoints, visualizeDetections } from "../utils/index.js"; import { Config, ResultCategory } from "../types/index.js"; /** * MCP Server class for managing DINO-X MCP server configuration and functionality * Specifically designed for STDIO transport mode */ export class MCPStdioServer { private server!: McpServer; private config!: Config; private api!: DinoXApiClient; constructor(customConfig: Config) { this.config = customConfig; this.api = new DinoXApiClient({ apiKey: this.config.apiKey!, baseUrl: BASE_URL, timeout: DEFAULT_TIMEOUT, }); this.server = new McpServer({ name: 'dinox-mcp-server', version: '1.0.0', }); this.registerTools(); } private registerTools(): void { this.registerDetectByTextTool(); this.registerDetectAllObjectsTool(); this.registerDetectHumanPoseKeypointsTool(); this.registerVisualizeDetectionResultTool(); } private registerDetectByTextTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_BY_TEXT]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), textPrompt: z.string().describe("Nouns of target objects (English only, avoid adjectives). Use periods to separate multiple categories (e.g., 'person.car.traffic light')."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, textPrompt, includeDescription } = args; if (!imageFileUri || !textPrompt) { return { content: [ { type: 'text', text: 'Image file URI and text prompt are required', }, ], } } const { objects } = await this.api.detectObjectsByText(imageFileUri, textPrompt, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); return { name: obj.category, bbox, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `Objects detected in image: ${Object.keys(categories).map(cat => `${cat} (${categories[cat].length})` )?.join(', ')}.` }, { type: "text", text: `Detailed object detection results: ${JSON.stringify((objectsInfo), null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in {xmin, ymin, xmax, ymax} format, where the origin (0,0) is at the top-left corner of the image. These coordinates help determine the exact position and spatial relationships of objects in the image.` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect objects from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) } private registerDetectAllObjectsTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_ALL_OBJECTS]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, includeDescription } = args; if (!imageFileUri) { return { content: [ { type: 'text', text: 'Image file URI is required', }, ], } } const { objects } = await this.api.detectAllObjects(imageFileUri, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); return { name: obj.category, bbox, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `Objects detected in image: ${Object.keys(categories).map(cat => `${cat} (${categories[cat].length})` )?.join(', ')}.` }, { type: "text", text: `Detailed object detection results: ${JSON.stringify(objectsInfo, null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in [xmin, ymin, xmax, ymax] format, where the origin (0,0) is at the top-left corner of the image. These coordinates help determine the exact position and spatial relationships of objects in the image.` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect objects from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) } private registerDetectHumanPoseKeypointsTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_HUMAN_POSE_KEYPOINTS]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, includeDescription } = args; if (!imageFileUri) { return { content: [ { type: 'text', text: 'Image file URI is required', }, ], } } const { objects } = await this.api.detectHumanPoseKeypoints(imageFileUri, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); const pose = obj.pose ? parsePoseKeypoints(obj.pose) : undefined; return { name: obj.category, bbox, pose, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `${objectsInfo.length} human(s) detected in image.` }, { type: "text", text: `Detailed human pose keypoints detection results: ${JSON.stringify((objectsInfo), null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in {xmin, ymin, xmax, ymax} format, where the origin (0,0) is at the top-left corner of the image. The pose keypoints follow the same coordinate system, with visibility states (not visible, visible).` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect human pose keypoints from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) } private registerVisualizeDetectionResultTool(): void { const { name, description } = ToolConfigs[Tool.VISUALIZE_DETECTION_RESULT]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), detections: z.array(z.object({ name: z.string().describe("Object category name"), bbox: z.object({ xmin: z.number(), ymin: z.number(), xmax: z.number(), ymax: z.number() }).describe("Bounding box coordinates") })).describe("Array of detection results with name and bbox information."), fontSize: z.number().optional().describe("Font size for labels (default: 24)"), boxThickness: z.number().optional().describe("Thickness of bounding box lines (default: 4)"), showLabels: z.boolean().optional().describe("Whether to show category labels (default: true)") }, async (args) => { try { const { imageFileUri, detections, fontSize, boxThickness, showLabels } = args; if (!imageFileUri || !detections || !Array.isArray(detections)) { return { content: [ { type: 'text', text: 'Image file URI and detections array are required', }, ], }; } const visualizedImagePath = await visualizeDetections( imageFileUri, detections, { fontSize, boxThickness, showLabels }, this.config.imageStorageDirectory || getDefaultImageStorageDirectory() ); return { content: [ { type: "text", text: `Visualization saved to: ${visualizedImagePath}` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to visualize detections: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) } /** * Start the server using stdio transport */ async start(): Promise<void> { console.error("Starting DINO-X MCP server on STDIO transport..."); const transport = new StdioServerTransport(); await this.server.connect(transport); console.error("DINO-X MCP server running on STDIO transport."); } }

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/IDEA-Research/DINO-X-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server