detect-human-pose-keypoints
Detect 17 keypoints per person in images to analyze body posture and movement for applications like fitness tracking or motion analysis.
Instructions
Detects 17 keypoints for each person in an image, supporting body posture and movement analysis.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| imageFileUri | Yes | URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'. | |
| includeDescription | Yes | Whether to return a description of the objects detected in the image, but will take longer to process. |
Implementation Reference
- src/servers/http-server.ts:230-306 (handler)MCP tool handler implementation for 'detect-human-pose-keypoints' in HTTP server. Includes inline input schema (Zod), API client call, bbox and pose keypoints parsing, categorization, and formatted text response.private registerDetectHumanPoseKeypointsTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_HUMAN_POSE_KEYPOINTS]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://'."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, includeDescription } = args; if (!imageFileUri) { return { content: [ { type: 'text', text: 'Image file URI is required', }, ], } } const { objects } = await this.api.detectHumanPoseKeypoints(imageFileUri, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); const pose = obj.pose ? parsePoseKeypoints(obj.pose) : undefined; return { name: obj.category, bbox, pose, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `${objectsInfo.length} human(s) detected in image.` }, { type: "text", text: `Detailed human pose keypoints detection results: ${JSON.stringify((objectsInfo), null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in {xmin, ymin, xmax, ymax} format, where the origin (0,0) is at the top-left corner of the image. The pose keypoints follow the same coordinate system, with visibility states (not visible, visible).` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect human pose keypoints from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) }
- src/servers/stdio-server.ts:197-273 (handler)MCP tool handler implementation for 'detect-human-pose-keypoints' in STDIO server. Includes inline input schema (Zod), API client call, bbox and pose keypoints parsing, categorization, and formatted text response.private registerDetectHumanPoseKeypointsTool(): void { const { name, description } = ToolConfigs[Tool.DETECT_HUMAN_POSE_KEYPOINTS]; this.server.tool( name, description, { imageFileUri: z.string().describe("URI of the input image. Preferred for remote or local files. Must start with 'https://' or 'file://'."), includeDescription: z.boolean().describe("Whether to return a description of the objects detected in the image, but will take longer to process."), }, async (args) => { try { const { imageFileUri, includeDescription } = args; if (!imageFileUri) { return { content: [ { type: 'text', text: 'Image file URI is required', }, ], } } const { objects } = await this.api.detectHumanPoseKeypoints(imageFileUri, includeDescription); const categories: ResultCategory = {}; for (const object of objects) { if (!categories[object.category]) { categories[object.category] = []; } categories[object.category].push(object); } const objectsInfo = objects.map(obj => { const bbox = parseBbox(obj.bbox); const pose = obj.pose ? parsePoseKeypoints(obj.pose) : undefined; return { name: obj.category, bbox, pose, ...(includeDescription ? { description: obj.caption, } : {}), } }); return { content: [ { type: "text", text: `${objectsInfo.length} human(s) detected in image.` }, { type: "text", text: `Detailed human pose keypoints detection results: ${JSON.stringify((objectsInfo), null, 2)}` }, { type: "text", text: `Note: The bbox coordinates are in {xmin, ymin, xmax, ymax} format, where the origin (0,0) is at the top-left corner of the image. The pose keypoints follow the same coordinate system, with visibility states (not visible, visible).` }, ] }; } catch (error) { return { content: [ { type: 'text', text: `Failed to detect human pose keypoints from image: ${error instanceof Error ? error.message : String(error)}`, }, ], }; } } ) }
- src/dino-x/client.ts:219-233 (helper)Core DinoXApiClient method implementing the detection logic specific to human pose keypoints by setting 'person' text prompt and including 'pose_keypoints' in targets, delegating to performDetection which handles API task creation and polling.async detectHumanPoseKeypoints( imageFileUri: string, includeDescription: boolean ): Promise<DetectionResult> { return this.performDetection(imageFileUri, includeDescription, { model: "DINO-X-1.0", prompt: { type: "text", text: "person" }, targets: ["bbox", "pose_keypoints"], bbox_threshold: 0.25, iou_threshold: 0.8 }); }
- src/utils/index.ts:55-80 (helper)Utility function to parse raw API pose keypoints array into structured object mapping 17 standard keypoints to {x, y, visible}.export const parsePoseKeypoints = (pose: number[]) => { const keypointNames = [ 'nose', 'leftEye', 'rightEye', 'leftEar', 'rightEar', 'leftShoulder', 'rightShoulder', 'leftElbow', 'rightElbow', 'leftWrist', 'rightWrist', 'leftHip', 'rightHip', 'leftKnee', 'rightKnee', 'leftAnkle', 'rightAnkle' ]; const structuredPose: { [key: string]: { x: number; y: number; visible: string; } } = {}; for (let i = 0; i < keypointNames.length; i++) { const baseIndex = i * 4; const visibilityMap = { 0: 'not visible', 2: 'visible' }; structuredPose[keypointNames[i]] = { x: parseFloat(pose[baseIndex].toFixed(1)), y: parseFloat(pose[baseIndex + 1].toFixed(1)), visible: visibilityMap[pose[baseIndex + 2] as keyof typeof visibilityMap], }; } return structuredPose; };
- src/constants/tool.ts:24-27 (schema)Tool configuration defining the name and description used for MCP registration.[Tool.DETECT_HUMAN_POSE_KEYPOINTS]: { name: Tool.DETECT_HUMAN_POSE_KEYPOINTS, description: "Detects 17 keypoints for each person in an image, supporting body posture and movement analysis.", },