ThinkDrop Vision Service

describe.py•5.11 kB

""" VLM scene description endpoint """ import os import logging import httpx from fastapi import APIRouter, HTTPException from pydantic import BaseModel from typing import Optional, List from ..services.screenshot import ScreenshotService from ..services.vision_engine import VisionEngine logger = logging.getLogger(__name__) router = APIRouter(tags=["describe"]) class DescribeRequest(BaseModel): """Describe request model""" region: Optional[List[int]] = None # [x, y, width, height] task: Optional[str] = None # Optional focus/task instruction mode: Optional[str] = None # 'online' or 'privacy' (overrides default) api_key: Optional[str] = None # Google Vision API key (from database or OAuth) include_ocr: bool = True # Include OCR text store_to_memory: bool = True # Store result to user-memory service class DescribeResponse(BaseModel): """Describe response model""" version: str = "mcp.v1" status: str = "success" data: dict @router.post("/describe", response_model=DescribeResponse) # MCP action: describe async def describe_screen(request: DescribeRequest): """ Describe screen content using vision engine (online or privacy mode) Args: request: Description configuration with optional mode parameter Returns: Natural language description + text extraction """ try: logger.info(f"Describing screen (region={request.region}, mode={request.mode}, task={request.task})") # Capture screenshot region = tuple(request.region) if request.region else None img = ScreenshotService.capture(region) # Process with vision engine vision_engine = VisionEngine() # Build options with API key if provided process_options = {} if request.task: process_options['prompt'] = request.task if request.api_key: process_options['api_key'] = request.api_key vision_result = await vision_engine.process( img=img, mode=request.mode, task='describe', options=process_options ) result = { "width": img.width, "height": img.height, "region": request.region, "text": vision_result.get('text', ''), "description": vision_result.get('description', ''), "labels": vision_result.get('labels', []), "objects": vision_result.get('objects', []), "mode": vision_result.get('mode'), "latency_ms": vision_result.get('latency_ms'), "cached": vision_result.get('cached', False) } # Store to user-memory service if requested if request.store_to_memory and result.get("description"): try: await store_to_user_memory(result) result["stored_to_memory"] = True except Exception as e: logger.warning(f"Failed to store to memory: {e}") result["memory_storage_error"] = str(e) return DescribeResponse( version="mcp.v1", status="success", data=result ) except Exception as e: logger.error(f"Describe failed: {e}") raise HTTPException( status_code=500, detail={ "version": "mcp.v1", "status": "error", "error": { "code": "DESCRIBE_FAILED", "message": str(e) } } ) async def store_to_user_memory(vision_data: dict): """ Store vision result to user-memory service as embedding Args: vision_data: Vision processing result """ user_memory_url = os.getenv('USER_MEMORY_SERVICE_URL', 'http://localhost:3003') api_key = os.getenv('USER_MEMORY_API_KEY', '') # Build content from description + OCR content = vision_data.get('description', '') if 'ocr' in vision_data and vision_data['ocr'].get('concat'): content += f"\n\nExtracted text: {vision_data['ocr']['concat']}" # Prepare payload payload = { "content": content, "metadata": { "type": "screen_capture", "source": "vision-service", "width": vision_data.get('width'), "height": vision_data.get('height'), "region": vision_data.get('region'), "has_ocr": 'ocr' in vision_data, "has_description": 'description' in vision_data } } # POST to user-memory service async with httpx.AsyncClient(timeout=10.0) as client: headers = {} if api_key: headers['x-api-key'] = api_key response = await client.post( f"{user_memory_url}/memory/store", json=payload, headers=headers ) response.raise_for_status() logger.info("Stored vision result to user-memory service") return response.json()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lukaizhi5559/thinkdrop-vision-service'

If you have feedback or need assistance with the MCP directory API, please join our Discord server