ThinkDrop Vision Service

describe.py•5.62 kB

""" VLM scene description endpoint """ import os import logging import httpx from fastapi import APIRouter, HTTPException from pydantic import BaseModel from typing import Optional, List from ..services.screenshot import ScreenshotService from ..services.ocr_engine import OCREngine from ..services.vlm_engine import VLMEngine logger = logging.getLogger(__name__) router = APIRouter(prefix="/vision", tags=["describe"]) class DescribeRequest(BaseModel): """Describe request model""" region: Optional[List[int]] = None # [x, y, width, height] task: Optional[str] = None # Optional focus/task instruction include_ocr: bool = True # Include OCR text store_to_memory: bool = True # Store result to user-memory service class DescribeResponse(BaseModel): """Describe response model""" version: str = "mcp.v1" status: str = "success" data: dict @router.post("/describe", response_model=DescribeResponse) async def describe_screen(request: DescribeRequest): """ Describe screen content using VLM Args: request: Description configuration Returns: Natural language description + optional OCR text """ try: logger.info(f"Describing screen (region={request.region}, task={request.task})") # Capture region = tuple(request.region) if request.region else None img = ScreenshotService.capture(region) # Save temp file for processing temp_file = ScreenshotService.save_temp(img) try: result = { "width": img.width, "height": img.height, "region": request.region } # OCR if request.include_ocr: try: ocr_engine = OCREngine.get_instance() items = ocr_engine.extract_text(img) ocr_text = " ".join(item["text"] for item in items) result["ocr"] = { "items": items, "concat": ocr_text } except Exception as e: logger.warning(f"OCR failed: {e}") result["ocr_error"] = str(e) # VLM description try: vlm_engine = VLMEngine.get_instance() if vlm_engine.is_enabled(): description = vlm_engine.describe(img, request.task) result["description"] = description else: result["description"] = None result["vlm_disabled"] = True logger.info("VLM disabled, using OCR only") except Exception as e: logger.error(f"VLM failed: {e}") result["vlm_error"] = str(e) # Fallback to OCR-only description if "ocr" in result: result["description"] = f"Screen content (OCR): {result['ocr']['concat'][:500]}" # Store to user-memory service if request.store_to_memory and "description" in result: try: await store_to_user_memory(result) result["stored_to_memory"] = True except Exception as e: logger.warning(f"Failed to store to memory: {e}") result["memory_storage_error"] = str(e) return DescribeResponse( version="mcp.v1", status="success", data=result ) finally: # Cleanup temp file ScreenshotService.cleanup_temp(temp_file) except Exception as e: logger.error(f"Describe failed: {e}") raise HTTPException( status_code=500, detail={ "version": "mcp.v1", "status": "error", "error": { "code": "DESCRIBE_FAILED", "message": str(e) } } ) async def store_to_user_memory(vision_data: dict): """ Store vision result to user-memory service as embedding Args: vision_data: Vision processing result """ user_memory_url = os.getenv('USER_MEMORY_SERVICE_URL', 'http://localhost:3003') api_key = os.getenv('USER_MEMORY_API_KEY', '') # Build content from description + OCR content = vision_data.get('description', '') if 'ocr' in vision_data and vision_data['ocr'].get('concat'): content += f"\n\nExtracted text: {vision_data['ocr']['concat']}" # Prepare payload payload = { "content": content, "metadata": { "type": "screen_capture", "source": "vision-service", "width": vision_data.get('width'), "height": vision_data.get('height'), "region": vision_data.get('region'), "has_ocr": 'ocr' in vision_data, "has_description": 'description' in vision_data } } # POST to user-memory service async with httpx.AsyncClient(timeout=10.0) as client: headers = {} if api_key: headers['x-api-key'] = api_key response = await client.post( f"{user_memory_url}/memory/store", json=payload, headers=headers ) response.raise_for_status() logger.info("Stored vision result to user-memory service") return response.json()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lukaizhi5559/thinkdrop-vision-service'

If you have feedback or need assistance with the MCP directory API, please join our Discord server