Moondream MCP Server

moondream-mcp
examples

agent.py•20 KiB

#!/usr/bin/env python3 """ Moondream Vision OpenAI Agents SDK Agent ======================================== An interactive Python client that connects to the Moondream MCP server using OpenAI Agents SDK. This agent provides a simple, intelligent interface for vision analysis tasks including image captioning, visual question answering, object detection, and more through natural language. Prerequisites ------------- * Python ≥ 3.10 * `pip install -r requirements.txt` * OpenAI API key for the agent * Moondream MCP server running Quick Start ----------- 1. Set up environment variables: ```bash export OPENAI_API_KEY="your-openai-key" export MOONDREAM_DEVICE="auto" # Optional: auto, cpu, cuda, mps ``` 2. Run the agent: ```bash python agent.py ``` 3. Interact with images: ```text moondream> What's in this image? /path/to/image.jpg moondream> Describe the scene in detail: https://example.com/image.png moondream> Find all the cars in /path/to/street.jpg moondream> Point to the dog in the image /path/to/pets.jpg moondream> quit ``` Environment Variables -------------------- Required: * OPENAI_API_KEY: OpenAI API key for the agent Optional: * MOONDREAM_DEVICE: Device for inference (auto/cpu/cuda/mps) * MOONDREAM_MODEL_NAME: Hugging Face model name * MOONDREAM_MAX_IMAGE_SIZE: Maximum image dimension * MOONDREAM_TIMEOUT_SECONDS: Processing timeout How It Works ------------ The agent uses OpenAI's Agents SDK to intelligently process natural language queries about images and execute them against the Moondream MCP server. The agent automatically determines the best vision analysis approach for each query. Transport Options ----------------- * Stdio (default): Direct process communication with MCP server * HTTP: Connect to running HTTP MCP server Examples: ```bash # Stdio transport (default) python agent.py # HTTP transport python agent.py --transport http --host 127.0.0.1 --port 8000 ``` """ from __future__ import annotations import asyncio import json import os import subprocess import time import uuid from pathlib import Path from typing import Any, Dict, Optional import click from rich import box from rich.console import Console from rich.prompt import Prompt from rich.table import Table from rich.panel import Panel from rich.markdown import Markdown # OpenAI Agents SDK imports from agents.agent import Agent from agents.run import Runner, trace from agents.mcp import MCPServer, MCPServerStdio, MCPServerStreamableHttp from agents.model_settings import ModelSettings import dotenv dotenv.load_dotenv() console = Console() def display_vision_result(content: str): """Display vision analysis results with proper formatting.""" try: resp = json.loads(content) except json.JSONDecodeError: console.print(f"[yellow]› {content}[/yellow]") return if "error" in resp: error = resp["error"] console.print(f"[red]❌ Error: {error}[/red]") return # Handle different types of vision results if "caption" in resp: console.print(f"[green]📝 Caption:[/green] {resp['caption']}") if "answer" in resp: console.print(f"[blue]💬 Answer:[/blue] {resp['answer']}") if "objects" in resp: objects = resp["objects"] if objects: console.print(f"[cyan]🔍 Detected Objects:[/cyan]") table = Table(show_header=True, header_style="bold cyan", box=box.SIMPLE) table.add_column("Object", style="green") table.add_column("Confidence", style="yellow") table.add_column("Location", style="blue") for obj in objects: confidence = f"{obj.get('confidence', 0):.2f}" if 'confidence' in obj else "N/A" location = f"({obj.get('x', 0)}, {obj.get('y', 0)})" if 'x' in obj and 'y' in obj else "N/A" table.add_row(obj.get('name', 'Unknown'), confidence, location) console.print(table) else: console.print("[yellow]› No objects detected.[/yellow]") if "points" in resp: points = resp["points"] if points: console.print(f"[magenta]📍 Pointing Results:[/magenta]") table = Table(show_header=True, header_style="bold magenta", box=box.SIMPLE) table.add_column("Description", style="green") table.add_column("Coordinates", style="blue") table.add_column("Confidence", style="yellow") for point in points: coords = f"({point.get('x', 0)}, {point.get('y', 0)})" confidence = f"{point.get('confidence', 0):.2f}" if 'confidence' in point else "N/A" table.add_row(point.get('description', 'Point'), coords, confidence) console.print(table) else: console.print("[yellow]› No points found.[/yellow]") if "analysis" in resp: analysis = resp["analysis"] console.print(f"[purple]🔬 Analysis:[/purple]") console.print(Markdown(analysis)) if "results" in resp: # Batch processing results results = resp["results"] console.print(f"[cyan]📊 Batch Results ({len(results)} images):[/cyan]") for i, result in enumerate(results, 1): console.print(f"[bold]Image {i}:[/bold]") display_vision_result(json.dumps(result)) if i < len(results): console.print() # Display metadata if available if "processing_time" in resp: console.print(f"[dim]Processing time: {resp['processing_time']:.2f}s[/dim]") if "image_size" in resp: size = resp["image_size"] console.print(f"[dim]Image size: {size.get('width', 0)}x{size.get('height', 0)}[/dim]") async def create_mcp_server(transport: str, host: str, port: int, server_command: str, server_args: str, timeout: int) -> MCPServer: """Create and return the appropriate MCP server based on transport type.""" if transport == "http": url = f"http://{host}:{port}/mcp" console.print(f"[blue]Connecting to Moondream MCP server at {url}...[/blue]") return MCPServerStreamableHttp( name="Moondream MCP Server", params={ "url": url, "timeout": float(timeout), # Use the provided timeout "sse_read_timeout": float(timeout + 120), # Add 2 minutes buffer for SSE "connect_timeout": 30.0 }, client_session_timeout_seconds=float(timeout) # This controls MCP tool call timeout ) else: console.print(f"[blue]Starting Moondream MCP server: {server_command} {server_args}[/blue]") return MCPServerStdio( name="Moondream MCP Server", params={ "command": server_command, "args": server_args.split() if server_args else [], }, client_session_timeout_seconds=float(timeout) # This controls MCP tool call timeout ) def validate_image_path(image_path: str) -> bool: """Validate if the image path exists and is a valid image file.""" if image_path.startswith(('http://', 'https://')): return True # Assume URLs are valid, will be validated by the server path = Path(image_path) if not path.exists(): console.print(f"[red]❌ File not found: {image_path}[/red]") return False valid_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff'} if path.suffix.lower() not in valid_extensions: console.print(f"[red]❌ Invalid image format: {path.suffix}[/red]") console.print(f"[yellow]Supported formats: {', '.join(valid_extensions)}[/yellow]") return False return True async def run_interactive_session(mcp_server: MCPServer, timeout: int): """Run the interactive Moondream vision session using OpenAI Agents SDK.""" # Create the agent with instructions for vision analysis agent = Agent( name="Moondream Vision Assistant", model="gpt-4o-mini", instructions="""You are a Moondream vision analysis assistant. Your role is to help users analyze images using various computer vision tools through natural language interaction. AVAILABLE VISION TOOLS: 1. **caption_image**: Generate descriptive captions for images - Supports different caption lengths: short, normal, detailed - Use for: "Describe this image", "What's in this picture?", "Caption this image" 2. **query_image**: Answer specific questions about images - Use for: "What color is the car?", "How many people are in the image?", "Is there a dog?" 3. **detect_objects**: Find and locate specific objects in images - Use for: "Find all cars", "Detect people", "What objects are in this image?" 4. **point_objects**: Point to specific objects or regions in images - Use for: "Point to the dog", "Where is the red car?", "Show me the building" 5. **analyze_image**: Comprehensive analysis combining multiple operations - Use for: "Analyze this image completely", "Give me a full breakdown" 6. **batch_analyze_images**: Process multiple images at once (up to 10) - Use for: "Analyze these images", "Process this batch of photos" INTERACTION PATTERNS: - When user provides an image path/URL, determine the best tool based on their request - For general questions like "What's in this image?", use caption_image with detailed length - For specific questions, use query_image with the user's question - For object detection requests, use detect_objects with the specified object type - For pointing/location requests, use point_objects - Always validate image paths before processing INPUT HANDLING: - Accept both local file paths and URLs - Support common image formats: jpg, jpeg, png, gif, bmp, webp, tiff - Handle batch processing for multiple images - Provide helpful error messages for invalid inputs RESPONSE FORMATTING: - Present results in a clear, structured format - Include relevant metadata (processing time, image dimensions) - For object detection, show results in tables when appropriate - For batch processing, organize results by image EXAMPLE INTERACTIONS: User: "What's in this image? /path/to/photo.jpg" → Use caption_image with detailed length User: "How many cars are in the parking lot? https://example.com/parking.jpg" → Use query_image with the specific question User: "Find all the people in /path/to/crowd.jpg" → Use detect_objects with object_type="person" User: "Point to the red building in the cityscape.png" → Use point_objects with query="red building" User: "Analyze this image completely: /path/to/scene.jpg" → Use analyze_image for comprehensive analysis Available commands for users: - Natural language requests with image paths/URLs - "help" - show help information - "examples" - show example commands - "quit" or "exit" - exit the session Always be helpful, provide clear responses, and explain what analysis you're performing.""", mcp_servers=[mcp_server], model_settings=ModelSettings( tool_choice="auto" ), ) console.print(f"[bold green]✅ Connected to Moondream MCP server![/bold green]") # Show welcome message console.print(Panel.fit( f"[bold cyan]Moondream Vision OpenAI Agents SDK Agent[/bold cyan]\n" f"AI Model: [yellow]GPT-4o-mini[/yellow]\n" f"Vision Model: [yellow]Moondream2[/yellow]\n\n" f"Type [bold]help[/bold] for commands, [bold]examples[/bold] for examples, or [bold]quit[/bold] to exit.\n" f"Provide image paths or URLs with your vision requests!", border_style="cyan" )) # Interactive loop conversation_id = uuid.uuid4().hex[:16] input_items = [] while True: try: user_input = Prompt.ask(f"[bold cyan]moondream>[/bold cyan]") except (EOFError, KeyboardInterrupt): console.print() break if not user_input.strip(): continue # Handle basic commands if user_input.strip().lower() in {"quit", "exit"}: break elif user_input.strip().lower() == "help": console.print(Panel(""" [bold]Moondream Vision OpenAI Agents SDK Agent[/bold] Ask questions about images in natural language - the AI agent will handle the analysis! [bold]Vision Analysis Tools[/bold] [green]Image Captioning[/green] - Describe what's in an image [green]Visual Q&A[/green] - Answer specific questions about images [green]Object Detection[/green] - Find and locate objects in images [green]Visual Pointing[/green] - Point to specific objects or regions [green]Comprehensive Analysis[/green] - Complete image analysis [green]Batch Processing[/green] - Analyze multiple images at once [bold]Supported Formats[/bold] [cyan]Local files[/cyan]: /path/to/image.jpg, ./photo.png, ~/pictures/scene.gif [cyan]URLs[/cyan]: https://example.com/image.jpg, http://site.com/photo.png [cyan]Formats[/cyan]: JPG, PNG, GIF, BMP, WebP, TIFF [bold]Commands[/bold] [cyan]help[/cyan] – Show this help message [cyan]examples[/cyan] – Show example commands [cyan]quit / exit[/cyan] – Leave the agent [bold]How it works[/bold] The AI agent automatically determines the best vision analysis approach based on your natural language request and the image(s) you provide. """, title="Help", border_style="blue")) continue elif user_input.strip().lower() == "examples": console.print(Panel(""" [bold]Example Commands[/bold] [bold cyan]Image Captioning[/bold cyan] [green]What's in this image? /path/to/photo.jpg[/green] [green]Describe this scene in detail: https://example.com/landscape.png[/green] [green]Caption this image: ./vacation_photo.jpg[/green] [bold cyan]Visual Question Answering[/bold cyan] [green]How many people are in /path/to/crowd.jpg?[/green] [green]What color is the car in this image? ~/car.png[/green] [green]Is there a dog in https://example.com/pets.jpg?[/green] [bold cyan]Object Detection[/bold cyan] [green]Find all the cars in /path/to/street.jpg[/green] [green]Detect people in this image: ./group_photo.png[/green] [green]What objects are in https://example.com/room.jpg?[/green] [bold cyan]Visual Pointing[/bold cyan] [green]Point to the red building in /path/to/city.jpg[/green] [green]Where is the dog in this image? ./pets.png[/green] [green]Show me the tallest tree: https://example.com/forest.jpg[/green] [bold cyan]Comprehensive Analysis[/bold cyan] [green]Analyze this image completely: /path/to/complex_scene.jpg[/green] [green]Give me a full breakdown of ./artwork.png[/green] [bold cyan]Batch Processing[/bold cyan] [green]Analyze these images: /path/to/img1.jpg /path/to/img2.png[/green] [green]Process this batch: ./photo1.jpg ./photo2.jpg ./photo3.png[/green] """, title="Examples", border_style="green")) continue # Process the user input with the AI agent try: with trace("Vision Analysis", group_id=conversation_id): input_items.append({"content": user_input, "role": "user"}) console.print("[green]🤖 Processing with AI vision agent...[/green]") console.print("[dim]Note: First-time model loading may take 30-60 seconds[/dim]") # Add timeout wrapper for the runner try: result = await asyncio.wait_for( Runner.run(agent, input_items), timeout=timeout # Use the provided timeout ) except asyncio.TimeoutError: console.print(f"[red]❌ Processing timed out after {timeout} seconds[/red]") console.print("[yellow]This may happen on first run when loading the model.[/yellow]") console.print("[yellow]Please try again - subsequent runs should be faster.[/yellow]") console.print(f"[dim]You can increase the timeout with --timeout {timeout * 2}[/dim]") continue # Display the agent's response if result.final_output: # Check if the output looks like JSON (tool result) if result.final_output.strip().startswith('{'): display_vision_result(result.final_output) else: console.print(f"[blue]🤖 Agent:[/blue] {result.final_output}") else: console.print("[yellow]› No response from agent.[/yellow]") # Update input items for next iteration input_items = result.to_input_list() except Exception as e: error_msg = str(e) if "timeout" in error_msg.lower() or "timed out" in error_msg.lower(): console.print(f"[red]❌ Timeout Error:[/red] {e}") console.print("[yellow]Vision processing can take time, especially on first run.[/yellow]") console.print("[yellow]Try again - the model should be loaded and faster now.[/yellow]") else: console.print(f"[red]❌ Error:[/red] {e}") # Continue the conversation even if there's an error @click.command(context_settings={"help_option_names": ["-h", "--help"]}) @click.option("--transport", default="stdio", type=click.Choice(["stdio", "http"]), help="Transport type to use (stdio or http)") @click.option("--host", default="127.0.0.1", help="Host for HTTP transport") @click.option("--port", default=8000, type=int, help="Port for HTTP transport") @click.option("--server-command", default="moondream-mcp", help="Command to run the MCP server (for stdio transport)") @click.option("--server-args", default="", help="Arguments for the server command (for stdio transport)") @click.option("--timeout", default=180, type=int, help="Processing timeout in seconds (default: 180)") def main(transport: str, host: str, port: int, server_command: str, server_args: str, timeout: int): """Start an interactive Moondream vision CLI powered by OpenAI Agents SDK.""" asyncio.run(async_main(transport, host, port, server_command, server_args, timeout)) async def async_main(transport: str, host: str, port: int, server_command: str, server_args: str, timeout: int): """Async main function.""" # Check for required environment variables if not os.getenv("OPENAI_API_KEY"): console.print("[red]❌ Error: OPENAI_API_KEY environment variable is required[/red]") console.print("[yellow]Please set your OpenAI API key: export OPENAI_API_KEY='your-key'[/yellow]") return # Create MCP server mcp_server = await create_mcp_server(transport, host, port, server_command, server_args, timeout) # For HTTP transport, we might need to start the server process process = None if transport == "http": # Check if we need to start the server (for demo purposes) try: import requests response = requests.get(f"http://{host}:{port}/health", timeout=2) except: # Server not running, try to start it console.print(f"[yellow]Starting HTTP server at {host}:{port}...[/yellow]") try: process = subprocess.Popen([server_command] + server_args.split()) time.sleep(3) # Give it time to start except Exception as e: console.print(f"[red]❌ Failed to start server: {e}[/red]") return try: # Run the interactive session async with mcp_server: await run_interactive_session(mcp_server, timeout) except Exception as e: console.print(f"[red]❌ Connection error:[/red] {e}") console.print("[yellow]Make sure the MCP server is running and environment variables are set.[/yellow]") finally: if process: process.terminate() console.print("[bold]Goodbye! 👋[/bold]") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ColeMurray/moondream-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

agent.py•20 KiB