Moondream MCP Server

Apache 2.0
OverviewInspectSchema Related Servers Reviews Score
#!/usr/bin/env node

/**
 * MCP server that provides image analysis capabilities using the Moondream model.
 * It implements tools for:
 * - Image captioning
 * - Object detection
 * - Visual question answering
 */

import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
  CallToolRequestSchema,
  ListToolsRequestSchema,
  Request,
} from "@modelcontextprotocol/sdk/types.js";
import { join } from "path";
import * as fs from "fs/promises";
import { PythonSetup } from "./utils/python-setup.js";
import { PuppeteerSetup, ViewportConfig } from "./utils/puppeteer-setup.js";

interface ListToolsRequest extends Request {
  method: "tools/list";
}

interface CallToolRequest extends Request {
  method: "tools/call";
  params: {
    name: string;
    arguments?: Record<string, unknown>;
  };
}

class MoondreamServer {
  private server: Server;
  private pythonSetup: PythonSetup;
  private puppeteerSetup: PuppeteerSetup;

  constructor() {
    this.pythonSetup = new PythonSetup();
    this.puppeteerSetup = new PuppeteerSetup();
    this.server = new Server(
      {
        name: "moondream-server",
        version: "0.1.0",
      },
      {
        capabilities: {
          tools: {
            test: {
              description: "Test tool to verify server functionality",
              inputSchema: {
                type: "object",
                properties: {
                  message: {
                    type: "string",
                    description: "Test message to echo back"
                  }
                },
                required: ["message"]
              }
            },
            analyze_image: {
              description: "Analyze an image using the Moondream model",
              inputSchema: {
                type: "object",
                properties: {
                  image_path: {
                    type: "string",
                    description: "Path to the image file to analyze"
                  },
                  prompt: {
                    type: "string",
                    description: "Command to analyze the image. Use 'generate caption' for image captioning, 'detect: [object]' for object detection, or any question for image querying."
                  }
                },
                required: ["image_path", "prompt"]
              }
            },
            analyze_webpage: {
              description: "Take a screenshot of a webpage and analyze it using Moondream",
              inputSchema: {
                type: "object",
                properties: {
                  url: {
                    type: "string",
                    description: "URL to screenshot (defaults to http://localhost:3000)"
                  },
                  query: {
                    type: "string",
                    description: "Question to ask about the webpage"
                  },
                  waitTime: {
                    type: "number",
                    description: "Time to wait after page load in milliseconds (default: 15000)"
                  },
                  viewport: {
                    type: "object",
                    description: "Optional viewport settings",
                    properties: {
                      width: {
                        type: "number",
                        description: "Viewport width (default: 1280, max: 2560)"
                      },
                      height: {
                        type: "number",
                        description: "Viewport height (default: 720, max: 1440)"
                      }
                    }
                  }
                },
                required: ["query"]
              }
            }
          },
        },
      }
    );

    this.setupToolHandlers();
    this.server.onerror = (error) => console.error("[MCP Error]", error);
  }

  private setupToolHandlers() {
    this.server.setRequestHandler(ListToolsRequestSchema, async (request: ListToolsRequest) => {
      return {
        tools: [
          {
            name: "test",
            description: "Test tool to verify server functionality",
            inputSchema: {
              type: "object",
              properties: {
                message: {
                  type: "string",
                  description: "Test message to echo back"
                }
              },
              required: ["message"]
            }
          },
          {
            name: "analyze_image",
            description: "Analyze an image using the Moondream model",
            inputSchema: {
              type: "object",
              properties: {
                image_path: {
                  type: "string",
                  description: "Path to the image file to analyze"
                },
                prompt: {
                  type: "string",
                  description: "Command to analyze the image. Use 'generate caption' for image captioning, 'detect: [object]' for object detection, or any question for image querying."
                }
              },
              required: ["image_path", "prompt"]
            }
          },
          {
            name: "analyze_webpage",
            description: "Take a screenshot of a webpage and analyze it using Moondream",
            inputSchema: {
              type: "object",
              properties: {
                url: {
                  type: "string",
                  description: "URL to screenshot (defaults to http://localhost:3000)"
                },
                query: {
                  type: "string",
                  description: "Question to ask about the webpage"
                },
                waitTime: {
                  type: "number",
                  description: "Time to wait after page load in milliseconds (default: 1000)"
                },
                viewport: {
                  type: "object",
                  description: "Optional viewport settings",
                  properties: {
                    width: {
                      type: "number",
                      description: "Viewport width (default: 1280, max: 2560)"
                    },
                    height: {
                      type: "number",
                      description: "Viewport height (default: 720, max: 1440)"
                    }
                  }
                }
              },
              required: ["query"]
            }
          }
        ]
      };
    });

    this.server.setRequestHandler(CallToolRequestSchema, async (request: CallToolRequest) => {
      switch (request.params.name) {
        case "test": {
          const message = String(request.params.arguments?.message);
          if (!message) {
            throw new Error("Message is required");
          }

          return {
            content: [{
              type: "text",
              text: `Test successful! Received message: ${message}`
            }]
          };
        }

        case "analyze_image": {
          const imagePath = String(request.params.arguments?.image_path);
          const prompt = String(request.params.arguments?.prompt);

          if (!imagePath || !prompt) {
            throw new Error("Image path and prompt are required");
          }

          try {
            // Verify image exists
            await fs.access(imagePath);

            // Read image file and convert to base64
            const imageBuffer = await fs.readFile(imagePath);
            const base64Image = imageBuffer.toString("base64");

            // Ensure proper padding
            const paddedBase64 = base64Image.padEnd(Math.ceil(base64Image.length / 4) * 4, '=');

            // Determine which endpoint to use based on the prompt
            let endpoint = "query";
            let body: any = {
              image_url: `data:image/jpeg;base64,${paddedBase64}`,
              question: prompt
            };

            if (prompt.toLowerCase() === "generate caption") {
              endpoint = "caption";
              body = { image_url: `data:image/jpeg;base64,${paddedBase64}` };
            } else if (prompt.toLowerCase().startsWith("detect:")) {
              endpoint = "detect";
              body = {
                image_url: `data:image/jpeg;base64,${paddedBase64}`,
                object: prompt.slice(7).trim()
              };
            }

            console.error(`[Debug] Sending request to ${endpoint} endpoint`);
            console.error(`[Debug] Request body keys:`, Object.keys(body));

            // Query the model server
            const response = await fetch(`http://127.0.0.1:3475/${endpoint}`, {
              method: "POST",
              headers: {
                "Content-Type": "application/json",
              },
              body: JSON.stringify(body),
            });

            if (!response.ok) {
              const errorText = await response.text();
              throw new Error(`Model server error: ${response.statusText} - ${errorText}`);
            }

            const result = await response.json();
            console.error(`[Debug] Response:`, result);
            
            let responseText = "";
            if (endpoint === "caption") {
              responseText = result.caption;
            } else if (endpoint === "detect") {
              responseText = `Detected objects: ${JSON.stringify(result.objects)}`;
            } else {
              responseText = result.answer;
            }

            return {
              content: [{
                type: "text",
                text: responseText,
              }],
            };
          } catch (error: unknown) {
            console.error("Error analyzing image:", error);
            const errorMessage = error instanceof Error ? error.message : "Unknown error";
            throw new Error(`Failed to analyze image: ${errorMessage}`);
          }
        }

        case "analyze_webpage": {
          const url = String(request.params.arguments?.url || "http://localhost:3000");
          const query = String(request.params.arguments?.query);
          const waitTime = Number(request.params.arguments?.waitTime || 15000);
          const viewport = request.params.arguments?.viewport as ViewportConfig | undefined;

          if (!query) {
            throw new Error("Query is required");
          }

          try {
            console.error("[Debug] Analyzing webpage:", url);
            console.error("[Debug] Query:", query);
            
            // Enhanced error handling with retries
            let screenshotPath = "";
            let retries = 3;
            
            while (retries > 0) {
              try {
                console.error(`[Debug] Attempt ${4 - retries}/3: Capturing screenshot`);
                screenshotPath = await this.puppeteerSetup.captureScreenshot(url, waitTime, viewport);
                break;
              } catch (error) {
                retries--;
                console.error(`[Debug] Screenshot attempt failed, ${retries} retries left:`, error instanceof Error ? error.message : String(error));
                if (retries === 0) throw error;
                await new Promise(resolve => setTimeout(resolve, 2000));
              }
            }
            console.error("[Debug] Screenshot saved to:", screenshotPath);

            // Read screenshot and convert to base64
            const imageBuffer = await fs.readFile(screenshotPath);
            console.error("[Debug] Screenshot size:", imageBuffer.length, "bytes");
            const base64Image = imageBuffer.toString("base64");
            console.error("[Debug] Base64 length:", base64Image.length);

            // Query the model
            const response = await fetch("http://127.0.0.1:3475/query", {
              method: "POST",
              headers: {
                "Content-Type": "application/json",
              },
              body: JSON.stringify({
                image_url: `data:image/jpeg;base64,${base64Image}`,
                question: query
              }),
            });

            if (!response.ok) {
              const errorText = await response.text();
              throw new Error(`Model server error: ${response.statusText} - ${errorText}`);
            }

            const result = await response.json();
            console.error("[Debug] Model response:", result);
            return {
              content: [{
                type: "text",
                text: result.answer,
              }],
            };
          } catch (error: unknown) {
            console.error("Error analyzing webpage:", error);
            const errorMessage = error instanceof Error ? error.message : "Unknown error";
            throw new Error(`Failed to analyze webpage: ${errorMessage}`);
          }
        }

        default: {
          throw new Error("Unknown tool");
        }
      }
    });
  }

  async cleanup() {
    this.pythonSetup.cleanup();
    await this.puppeteerSetup.cleanup();
  }

  async run() {
    try {
      await this.pythonSetup.setup();
      const transport = new StdioServerTransport();
      await this.server.connect(transport);
      console.error("Moondream MCP server running on stdio");

      // Handle cleanup on exit
      process.on('SIGINT', async () => {
        await this.cleanup();
        await this.server.close();
        process.exit(0);
      });
      
      process.on('SIGTERM', async () => {
        await this.cleanup();
        await this.server.close();
        process.exit(0);
      });
    } catch (error) {
      console.error("Failed to start server:", error);
      process.exit(1);
    }
  }
}

const server = new MoondreamServer();
server.run().catch((error) => {
  console.error("Server error:", error);
  process.exit(1);
});