mcp-screenshot

#!/usr/bin/env node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListToolsRequestSchema, ToolSchema, } from "@modelcontextprotocol/sdk/types.js"; import { execFile } from "node:child_process"; import { promisify } from "node:util"; import { mkdir } from "node:fs/promises"; import { join } from "node:path"; import { homedir } from "node:os"; import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { createWorker } from "tesseract.js"; import sharp from "sharp"; import { createReadStream } from "node:fs"; import axios from "axios"; import FormData from "form-data"; const execFileAsync = promisify(execFile); // Screenshot region types const ScreenshotArgsSchema = z.object({ region: z.enum(["left", "right", "full"]).default("left"), format: z .enum(["json", "markdown", "vertical", "horizontal"]) .default("markdown"), }); const ToolInputSchema = ToolSchema.shape.inputSchema; type ToolInput = z.infer<typeof ToolInputSchema>; // Environment variable type definition const API_CONFIG = { OCR_API_URL: process.env.OCR_API_URL || "http://localhost:8000" + "/analyze", OCR_API_PATH: "/analyze", } as const; async function ensureDateDirectory(): Promise<string> { const now = new Date(); const year = now.getFullYear(); const month = String(now.getMonth() + 1).padStart(2, "0"); const day = String(now.getDate()).padStart(2, "0"); const downloadDir = join(homedir(), "Downloads"); const dateDir = join(downloadDir, `${year}${month}${day}`); await mkdir(dateDir, { recursive: true }); return dateDir; } async function getDisplayDimensions(): Promise<{ width: number; height: number; }> { try { // Get the actual pixel dimensions using system_profiler const { stdout } = await execFileAsync("system_profiler", [ "SPDisplaysDataType", "-json", ]); const data = JSON.parse(stdout); const mainDisplay = data.SPDisplaysDataType[0].spdisplays_ndrvs[0]; const dimensions = mainDisplay._spdisplays_pixels.split(" x "); // Convert dimensions to numbers const width = Number(dimensions[0]); const height = Number(dimensions[1]); if (!width || !height || Number.isNaN(width) || Number.isNaN(height)) { throw new Error( `Invalid display dimensions: width=${width}, height=${height}`, ); } console.error( `Debug: Display dimensions - width: ${width}, height: ${height}`, ); return { width, height }; } catch (error) { throw new Error(`Failed to get display dimensions: ${error}`); } } async function takeScreenshot( region: z.infer<typeof ScreenshotArgsSchema>["region"], ): Promise<string> { const dateDir = await ensureDateDirectory(); const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const filename = `screenshot-${region}-${timestamp}.png`; const filepath = join(dateDir, filename); try { // Get main display dimensions const { width, height } = await getDisplayDimensions(); console.error( `Debug: Display dimensions - width: ${width}, height: ${height}`, ); // Always capture full screen await execFileAsync("screencapture", [filepath]); // Process image if needed if (region !== "full") { const tempFilePath = `${filepath}.temp.png`; await sharp(filepath).toFile(tempFilePath); const metadata = await sharp(tempFilePath).metadata(); if (!metadata.width || !metadata.height) { throw new Error("Failed to get image dimensions"); } const halfWidth = Math.floor(metadata.width / 2); // Extract left or right half if (region === "left") { await sharp(tempFilePath) .extract({ left: 0, top: 0, width: halfWidth, height: metadata.height, }) .toFile(filepath); } else if (region === "right") { await sharp(tempFilePath) .extract({ left: halfWidth, top: 0, width: halfWidth, height: metadata.height, }) .toFile(filepath); } // Remove temporary file await execFileAsync("rm", [tempFilePath]); } return filepath; } catch (error) { throw new Error(`Screenshot capture failed: ${error}`); } } async function performOCR( imagePath: string, format = "markdown", ): Promise<string> { try { const formData = new FormData(); formData.append("file", createReadStream(imagePath), { filename: imagePath.split("/").pop(), }); const response = await axios.post( `${API_CONFIG.OCR_API_URL}${API_CONFIG.OCR_API_PATH}?format=${format}`, formData, { headers: formData.getHeaders(), }, ); if (response.status !== 200) { throw new Error(`OCR API returned status ${response.status}`); } // Remove <br> tags const content = response.data.content.replace(/<br\s*\/?>/g, ""); return content; } catch (error) { console.error("OCR API error, falling back to Tesseract.js:", error); try { // Configure worker for both Japanese and English recognition console.error("OCR: Creating worker for Japanese and English..."); const worker = await createWorker("jpn+eng"); console.error("OCR: Starting recognition..."); const { data: { text }, } = await worker.recognize(imagePath); console.error("OCR: Recognition completed"); await worker.terminate(); // Format output according to specified format let formattedText = text.trim(); switch (format) { case "json": formattedText = JSON.stringify({ content: text.trim() }); break; case "markdown": formattedText = `\`\`\`\n${text.trim()}\n\`\`\``; break; case "vertical": formattedText = text.trim().split("\n").join("\n\n"); break; case "horizontal": formattedText = text.trim().replace(/\n/g, " "); break; } return formattedText; } catch (tesseractError) { console.error("Tesseract.js error details:", tesseractError); throw new Error( `Both OCR API and Tesseract.js failed. API error: ${error instanceof Error ? error.message : String(error)}. Tesseract error: ${tesseractError instanceof Error ? tesseractError.message : String(tesseractError)}`, ); } } } // Server setup const server = new Server( { name: "mcp-screenshot", version: "1.0.0", }, { capabilities: { tools: {}, }, }, ); // Tool handlers server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: "capture", description: "Captures a screenshot of the specified region and performs OCR. " + "Options:\n" + "- region: 'left'/'right'/'full' (default: 'left')\n" + "- format: 'json'/'markdown'/'vertical'/'horizontal' (default: 'markdown')\n" + "The screenshot is saved to a dated directory in Downloads.", inputSchema: zodToJsonSchema(ScreenshotArgsSchema) as ToolInput, }, ], })); server.setRequestHandler(CallToolRequestSchema, async (request) => { try { const { name, arguments: args } = request.params; if (name !== "capture") { throw new Error(`Unknown tool: ${name}`); } const parsed = ScreenshotArgsSchema.safeParse(args); if (!parsed.success) { throw new Error(`Invalid arguments: ${parsed.error}`); } console.error( `Debug: Starting screenshot capture for region: ${parsed.data.region}, format: ${parsed.data.format}`, ); const imagePath = await takeScreenshot(parsed.data.region); console.error(`Debug: Screenshot saved to: ${imagePath}`); const ocrText = await performOCR(imagePath, parsed.data.format); console.error("Debug: OCR completed"); return { content: [ { type: "text", text: `Screenshot saved to: ${imagePath}\n\nOCR Results:\n${ocrText}`, }, ], }; } catch (error) { console.error("Error:", error); return { content: [ { type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } }); // Start server async function runServer() { const transport = new StdioServerTransport(); await server.connect(transport); console.error("Screenshot MCP server running on stdio"); } runServer().catch((error) => { console.error("Fatal error running server:", error); process.exit(1); });