Skip to main content
Glama

MCP Screenshot Server

by batteryshark
server.py44.8 kB
#!/usr/bin/env python3 """ MCP Screenshot Server - Capture and analyze screen content """ import os import platform import subprocess import base64 import io from typing import Literal, Optional, List, Dict, Any from dataclasses import dataclass from pathlib import Path import numpy as np import cv2 from PIL import Image, ImageEnhance, ImageFilter import pyautogui from fastmcp import FastMCP from mcp.types import ImageContent, TextContent from fastmcp.tools.tool import ToolResult # Initialize FastMCP server mcp = FastMCP( name="ScreenshotServer", instructions=""" Captures screenshots intelligently with natural language understanding. ENHANCED TOOL: Use screenshot_smart_enhanced() for natural language queries like: - "what am I watching on YouTube" - "show me what I'm working on" - "what am I listening to" - "show me my conversation" This tool understands intent, finds the right window automatically, and can auto-zoom into interesting content regions if the initial capture is unclear. STANDARD TOOL: Use screenshot_smart() for simple keyword-based context hints. FALLBACK TOOLS: Use specific tools only when smart capture isn't sufficient: - screenshot_active_window: Current focused window - screenshot_window: Specific window by ID - screenshot_region: Specific screen coordinates - screenshot_full: Last resort for entire screen Always prefer intelligent captures that understand user intent over manual window selection. """ ) @dataclass class WindowInfo: """Information about a window""" id: int title: str app: str bounds: tuple[int, int, int, int] # x, y, width, height class ScreenshotCapture: """Core screenshot capture functionality""" def __init__(self): self.system = platform.system() # Disable pyautogui failsafe for server use pyautogui.FAILSAFE = False # Priority apps for intelligent capture (using exact macOS app names) self.priority_apps = { 'browsers': ['Google Chrome', 'Safari', 'Firefox', 'Microsoft Edge', 'Arc', 'Chrome'], 'media': ['YouTube', 'Netflix', 'VLC', 'QuickTime Player', 'IINA'], 'development': ['Visual Studio Code', 'Code', 'Cursor', 'Xcode', 'Terminal', 'iTerm2', 'PyCharm', 'iTerm 2'], 'communication': ['Slack', 'Discord', 'Zoom', 'Microsoft Teams', 'Teams', 'Messages'] } def find_priority_window(self, context_hint: Optional[str] = None) -> Optional[WindowInfo]: """Find the most relevant window based on intelligent context analysis""" # Get all available windows first windows = self.list_windows_macos() active = self.get_active_window_macos() if not context_hint: # No context - return best active or priority window if active and self._is_relevant_window(active): return active return self._find_best_priority_window(windows) # Parse user intent from context intent_category = self._parse_user_intent(context_hint) # Smart window matching based on intent if intent_category: target_window = self._find_window_by_intent(windows, intent_category, context_hint) if target_window: return target_window # Direct keyword matching (existing behavior) context_lower = context_hint.lower() # Check active window first if it matches if active and self._window_matches_context(active, context_lower): return active # Search all windows for matches for window in windows: if self._window_matches_context(window, context_lower): return window # Fallback to best available window if active and self._is_relevant_window(active): return active return self._find_best_priority_window(windows) def _parse_user_intent(self, context: str) -> Optional[str]: """Parse user intent from natural language context""" context_lower = context.lower() # Intent patterns for semantic understanding intent_patterns = { 'media_consumption': [ 'watching', 'listening', 'playing', 'streaming', 'video', 'music', 'youtube', 'netflix', 'spotify', 'twitch', 'hulu', 'prime video', 'what am i watching', 'what am i listening', 'what\'s playing' ], 'development': [ 'coding', 'programming', 'debugging', 'terminal', 'editor', 'cursor', 'vscode', 'code', 'git', 'github', 'what am i coding', 'what am i working on', 'development', 'project' ], 'communication': [ 'chatting', 'messaging', 'meeting', 'call', 'slack', 'discord', 'teams', 'zoom', 'messages', 'who am i talking to', 'conversation' ], 'browsing': [ 'browsing', 'reading', 'website', 'web', 'chrome', 'safari', 'firefox', 'browser', 'what am i reading', 'what site', 'webpage' ] } # Find matching intent category for intent, keywords in intent_patterns.items(): if any(keyword in context_lower for keyword in keywords): return intent return None def _find_window_by_intent(self, windows: List[WindowInfo], intent: str, context: str) -> Optional[WindowInfo]: """Find window matching user intent category""" # App mappings for each intent intent_apps = { 'media_consumption': [ 'Google Chrome', 'Safari', 'Firefox', 'YouTube', 'Netflix', 'Spotify', 'VLC', 'QuickTime Player', 'IINA', 'Twitch' ], 'development': [ 'Cursor', 'Visual Studio Code', 'Code', 'Xcode', 'Terminal', 'iTerm2', 'PyCharm', 'GitHub Desktop', 'GitKraken' ], 'communication': [ 'Slack', 'Discord', 'Zoom', 'Microsoft Teams', 'Teams', 'Messages', 'WhatsApp', 'Telegram' ], 'browsing': [ 'Google Chrome', 'Safari', 'Firefox', 'Microsoft Edge', 'Arc' ] } target_apps = intent_apps.get(intent, []) # For media consumption, prioritize windows with video/music indicators if intent == 'media_consumption': media_indicators = ['youtube', 'netflix', 'spotify', 'video', 'music', 'playing', 'stream'] # Look for windows with media indicators in title for window in windows: if any(indicator in window.title.lower() for indicator in media_indicators): if any(app.lower() in window.app.lower() for app in target_apps): return window # Find windows matching target apps for this intent for window in windows: if any(app.lower() in window.app.lower() for app in target_apps): if self._is_relevant_window(window): return window return None def _window_matches_context(self, window: WindowInfo, context_lower: str) -> bool: """Check if window matches the given context""" return (context_lower in window.title.lower() or context_lower in window.app.lower() or self._app_matches_context(window.app, context_lower)) def _app_matches_context(self, app_name: str, context_lower: str) -> bool: """Check if app matches context with smart mapping""" app_lower = app_name.lower() # Smart app context mapping context_mappings = { 'youtube': ['chrome', 'safari', 'firefox'], 'netflix': ['chrome', 'safari', 'firefox'], 'browser': ['chrome', 'safari', 'firefox', 'edge', 'arc'], 'code': ['cursor', 'visual studio code', 'xcode', 'pycharm'], 'terminal': ['terminal', 'iterm'], 'music': ['spotify', 'apple music', 'youtube'], 'video': ['vlc', 'quicktime', 'iina'] } for keyword, apps in context_mappings.items(): if keyword in context_lower: if any(app in app_lower for app in apps): return True return False def _find_best_priority_window(self, windows: List[WindowInfo]) -> Optional[WindowInfo]: """Find the best priority window from available windows""" if not windows: return None # Look for priority apps in order for category, apps in self.priority_apps.items(): for app in apps: for window in windows: if app.lower() in window.app.lower() and self._is_relevant_window(window): return window # Fallback to largest non-system window relevant_windows = [w for w in windows if self._is_relevant_window(w)] if relevant_windows: return max(relevant_windows, key=lambda w: w.bounds[2] * w.bounds[3]) return None def _is_relevant_window(self, window: WindowInfo) -> bool: """Check if window is worth capturing (not tiny, not system)""" x, y, w, h = window.bounds # Skip tiny windows if w < 200 or h < 100: return False # Skip system/background apps system_apps = ['Finder', 'System Preferences', 'Activity Monitor', 'Console', 'Keychain Access', 'System Information'] if any(sys_app in window.app for sys_app in system_apps): return False # Skip empty/generic titles if not window.title or window.title in ['', ' ', 'Window']: return False return True def capture_screen(self, region: Optional[tuple[int, int, int, int]] = None) -> Image.Image: """Capture screen or region using best available method""" # pyautogui works reliably for region capture, use it as primary method return self._capture_crossplatform(region) def _capture_macos(self, region: Optional[tuple[int, int, int, int]] = None) -> Image.Image: """Use macOS screencapture command for best quality""" try: import tempfile import time # Use a unique temporary file to avoid conflicts with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file: temp_path = temp_file.name cmd = ["screencapture", "-x", temp_path] if region: x, y, w, h = region # screencapture -R expects: -Rx,y,width,height (no space between -R and coordinates) cmd.append(f"-R{x},{y},{w},{h}") result = subprocess.run(cmd, check=True, capture_output=True, text=True) # Small delay to ensure file is written time.sleep(0.1) if not os.path.exists(temp_path): raise FileNotFoundError(f"Screenshot file not created: {temp_path}") image = Image.open(temp_path) os.unlink(temp_path) return image except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"screencapture failed: {e}") # Fallback to cross-platform method return self._capture_crossplatform(region) def _capture_crossplatform(self, region: Optional[tuple[int, int, int, int]] = None) -> Image.Image: """Cross-platform capture using pyautogui""" if region: x, y, w, h = region return pyautogui.screenshot(region=(x, y, w, h)) else: return pyautogui.screenshot() def get_active_window_macos(self) -> Optional[WindowInfo]: """Get active window info on macOS""" try: script = ''' tell application "System Events" set frontApp to first application process whose frontmost is true set appName to name of frontApp set frontWindow to first window of frontApp set windowTitle to name of frontWindow set {x, y} to position of frontWindow set {w, h} to size of frontWindow return appName & "|" & windowTitle & "|" & x & "|" & y & "|" & w & "|" & h end tell ''' result = subprocess.run( ["osascript", "-e", script], capture_output=True, text=True, check=True ) parts = result.stdout.strip().split("|") if len(parts) == 6: app, title, x, y, w, h = parts return WindowInfo( id=0, # macOS doesn't provide easy window IDs title=title, app=app, bounds=(int(x), int(y), int(w), int(h)) ) except (subprocess.CalledProcessError, FileNotFoundError): pass return None def list_windows_macos(self) -> List[WindowInfo]: """List all windows on macOS using the same reliable AppleScript approach as test_enum.py""" try: # Use the same clean AppleScript approach as your test_enum.py script = ''' set window_list to {} set output to "" try tell application "System Events" set all_processes to every application process where background only is false repeat with proc in all_processes set app_name to name of proc try repeat with w in every window of proc set window_title to name of w if window_title is not "" then set window_pos to position of w set window_size to size of w set window_info to app_name & "|" & window_title & "|" & (item 1 of window_pos) & "|" & (item 2 of window_pos) & "|" & (item 1 of window_size) & "|" & (item 2 of window_size) set output to output & window_info & linefeed end if end repeat on error -- Ignore processes that might not have window properties end try end repeat end tell on error errMsg return "Error: " & errMsg end try return output ''' result = subprocess.run( ["osascript", "-e", script], capture_output=True, text=True, check=True ) windows = [] if result.stdout.strip(): lines = result.stdout.strip().split('\n') for i, line in enumerate(lines): if line.strip() and "|" in line: parts = line.strip().split("|") if len(parts) >= 6: try: app, title, x, y, w, h = parts[:6] windows.append(WindowInfo( id=i, title=title.strip(), app=app.strip(), bounds=(int(x), int(y), int(w), int(h)) )) except (ValueError, IndexError): continue return windows except subprocess.CalledProcessError as e: print(f"⚠️ Window listing failed. Error: {e}") return [] except Exception as e: print(f"⚠️ Window listing error: {e}") return [] class ImageProcessor: """Image enhancement and processing""" @staticmethod def detect_content_regions(image: Image.Image) -> List[tuple[int, int, int, int]]: """Detect interesting content regions in image for auto-zoom""" import cv2 import numpy as np # Convert PIL to OpenCV format img_array = np.array(image) if len(img_array.shape) == 3: img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) else: img_gray = img_array # Detect edges to find content areas edges = cv2.Canny(img_gray, 50, 150) # Find contours contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) regions = [] min_area = (image.width * image.height) * 0.05 # At least 5% of image max_area = (image.width * image.height) * 0.8 # At most 80% of image for contour in contours: x, y, w, h = cv2.boundingRect(contour) area = w * h # Filter by size and aspect ratio if (min_area < area < max_area and w > 100 and h > 100 and # Minimum meaningful size 0.3 < w/h < 3.0): # Reasonable aspect ratio regions.append((x, y, w, h)) # Sort by area (largest first) and return top 3 regions.sort(key=lambda r: r[2] * r[3], reverse=True) return regions[:3] @staticmethod def is_image_clear(image: Image.Image, threshold: float = 100.0) -> bool: """Check if image has sufficient detail/clarity""" import cv2 import numpy as np # Convert to grayscale img_array = np.array(image.convert('L')) # Calculate Laplacian variance (measure of blur/clarity) laplacian = cv2.Laplacian(img_array, cv2.CV_64F) variance = laplacian.var() return variance > threshold @staticmethod def get_quality_scale(mode: Literal["overview", "readable", "detail"], image_width: int = 0) -> float: """Get scale factor for quality mode, adjusted for large images""" base_scales = { "overview": 0.4, "readable": 0.8, "detail": 1.0 } scale = base_scales[mode] # For very wide images (ultra-wide monitors), scale down more aggressively if image_width > 2000: # Ultra-wide territory scale *= 0.6 # Additional 40% reduction elif image_width > 1600: # Large screens scale *= 0.75 # Additional 25% reduction return min(scale, 1.0) # Cap at 1.0 @staticmethod def enhance_for_text(image: Image.Image) -> Image.Image: """Enhance image for better text readability""" # Convert to numpy array for OpenCV processing img_array = np.array(image) # Apply sharpening kernel kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]]) sharpened = cv2.filter2D(img_array, -1, kernel) # Convert back to PIL for further processing enhanced = Image.fromarray(sharpened) # Enhance contrast enhancer = ImageEnhance.Contrast(enhanced) enhanced = enhancer.enhance(1.2) # Slight sharpening with PIL enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3)) return enhanced @staticmethod def process_image( image: Image.Image, quality_mode: Literal["overview", "readable", "detail"], enhance_text: bool, format: Literal["png", "jpeg"] = "png" ) -> bytes: """Process image with quality and enhancement settings""" # Apply text enhancement if requested if enhance_text: image = ImageProcessor.enhance_for_text(image) # Scale image based on quality mode and image size scale = ImageProcessor.get_quality_scale(quality_mode, image.width) if scale < 1.0: new_size = (int(image.width * scale), int(image.height * scale)) # Use LANCZOS for high-quality downscaling image = image.resize(new_size, Image.Resampling.LANCZOS) # Convert to bytes buffer = io.BytesIO() if format == "jpeg": # Convert RGBA to RGB for JPEG if image.mode in ("RGBA", "LA", "P"): background = Image.new("RGB", image.size, (255, 255, 255)) if image.mode == "P": image = image.convert("RGBA") background.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None) image = background image.save(buffer, format="JPEG", quality=85, optimize=True) else: image.save(buffer, format="PNG", optimize=True) return buffer.getvalue() # Initialize capture and processor instances capture = ScreenshotCapture() processor = ImageProcessor() @mcp.tool def check_permissions() -> dict: """Check what macOS permissions are available and what functionality works.""" permissions = { "platform": platform.system(), "screen_recording": False, "accessibility": False, "working_features": [], "missing_features": [], "instructions": [] } if platform.system() != "Darwin": permissions["note"] = "Permission checks only apply to macOS" return permissions # Test screen recording (try a small capture) try: test_image = capture.capture_screen(region=(0, 0, 100, 100)) if test_image and test_image.width > 0: permissions["screen_recording"] = True permissions["working_features"].extend([ "screenshot_smart", "screenshot_active_window", "screenshot_full", "screenshot_region" ]) except Exception: permissions["missing_features"].extend([ "All screenshot functionality" ]) permissions["instructions"].append( "Enable Screen Recording: System Preferences > Security & Privacy > Privacy > Screen Recording" ) # Test accessibility (try to list windows) try: windows = capture.list_windows_macos() if windows: permissions["accessibility"] = True permissions["working_features"].extend([ "list_windows", "screenshot_window (with window ID)" ]) else: # Try to determine if it's a permission issue script = 'tell application "System Events" to get name of first application process' result = subprocess.run(["osascript", "-e", script], capture_output=True, text=True) if "assistive access" in result.stderr or "-25211" in result.stderr: permissions["missing_features"].extend([ "list_windows", "screenshot_window (selective)" ]) permissions["instructions"].append( "Enable Accessibility: System Preferences > Security & Privacy > Privacy > Accessibility" ) except Exception as e: if "assistive access" in str(e) or "-25211" in str(e): permissions["missing_features"].extend([ "list_windows", "screenshot_window (selective)" ]) permissions["instructions"].append( "Enable Accessibility: System Preferences > Security & Privacy > Privacy > Accessibility" ) # Add summary if permissions["screen_recording"] and not permissions["accessibility"]: permissions["summary"] = "Core screenshot functionality works. Window listing needs Accessibility permission." elif permissions["screen_recording"] and permissions["accessibility"]: permissions["summary"] = "All functionality available! 🎉" elif not permissions["screen_recording"]: permissions["summary"] = "Screen Recording permission required for basic functionality." else: permissions["summary"] = "Permission status unclear." return permissions @mcp.tool def screenshot_smart_enhanced( query: Optional[str] = None, auto_zoom: bool = True, quality_mode: Literal["overview", "readable", "detail"] = "readable", enhance_text: bool = True, format: Literal["png", "jpeg"] = "png" ) -> ToolResult: """Enhanced smart screenshot with natural language understanding and auto-zoom. Args: query: Natural language query like 'what am I watching on YouTube' or 'show me my code' auto_zoom: Automatically capture focused regions if initial screenshot is unclear quality_mode: Image quality mode enhance_text: Apply text enhancement format: Image format """ try: # Find the best window using enhanced logic target_window = capture.find_priority_window(query) if not target_window: return ToolResult( content=[TextContent(type="text", text="No suitable window found for your query")] ) # Activate the target window try: activate_script = f''' tell application "System Events" tell application process "{target_window.app}" set frontmost to true tell window "{target_window.title}" perform action "AXRaise" end tell end tell end tell ''' subprocess.run( ["osascript", "-e", activate_script], capture_output=True, text=True, check=True ) import time time.sleep(0.3) # Wait for window activation except Exception: pass # Continue anyway # Capture the window x, y, w, h = target_window.bounds image = capture.capture_screen(region=(x, y, w, h)) # Check if auto-zoom is needed and enabled if auto_zoom and not processor.is_image_clear(image): # Try to find interesting regions to zoom into regions = processor.detect_content_regions(image) if regions: # Use the largest interesting region region_x, region_y, region_w, region_h = regions[0] # Adjust coordinates to absolute screen coordinates abs_x = x + region_x abs_y = y + region_y # Capture the focused region zoom_image = capture.capture_screen(region=(abs_x, abs_y, region_w, region_h)) # Use zoomed image if it's clearer if processor.is_image_clear(zoom_image): image = zoom_image zoom_info = f" (auto-zoomed to {region_w}x{region_h} region)" else: zoom_info = "" else: zoom_info = "" else: zoom_info = "" # Process the final image image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() # Determine response message based on query understanding if query: intent = capture._parse_user_intent(query) if intent == 'media_consumption': response_msg = f"Here's what you're watching/listening to: {target_window.app} - {target_window.title}" elif intent == 'development': response_msg = f"Here's what you're working on: {target_window.app} - {target_window.title}" elif intent == 'communication': response_msg = f"Here's your conversation: {target_window.app} - {target_window.title}" else: response_msg = f"Smart capture: {target_window.app} - {target_window.title}" else: response_msg = f"Smart capture: {target_window.app} - {target_window.title}" response_msg += f" ({image.width}x{image.height}, {quality_mode} quality{zoom_info})" mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent(type="text", text=response_msg) ] ) except Exception as e: return ToolResult( content=[TextContent(type="text", text=f"Enhanced smart capture failed: {str(e)}")] ) @mcp.tool def screenshot_smart( context: Optional[str] = None, quality_mode: Literal["overview", "readable", "detail"] = "readable", enhance_text: bool = True, format: Literal["png", "jpeg"] = "png" ) -> ToolResult: """Smart screenshot that finds the most relevant window automatically. Args: context: Optional hint about what to look for (e.g., 'youtube', 'browser', 'code') quality_mode: Image quality mode enhance_text: Apply text enhancement format: Image format """ try: # Find the best window to capture target_window = capture.find_priority_window(context) if target_window: # Activate the priority window first for clean capture try: activate_script = f''' tell application "System Events" tell application process "{target_window.app}" set frontmost to true tell window "{target_window.title}" perform action "AXRaise" end tell end tell end tell ''' subprocess.run( ["osascript", "-e", activate_script], capture_output=True, text=True, check=True ) # Small delay to let window come to front import time time.sleep(0.2) except Exception: # Continue anyway, might still work pass # Capture the priority window x, y, w, h = target_window.bounds image = capture.capture_screen(region=(x, y, w, h)) # Process image image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Smart capture (activated): {target_window.app} - {target_window.title} ({w}x{h}, {quality_mode} quality)" ) ] ) else: # Fallback to active window if capture.system == "Darwin": active_window = capture.get_active_window_macos() if active_window: x, y, w, h = active_window.bounds image = capture.capture_screen(region=(x, y, w, h)) image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Active window fallback: {active_window.app} - {active_window.title} ({w}x{h}, {quality_mode} quality)" ) ] ) # Last resort: small region of screen center screen_image = capture.capture_screen() center_x = screen_image.width // 2 center_y = screen_image.height // 2 region_size = min(1200, screen_image.width // 2, screen_image.height // 2) x = center_x - region_size // 2 y = center_y - region_size // 2 image = capture.capture_screen(region=(x, y, region_size, region_size)) image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Center region capture: {region_size}x{region_size} from screen center ({quality_mode} quality)" ) ] ) except Exception as e: return ToolResult( content=[TextContent(type="text", text=f"Smart capture failed: {str(e)}")] ) @mcp.tool def screenshot_full( quality_mode: Literal["overview", "readable", "detail"] = "overview", enhance_text: bool = True, format: Literal["png", "jpeg"] = "png" ) -> ToolResult: """Capture entire desktop/screen with quality and enhancement options.""" try: # Capture full screen image = capture.capture_screen() # Process image image_bytes = processor.process_image(image, quality_mode, enhance_text, format) # Encode as base64 image_b64 = base64.b64encode(image_bytes).decode() # Return as ImageContent mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Full screen captured ({image.width}x{image.height}, {quality_mode} quality)" ) ] ) except Exception as e: return ToolResult( content=[TextContent(type="text", text=f"Screenshot failed: {str(e)}")] ) @mcp.tool def screenshot_active_window( quality_mode: Literal["overview", "readable", "detail"] = "readable", enhance_text: bool = True, format: Literal["png", "jpeg"] = "png" ) -> ToolResult: """Capture currently focused/active window.""" try: if capture.system == "Darwin": window_info = capture.get_active_window_macos() if not window_info: return ToolResult( content=[TextContent(type="text", text="Could not detect active window")] ) # Capture window region x, y, w, h = window_info.bounds image = capture.capture_screen(region=(x, y, w, h)) # Process image image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Active window captured: {window_info.app} - {window_info.title} ({w}x{h}, {quality_mode} quality)" ) ] ) else: return ToolResult( content=[TextContent(type="text", text="Active window capture not yet implemented for this platform")] ) except Exception as e: return ToolResult( content=[TextContent(type="text", text=f"Active window capture failed: {str(e)}")] ) @mcp.tool def screenshot_region( x: int, y: int, width: int, height: int, quality_mode: Literal["overview", "readable", "detail"] = "detail", enhance_text: bool = True, format: Literal["png", "jpeg"] = "png" ) -> ToolResult: """Capture specific rectangular area of screen.""" try: # Validate coordinates if width <= 0 or height <= 0: return ToolResult( content=[TextContent(type="text", text="Width and height must be positive")] ) # Capture region image = capture.capture_screen(region=(x, y, width, height)) # Process image image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Region captured: ({x},{y}) {width}x{height} ({quality_mode} quality)" ) ] ) except Exception as e: return ToolResult( content=[TextContent(type="text", text=f"Region capture failed: {str(e)}")] ) @mcp.tool def list_windows() -> dict: """Get list of all open windows with IDs, titles, and bounds.""" try: if capture.system == "Darwin": windows = capture.list_windows_macos() return { "windows": [ { "id": w.id, "title": w.title, "app": w.app, "bounds": list(w.bounds) # [x, y, width, height] } for w in windows ], "count": len(windows), "platform": "macOS" } else: return { "windows": [], "count": 0, "platform": capture.system, "error": "Window listing not yet implemented for this platform" } except Exception as e: return { "windows": [], "count": 0, "error": f"Failed to list windows: {str(e)}" } @mcp.tool def activate_window(window_id: int) -> dict: """Activate/focus a specific window by bringing it to front.""" try: if capture.system == "Darwin": windows = capture.list_windows_macos() # Find window by ID target_window = None for window in windows: if window.id == window_id: target_window = window break if not target_window: return {"success": False, "error": f"Window with ID {window_id} not found"} # AppleScript to activate the window script = f''' tell application "System Events" tell application process "{target_window.app}" set frontmost to true tell window "{target_window.title}" perform action "AXRaise" end tell end tell end tell ''' subprocess.run( ["osascript", "-e", script], capture_output=True, text=True, check=True ) return { "success": True, "activated_window": { "app": target_window.app, "title": target_window.title, "id": window_id } } else: return {"success": False, "error": "Window activation not implemented for this platform"} except Exception as e: return {"success": False, "error": f"Failed to activate window: {str(e)}"} @mcp.tool def screenshot_window( window_id: int, quality_mode: Literal["overview", "readable", "detail"] = "readable", enhance_text: bool = True, format: Literal["png", "jpeg"] = "png" ) -> ToolResult: """Capture specific window by ID from list_windows. Automatically activates the window first to ensure clean capture.""" try: if capture.system == "Darwin": windows = capture.list_windows_macos() # Find window by ID target_window = None for window in windows: if window.id == window_id: target_window = window break if not target_window: return ToolResult( content=[TextContent(type="text", text=f"Window with ID {window_id} not found")] ) # Activate window first to ensure clean capture try: activate_script = f''' tell application "System Events" tell application process "{target_window.app}" set frontmost to true tell window "{target_window.title}" perform action "AXRaise" end tell end tell end tell ''' subprocess.run( ["osascript", "-e", activate_script], capture_output=True, text=True, check=True ) # Small delay to let window come to front import time time.sleep(0.2) except Exception as e: # Continue anyway, might still work pass # Capture window region x, y, w, h = target_window.bounds image = capture.capture_screen(region=(x, y, w, h)) # Process image image_bytes = processor.process_image(image, quality_mode, enhance_text, format) image_b64 = base64.b64encode(image_bytes).decode() mime_type = f"image/{format}" return ToolResult( content=[ ImageContent(type="image", data=image_b64, mimeType=mime_type), TextContent( type="text", text=f"Window captured (activated): {target_window.app} - {target_window.title} ({w}x{h}, {quality_mode} quality)" ) ] ) else: return ToolResult( content=[TextContent(type="text", text="Window capture not yet implemented for this platform")] ) except Exception as e: return ToolResult( content=[TextContent(type="text", text=f"Window capture failed: {str(e)}")] ) def main(): # Support both stdio and HTTP transport mcp_host = os.getenv("HOST", "127.0.0.1") mcp_port = os.getenv("PORT", None) if mcp_port: mcp.run(port=int(mcp_port), host=mcp_host, transport="streamable-http") else: mcp.run() if __name__ == "__main__": main()

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/batteryshark/mcp-screenshot'

If you have feedback or need assistance with the MCP directory API, please join our Discord server