server.py•44.8 kB
#!/usr/bin/env python3
"""
MCP Screenshot Server - Capture and analyze screen content
"""
import os
import platform
import subprocess
import base64
import io
from typing import Literal, Optional, List, Dict, Any
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import cv2
from PIL import Image, ImageEnhance, ImageFilter
import pyautogui
from fastmcp import FastMCP
from mcp.types import ImageContent, TextContent
from fastmcp.tools.tool import ToolResult
# Initialize FastMCP server
mcp = FastMCP(
name="ScreenshotServer",
instructions="""
Captures screenshots intelligently with natural language understanding.
ENHANCED TOOL: Use screenshot_smart_enhanced() for natural language queries like:
- "what am I watching on YouTube"
- "show me what I'm working on"
- "what am I listening to"
- "show me my conversation"
This tool understands intent, finds the right window automatically, and can auto-zoom
into interesting content regions if the initial capture is unclear.
STANDARD TOOL: Use screenshot_smart() for simple keyword-based context hints.
FALLBACK TOOLS: Use specific tools only when smart capture isn't sufficient:
- screenshot_active_window: Current focused window
- screenshot_window: Specific window by ID
- screenshot_region: Specific screen coordinates
- screenshot_full: Last resort for entire screen
Always prefer intelligent captures that understand user intent over manual window selection.
"""
)
@dataclass
class WindowInfo:
"""Information about a window"""
id: int
title: str
app: str
bounds: tuple[int, int, int, int] # x, y, width, height
class ScreenshotCapture:
"""Core screenshot capture functionality"""
def __init__(self):
self.system = platform.system()
# Disable pyautogui failsafe for server use
pyautogui.FAILSAFE = False
# Priority apps for intelligent capture (using exact macOS app names)
self.priority_apps = {
'browsers': ['Google Chrome', 'Safari', 'Firefox', 'Microsoft Edge', 'Arc', 'Chrome'],
'media': ['YouTube', 'Netflix', 'VLC', 'QuickTime Player', 'IINA'],
'development': ['Visual Studio Code', 'Code', 'Cursor', 'Xcode', 'Terminal', 'iTerm2', 'PyCharm', 'iTerm 2'],
'communication': ['Slack', 'Discord', 'Zoom', 'Microsoft Teams', 'Teams', 'Messages']
}
def find_priority_window(self, context_hint: Optional[str] = None) -> Optional[WindowInfo]:
"""Find the most relevant window based on intelligent context analysis"""
# Get all available windows first
windows = self.list_windows_macos()
active = self.get_active_window_macos()
if not context_hint:
# No context - return best active or priority window
if active and self._is_relevant_window(active):
return active
return self._find_best_priority_window(windows)
# Parse user intent from context
intent_category = self._parse_user_intent(context_hint)
# Smart window matching based on intent
if intent_category:
target_window = self._find_window_by_intent(windows, intent_category, context_hint)
if target_window:
return target_window
# Direct keyword matching (existing behavior)
context_lower = context_hint.lower()
# Check active window first if it matches
if active and self._window_matches_context(active, context_lower):
return active
# Search all windows for matches
for window in windows:
if self._window_matches_context(window, context_lower):
return window
# Fallback to best available window
if active and self._is_relevant_window(active):
return active
return self._find_best_priority_window(windows)
def _parse_user_intent(self, context: str) -> Optional[str]:
"""Parse user intent from natural language context"""
context_lower = context.lower()
# Intent patterns for semantic understanding
intent_patterns = {
'media_consumption': [
'watching', 'listening', 'playing', 'streaming', 'video', 'music',
'youtube', 'netflix', 'spotify', 'twitch', 'hulu', 'prime video',
'what am i watching', 'what am i listening', 'what\'s playing'
],
'development': [
'coding', 'programming', 'debugging', 'terminal', 'editor',
'cursor', 'vscode', 'code', 'git', 'github', 'what am i coding',
'what am i working on', 'development', 'project'
],
'communication': [
'chatting', 'messaging', 'meeting', 'call', 'slack', 'discord',
'teams', 'zoom', 'messages', 'who am i talking to', 'conversation'
],
'browsing': [
'browsing', 'reading', 'website', 'web', 'chrome', 'safari',
'firefox', 'browser', 'what am i reading', 'what site', 'webpage'
]
}
# Find matching intent category
for intent, keywords in intent_patterns.items():
if any(keyword in context_lower for keyword in keywords):
return intent
return None
def _find_window_by_intent(self, windows: List[WindowInfo], intent: str, context: str) -> Optional[WindowInfo]:
"""Find window matching user intent category"""
# App mappings for each intent
intent_apps = {
'media_consumption': [
'Google Chrome', 'Safari', 'Firefox', 'YouTube', 'Netflix',
'Spotify', 'VLC', 'QuickTime Player', 'IINA', 'Twitch'
],
'development': [
'Cursor', 'Visual Studio Code', 'Code', 'Xcode', 'Terminal',
'iTerm2', 'PyCharm', 'GitHub Desktop', 'GitKraken'
],
'communication': [
'Slack', 'Discord', 'Zoom', 'Microsoft Teams', 'Teams',
'Messages', 'WhatsApp', 'Telegram'
],
'browsing': [
'Google Chrome', 'Safari', 'Firefox', 'Microsoft Edge', 'Arc'
]
}
target_apps = intent_apps.get(intent, [])
# For media consumption, prioritize windows with video/music indicators
if intent == 'media_consumption':
media_indicators = ['youtube', 'netflix', 'spotify', 'video', 'music', 'playing', 'stream']
# Look for windows with media indicators in title
for window in windows:
if any(indicator in window.title.lower() for indicator in media_indicators):
if any(app.lower() in window.app.lower() for app in target_apps):
return window
# Find windows matching target apps for this intent
for window in windows:
if any(app.lower() in window.app.lower() for app in target_apps):
if self._is_relevant_window(window):
return window
return None
def _window_matches_context(self, window: WindowInfo, context_lower: str) -> bool:
"""Check if window matches the given context"""
return (context_lower in window.title.lower() or
context_lower in window.app.lower() or
self._app_matches_context(window.app, context_lower))
def _app_matches_context(self, app_name: str, context_lower: str) -> bool:
"""Check if app matches context with smart mapping"""
app_lower = app_name.lower()
# Smart app context mapping
context_mappings = {
'youtube': ['chrome', 'safari', 'firefox'],
'netflix': ['chrome', 'safari', 'firefox'],
'browser': ['chrome', 'safari', 'firefox', 'edge', 'arc'],
'code': ['cursor', 'visual studio code', 'xcode', 'pycharm'],
'terminal': ['terminal', 'iterm'],
'music': ['spotify', 'apple music', 'youtube'],
'video': ['vlc', 'quicktime', 'iina']
}
for keyword, apps in context_mappings.items():
if keyword in context_lower:
if any(app in app_lower for app in apps):
return True
return False
def _find_best_priority_window(self, windows: List[WindowInfo]) -> Optional[WindowInfo]:
"""Find the best priority window from available windows"""
if not windows:
return None
# Look for priority apps in order
for category, apps in self.priority_apps.items():
for app in apps:
for window in windows:
if app.lower() in window.app.lower() and self._is_relevant_window(window):
return window
# Fallback to largest non-system window
relevant_windows = [w for w in windows if self._is_relevant_window(w)]
if relevant_windows:
return max(relevant_windows, key=lambda w: w.bounds[2] * w.bounds[3])
return None
def _is_relevant_window(self, window: WindowInfo) -> bool:
"""Check if window is worth capturing (not tiny, not system)"""
x, y, w, h = window.bounds
# Skip tiny windows
if w < 200 or h < 100:
return False
# Skip system/background apps
system_apps = ['Finder', 'System Preferences', 'Activity Monitor',
'Console', 'Keychain Access', 'System Information']
if any(sys_app in window.app for sys_app in system_apps):
return False
# Skip empty/generic titles
if not window.title or window.title in ['', ' ', 'Window']:
return False
return True
def capture_screen(self, region: Optional[tuple[int, int, int, int]] = None) -> Image.Image:
"""Capture screen or region using best available method"""
# pyautogui works reliably for region capture, use it as primary method
return self._capture_crossplatform(region)
def _capture_macos(self, region: Optional[tuple[int, int, int, int]] = None) -> Image.Image:
"""Use macOS screencapture command for best quality"""
try:
import tempfile
import time
# Use a unique temporary file to avoid conflicts
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
temp_path = temp_file.name
cmd = ["screencapture", "-x", temp_path]
if region:
x, y, w, h = region
# screencapture -R expects: -Rx,y,width,height (no space between -R and coordinates)
cmd.append(f"-R{x},{y},{w},{h}")
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
# Small delay to ensure file is written
time.sleep(0.1)
if not os.path.exists(temp_path):
raise FileNotFoundError(f"Screenshot file not created: {temp_path}")
image = Image.open(temp_path)
os.unlink(temp_path)
return image
except (subprocess.CalledProcessError, FileNotFoundError) as e:
print(f"screencapture failed: {e}")
# Fallback to cross-platform method
return self._capture_crossplatform(region)
def _capture_crossplatform(self, region: Optional[tuple[int, int, int, int]] = None) -> Image.Image:
"""Cross-platform capture using pyautogui"""
if region:
x, y, w, h = region
return pyautogui.screenshot(region=(x, y, w, h))
else:
return pyautogui.screenshot()
def get_active_window_macos(self) -> Optional[WindowInfo]:
"""Get active window info on macOS"""
try:
script = '''
tell application "System Events"
set frontApp to first application process whose frontmost is true
set appName to name of frontApp
set frontWindow to first window of frontApp
set windowTitle to name of frontWindow
set {x, y} to position of frontWindow
set {w, h} to size of frontWindow
return appName & "|" & windowTitle & "|" & x & "|" & y & "|" & w & "|" & h
end tell
'''
result = subprocess.run(
["osascript", "-e", script],
capture_output=True,
text=True,
check=True
)
parts = result.stdout.strip().split("|")
if len(parts) == 6:
app, title, x, y, w, h = parts
return WindowInfo(
id=0, # macOS doesn't provide easy window IDs
title=title,
app=app,
bounds=(int(x), int(y), int(w), int(h))
)
except (subprocess.CalledProcessError, FileNotFoundError):
pass
return None
def list_windows_macos(self) -> List[WindowInfo]:
"""List all windows on macOS using the same reliable AppleScript approach as test_enum.py"""
try:
# Use the same clean AppleScript approach as your test_enum.py
script = '''
set window_list to {}
set output to ""
try
tell application "System Events"
set all_processes to every application process where background only is false
repeat with proc in all_processes
set app_name to name of proc
try
repeat with w in every window of proc
set window_title to name of w
if window_title is not "" then
set window_pos to position of w
set window_size to size of w
set window_info to app_name & "|" & window_title & "|" & (item 1 of window_pos) & "|" & (item 2 of window_pos) & "|" & (item 1 of window_size) & "|" & (item 2 of window_size)
set output to output & window_info & linefeed
end if
end repeat
on error
-- Ignore processes that might not have window properties
end try
end repeat
end tell
on error errMsg
return "Error: " & errMsg
end try
return output
'''
result = subprocess.run(
["osascript", "-e", script],
capture_output=True,
text=True,
check=True
)
windows = []
if result.stdout.strip():
lines = result.stdout.strip().split('\n')
for i, line in enumerate(lines):
if line.strip() and "|" in line:
parts = line.strip().split("|")
if len(parts) >= 6:
try:
app, title, x, y, w, h = parts[:6]
windows.append(WindowInfo(
id=i,
title=title.strip(),
app=app.strip(),
bounds=(int(x), int(y), int(w), int(h))
))
except (ValueError, IndexError):
continue
return windows
except subprocess.CalledProcessError as e:
print(f"⚠️ Window listing failed. Error: {e}")
return []
except Exception as e:
print(f"⚠️ Window listing error: {e}")
return []
class ImageProcessor:
"""Image enhancement and processing"""
@staticmethod
def detect_content_regions(image: Image.Image) -> List[tuple[int, int, int, int]]:
"""Detect interesting content regions in image for auto-zoom"""
import cv2
import numpy as np
# Convert PIL to OpenCV format
img_array = np.array(image)
if len(img_array.shape) == 3:
img_gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
img_gray = img_array
# Detect edges to find content areas
edges = cv2.Canny(img_gray, 50, 150)
# Find contours
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
regions = []
min_area = (image.width * image.height) * 0.05 # At least 5% of image
max_area = (image.width * image.height) * 0.8 # At most 80% of image
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
# Filter by size and aspect ratio
if (min_area < area < max_area and
w > 100 and h > 100 and # Minimum meaningful size
0.3 < w/h < 3.0): # Reasonable aspect ratio
regions.append((x, y, w, h))
# Sort by area (largest first) and return top 3
regions.sort(key=lambda r: r[2] * r[3], reverse=True)
return regions[:3]
@staticmethod
def is_image_clear(image: Image.Image, threshold: float = 100.0) -> bool:
"""Check if image has sufficient detail/clarity"""
import cv2
import numpy as np
# Convert to grayscale
img_array = np.array(image.convert('L'))
# Calculate Laplacian variance (measure of blur/clarity)
laplacian = cv2.Laplacian(img_array, cv2.CV_64F)
variance = laplacian.var()
return variance > threshold
@staticmethod
def get_quality_scale(mode: Literal["overview", "readable", "detail"], image_width: int = 0) -> float:
"""Get scale factor for quality mode, adjusted for large images"""
base_scales = {
"overview": 0.4,
"readable": 0.8,
"detail": 1.0
}
scale = base_scales[mode]
# For very wide images (ultra-wide monitors), scale down more aggressively
if image_width > 2000: # Ultra-wide territory
scale *= 0.6 # Additional 40% reduction
elif image_width > 1600: # Large screens
scale *= 0.75 # Additional 25% reduction
return min(scale, 1.0) # Cap at 1.0
@staticmethod
def enhance_for_text(image: Image.Image) -> Image.Image:
"""Enhance image for better text readability"""
# Convert to numpy array for OpenCV processing
img_array = np.array(image)
# Apply sharpening kernel
kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
sharpened = cv2.filter2D(img_array, -1, kernel)
# Convert back to PIL for further processing
enhanced = Image.fromarray(sharpened)
# Enhance contrast
enhancer = ImageEnhance.Contrast(enhanced)
enhanced = enhancer.enhance(1.2)
# Slight sharpening with PIL
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
return enhanced
@staticmethod
def process_image(
image: Image.Image,
quality_mode: Literal["overview", "readable", "detail"],
enhance_text: bool,
format: Literal["png", "jpeg"] = "png"
) -> bytes:
"""Process image with quality and enhancement settings"""
# Apply text enhancement if requested
if enhance_text:
image = ImageProcessor.enhance_for_text(image)
# Scale image based on quality mode and image size
scale = ImageProcessor.get_quality_scale(quality_mode, image.width)
if scale < 1.0:
new_size = (int(image.width * scale), int(image.height * scale))
# Use LANCZOS for high-quality downscaling
image = image.resize(new_size, Image.Resampling.LANCZOS)
# Convert to bytes
buffer = io.BytesIO()
if format == "jpeg":
# Convert RGBA to RGB for JPEG
if image.mode in ("RGBA", "LA", "P"):
background = Image.new("RGB", image.size, (255, 255, 255))
if image.mode == "P":
image = image.convert("RGBA")
background.paste(image, mask=image.split()[-1] if image.mode == "RGBA" else None)
image = background
image.save(buffer, format="JPEG", quality=85, optimize=True)
else:
image.save(buffer, format="PNG", optimize=True)
return buffer.getvalue()
# Initialize capture and processor instances
capture = ScreenshotCapture()
processor = ImageProcessor()
@mcp.tool
def check_permissions() -> dict:
"""Check what macOS permissions are available and what functionality works."""
permissions = {
"platform": platform.system(),
"screen_recording": False,
"accessibility": False,
"working_features": [],
"missing_features": [],
"instructions": []
}
if platform.system() != "Darwin":
permissions["note"] = "Permission checks only apply to macOS"
return permissions
# Test screen recording (try a small capture)
try:
test_image = capture.capture_screen(region=(0, 0, 100, 100))
if test_image and test_image.width > 0:
permissions["screen_recording"] = True
permissions["working_features"].extend([
"screenshot_smart", "screenshot_active_window",
"screenshot_full", "screenshot_region"
])
except Exception:
permissions["missing_features"].extend([
"All screenshot functionality"
])
permissions["instructions"].append(
"Enable Screen Recording: System Preferences > Security & Privacy > Privacy > Screen Recording"
)
# Test accessibility (try to list windows)
try:
windows = capture.list_windows_macos()
if windows:
permissions["accessibility"] = True
permissions["working_features"].extend([
"list_windows", "screenshot_window (with window ID)"
])
else:
# Try to determine if it's a permission issue
script = 'tell application "System Events" to get name of first application process'
result = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
if "assistive access" in result.stderr or "-25211" in result.stderr:
permissions["missing_features"].extend([
"list_windows", "screenshot_window (selective)"
])
permissions["instructions"].append(
"Enable Accessibility: System Preferences > Security & Privacy > Privacy > Accessibility"
)
except Exception as e:
if "assistive access" in str(e) or "-25211" in str(e):
permissions["missing_features"].extend([
"list_windows", "screenshot_window (selective)"
])
permissions["instructions"].append(
"Enable Accessibility: System Preferences > Security & Privacy > Privacy > Accessibility"
)
# Add summary
if permissions["screen_recording"] and not permissions["accessibility"]:
permissions["summary"] = "Core screenshot functionality works. Window listing needs Accessibility permission."
elif permissions["screen_recording"] and permissions["accessibility"]:
permissions["summary"] = "All functionality available! 🎉"
elif not permissions["screen_recording"]:
permissions["summary"] = "Screen Recording permission required for basic functionality."
else:
permissions["summary"] = "Permission status unclear."
return permissions
@mcp.tool
def screenshot_smart_enhanced(
query: Optional[str] = None,
auto_zoom: bool = True,
quality_mode: Literal["overview", "readable", "detail"] = "readable",
enhance_text: bool = True,
format: Literal["png", "jpeg"] = "png"
) -> ToolResult:
"""Enhanced smart screenshot with natural language understanding and auto-zoom.
Args:
query: Natural language query like 'what am I watching on YouTube' or 'show me my code'
auto_zoom: Automatically capture focused regions if initial screenshot is unclear
quality_mode: Image quality mode
enhance_text: Apply text enhancement
format: Image format
"""
try:
# Find the best window using enhanced logic
target_window = capture.find_priority_window(query)
if not target_window:
return ToolResult(
content=[TextContent(type="text", text="No suitable window found for your query")]
)
# Activate the target window
try:
activate_script = f'''
tell application "System Events"
tell application process "{target_window.app}"
set frontmost to true
tell window "{target_window.title}"
perform action "AXRaise"
end tell
end tell
end tell
'''
subprocess.run(
["osascript", "-e", activate_script],
capture_output=True,
text=True,
check=True
)
import time
time.sleep(0.3) # Wait for window activation
except Exception:
pass # Continue anyway
# Capture the window
x, y, w, h = target_window.bounds
image = capture.capture_screen(region=(x, y, w, h))
# Check if auto-zoom is needed and enabled
if auto_zoom and not processor.is_image_clear(image):
# Try to find interesting regions to zoom into
regions = processor.detect_content_regions(image)
if regions:
# Use the largest interesting region
region_x, region_y, region_w, region_h = regions[0]
# Adjust coordinates to absolute screen coordinates
abs_x = x + region_x
abs_y = y + region_y
# Capture the focused region
zoom_image = capture.capture_screen(region=(abs_x, abs_y, region_w, region_h))
# Use zoomed image if it's clearer
if processor.is_image_clear(zoom_image):
image = zoom_image
zoom_info = f" (auto-zoomed to {region_w}x{region_h} region)"
else:
zoom_info = ""
else:
zoom_info = ""
else:
zoom_info = ""
# Process the final image
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
# Determine response message based on query understanding
if query:
intent = capture._parse_user_intent(query)
if intent == 'media_consumption':
response_msg = f"Here's what you're watching/listening to: {target_window.app} - {target_window.title}"
elif intent == 'development':
response_msg = f"Here's what you're working on: {target_window.app} - {target_window.title}"
elif intent == 'communication':
response_msg = f"Here's your conversation: {target_window.app} - {target_window.title}"
else:
response_msg = f"Smart capture: {target_window.app} - {target_window.title}"
else:
response_msg = f"Smart capture: {target_window.app} - {target_window.title}"
response_msg += f" ({image.width}x{image.height}, {quality_mode} quality{zoom_info})"
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(type="text", text=response_msg)
]
)
except Exception as e:
return ToolResult(
content=[TextContent(type="text", text=f"Enhanced smart capture failed: {str(e)}")]
)
@mcp.tool
def screenshot_smart(
context: Optional[str] = None,
quality_mode: Literal["overview", "readable", "detail"] = "readable",
enhance_text: bool = True,
format: Literal["png", "jpeg"] = "png"
) -> ToolResult:
"""Smart screenshot that finds the most relevant window automatically.
Args:
context: Optional hint about what to look for (e.g., 'youtube', 'browser', 'code')
quality_mode: Image quality mode
enhance_text: Apply text enhancement
format: Image format
"""
try:
# Find the best window to capture
target_window = capture.find_priority_window(context)
if target_window:
# Activate the priority window first for clean capture
try:
activate_script = f'''
tell application "System Events"
tell application process "{target_window.app}"
set frontmost to true
tell window "{target_window.title}"
perform action "AXRaise"
end tell
end tell
end tell
'''
subprocess.run(
["osascript", "-e", activate_script],
capture_output=True,
text=True,
check=True
)
# Small delay to let window come to front
import time
time.sleep(0.2)
except Exception:
# Continue anyway, might still work
pass
# Capture the priority window
x, y, w, h = target_window.bounds
image = capture.capture_screen(region=(x, y, w, h))
# Process image
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Smart capture (activated): {target_window.app} - {target_window.title} ({w}x{h}, {quality_mode} quality)"
)
]
)
else:
# Fallback to active window
if capture.system == "Darwin":
active_window = capture.get_active_window_macos()
if active_window:
x, y, w, h = active_window.bounds
image = capture.capture_screen(region=(x, y, w, h))
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Active window fallback: {active_window.app} - {active_window.title} ({w}x{h}, {quality_mode} quality)"
)
]
)
# Last resort: small region of screen center
screen_image = capture.capture_screen()
center_x = screen_image.width // 2
center_y = screen_image.height // 2
region_size = min(1200, screen_image.width // 2, screen_image.height // 2)
x = center_x - region_size // 2
y = center_y - region_size // 2
image = capture.capture_screen(region=(x, y, region_size, region_size))
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Center region capture: {region_size}x{region_size} from screen center ({quality_mode} quality)"
)
]
)
except Exception as e:
return ToolResult(
content=[TextContent(type="text", text=f"Smart capture failed: {str(e)}")]
)
@mcp.tool
def screenshot_full(
quality_mode: Literal["overview", "readable", "detail"] = "overview",
enhance_text: bool = True,
format: Literal["png", "jpeg"] = "png"
) -> ToolResult:
"""Capture entire desktop/screen with quality and enhancement options."""
try:
# Capture full screen
image = capture.capture_screen()
# Process image
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
# Encode as base64
image_b64 = base64.b64encode(image_bytes).decode()
# Return as ImageContent
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Full screen captured ({image.width}x{image.height}, {quality_mode} quality)"
)
]
)
except Exception as e:
return ToolResult(
content=[TextContent(type="text", text=f"Screenshot failed: {str(e)}")]
)
@mcp.tool
def screenshot_active_window(
quality_mode: Literal["overview", "readable", "detail"] = "readable",
enhance_text: bool = True,
format: Literal["png", "jpeg"] = "png"
) -> ToolResult:
"""Capture currently focused/active window."""
try:
if capture.system == "Darwin":
window_info = capture.get_active_window_macos()
if not window_info:
return ToolResult(
content=[TextContent(type="text", text="Could not detect active window")]
)
# Capture window region
x, y, w, h = window_info.bounds
image = capture.capture_screen(region=(x, y, w, h))
# Process image
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Active window captured: {window_info.app} - {window_info.title} ({w}x{h}, {quality_mode} quality)"
)
]
)
else:
return ToolResult(
content=[TextContent(type="text", text="Active window capture not yet implemented for this platform")]
)
except Exception as e:
return ToolResult(
content=[TextContent(type="text", text=f"Active window capture failed: {str(e)}")]
)
@mcp.tool
def screenshot_region(
x: int,
y: int,
width: int,
height: int,
quality_mode: Literal["overview", "readable", "detail"] = "detail",
enhance_text: bool = True,
format: Literal["png", "jpeg"] = "png"
) -> ToolResult:
"""Capture specific rectangular area of screen."""
try:
# Validate coordinates
if width <= 0 or height <= 0:
return ToolResult(
content=[TextContent(type="text", text="Width and height must be positive")]
)
# Capture region
image = capture.capture_screen(region=(x, y, width, height))
# Process image
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Region captured: ({x},{y}) {width}x{height} ({quality_mode} quality)"
)
]
)
except Exception as e:
return ToolResult(
content=[TextContent(type="text", text=f"Region capture failed: {str(e)}")]
)
@mcp.tool
def list_windows() -> dict:
"""Get list of all open windows with IDs, titles, and bounds."""
try:
if capture.system == "Darwin":
windows = capture.list_windows_macos()
return {
"windows": [
{
"id": w.id,
"title": w.title,
"app": w.app,
"bounds": list(w.bounds) # [x, y, width, height]
}
for w in windows
],
"count": len(windows),
"platform": "macOS"
}
else:
return {
"windows": [],
"count": 0,
"platform": capture.system,
"error": "Window listing not yet implemented for this platform"
}
except Exception as e:
return {
"windows": [],
"count": 0,
"error": f"Failed to list windows: {str(e)}"
}
@mcp.tool
def activate_window(window_id: int) -> dict:
"""Activate/focus a specific window by bringing it to front."""
try:
if capture.system == "Darwin":
windows = capture.list_windows_macos()
# Find window by ID
target_window = None
for window in windows:
if window.id == window_id:
target_window = window
break
if not target_window:
return {"success": False, "error": f"Window with ID {window_id} not found"}
# AppleScript to activate the window
script = f'''
tell application "System Events"
tell application process "{target_window.app}"
set frontmost to true
tell window "{target_window.title}"
perform action "AXRaise"
end tell
end tell
end tell
'''
subprocess.run(
["osascript", "-e", script],
capture_output=True,
text=True,
check=True
)
return {
"success": True,
"activated_window": {
"app": target_window.app,
"title": target_window.title,
"id": window_id
}
}
else:
return {"success": False, "error": "Window activation not implemented for this platform"}
except Exception as e:
return {"success": False, "error": f"Failed to activate window: {str(e)}"}
@mcp.tool
def screenshot_window(
window_id: int,
quality_mode: Literal["overview", "readable", "detail"] = "readable",
enhance_text: bool = True,
format: Literal["png", "jpeg"] = "png"
) -> ToolResult:
"""Capture specific window by ID from list_windows. Automatically activates the window first to ensure clean capture."""
try:
if capture.system == "Darwin":
windows = capture.list_windows_macos()
# Find window by ID
target_window = None
for window in windows:
if window.id == window_id:
target_window = window
break
if not target_window:
return ToolResult(
content=[TextContent(type="text", text=f"Window with ID {window_id} not found")]
)
# Activate window first to ensure clean capture
try:
activate_script = f'''
tell application "System Events"
tell application process "{target_window.app}"
set frontmost to true
tell window "{target_window.title}"
perform action "AXRaise"
end tell
end tell
end tell
'''
subprocess.run(
["osascript", "-e", activate_script],
capture_output=True,
text=True,
check=True
)
# Small delay to let window come to front
import time
time.sleep(0.2)
except Exception as e:
# Continue anyway, might still work
pass
# Capture window region
x, y, w, h = target_window.bounds
image = capture.capture_screen(region=(x, y, w, h))
# Process image
image_bytes = processor.process_image(image, quality_mode, enhance_text, format)
image_b64 = base64.b64encode(image_bytes).decode()
mime_type = f"image/{format}"
return ToolResult(
content=[
ImageContent(type="image", data=image_b64, mimeType=mime_type),
TextContent(
type="text",
text=f"Window captured (activated): {target_window.app} - {target_window.title} ({w}x{h}, {quality_mode} quality)"
)
]
)
else:
return ToolResult(
content=[TextContent(type="text", text="Window capture not yet implemented for this platform")]
)
except Exception as e:
return ToolResult(
content=[TextContent(type="text", text=f"Window capture failed: {str(e)}")]
)
def main():
# Support both stdio and HTTP transport
mcp_host = os.getenv("HOST", "127.0.0.1")
mcp_port = os.getenv("PORT", None)
if mcp_port:
mcp.run(port=int(mcp_port), host=mcp_host, transport="streamable-http")
else:
mcp.run()
if __name__ == "__main__":
main()