Auto-Snap MCP

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

auto-snap-mcp

capture.py•72.6 kB

""" Screenshot capture functionality for Linux systems and Windows (via WSL2). Uses pyscreenshot for cross-platform screenshot capability. """ import os import subprocess import time import platform from typing import List, Dict, Optional, Tuple from pathlib import Path import pyscreenshot as ImageGrab from PIL import Image import logging from config import get_config, get_output_directory, generate_page_filename logger = logging.getLogger(__name__) def detect_environment() -> str: """ Detect the current runtime environment. Returns: 'wsl' if running in WSL (can access Windows) 'linux' if running in native Linux 'windows' if running in native Windows """ try: # Check if we're in WSL if os.path.exists('/proc/version'): with open('/proc/version', 'r') as f: version_info = f.read().lower() if 'microsoft' in version_info or 'wsl' in version_info: return 'wsl' # Check if we're on Windows if platform.system() == 'Windows': return 'windows' # Default to Linux return 'linux' except Exception as e: logger.warning(f"Could not detect environment: {e}") return 'linux' # Safe default class WindowsWindowManager: """Handles Windows application enumeration and screenshot capture from WSL2.""" def __init__(self): self.powershell_available = self._check_powershell() def _check_powershell(self) -> bool: """Check if PowerShell is available from WSL2.""" try: logger.info("Checking PowerShell availability...") result = subprocess.run( ['powershell.exe', '-Command', 'Write-Host "test"'], capture_output=True, text=True, check=True, timeout=10 # 10 second timeout ) logger.info("PowerShell is available") return True except subprocess.TimeoutExpired: logger.error("PowerShell check timed out after 10 seconds") return False except (subprocess.CalledProcessError, FileNotFoundError) as e: logger.error(f"PowerShell not available from WSL2: {e}") return False def list_windows(self) -> List[Dict[str, str]]: """ List all Windows applications with visible windows using PowerShell. Returns list of window info dictionaries. """ if not self.powershell_available: logger.error("PowerShell not available - cannot list Windows applications") return [] try: # Enhanced PowerShell script to get comprehensive window information ps_script = ''' Add-Type -TypeDefinition @" using System; using System.Runtime.InteropServices; using System.Text; public class Win32 { [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsIconic(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsZoomed(IntPtr hWnd); [DllImport("user32.dll")] public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount); [DllImport("user32.dll")] public static extern int GetWindowTextLength(IntPtr hWnd); [DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId); } "@ $windows = @() # Get processes with capturable windows only Get-Process | Where-Object { $_.MainWindowHandle -ne 0 -and $_.ProcessName -notmatch "^(dwm|csrss|winlogon|wininit)$" } | ForEach-Object { $handle = [IntPtr]$_.MainWindowHandle $isVisible = [Win32]::IsWindowVisible($handle) $isMinimized = [Win32]::IsIconic($handle) $isMaximized = [Win32]::IsZoomed($handle) # Only include windows that are in capturable states # Skip hidden windows that can't be meaningfully captured if (-not ($isVisible -or $isMinimized)) { return # Skip this window } # Get window title using Windows API (more reliable than MainWindowTitle) $titleLength = [Win32]::GetWindowTextLength($handle) if ($titleLength -gt 0) { $title = New-Object System.Text.StringBuilder($titleLength + 1) [Win32]::GetWindowText($handle, $title, $title.Capacity) | Out-Null $windowTitle = $title.ToString() } else { $windowTitle = $_.MainWindowTitle } # Include window even if title is empty, but provide useful info if ([string]::IsNullOrEmpty($windowTitle)) { $windowTitle = "[$($_.ProcessName) - $($_.Id)]" } # Determine window state - only capturable states $windowState = if ($isMinimized) { "minimized" } elseif ($isMaximized) { "maximized" } else { "normal" } $windows += @{ id = $_.MainWindowHandle.ToString() title = $windowTitle process_name = $_.ProcessName process_id = $_.Id.ToString() window_handle = $_.MainWindowHandle.ToString() is_visible = $isVisible is_minimized = $isMinimized is_maximized = $isMaximized window_state = $windowState type = "windows" } } # Convert to JSON $windows | ConvertTo-Json -Compress ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=30, # 30 second timeout for window enumeration encoding='utf-8', errors='ignore' # Ignore encoding errors to handle special characters ) import json if result.stdout.strip(): try: # Handle both single object and array responses data = json.loads(result.stdout.strip()) if isinstance(data, dict): data = [data] # Convert single result to list windows = [] for window_info in data: # Only include windows with valid window handles (> 0) window_handle = str(window_info.get('window_handle', '0')) if window_handle == '0': continue # Skip processes without actual windows # Use the enhanced window information from the new PowerShell script windows.append({ 'id': str(window_info.get('id', '')), 'title': window_info.get('title', ''), 'process_name': window_info.get('process_name', ''), 'process_id': str(window_info.get('process_id', '')), 'window_handle': window_handle, 'is_visible': window_info.get('is_visible', False), 'is_minimized': window_info.get('is_minimized', False), 'is_maximized': window_info.get('is_maximized', False), 'window_state': window_info.get('window_state', 'normal'), # Default to normal instead of unknown 'type': 'windows' }) logger.info(f"Found {len(windows)} capturable windows using enhanced detection") if windows: # Log stats about capturable window states for debugging normal_count = sum(1 for w in windows if w['window_state'] == 'normal') minimized_count = sum(1 for w in windows if w['window_state'] == 'minimized') maximized_count = sum(1 for w in windows if w['window_state'] == 'maximized') logger.info(f"Window states: {normal_count} normal, {minimized_count} minimized, {maximized_count} maximized") return windows except json.JSONDecodeError as e: logger.error(f"Failed to parse PowerShell JSON output: {e}") logger.error(f"PowerShell output was: {result.stdout[:500]}...") # Log first 500 chars for debugging return [] else: logger.warning("PowerShell returned empty output - no windows detected") return [] except subprocess.TimeoutExpired: logger.error("PowerShell window enumeration timed out after 30 seconds") return [] except subprocess.CalledProcessError as e: logger.error(f"Failed to list Windows applications: {e}") return [] def capture_window(self, window_id: str, output_path: Optional[str] = None) -> str: """ Capture screenshot of a Windows application window. """ if not self.powershell_available: raise Exception("PowerShell not available - cannot capture Windows applications") try: # Enhanced PowerShell script with PrintWindow support for minimized windows ps_script = f''' Add-Type -AssemblyName System.Windows.Forms Add-Type -AssemblyName System.Drawing $windowHandle = [IntPtr]{window_id} if ($windowHandle -eq 0) {{ Write-Error "Invalid window handle" exit 1 }} # Define comprehensive Windows API functions Add-Type @" using System; using System.Runtime.InteropServices; public class Win32 {{ [StructLayout(LayoutKind.Sequential)] public struct RECT {{ public int Left, Top, Right, Bottom; }} // Window management APIs [DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect); [DllImport("user32.dll")] public static extern bool IsIconic(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); [DllImport("user32.dll")] public static extern bool PrintWindow(IntPtr hWnd, IntPtr hdcBlt, uint nFlags); [DllImport("user32.dll")] public static extern bool SetLayeredWindowAttributes(IntPtr hWnd, uint crKey, byte bAlpha, uint dwFlags); [DllImport("user32.dll")] public static extern int GetWindowLong(IntPtr hWnd, int nIndex); [DllImport("user32.dll")] public static extern int SetWindowLong(IntPtr hWnd, int nIndex, int dwNewLong); [DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr hWnd); [DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow(); [DllImport("user32.dll")] public static extern bool EnumChildWindows(IntPtr hWndParent, EnumChildProc lpEnumFunc, IntPtr lParam); [DllImport("user32.dll")] public static extern int GetClassName(IntPtr hWnd, System.Text.StringBuilder lpClassName, int nMaxCount); [DllImport("user32.dll")] public static extern IntPtr FindWindowEx(IntPtr hWndParent, IntPtr hWndChildAfter, string lpszClass, string lpszWindow); [DllImport("user32.dll")] public static extern bool PostMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam); [DllImport("user32.dll")] public static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam); [DllImport("user32.dll")] public static extern bool BringWindowToTop(IntPtr hWnd); public delegate bool EnumChildProc(IntPtr hWnd, IntPtr lParam); // Constants public const int SW_HIDE = 0; public const int SW_RESTORE = 9; public const int SW_MINIMIZE = 6; public const int SW_SHOW = 5; public const int GWL_EXSTYLE = -20; public const int WS_EX_LAYERED = 0x80000; public const int LWA_ALPHA = 0x2; public const uint PW_CLIENTONLY = 0x1; public const uint PW_RENDERFULLCONTENT = 0x2; // Windows Messages public const uint WM_KEYDOWN = 0x0100; public const uint WM_KEYUP = 0x0101; public const uint WM_CHAR = 0x0102; public const uint WM_COMMAND = 0x0111; public const uint WM_VSCROLL = 0x0115; // Virtual Key Codes public const int VK_SPACE = 0x20; public const int VK_PRIOR = 0x21; // Page Up public const int VK_NEXT = 0x22; // Page Down public const int VK_DOWN = 0x28; // Down Arrow public const int VK_UP = 0x26; // Up Arrow public const int VK_LEFT = 0x25; // Left Arrow public const int VK_RIGHT = 0x27; // Right Arrow }} "@ # Check if window is minimized and implement smart capture logic $isMinimized = [Win32]::IsIconic($windowHandle) $wasMinimized = $false $originalExStyle = 0 try {{ # Get window rectangle $rect = New-Object Win32+RECT $success = [Win32]::GetWindowRect($windowHandle, [ref]$rect) if (-not $success) {{ Write-Error "Could not get window rectangle" exit 1 }} $width = $rect.Right - $rect.Left $height = $rect.Bottom - $rect.Top if ($width -le 0 -or $height -le 0) {{ Write-Error "Invalid window dimensions: $($width)x$($height)" exit 1 }} # Handle minimized windows with stealth restoration if ($isMinimized) {{ Write-Verbose "Window is minimized, using stealth restoration technique" $wasMinimized = $true # Make window transparent for stealth operation $originalExStyle = [Win32]::GetWindowLong($windowHandle, [Win32]::GWL_EXSTYLE) $newExStyle = $originalExStyle -bor [Win32]::WS_EX_LAYERED [Win32]::SetWindowLong($windowHandle, [Win32]::GWL_EXSTYLE, $newExStyle) | Out-Null [Win32]::SetLayeredWindowAttributes($windowHandle, 0, 1, [Win32]::LWA_ALPHA) | Out-Null # Restore window temporarily [Win32]::ShowWindow($windowHandle, [Win32]::SW_RESTORE) | Out-Null Start-Sleep -Milliseconds 100 # Brief pause for window to render # Update rectangle after restoration $success = [Win32]::GetWindowRect($windowHandle, [ref]$rect) if ($success) {{ $width = $rect.Right - $rect.Left $height = $rect.Bottom - $rect.Top }} }} # Create bitmap for capture $bitmap = New-Object System.Drawing.Bitmap($width, $height) $graphics = [System.Drawing.Graphics]::FromImage($bitmap) # Try PrintWindow first (better for minimized/hidden windows) $hdcBitmap = $graphics.GetHdc() $printSuccess = [Win32]::PrintWindow($windowHandle, $hdcBitmap, [Win32]::PW_RENDERFULLCONTENT) $graphics.ReleaseHdc($hdcBitmap) # Fallback to CopyFromScreen if PrintWindow fails if (-not $printSuccess) {{ Write-Verbose "PrintWindow failed, falling back to CopyFromScreen" if (-not $isMinimized) {{ $graphics.CopyFromScreen($rect.Left, $rect.Top, 0, 0, [System.Drawing.Size]::new($width, $height)) }} else {{ throw "Cannot capture minimized window with CopyFromScreen" }} }} $graphics.Dispose() # Save to temporary file $tempPath = [System.IO.Path]::GetTempFileName() + ".png" $bitmap.Save($tempPath, [System.Drawing.Imaging.ImageFormat]::Png) $bitmap.Dispose() Write-Output $tempPath }} finally {{ # Restore original window state if it was modified if ($wasMinimized) {{ # Restore original transparency [Win32]::SetWindowLong($windowHandle, [Win32]::GWL_EXSTYLE, $originalExStyle) | Out-Null # Re-minimize the window [Win32]::ShowWindow($windowHandle, [Win32]::SW_MINIMIZE) | Out-Null }} }} ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=60, # 60 second timeout for screenshot capture encoding='utf-8', errors='ignore' # Ignore encoding errors to handle special characters ) temp_windows_path = result.stdout.strip() if not temp_windows_path: raise Exception("PowerShell did not return temp file path") # Convert Windows path to WSL path wsl_temp_path = self._windows_path_to_wsl(temp_windows_path) if output_path is None: timestamp = int(time.time()) output_path = f"windows_capture_{window_id}_{timestamp}.png" # Copy from Windows temp to desired location subprocess.run(['cp', wsl_temp_path, output_path], check=True) # Clean up Windows temp file subprocess.run( ['powershell.exe', '-Command', f'Remove-Item "{temp_windows_path}" -Force'], check=False # Don't fail if cleanup fails ) logger.info(f"Windows application captured: {output_path}") return output_path except subprocess.TimeoutExpired: logger.error(f"PowerShell window capture timed out after 60 seconds") raise Exception(f"Capture timeout for window {window_id}") except subprocess.CalledProcessError as e: logger.error(f"Failed to capture Windows application {window_id}: {e}") raise def _windows_path_to_wsl(self, windows_path: str) -> str: """Convert Windows path to WSL path.""" try: result = subprocess.run( ['wslpath', windows_path], capture_output=True, text=True, check=True, timeout=5 # 5 second timeout for path conversion ) return result.stdout.strip() except (subprocess.CalledProcessError, subprocess.TimeoutExpired): # Fallback: manual conversion for basic cases if windows_path.startswith('C:'): return windows_path.replace('C:', '/mnt/c').replace('\\', '/') return windows_path def capture_full_screen(self, output_path: Optional[str] = None) -> str: """ Capture full screen screenshot using PowerShell from WSL2. """ if not self.powershell_available: raise Exception("PowerShell not available - cannot capture full screen") logger.info("Capturing full screen using PowerShell...") try: # PowerShell script to capture full screen ps_script = ''' Add-Type -AssemblyName System.Windows.Forms Add-Type -AssemblyName System.Drawing # Get primary screen dimensions $primaryScreen = [System.Windows.Forms.Screen]::PrimaryScreen $bounds = $primaryScreen.Bounds $width = $bounds.Width $height = $bounds.Height Write-Verbose "Screen dimensions: ${width}x${height}" # Capture full screen $bitmap = New-Object System.Drawing.Bitmap($width, $height) $graphics = [System.Drawing.Graphics]::FromImage($bitmap) $graphics.CopyFromScreen(0, 0, 0, 0, [System.Drawing.Size]::new($width, $height)) $graphics.Dispose() # Save to temporary file in Windows temp directory $tempPath = [System.IO.Path]::GetTempFileName() + ".png" $bitmap.Save($tempPath, [System.Drawing.Imaging.ImageFormat]::Png) $bitmap.Dispose() Write-Output $tempPath ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=30 # 30 second timeout for full screen capture ) temp_windows_path = result.stdout.strip() if not temp_windows_path: raise Exception("PowerShell did not return temp file path") # Convert Windows path to WSL path wsl_temp_path = self._windows_path_to_wsl(temp_windows_path) if output_path is None: timestamp = int(time.time()) output_path = f"fullscreen_{timestamp}.png" # Copy from Windows temp to desired location subprocess.run(['cp', wsl_temp_path, output_path], check=True) # Clean up Windows temp file subprocess.run( ['powershell.exe', '-Command', f'Remove-Item "{temp_windows_path}" -Force'], check=False # Don't fail if cleanup fails ) logger.info(f"Full screen captured: {output_path}") return output_path except subprocess.TimeoutExpired: logger.error("PowerShell full screen capture timed out after 30 seconds") raise Exception("Full screen capture timeout") except subprocess.CalledProcessError as e: logger.error(f"Failed to capture full screen: {e}") raise except Exception as e: logger.error(f"Full screen capture failed: {e}") raise def debug_window_detection(self) -> Dict[str, any]: """ Comprehensive debugging information for window detection. Returns detailed diagnostics about the PowerShell environment and window detection. """ debug_info = { 'powershell_available': self.powershell_available, 'detection_methods': [], 'raw_powershell_output': '', 'parsing_errors': [], 'process_info': {} } if not self.powershell_available: debug_info['error'] = 'PowerShell not available' return debug_info try: # Test basic PowerShell functionality basic_test = subprocess.run( ['powershell.exe', '-Command', 'Get-Process | Select-Object -First 5 Name, Id | ConvertTo-Json'], capture_output=True, text=True, check=True, timeout=10 ) debug_info['basic_powershell_test'] = 'SUCCESS' debug_info['sample_processes'] = basic_test.stdout.strip()[:200] + '...' except Exception as e: debug_info['basic_powershell_test'] = f'FAILED: {str(e)}' return debug_info # Test our enhanced window detection script with verbose output try: debug_script = ''' $VerbosePreference = "Continue" Write-Verbose "Starting window detection debug..." Add-Type -TypeDefinition @" using System; using System.Runtime.InteropServices; using System.Text; public class Win32 { [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsIconic(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsZoomed(IntPtr hWnd); [DllImport("user32.dll")] public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount); [DllImport("user32.dll")] public static extern int GetWindowTextLength(IntPtr hWnd); } "@ $allProcesses = Get-Process | Measure-Object | Select-Object -ExpandProperty Count Write-Verbose "Total processes found: $allProcesses" $processesWithWindows = Get-Process | Where-Object { $_.MainWindowHandle -ne 0 } | Measure-Object | Select-Object -ExpandProperty Count Write-Verbose "Processes with windows: $processesWithWindows" $windows = @() Get-Process | Where-Object { $_.MainWindowHandle -ne 0 -and $_.ProcessName -notmatch "^(dwm|csrss|winlogon|wininit)$" } | ForEach-Object { Write-Verbose "Processing: $($_.ProcessName) (ID: $($_.Id))" $handle = [IntPtr]$_.MainWindowHandle $isVisible = [Win32]::IsWindowVisible($handle) $isMinimized = [Win32]::IsIconic($handle) $isMaximized = [Win32]::IsZoomed($handle) # Only include capturable windows (same logic as main script) if (-not ($isVisible -or $isMinimized)) { Write-Verbose "Skipping hidden window: $($_.ProcessName)" return } $windows += @{ process_name = $_.ProcessName process_id = $_.Id window_handle = $_.MainWindowHandle.ToString() main_window_title = $_.MainWindowTitle is_visible = $isVisible is_minimized = $isMinimized is_maximized = $isMaximized window_state = if ($isMinimized) { "minimized" } elseif ($isMaximized) { "maximized" } else { "normal" } } } Write-Host "PROCESS_COUNT:$allProcesses" Write-Host "WINDOWS_COUNT:$processesWithWindows" Write-Host "FILTERED_COUNT:$($windows.Count)" $windows | ConvertTo-Json -Depth 2 ''' result = subprocess.run( ['powershell.exe', '-Command', debug_script], capture_output=True, text=True, timeout=30 ) debug_info['enhanced_script_exit_code'] = result.returncode debug_info['raw_powershell_output'] = result.stdout debug_info['powershell_stderr'] = result.stderr # Parse the debug output lines = result.stdout.split('\n') for line in lines: if line.startswith('PROCESS_COUNT:'): debug_info['total_processes'] = int(line.split(':')[1]) elif line.startswith('WINDOWS_COUNT:'): debug_info['processes_with_windows'] = int(line.split(':')[1]) elif line.startswith('FILTERED_COUNT:'): debug_info['filtered_windows'] = int(line.split(':')[1]) except Exception as e: debug_info['enhanced_script_error'] = str(e) return debug_info def _find_pdf_viewer_window(self, parent_window_id: str) -> str: """ Find the child window that handles PDF viewer functionality within Adobe Reader. Args: parent_window_id: Parent window handle ID (Adobe Reader main window) Returns: Child window handle ID that should receive navigation keys, or parent if not found """ if not self.powershell_available: logger.error("PowerShell not available - cannot enumerate child windows") return parent_window_id try: # PowerShell script to find PDF viewer child window ps_script = f''' $parentHandle = [IntPtr]{parent_window_id} if ($parentHandle -eq 0) {{ Write-Output "{parent_window_id}" exit 0 }} # Define Windows API functions for child window enumeration Add-Type @" using System; using System.Runtime.InteropServices; using System.Text; public class Win32 {{ [DllImport("user32.dll")] public static extern bool EnumChildWindows(IntPtr hWndParent, EnumChildProc lpEnumFunc, IntPtr lParam); [DllImport("user32.dll")] public static extern int GetClassName(IntPtr hWnd, StringBuilder lpClassName, int nMaxCount); [DllImport("user32.dll")] public static extern IntPtr FindWindowEx(IntPtr hWndParent, IntPtr hWndChildAfter, string lpszClass, string lpszWindow); [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr hWnd); public delegate bool EnumChildProc(IntPtr hWnd, IntPtr lParam); }} "@ $foundWindows = @() # Strategy 1: Look for common Adobe Reader PDF viewer window classes $knownPDFClasses = @( "AVPageView", # Adobe Reader page view "AVScrolledPageView", # Adobe Reader scrolled view "AcroRd32Class", # Adobe Reader document "AcrobatClass", # Adobe Acrobat document "AVL_AVView", # Adobe Viewer "AVPageViewWnd32", # Adobe page view window "AVThumbnailView" # Adobe thumbnail view ) foreach ($className in $knownPDFClasses) {{ $childHandle = [Win32]::FindWindowEx($parentHandle, [IntPtr]::Zero, $className, $null) if ($childHandle -ne [IntPtr]::Zero -and [Win32]::IsWindowVisible($childHandle)) {{ Write-Verbose "Found PDF viewer window with class: $className" $foundWindows += $childHandle.ToString() }} }} # Strategy 2: Enumerate all child windows and look for likely candidates $allChildWindows = @() # Define callback function for EnumChildWindows $callback = {{ param($hWnd, $lParam) $className = New-Object System.Text.StringBuilder(256) $result = [Win32]::GetClassName($hWnd, $className, $className.Capacity) if ($result -gt 0) {{ $class = $className.ToString() $isVisible = [Win32]::IsWindowVisible($hWnd) # Look for classes that might contain PDF content if ($isVisible -and ($class -match "(View|Page|Document|PDF|Acro|AVL)" -or $class.Length -gt 10)) {{ $allChildWindows += @{{ Handle = $hWnd.ToString() ClassName = $class IsVisible = $isVisible }} }} }} return $true # Continue enumeration }} # This won't work directly in PowerShell due to delegate limitations # But we'll try the FindWindowEx approach which is more reliable # Strategy 3: Try to find the most likely candidate window # Look for windows with specific patterns in class names $candidateHandle = [IntPtr]::Zero $childAfter = [IntPtr]::Zero do {{ $childAfter = [Win32]::FindWindowEx($parentHandle, $childAfter, $null, $null) if ($childAfter -ne [IntPtr]::Zero) {{ $className = New-Object System.Text.StringBuilder(256) $result = [Win32]::GetClassName($childAfter, $className, $className.Capacity) if ($result -gt 0) {{ $class = $className.ToString() $isVisible = [Win32]::IsWindowVisible($childAfter) # Prioritize windows with PDF-related class names if ($isVisible -and ($class -match "(AVPageView|AVScrolled|AcroRd|Acrobat|AVL_AVView)" -or ($class.Length -gt 8 -and $class -notmatch "(Button|Static|Edit|ComboBox|ListBox)"))) {{ $foundWindows += $childAfter.ToString() Write-Verbose "Found candidate window: $class ($($childAfter.ToString()))" }} }} }} }} while ($childAfter -ne [IntPtr]::Zero) # Return the best candidate or the parent if none found if ($foundWindows.Count -gt 0) {{ # Return the first likely PDF viewer window Write-Output $foundWindows[0] }} else {{ # Fallback to parent window Write-Output "{parent_window_id}" }} ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=15, # 15 second timeout for child window enumeration encoding='utf-8', errors='ignore' ) child_window_id = result.stdout.strip() if child_window_id and child_window_id != parent_window_id: logger.info(f"Found PDF viewer child window: {child_window_id} (parent: {parent_window_id})") return child_window_id else: logger.info(f"No suitable PDF viewer child window found, using parent: {parent_window_id}") return parent_window_id except subprocess.TimeoutExpired: logger.error("PowerShell child window enumeration timed out after 15 seconds") return parent_window_id except subprocess.CalledProcessError as e: logger.error(f"Failed to enumerate child windows: {e}") return parent_window_id except Exception as e: logger.error(f"Unexpected error finding PDF viewer window: {e}") return parent_window_id def _send_key_to_window(self, window_id: str, key: str) -> bool: """ Send a key press to a specific window using PostMessage for direct window messaging. First tries to find PDF viewer child window, then sends key directly via Windows API. Args: window_id: Window handle ID key: Key to send (e.g., "{DOWN}", "{PGDN}", "{RIGHT}") Returns: True if key was sent successfully, False otherwise """ if not self.powershell_available: logger.error("PowerShell not available - cannot send keys") return False try: # First, find the PDF viewer child window that should receive the keys target_window_id = self._find_pdf_viewer_window(window_id) logger.debug(f"Target window for key sending: {target_window_id} (original: {window_id})") # PowerShell script using PostMessage for direct key sending ps_script = f''' $targetHandle = [IntPtr]{target_window_id} if ($targetHandle -eq 0) {{ Write-Error "Invalid target window handle" exit 1 }} # Define comprehensive Windows API functions Add-Type @" using System; using System.Runtime.InteropServices; public class Win32 {{ [DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool BringWindowToTop(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); [DllImport("user32.dll")] public static extern bool PostMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam); [DllImport("user32.dll")] public static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam); [DllImport("user32.dll")] public static extern bool IsWindow(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr hWnd); // Windows Messages public const uint WM_KEYDOWN = 0x0100; public const uint WM_KEYUP = 0x0101; public const uint WM_CHAR = 0x0102; // Virtual Key Codes public const int VK_SPACE = 0x20; public const int VK_PRIOR = 0x21; // Page Up public const int VK_NEXT = 0x22; // Page Down public const int VK_DOWN = 0x28; // Down Arrow public const int VK_UP = 0x26; // Up Arrow public const int VK_LEFT = 0x25; // Left Arrow public const int VK_RIGHT = 0x27; // Right Arrow public const int SW_RESTORE = 9; public const int SW_SHOW = 5; }} "@ # Validate target window if (-not [Win32]::IsWindow($targetHandle)) {{ Write-Error "Target window handle is not valid" exit 1 }} # Map SendKeys format to virtual key codes $virtualKey = 0 switch ("{key}") {{ "{{DOWN}}" {{ $virtualKey = [Win32]::VK_DOWN }} "{{PGDN}}" {{ $virtualKey = [Win32]::VK_NEXT }} "{{RIGHT}}" {{ $virtualKey = [Win32]::VK_RIGHT }} "{{UP}}" {{ $virtualKey = [Win32]::VK_UP }} "{{LEFT}}" {{ $virtualKey = [Win32]::VK_LEFT }} " " {{ $virtualKey = [Win32]::VK_SPACE }} default {{ # Try to handle other keys - fallback to space $virtualKey = [Win32]::VK_NEXT # Default to Page Down for PDF navigation }} }} if ($virtualKey -eq 0) {{ Write-Error "Unknown key mapping for: {key}" exit 1 }} try {{ # Send key messages directly to target window (no window state changes needed) $keyDownResult = [Win32]::PostMessage($targetHandle, [Win32]::WM_KEYDOWN, [IntPtr]$virtualKey, [IntPtr]0) Start-Sleep -Milliseconds 50 $keyUpResult = [Win32]::PostMessage($targetHandle, [Win32]::WM_KEYUP, [IntPtr]$virtualKey, [IntPtr]0) if ($keyDownResult -and $keyUpResult) {{ Write-Output "SUCCESS: PostMessage sent VK=$virtualKey to window $($targetHandle.ToString())" }} else {{ Write-Warning "PostMessage may have failed: KeyDown=$keyDownResult KeyUp=$keyUpResult" # Fallback: try SendMessage instead of PostMessage [Win32]::SendMessage($targetHandle, [Win32]::WM_KEYDOWN, [IntPtr]$virtualKey, [IntPtr]0) | Out-Null Start-Sleep -Milliseconds 50 [Win32]::SendMessage($targetHandle, [Win32]::WM_KEYUP, [IntPtr]$virtualKey, [IntPtr]0) | Out-Null Write-Output "FALLBACK: SendMessage used as fallback" }} }} catch {{ Write-Error "Failed to send key message: $($_.Exception.Message)" exit 1 }} ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=15, # 15 second timeout for enhanced key sending encoding='utf-8', errors='ignore' ) if "SUCCESS" in result.stdout or "FALLBACK" in result.stdout: logger.debug(f"Successfully sent key '{key}' to target window {target_window_id}") return True else: logger.error(f"Failed to send key '{key}' to target window {target_window_id}") logger.error(f"PowerShell output: {result.stdout}") return False except subprocess.TimeoutExpired: logger.error(f"PowerShell enhanced key sending timed out after 15 seconds") return False except subprocess.CalledProcessError as e: logger.error(f"Failed to send key '{key}' using PostMessage: {e}") return False except Exception as e: logger.error(f"Unexpected error in enhanced key sending: {e}") return False def capture_multiple_pages(self, window_id: str, page_count: int, output_dir: Optional[str] = None, navigation_key: str = "{DOWN}", delay_seconds: float = 1.5) -> List[str]: """ Capture multiple pages from a document window with automatic navigation. Args: window_id: Window ID containing the document page_count: Number of pages to capture output_dir: Directory to save captured pages navigation_key: Key to send for navigation (e.g., "{DOWN}", "{PGDN}", "{RIGHT}") delay_seconds: Delay between navigation and capture Returns: List of captured file paths """ try: # Get configured output directory (with backward compatibility) config = get_config() if output_dir is None: if config.should_use_legacy_mode(): output_dir = "captures" # Legacy default else: actual_output_dir = get_output_directory() else: actual_output_dir = get_output_directory(output_dir) # Create output directory actual_output_dir.mkdir(parents=True, exist_ok=True) output_dir_str = str(actual_output_dir) # Convert navigation key names to SendKeys format key_mapping = { "Down": "{DOWN}", "Page_Down": "{PGDN}", "Right": "{RIGHT}", "Up": "{UP}", "Left": "{LEFT}", "space": " ", "Enter": "{ENTER}", "Tab": "{TAB}" } # Use mapping if available, otherwise use key as-is (for direct SendKeys format) sendkeys_format = key_mapping.get(navigation_key, navigation_key) # Detect and preserve original window state logger.info(f"Starting multi-page capture: {page_count} pages from window {window_id}") logger.info(f"Navigation key: {navigation_key} -> {sendkeys_format}, Delay: {delay_seconds}s") original_window_state = self._detect_and_prepare_window_state(window_id) logger.info(f"Original window state: {original_window_state}") captured_files = [] for page_num in range(1, page_count + 1): logger.info(f"Capturing page {page_num}/{page_count}") # Capture current page filename = generate_page_filename(page_num) output_path = actual_output_dir / filename try: captured_path = self.capture_window(window_id, str(output_path)) captured_files.append(captured_path) logger.info(f"Captured page {page_num}: {captured_path}") except Exception as e: logger.error(f"Failed to capture page {page_num}: {e}") # Continue with next page even if one fails continue # Navigate to next page (except for last page) if page_num < page_count: logger.debug(f"Navigating to next page using key: {sendkeys_format}") key_sent = self._send_key_to_window(window_id, sendkeys_format) if not key_sent: logger.warning(f"Failed to send navigation key for page {page_num}") # Continue anyway - user might manually navigate # Wait for page to load/render time.sleep(delay_seconds) logger.info(f"Multi-page capture completed: {len(captured_files)} pages captured to {actual_output_dir}") # Restore original window state if original_window_state: self._restore_window_state(window_id, original_window_state) logger.info(f"Restored window to original state: {original_window_state}") return captured_files except Exception as e: # Restore original window state even on error if 'original_window_state' in locals() and original_window_state: try: self._restore_window_state(window_id, original_window_state) logger.info(f"Restored window state after error: {original_window_state}") except Exception as restore_error: logger.warning(f"Failed to restore window state after error: {restore_error}") logger.error(f"Failed to capture multiple pages: {e}") raise def _detect_and_prepare_window_state(self, window_id: str) -> dict: """ Detect current window state and prepare for capture session. Only modifies window state if necessary for capture operations. Args: window_id: Window handle ID Returns: Dictionary with original window state information """ if not self.powershell_available: logger.warning("PowerShell not available - cannot detect window state") return {} try: ps_script = f''' $windowHandle = [IntPtr]{window_id} if ($windowHandle -eq 0) {{ Write-Output "invalid_handle" exit 0 }} # Define Windows API functions for state detection Add-Type @" using System; using System.Runtime.InteropServices; public class Win32 {{ [DllImport("user32.dll")] public static extern bool IsWindow(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsIconic(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool IsZoomed(IntPtr hWnd); [DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow(); [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); [DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr hWnd); [DllImport("user32.dll")] public static extern bool BringWindowToTop(IntPtr hWnd); public const int SW_RESTORE = 9; public const int SW_SHOW = 5; public const int SW_MINIMIZE = 6; public const int SW_MAXIMIZE = 3; }} "@ if (-not [Win32]::IsWindow($windowHandle)) {{ Write-Output "invalid_window" exit 0 }} # Detect current state $isVisible = [Win32]::IsWindowVisible($windowHandle) $isMinimized = [Win32]::IsIconic($windowHandle) $isMaximized = [Win32]::IsZoomed($windowHandle) $isForeground = ([Win32]::GetForegroundWindow() -eq $windowHandle) # Determine state name $stateName = if ($isMinimized) {{ "minimized" }} elseif ($isMaximized) {{ "maximized" }} elseif ($isVisible) {{ "normal" }} else {{ "hidden" }} # Determine if we need to prepare the window for capture $needsPreparation = $false if ($isMinimized -or -not $isVisible) {{ Write-Verbose "Window needs preparation: restoring from minimized/hidden state" $needsPreparation = $true # Only restore if minimized or hidden [Win32]::ShowWindow($windowHandle, [Win32]::SW_RESTORE) | Out-Null Start-Sleep -Milliseconds 200 }} if (-not $isForeground) {{ Write-Verbose "Window needs focus preparation" $needsPreparation = $true # Only set focus if not already foreground [Win32]::BringWindowToTop($windowHandle) | Out-Null [Win32]::SetForegroundWindow($windowHandle) | Out-Null Start-Sleep -Milliseconds 100 }} # Output state information Write-Output "$stateName|$isVisible|$isMinimized|$isMaximized|$isForeground|$needsPreparation" ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=10, encoding='utf-8', errors='ignore' ) output = result.stdout.strip() if output in ["invalid_handle", "invalid_window"]: logger.warning(f"Invalid window for state detection: {output}") return {} # Parse state information parts = output.split('|') if len(parts) >= 6: state_info = { 'state_name': parts[0], 'was_visible': parts[1] == 'True', 'was_minimized': parts[2] == 'True', 'was_maximized': parts[3] == 'True', 'was_foreground': parts[4] == 'True', 'was_prepared': parts[5] == 'True' } logger.debug(f"Detected window state: {state_info}") return state_info else: logger.warning(f"Could not parse window state: {output}") return {} except subprocess.TimeoutExpired: logger.error("Window state detection timed out") return {} except subprocess.CalledProcessError as e: logger.error(f"Failed to detect window state: {e}") return {} except Exception as e: logger.error(f"Unexpected error in window state detection: {e}") return {} def _restore_window_state(self, window_id: str, original_state: dict) -> bool: """ Restore window to its original state after capture session. Args: window_id: Window handle ID original_state: State information from _detect_and_prepare_window_state Returns: True if restoration was successful, False otherwise """ if not self.powershell_available or not original_state: return False try: # Only restore if we made changes during preparation if not original_state.get('was_prepared', False): logger.debug("No window state changes were made, skipping restoration") return True ps_script = f''' $windowHandle = [IntPtr]{window_id} if ($windowHandle -eq 0) {{ exit 1 }} # Define Windows API functions Add-Type @" using System; using System.Runtime.InteropServices; public class Win32 {{ [DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow); public const int SW_RESTORE = 9; public const int SW_MINIMIZE = 6; public const int SW_MAXIMIZE = 3; public const int SW_HIDE = 0; }} "@ $wasMinimized = [bool]::Parse("{str(original_state.get('was_minimized', False)).lower()}") $wasMaximized = [bool]::Parse("{str(original_state.get('was_maximized', False)).lower()}") $wasVisible = [bool]::Parse("{str(original_state.get('was_visible', True)).lower()}") # Restore original window state if ($wasMinimized) {{ Write-Verbose "Restoring to minimized state" [Win32]::ShowWindow($windowHandle, [Win32]::SW_MINIMIZE) | Out-Null }} elseif ($wasMaximized) {{ Write-Verbose "Restoring to maximized state" [Win32]::ShowWindow($windowHandle, [Win32]::SW_MAXIMIZE) | Out-Null }} elseif (-not $wasVisible) {{ Write-Verbose "Restoring to hidden state" [Win32]::ShowWindow($windowHandle, [Win32]::SW_HIDE) | Out-Null }} else {{ Write-Verbose "Restoring to normal state" [Win32]::ShowWindow($windowHandle, [Win32]::SW_RESTORE) | Out-Null }} Write-Output "SUCCESS" ''' result = subprocess.run( ['powershell.exe', '-Command', ps_script], capture_output=True, text=True, check=True, timeout=5, encoding='utf-8', errors='ignore' ) if "SUCCESS" in result.stdout: logger.debug("Successfully restored window state") return True else: logger.warning("Failed to restore window state") return False except subprocess.TimeoutExpired: logger.error("Window state restoration timed out") return False except subprocess.CalledProcessError as e: logger.error(f"Failed to restore window state: {e}") return False except Exception as e: logger.error(f"Unexpected error in window state restoration: {e}") return False class CrossPlatformWindowManager: """ Unified interface that automatically selects the appropriate window manager based on the runtime environment. """ def __init__(self): self.environment = detect_environment() self.manager = self._create_manager() def _create_manager(self): """Create appropriate window manager based on environment.""" if self.environment == 'wsl': # In WSL, try Windows manager first, fallback to Linux windows_manager = WindowsWindowManager() if windows_manager.powershell_available: logger.info("Using Windows application manager (PowerShell from WSL2)") return windows_manager else: logger.info("PowerShell unavailable, falling back to Linux X11 manager") return WindowCapture() elif self.environment == 'windows': logger.info("Using Windows application manager") return WindowsWindowManager() else: # linux logger.info("Using Linux X11 window manager") return WindowCapture() def list_windows(self) -> List[Dict[str, str]]: """List all available windows using the appropriate manager.""" try: windows = self.manager.list_windows() # Add environment info to each window for window in windows: window['environment'] = self.environment if 'type' not in window: window['type'] = 'x11' if self.environment == 'linux' else self.environment return windows except Exception as e: logger.error(f"Failed to list windows: {e}") return [] def capture_window(self, window_id: str, output_path: Optional[str] = None) -> str: """Capture screenshot of a specific window.""" return self.manager.capture_window(window_id, output_path) def capture_full_screen(self, output_path: Optional[str] = None) -> str: """Capture full screen screenshot.""" if hasattr(self.manager, 'capture_full_screen'): return self.manager.capture_full_screen(output_path) else: # Fallback using pyscreenshot try: screenshot = ImageGrab.grab() if output_path is None: timestamp = int(time.time()) output_path = f"screenshot_{timestamp}.png" screenshot.save(output_path) logger.info(f"Full screen captured: {output_path}") return output_path except Exception as e: logger.error(f"Failed to capture full screen: {e}") raise def get_environment_info(self) -> Dict[str, str]: """Get information about the current environment and capabilities.""" info = { 'environment': self.environment, 'manager_type': type(self.manager).__name__ } if isinstance(self.manager, WindowsWindowManager): info['powershell_available'] = self.manager.powershell_available elif isinstance(self.manager, WindowCapture): missing_deps = check_dependencies() info['missing_dependencies'] = missing_deps info['x11_available'] = len(missing_deps) == 0 return info class WindowCapture: """Handles window enumeration and screenshot capture on Linux.""" def __init__(self): self.display = os.environ.get('DISPLAY', ':0') def list_windows(self) -> List[Dict[str, str]]: """ List all available windows using wmctrl command. Returns list of window info dictionaries. """ try: # Use wmctrl to list windows result = subprocess.run( ['wmctrl', '-l'], capture_output=True, text=True, check=True, timeout=10 # 10 second timeout for window listing ) windows = [] for line in result.stdout.strip().split('\n'): if line.strip(): parts = line.split(None, 3) if len(parts) >= 4: window_id = parts[0] desktop = parts[1] machine = parts[2] title = parts[3] windows.append({ 'id': window_id, 'title': title, 'desktop': desktop, 'machine': machine }) return windows except subprocess.CalledProcessError as e: logger.error(f"Failed to list windows with wmctrl: {e}") return [] except FileNotFoundError: logger.error("wmctrl not found. Please install: sudo apt-get install wmctrl") return [] def get_window_geometry(self, window_id: str) -> Optional[Tuple[int, int, int, int]]: """ Get window geometry (x, y, width, height) for a window ID. """ try: result = subprocess.run( ['wmctrl', '-G'], capture_output=True, text=True, check=True, timeout=10 # 10 second timeout for window geometry ) for line in result.stdout.strip().split('\n'): if line.startswith(window_id): parts = line.split() if len(parts) >= 7: x = int(parts[2]) y = int(parts[3]) width = int(parts[4]) height = int(parts[5]) return (x, y, width, height) return None except (subprocess.CalledProcessError, ValueError) as e: logger.error(f"Failed to get window geometry: {e}") return None def focus_window(self, window_id: str) -> bool: """ Focus a window by its ID. """ try: subprocess.run( ['wmctrl', '-i', '-a', window_id], check=True, capture_output=True, timeout=5 # 5 second timeout for window focus ) time.sleep(0.5) # Give window time to focus return True except subprocess.CalledProcessError as e: logger.error(f"Failed to focus window {window_id}: {e}") return False def capture_full_screen(self, output_path: Optional[str] = None) -> str: """ Capture full screen screenshot. """ try: screenshot = ImageGrab.grab() if output_path is None: timestamp = int(time.time()) output_path = f"screenshot_{timestamp}.png" screenshot.save(output_path) logger.info(f"Full screen captured: {output_path}") return output_path except Exception as e: logger.error(f"Failed to capture full screen: {e}") raise def capture_window(self, window_id: str, output_path: Optional[str] = None) -> str: """ Capture screenshot of a specific window. """ try: # First focus the window if not self.focus_window(window_id): logger.warning(f"Could not focus window {window_id}, trying capture anyway") # Get window geometry geometry = self.get_window_geometry(window_id) if geometry: x, y, width, height = geometry # Capture the specific region screenshot = ImageGrab.grab(bbox=(x, y, x + width, y + height)) else: logger.warning(f"Could not get geometry for window {window_id}, capturing full screen") screenshot = ImageGrab.grab() if output_path is None: timestamp = int(time.time()) output_path = f"window_{window_id}_{timestamp}.png" screenshot.save(output_path) logger.info(f"Window captured: {output_path}") return output_path except Exception as e: logger.error(f"Failed to capture window {window_id}: {e}") raise def capture_multiple_pages(self, window_id: str, page_count: int, output_dir: Optional[str] = None, navigation_key: str = "Page_Down", delay_seconds: float = 1.0) -> List[str]: """ Capture multiple pages from a document window. Args: window_id: Window to capture page_count: Number of pages to capture output_dir: Directory to save captures navigation_key: Key to press for page navigation (Page_Down, Right, space) delay_seconds: Delay between page navigation and capture Returns: List of captured file paths """ try: # Get configured output directory (with backward compatibility) config = get_config() if output_dir is None: if config.should_use_legacy_mode(): output_dir = "captures" # Legacy default else: actual_output_dir = get_output_directory() else: actual_output_dir = get_output_directory(output_dir) # Create output directory actual_output_dir.mkdir(parents=True, exist_ok=True) # Focus the window if not self.focus_window(window_id): raise Exception(f"Could not focus window {window_id}") captured_files = [] for page_num in range(1, page_count + 1): # Capture current page filename = generate_page_filename(page_num) output_path = actual_output_dir / filename self.capture_window(window_id, str(output_path)) captured_files.append(str(output_path)) # Navigate to next page (except for last page) if page_num < page_count: self._send_key_to_window(window_id, navigation_key) time.sleep(delay_seconds) logger.info(f"Captured {len(captured_files)} pages to {actual_output_dir}") return captured_files except Exception as e: logger.error(f"Failed to capture multiple pages: {e}") raise def _send_key_to_window(self, window_id: str, key: str) -> bool: """ Send a key press to a specific window using xdotool. """ try: subprocess.run( ['xdotool', 'key', '--window', window_id, key], check=True, capture_output=True, timeout=5 # 5 second timeout for key send ) return True except subprocess.CalledProcessError as e: logger.error(f"Failed to send key {key} to window {window_id}: {e}") return False except FileNotFoundError: logger.error("xdotool not found. Please install: sudo apt-get install xdotool") return False def check_dependencies() -> List[str]: """ Check if required system dependencies are installed. Returns list of missing dependencies. """ missing = [] # Check for wmctrl try: subprocess.run(['wmctrl', '--version'], capture_output=True, check=True, timeout=5) except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): missing.append('wmctrl') # Check for xdotool try: subprocess.run(['xdotool', '--version'], capture_output=True, check=True, timeout=5) except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): missing.append('xdotool') return missing

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PovedaAqui/auto-snap-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server