import Algorithms
import AppKit
import AXorcist
import Commander
import CoreGraphics
import Foundation
import PeekabooCore
import PeekabooFoundation
import ScreenCaptureKit
private enum ScreenCaptureBridge {
static func captureFrontmost(services: any PeekabooServiceProviding) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureFrontmost()
}.value
}
static func captureWindow(
services: any PeekabooServiceProviding,
appIdentifier: String,
windowIndex: Int?
) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureWindow(appIdentifier: appIdentifier, windowIndex: windowIndex)
}.value
}
static func captureWindowById(
services: any PeekabooServiceProviding,
windowId: Int
) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureWindow(windowID: CGWindowID(windowId))
}.value
}
static func captureArea(services: any PeekabooServiceProviding, rect: CGRect) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureArea(rect)
}.value
}
static func captureScreen(
services: any PeekabooServiceProviding,
displayIndex: Int?
) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureScreen(displayIndex: displayIndex)
}.value
}
}
/// Capture a screenshot and build an interactive UI map
@available(macOS 14.0, *)
struct SeeCommand: ApplicationResolvable, ErrorHandlingCommand, RuntimeOptionsConfigurable {
@Option(help: "Application name to capture, or special values: 'menubar', 'frontmost'")
var app: String?
@Option(name: .long, help: "Target application by process ID")
var pid: Int32?
@Option(help: "Specific window title to capture")
var windowTitle: String?
@Option(
name: .long,
help: "Target window by CoreGraphics window id (window_id from `peekaboo window list --json`)"
)
var windowId: Int?
@Option(help: "Capture mode (screen, window, frontmost)")
var mode: PeekabooCore.CaptureMode?
@Option(
names: [.automatic, .customLong("save"), .customLong("output"), .customShort("o", allowingJoined: false)],
help: "Output path for screenshot (aliases: --save, --output, -o)"
)
var path: String?
@Option(
name: .long,
help: "Specific screen index to capture (0-based). If not specified, captures all screens when in screen mode"
)
var screenIndex: Int?
@Flag(help: "Generate annotated screenshot with interaction markers")
var annotate = false
@Option(help: "Analyze captured content with AI")
var analyze: String?
@Option(
name: .long,
help: """
Overall timeout in seconds (default: 20, or 60 when --analyze is set).
Increase this if element detection regularly times out for large/complex windows.
"""
)
var timeoutSeconds: Int?
@Option(
name: .long,
help: """
Capture engine: auto|modern|sckit|classic|cg (default: auto).
modern/sckit force ScreenCaptureKit; classic/cg force CGWindowList;
auto tries SC then falls back when allowed.
"""
)
var captureEngine: String?
@Flag(help: "Skip web-content focus fallback when no text fields are detected")
var noWebFocus = false
@RuntimeStorage private var runtime: CommandRuntime?
var runtimeOptions = CommandRuntimeOptions()
private var resolvedRuntime: CommandRuntime {
guard let runtime else {
preconditionFailure("CommandRuntime must be configured before accessing runtime resources")
}
return runtime
}
var jsonOutput: Bool { self.runtime?.configuration.jsonOutput ?? self.runtimeOptions.jsonOutput }
var verbose: Bool { self.runtime?.configuration.verbose ?? self.runtimeOptions.verbose }
private var logger: Logger { self.resolvedRuntime.logger }
private var services: any PeekabooServiceProviding { self.resolvedRuntime.services }
var outputLogger: Logger { self.logger }
@MainActor
mutating func run(using runtime: CommandRuntime) async throws {
self.runtime = runtime
let startTime = Date()
let logger = self.logger
let overallTimeout = TimeInterval(self.timeoutSeconds ?? ((self.analyze == nil) ? 20 : 60))
logger.operationStart("see_command", metadata: [
"app": self.app ?? "none",
"mode": self.mode?.rawValue ?? "auto",
"annotate": self.annotate,
"hasAnalyzePrompt": self.analyze != nil,
])
let commandCopy = self
do {
try await withThrowingTaskGroup(of: Void.self) { group in
group.addTask {
try await commandCopy.runImpl(startTime: startTime, logger: logger)
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(overallTimeout * 1_000_000_000))
throw CaptureError.detectionTimedOut(overallTimeout)
}
do {
_ = try await group.next()
group.cancelAll()
} catch {
group.cancelAll()
throw error
}
}
} catch {
logger.operationComplete(
"see_command",
success: false,
metadata: [
"error": error.localizedDescription,
]
)
throw error
}
}
private func runImpl(startTime: Date, logger: Logger) async throws {
do {
// Check permissions
logger.verbose("Checking screen recording permissions", category: "Permissions")
try await requireScreenRecordingPermission(services: self.services)
logger.verbose("Screen recording permission granted", category: "Permissions")
// Perform capture and element detection
logger.verbose("Starting capture and detection phase", category: "Capture")
let captureResult = try await performCaptureWithDetection()
logger.verbose("Capture completed successfully", category: "Capture", metadata: [
"snapshotId": captureResult.snapshotId,
"elementCount": captureResult.elements.all.count,
"screenshotSize": self.getFileSize(captureResult.screenshotPath) ?? 0,
])
// Generate annotated screenshot if requested
var annotatedPath: String?
if self.annotate {
logger.operationStart("generate_annotations")
annotatedPath = try await self.generateAnnotatedScreenshot(
snapshotId: captureResult.snapshotId,
originalPath: captureResult.screenshotPath
)
if let annotatedPath,
annotatedPath != captureResult.screenshotPath {
try await self.services.snapshots.storeAnnotatedScreenshot(
snapshotId: captureResult.snapshotId,
annotatedScreenshotPath: annotatedPath
)
}
logger.operationComplete("generate_annotations", metadata: [
"annotatedPath": annotatedPath ?? "none",
])
}
// Perform AI analysis if requested
var analysisResult: SeeAnalysisData?
if let prompt = analyze {
// Pre-analysis diagnostics
let fileSize = (try? FileManager.default
.attributesOfItem(atPath: captureResult.screenshotPath)[.size] as? Int) ?? 0
logger.verbose(
"Starting AI analysis",
category: "AI",
metadata: [
"imagePath": captureResult.screenshotPath,
"imageSizeBytes": fileSize,
"promptLength": prompt.count
]
)
logger.operationStart("ai_analysis", metadata: ["promptPreview": String(prompt.prefix(80))])
logger.startTimer("ai_generate")
analysisResult = try await self.performAnalysisDetailed(
imagePath: captureResult.screenshotPath,
prompt: prompt
)
logger.stopTimer("ai_generate")
logger.operationComplete(
"ai_analysis",
success: analysisResult != nil,
metadata: [
"provider": analysisResult?.provider ?? "unknown",
"model": analysisResult?.model ?? "unknown"
]
)
}
// Output results
let executionTime = Date().timeIntervalSince(startTime)
logger.operationComplete("see_command", metadata: [
"executionTimeMs": Int(executionTime * 1000),
"success": true,
])
let context = SeeCommandRenderContext(
snapshotId: captureResult.snapshotId,
screenshotPath: captureResult.screenshotPath,
annotatedPath: annotatedPath,
metadata: captureResult.metadata,
elements: captureResult.elements,
analysis: analysisResult,
executionTime: executionTime
)
await self.renderResults(context: context)
} catch {
logger.operationComplete("see_command", success: false, metadata: [
"error": error.localizedDescription,
])
self.handleError(error) // Use protocol's error handling
throw ExitCode.failure
}
}
private func getFileSize(_ path: String) -> Int? {
try? FileManager.default.attributesOfItem(atPath: path)[.size] as? Int
}
private func renderResults(context: SeeCommandRenderContext) async {
if self.jsonOutput {
await self.outputJSONResults(context: context)
} else {
await self.outputTextResults(context: context)
}
}
private func performCaptureWithDetection() async throws -> CaptureAndDetectionResult {
// Handle special app cases
let captureResult: CaptureResult
if let appName = self.app?.lowercased() {
switch appName {
case "menubar":
self.logger.verbose("Capturing menu bar area", category: "Capture")
captureResult = try await self.captureMenuBar()
case "frontmost":
self.logger.verbose("Capturing frontmost window (via --app frontmost)", category: "Capture")
captureResult = try await ScreenCaptureBridge.captureFrontmost(services: self.services)
default:
// Use normal capture logic
captureResult = try await self.performStandardCapture()
}
} else {
// Use normal capture logic
captureResult = try await self.performStandardCapture()
}
// Save screenshot
self.logger.startTimer("file_write")
let outputPath = try saveScreenshot(captureResult.imageData)
self.logger.stopTimer("file_write")
// Create window context from capture metadata
let windowContext = WindowContext(
applicationName: captureResult.metadata.applicationInfo?.name,
applicationBundleId: captureResult.metadata.applicationInfo?.bundleIdentifier,
applicationProcessId: captureResult.metadata.applicationInfo?.processIdentifier,
windowTitle: captureResult.metadata.windowInfo?.title,
windowID: captureResult.metadata.windowInfo?.windowID,
windowBounds: captureResult.metadata.windowInfo?.bounds,
shouldFocusWebContent: self.noWebFocus ? false : true
)
// Detect UI elements with window context
self.logger.operationStart("element_detection")
let detectionResult: ElementDetectionResult
do {
detectionResult = try await Self.withWallClockTimeout(seconds: 20.0) {
try await AutomationServiceBridge.detectElements(
automation: self.services.automation,
imageData: captureResult.imageData,
snapshotId: nil,
windowContext: windowContext
)
}
} catch is TimeoutError {
throw CaptureError.detectionTimedOut(20.0)
}
self.logger.operationComplete("element_detection")
// Update the result with the correct screenshot path
let resultWithPath = ElementDetectionResult(
snapshotId: detectionResult.snapshotId,
screenshotPath: outputPath,
elements: detectionResult.elements,
metadata: detectionResult.metadata
)
try await self.services.snapshots.storeScreenshot(
snapshotId: detectionResult.snapshotId,
screenshotPath: outputPath,
applicationBundleId: captureResult.metadata.applicationInfo?.bundleIdentifier,
applicationProcessId: captureResult.metadata.applicationInfo.map { Int32($0.processIdentifier) },
applicationName: windowContext.applicationName,
windowTitle: windowContext.windowTitle,
windowBounds: windowContext.windowBounds
)
// Store the result in snapshot
try await self.services.snapshots.storeDetectionResult(
snapshotId: detectionResult.snapshotId,
result: resultWithPath
)
return CaptureAndDetectionResult(
snapshotId: detectionResult.snapshotId,
screenshotPath: outputPath,
elements: detectionResult.elements,
metadata: detectionResult.metadata
)
}
private func performStandardCapture() async throws -> CaptureResult {
let effectiveMode = self.determineMode()
self.logger.verbose(
"Determined capture mode",
category: "Capture",
metadata: ["mode": effectiveMode.rawValue]
)
self.logger.operationStart("capture_phase", metadata: ["mode": effectiveMode.rawValue])
switch effectiveMode {
case .screen:
// Handle screen capture with multi-screen support
let result = try await self.performScreenCapture()
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
case .multi:
// Commander currently treats multi captures as multi-display screen grabs
let result = try await self.performScreenCapture()
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
case .window:
if let windowId = self.windowId {
self.logger.verbose("Initiating window capture (by id)", category: "Capture", metadata: [
"windowId": windowId,
])
self.logger.startTimer("window_capture")
let result = try await ScreenCaptureBridge.captureWindowById(
services: self.services,
windowId: windowId
)
self.logger.stopTimer("window_capture")
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
} else if self.app != nil || self.pid != nil {
let appIdentifier = try self.resolveApplicationIdentifier()
self.logger.verbose("Initiating window capture", category: "Capture", metadata: [
"app": appIdentifier,
"windowTitle": self.windowTitle ?? "any",
])
let windowIndex = try await self.resolveSeeWindowIndex(
appIdentifier: appIdentifier,
titleFragment: self.windowTitle
)
self.logger.startTimer("window_capture")
let result = try await ScreenCaptureBridge.captureWindow(
services: self.services,
appIdentifier: appIdentifier,
windowIndex: windowIndex
)
self.logger.stopTimer("window_capture")
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
} else {
throw ValidationError("Provide --window-id, or --app/--pid for window mode")
}
case .frontmost:
self.logger.verbose("Capturing frontmost window")
let result = try await ScreenCaptureBridge.captureFrontmost(services: self.services)
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
case .area:
throw ValidationError("Area capture mode is not supported for 'see' yet. Use --mode screen or window")
}
}
private func captureMenuBar() async throws -> CaptureResult {
// Get the main screen bounds
guard let mainScreen = NSScreen.main else {
throw PeekabooError.captureFailed("No main screen found")
}
// Menu bar is at the top of the screen
let menuBarHeight: CGFloat = 24.0 // Standard macOS menu bar height
let menuBarRect = CGRect(
x: mainScreen.frame.origin.x,
y: mainScreen.frame.origin.y + mainScreen.frame.height - menuBarHeight,
width: mainScreen.frame.width,
height: menuBarHeight
)
// Capture the menu bar area
return try await ScreenCaptureBridge.captureArea(services: self.services, rect: menuBarRect)
}
private func saveScreenshot(_ imageData: Data) throws -> String {
let outputPath: String
if let providedPath = path {
outputPath = NSString(string: providedPath).expandingTildeInPath
} else {
let timestamp = Date().timeIntervalSince1970
let filename = "peekaboo_see_\(Int(timestamp)).png"
let defaultPath = ConfigurationManager.shared.getDefaultSavePath(cliValue: nil)
outputPath = (defaultPath as NSString).appendingPathComponent(filename)
}
// Create directory if needed
let directory = (outputPath as NSString).deletingLastPathComponent
try FileManager.default.createDirectory(
atPath: directory,
withIntermediateDirectories: true
)
// Save the image
try imageData.write(to: URL(fileURLWithPath: outputPath))
self.logger.verbose("Saved screenshot to: \(outputPath)")
return outputPath
}
private func resolveSeeWindowIndex(appIdentifier: String, titleFragment: String?) async throws -> Int? {
// IMPORTANT: ScreenCaptureService's modern path interprets `windowIndex` as an index into the
// ScreenCaptureKit window list (SCShareableContent.windows filtered by PID), not the
// Accessibility/WindowManagementService ordering. Resolve indices against SC first to avoid
// capturing the wrong window when apps have hidden/auxiliary windows (e.g. Playground).
//
// When no title is provided, prefer `nil` so the capture service can auto-pick a renderable window.
guard let fragment = titleFragment, !fragment.isEmpty else {
return nil
}
let appInfo = try await self.services.applications.findApplication(identifier: appIdentifier)
let content = try await AXTimeoutHelper.withTimeout(seconds: 5.0) {
try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: false)
}
let appWindows = content.windows.filter { window in
window.owningApplication?.processID == appInfo.processIdentifier
}
guard !appWindows.isEmpty else {
throw CaptureError.windowNotFound
}
// Prefer matching via CGWindowList title -> windowID, then map to SCWindow.windowID.
if let targetWindowID = self.resolveCGWindowID(
forPID: appInfo.processIdentifier,
titleFragment: fragment
) {
if let index = appWindows.firstIndex(where: { Int($0.windowID) == Int(targetWindowID) }) {
return index
}
}
// Fallback: some windows may not expose a CG title; try SCWindow.title directly.
if let index = appWindows.firstIndex(where: { window in
(window.title ?? "").localizedCaseInsensitiveContains(fragment)
}) {
return index
}
throw CaptureError.windowNotFound
}
private func resolveCGWindowID(forPID pid: Int32, titleFragment: String) -> CGWindowID? {
let windowList = CGWindowListCopyWindowInfo(
[.optionAll, .excludeDesktopElements],
kCGNullWindowID
) as? [[String: Any]] ?? []
for info in windowList {
guard let ownerPID = info[kCGWindowOwnerPID as String] as? Int32, ownerPID == pid else { continue }
let title = info[kCGWindowName as String] as? String ?? ""
guard title.localizedCaseInsensitiveContains(titleFragment) else { continue }
if let windowID = info[kCGWindowNumber as String] as? CGWindowID {
return windowID
}
}
return nil
}
// swiftlint:disable function_body_length
private func generateAnnotatedScreenshot(
snapshotId: String,
originalPath: String
) async throws -> String {
// Get detection result from snapshot
guard let detectionResult = try await self.services.snapshots.getDetectionResult(snapshotId: snapshotId)
else {
self.logger.info("No detection result found for snapshot")
return originalPath
}
// Create annotated image
let annotatedPath = (originalPath as NSString).deletingPathExtension + "_annotated.png"
// Load original image
guard let nsImage = NSImage(contentsOfFile: originalPath) else {
throw CaptureError.fileIOError("Failed to load image from \(originalPath)")
}
// Get image size
let imageSize = nsImage.size
// Create bitmap context
guard let bitmapRep = NSBitmapImageRep(
bitmapDataPlanes: nil,
pixelsWide: Int(imageSize.width),
pixelsHigh: Int(imageSize.height),
bitsPerSample: 8,
samplesPerPixel: 4,
hasAlpha: true,
isPlanar: false,
colorSpaceName: .calibratedRGB,
bytesPerRow: 0,
bitsPerPixel: 0
)
else {
throw CaptureError.captureFailure("Failed to create bitmap representation")
}
// Draw into context
NSGraphicsContext.saveGraphicsState()
guard let context = NSGraphicsContext(bitmapImageRep: bitmapRep) else {
self.logger.error("Failed to create graphics context")
throw CaptureError.captureFailure("Failed to create graphics context")
}
NSGraphicsContext.current = context
self.logger.verbose("Graphics context created successfully")
// Draw original image
nsImage.draw(in: NSRect(origin: .zero, size: imageSize))
self.logger.verbose("Original image drawn")
// Configure text attributes - smaller font for less occlusion
let fontSize: CGFloat = 8
let textAttributes: [NSAttributedString.Key: Any] = [
.font: NSFont.systemFont(ofSize: fontSize, weight: .semibold),
.foregroundColor: NSColor.white,
]
// Role-based colors from spec
let roleColors: [ElementType: NSColor] = [
.button: NSColor(red: 0, green: 0.48, blue: 1.0, alpha: 1.0), // #007AFF
.textField: NSColor(red: 0.204, green: 0.78, blue: 0.349, alpha: 1.0), // #34C759
.link: NSColor(red: 0, green: 0.48, blue: 1.0, alpha: 1.0), // #007AFF
.checkbox: NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0), // #8E8E93
.slider: NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0), // #8E8E93
.menu: NSColor(red: 0, green: 0.48, blue: 1.0, alpha: 1.0), // #007AFF
]
// Draw UI elements
let enabledElements = detectionResult.elements.all.filter(\.isEnabled)
if enabledElements.isEmpty {
self.logger.info("No enabled elements to annotate. Total elements: \(detectionResult.elements.all.count)")
print("\(AgentDisplayTokens.Status.warning) No interactive UI elements found to annotate")
return originalPath // Return original image if no elements to annotate
}
self.logger.info(
"Annotating \(enabledElements.count) enabled elements out of \(detectionResult.elements.all.count) total"
)
self.logger.verbose("Image size: \(imageSize)")
// Calculate window origin from element bounds if we have elements
var windowOrigin = CGPoint.zero
if !detectionResult.elements.all.isEmpty {
// Find the leftmost and topmost element to estimate window origin
let minX = detectionResult.elements.all.map(\.bounds.minX).min() ?? 0
let minY = detectionResult.elements.all.map(\.bounds.minY).min() ?? 0
windowOrigin = CGPoint(x: minX, y: minY)
self.logger.verbose("Estimated window origin from elements: \(windowOrigin)")
}
// Convert all element bounds to window-relative coordinates and flip Y
var elementRects: [(element: DetectedElement, rect: NSRect)] = []
for element in enabledElements {
let elementFrame = CGRect(
x: element.bounds.origin.x - windowOrigin.x,
y: element.bounds.origin.y - windowOrigin.y,
width: element.bounds.width,
height: element.bounds.height
)
let rect = NSRect(
x: elementFrame.origin.x,
y: imageSize.height - elementFrame.origin.y - elementFrame.height, // Flip Y coordinate
width: elementFrame.width,
height: elementFrame.height
)
elementRects.append((element: element, rect: rect))
}
// Create smart label placer for intelligent label positioning
let labelPlacer = SmartLabelPlacer(
image: nsImage,
fontSize: fontSize,
debugMode: self.verbose,
logger: self.logger
)
// Draw elements and calculate label positions
var labelPositions: [(rect: NSRect, connection: NSPoint?, element: DetectedElement)] = []
for (element, rect) in elementRects {
let drawingDetails = [
"Drawing element: \(element.id)",
"type: \(element.type)",
"original bounds: \(element.bounds)",
"window rect: \(rect)"
].joined(separator: ", ")
self.logger.verbose(drawingDetails)
// Get color for element type
let color = roleColors[element.type] ?? NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0)
// Draw bounding box
color.withAlphaComponent(0.5).setFill()
rect.fill()
color.setStroke()
let path = NSBezierPath(rect: rect)
path.lineWidth = 2
path.stroke()
// Calculate label size
let idString = NSAttributedString(string: element.id, attributes: textAttributes)
let textSize = idString.size()
let labelPadding: CGFloat = 4
let labelSize = NSSize(width: textSize.width + labelPadding * 2, height: textSize.height + labelPadding)
// Use smart label placer to find best position
if let placement = labelPlacer.findBestLabelPosition(
for: element,
elementRect: rect,
labelSize: labelSize,
existingLabels: labelPositions.map { ($0.rect, $0.element) },
allElements: elementRects
) {
labelPositions.append((
rect: placement.labelRect,
connection: placement.connectionPoint,
element: element
))
}
}
// NOTE: Old placement code removed - now using SmartLabelPlacer
// [OLD CODE REMOVED - lines 483-785 contained the old placement logic]
// Draw all labels and connection lines
for (labelRect, connectionPoint, element) in labelPositions {
// Draw connection line if label is outside - make it more subtle
if let connection = connectionPoint {
NSColor.black.withAlphaComponent(0.3).setStroke()
let linePath = NSBezierPath()
linePath.lineWidth = 0.5
// Draw line from connection point to nearest edge of label
linePath.move(to: connection)
// Find the closest point on label rectangle to the connection point
let closestX = max(labelRect.minX, min(connection.x, labelRect.maxX))
let closestY = max(labelRect.minY, min(connection.y, labelRect.maxY))
linePath.line(to: NSPoint(x: closestX, y: closestY))
linePath.stroke()
}
// Draw label background - more transparent to show content beneath
NSColor.black.withAlphaComponent(0.7).setFill()
NSBezierPath(roundedRect: labelRect, xRadius: 1, yRadius: 1).fill()
// Draw label border (same color as element) - thinner for less occlusion
let color = roleColors[element.type] ?? NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0)
color.withAlphaComponent(0.8).setStroke()
let borderPath = NSBezierPath(roundedRect: labelRect, xRadius: 1, yRadius: 1)
borderPath.lineWidth = 0.5
borderPath.stroke()
// Draw label text
let idString = NSAttributedString(string: element.id, attributes: textAttributes)
idString.draw(at: NSPoint(x: labelRect.origin.x + 4, y: labelRect.origin.y + 2))
}
NSGraphicsContext.restoreGraphicsState()
// Save annotated image
guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else {
throw CaptureError.captureFailure("Failed to create PNG data")
}
try pngData.write(to: URL(fileURLWithPath: annotatedPath))
self.logger.verbose("Created annotated screenshot: \(annotatedPath)")
// Log annotation info only in non-JSON mode
if !self.jsonOutput {
let interactableElements = detectionResult.elements.all.filter(\.isEnabled)
print("📝 Created annotated screenshot with \(interactableElements.count) interactive elements")
}
return annotatedPath
}
// swiftlint:enable function_body_length
// [OLD CODE REMOVED - massive cleanup of duplicate placement logic]
}
// MARK: - Supporting Types
private struct CaptureAndDetectionResult {
let snapshotId: String
let screenshotPath: String
let elements: DetectedElements
let metadata: DetectionMetadata
}
private struct SnapshotPaths {
let raw: String
let annotated: String
let map: String
}
private struct SeeCommandRenderContext {
let snapshotId: String
let screenshotPath: String
let annotatedPath: String?
let metadata: DetectionMetadata
let elements: DetectedElements
let analysis: SeeAnalysisData?
let executionTime: TimeInterval
}
// MARK: - JSON Output Structure (matching original)
struct UIElementSummary: Codable {
let id: String
let role: String
let title: String?
let label: String?
let description: String?
let role_description: String?
let help: String?
let identifier: String?
let is_actionable: Bool
let keyboard_shortcut: String?
}
struct SeeAnalysisData: Codable {
let provider: String
let model: String
let text: String
}
struct SeeResult: Codable {
let snapshot_id: String
let screenshot_raw: String
let screenshot_annotated: String
let ui_map: String
let application_name: String?
let window_title: String?
let is_dialog: Bool
let element_count: Int
let interactable_count: Int
let capture_mode: String
let analysis: SeeAnalysisData?
let execution_time: TimeInterval
let ui_elements: [UIElementSummary]
let menu_bar: MenuBarSummary?
var success: Bool = true
}
struct MenuBarSummary: Codable {
let menus: [MenuSummary]
struct MenuSummary: Codable {
let title: String
let item_count: Int
let enabled: Bool
let items: [MenuItemSummary]
}
struct MenuItemSummary: Codable {
let title: String
let enabled: Bool
let keyboard_shortcut: String?
}
}
// MARK: - Format Helpers Extension
extension SeeCommand {
/// Fetches the menu bar summary only when verbose output is requested, with a short timeout.
private func fetchMenuBarSummaryIfEnabled() async -> MenuBarSummary? {
guard self.verbose else { return nil }
do {
return try await Self.withWallClockTimeout(seconds: 2.5) {
try Task.checkCancellation()
return await self.getMenuBarItemsSummary()
}
} catch {
self.logger.debug(
"Skipping menu bar summary",
category: "Menu",
metadata: ["reason": error.localizedDescription]
)
return nil
}
}
/// Timeout helper that is not MainActor-bound, so it can still fire if the main actor is blocked.
static func withWallClockTimeout<T: Sendable>(
seconds: TimeInterval,
operation: @escaping @Sendable () async throws -> T
) async throws -> T {
try await withThrowingTaskGroup(of: T.self) { group in
group.addTask {
try await operation()
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000))
throw CaptureError.detectionTimedOut(seconds)
}
guard let result = try await group.next() else {
throw CaptureError.detectionTimedOut(seconds)
}
group.cancelAll()
return result
}
}
private func performAnalysisDetailed(imagePath: String, prompt: String) async throws -> SeeAnalysisData {
// Use PeekabooCore AI service which is configured via ConfigurationManager/Tachikoma
let ai = PeekabooAIService()
let res = try await ai.analyzeImageFileDetailed(at: imagePath, question: prompt, model: nil)
return SeeAnalysisData(provider: res.provider, model: res.model, text: res.text)
}
private func buildMenuSummaryIfNeeded() async -> MenuBarSummary? {
// Placeholder for future UI summary generation; currently unused.
nil
}
private func determineMode() -> PeekabooCore.CaptureMode {
if let mode = self.mode {
mode
} else if self.app != nil || self.pid != nil || self.windowTitle != nil || self.windowId != nil {
// If app or window title is specified, default to window mode
.window
} else {
// Otherwise default to frontmost
.frontmost
}
}
// MARK: - Output Methods
private func outputJSONResults(context: SeeCommandRenderContext) async {
let uiElements: [UIElementSummary] = context.elements.all.map { element in
UIElementSummary(
id: element.id,
role: element.type.rawValue,
title: element.attributes["title"],
label: element.label,
description: element.attributes["description"],
role_description: element.attributes["roleDescription"],
help: element.attributes["help"],
identifier: element.attributes["identifier"],
is_actionable: element.isEnabled,
keyboard_shortcut: element.attributes["keyboardShortcut"]
)
}
let snapshotPaths = self.snapshotPaths(for: context)
// Menu bar enumeration can be slow or hang on some setups. Only attempt it in verbose
// mode and bound it with a short timeout so JSON output is responsive by default.
let menuSummary = await self.fetchMenuBarSummaryIfEnabled()
let output = SeeResult(
snapshot_id: context.snapshotId,
screenshot_raw: snapshotPaths.raw,
screenshot_annotated: snapshotPaths.annotated,
ui_map: snapshotPaths.map,
application_name: context.metadata.windowContext?.applicationName,
window_title: context.metadata.windowContext?.windowTitle,
is_dialog: context.metadata.isDialog,
element_count: context.metadata.elementCount,
interactable_count: context.elements.all.count { $0.isEnabled },
capture_mode: self.determineMode().rawValue,
analysis: context.analysis,
execution_time: context.executionTime,
ui_elements: uiElements,
menu_bar: menuSummary
)
outputSuccessCodable(data: output, logger: self.outputLogger)
}
private func getMenuBarItemsSummary() async -> MenuBarSummary {
// Get menu bar items from service
var menuExtras: [MenuExtraInfo] = []
do {
menuExtras = try await self.services.menu.listMenuExtras()
} catch {
// If there's an error, just return empty array
menuExtras = []
}
// Group items into menu categories
// For now, we'll create a simplified view showing each menu bar item as a "menu"
let menus = menuExtras.map { extra in
MenuBarSummary.MenuSummary(
title: extra.title,
item_count: 1, // Each menu bar item is treated as a single menu
enabled: true,
items: [
MenuBarSummary.MenuItemSummary(
title: extra.title,
enabled: true,
keyboard_shortcut: nil
)
]
)
}
return MenuBarSummary(menus: menus)
}
private func outputTextResults(context: SeeCommandRenderContext) async {
print("🖼️ Screenshot saved to: \(context.screenshotPath)")
if let annotatedPath = context.annotatedPath {
print("📝 Annotated screenshot: \(annotatedPath)")
}
if let appName = context.metadata.windowContext?.applicationName {
print("📱 Application: \(appName)")
}
if let windowTitle = context.metadata.windowContext?.windowTitle {
let windowType = context.metadata.isDialog ? "Dialog" : "Window"
let icon = context.metadata.isDialog ? "🗨️" : "[win]"
print("\(icon) \(windowType): \(windowTitle)")
}
print("🧊 Detection method: \(context.metadata.method)")
print("📊 UI elements detected: \(context.metadata.elementCount)")
print("⚙️ Interactable elements: \(context.elements.all.count { $0.isEnabled })")
let formattedDuration = String(format: "%.2f", context.executionTime)
print("⏱️ Execution time: \(formattedDuration)s")
if let analysis = context.analysis {
print("\n🤖 AI Analysis\n\(analysis.text)")
}
if context.metadata.elementCount > 0 {
print("\n🔍 Element Summary")
for element in context.elements.all.prefix(10) {
let summaryLabel = element.label ?? element.attributes["title"] ?? element.value ?? "Untitled"
print("• \(element.id) (\(element.type.rawValue)) - \(summaryLabel)")
}
if context.metadata.elementCount > 10 {
print(" ...and \(context.metadata.elementCount - 10) more elements")
}
}
if self.annotate {
print("\n📝 Annotated screenshot created")
}
if let menuSummary = await self.buildMenuSummaryIfNeeded() {
print("\n🧭 Menu Bar Summary")
for menu in menuSummary.menus {
print("- \(menu.title) (\(menu.enabled ? "Enabled" : "Disabled"))")
for item in menu.items.prefix(5) {
let shortcut = item.keyboard_shortcut.map { " [\($0)]" } ?? ""
print(" • \(item.title)\(shortcut)")
}
}
}
print("\nSnapshot ID: \(context.snapshotId)")
let terminalCapabilities = TerminalDetector.detectCapabilities()
if terminalCapabilities.recommendedOutputMode == .minimal {
print("Agent: Use a tool like view_image to inspect it.")
}
}
private func snapshotPaths(for context: SeeCommandRenderContext) -> SnapshotPaths {
SnapshotPaths(
raw: context.screenshotPath,
annotated: context.annotatedPath ?? context.screenshotPath,
map: self.services.snapshots.getSnapshotStoragePath() + "/\(context.snapshotId)/snapshot.json"
)
}
}
// MARK: - Multi-Screen Support
extension SeeCommand {
private func performScreenCapture() async throws -> CaptureResult {
// Log warning if annotation was requested for full screen captures
if self.annotate {
self.logger.info("Annotation is disabled for full screen captures due to performance constraints")
}
self.logger.verbose("Initiating screen capture", category: "Capture")
self.logger.startTimer("screen_capture")
defer {
self.logger.stopTimer("screen_capture")
}
if let index = self.screenIndex ?? (self.analyze != nil ? 0 : nil) {
// Capture specific screen
self.logger.verbose("Capturing specific screen", category: "Capture", metadata: ["screenIndex": index])
let result = try await ScreenCaptureBridge.captureScreen(services: self.services, displayIndex: index)
// Add display info to output
if let displayInfo = result.metadata.displayInfo {
self.printScreenDisplayInfo(
index: index,
displayInfo: displayInfo,
indent: "",
suffix: nil
)
}
self.logger.verbose("Screen capture completed", category: "Capture", metadata: [
"mode": "screen-index",
"screenIndex": index,
"imageBytes": result.imageData.count
])
return result
} else {
// Capture all screens
self.logger.verbose("Capturing all screens", category: "Capture")
let results = try await self.captureAllScreens()
if results.isEmpty {
throw CaptureError.captureFailure("Failed to capture any screens")
}
// Save all screenshots except the first (which will be saved by the normal flow)
print("📸 Captured \(results.count) screen(s):")
for (index, result) in results.indexed() {
if index > 0 {
// Save additional screenshots
let screenPath: String
if let basePath = self.path {
// User specified a path - add screen index to filename
let directory = (basePath as NSString).deletingLastPathComponent
let filename = (basePath as NSString).lastPathComponent
let nameWithoutExt = (filename as NSString).deletingPathExtension
let ext = (filename as NSString).pathExtension
screenPath = (directory as NSString)
.appendingPathComponent("\(nameWithoutExt)_screen\(index).\(ext)")
} else {
// Default path with screen index
let timestamp = ISO8601DateFormatter().string(from: Date())
screenPath = "screenshot_\(timestamp)_screen\(index).png"
}
// Save the screenshot
try result.imageData.write(to: URL(fileURLWithPath: screenPath))
// Display info about this screen
if let displayInfo = result.metadata.displayInfo {
let fileSize = self.getFileSize(screenPath) ?? 0
let suffix = "\(screenPath) (\(self.formatFileSize(Int64(fileSize))))"
self.printScreenDisplayInfo(
index: index,
displayInfo: displayInfo,
indent: " ",
suffix: suffix
)
}
} else {
// First screen will be saved by the normal flow, just show info
if let displayInfo = result.metadata.displayInfo {
self.printScreenDisplayInfo(
index: index,
displayInfo: displayInfo,
indent: " ",
suffix: "(primary)"
)
}
}
}
// Return the primary screen result (first one)
self.logger.verbose("Multi-screen capture completed", category: "Capture", metadata: [
"count": results.count,
"primaryBytes": results.first?.imageData.count ?? 0
])
return results[0]
}
}
}
// MARK: - Multi-Screen Support
extension SeeCommand {
private func captureAllScreens() async throws -> [CaptureResult] {
var results: [CaptureResult] = []
// Get available displays from the screen capture service
let content = try await SCShareableContent.current
let displays = content.displays
self.logger.info("Found \(displays.count) display(s) to capture")
for (index, display) in displays.indexed() {
self.logger.verbose("Capturing display \(index)", category: "MultiScreen", metadata: [
"displayID": display.displayID,
"width": display.width,
"height": display.height
])
do {
let result = try await ScreenCaptureBridge.captureScreen(services: self.services, displayIndex: index)
// Update path to include screen index if capturing multiple screens
if displays.count > 1 {
let updatedResult = self.updateCaptureResultPath(result, screenIndex: index, displayInfo: display)
results.append(updatedResult)
} else {
results.append(result)
}
} catch {
self.logger.error("Failed to capture display \(index): \(error)")
// Continue capturing other screens even if one fails
}
}
if results.isEmpty {
throw CaptureError.captureFailure("Failed to capture any screens")
}
return results
}
private func updateCaptureResultPath(
_ result: CaptureResult,
screenIndex: Int,
displayInfo: SCDisplay
) -> CaptureResult {
// Since CaptureResult is immutable and doesn't have a path property,
// we can't update the path. Just return the original result.
// The saved path is already included in result.savedPath if it was saved.
result
}
private func formatFileSize(_ bytes: Int64) -> String {
let formatter = ByteCountFormatter()
formatter.countStyle = .file
return formatter.string(fromByteCount: bytes)
}
}
@MainActor
extension SeeCommand: ParsableCommand {
nonisolated(unsafe) static var commandDescription: CommandDescription {
MainActorCommandDescription.describe {
let definition = VisionToolDefinitions.see.commandConfiguration
return CommandDescription(
commandName: definition.commandName,
abstract: definition.abstract,
discussion: definition.discussion,
usageExamples: [
CommandUsageExample(
command: "peekaboo see --json-output --annotate --path /tmp/see.png",
description: "Capture the frontmost window, print structured output, and save annotations."
),
CommandUsageExample(
command: "peekaboo see --app Safari --window-title \"Login\" --json-output",
description: "Target a specific Safari window to collect stable element IDs."
),
CommandUsageExample(
command: "peekaboo see --mode screen --screen-index 0 --analyze 'Summarize the dashboard'",
description: "Capture a display and immediately send it to the configured AI provider."
)
],
showHelpOnEmptyInvocation: true
)
}
}
}
extension SeeCommand: AsyncRuntimeCommand {}
@MainActor
extension SeeCommand: CommanderBindableCommand {
mutating func applyCommanderValues(_ values: CommanderBindableValues) throws {
self.app = values.singleOption("app")
self.pid = try values.decodeOption("pid", as: Int32.self)
self.windowTitle = values.singleOption("windowTitle")
self.windowId = try values.decodeOption("windowId", as: Int.self)
if let parsedMode: PeekabooCore.CaptureMode = try values.decodeOptionEnum("mode", caseInsensitive: false) {
self.mode = parsedMode
}
self.path = values.singleOption("path")
self.screenIndex = try values.decodeOption("screenIndex", as: Int.self)
self.annotate = values.flag("annotate")
self.analyze = values.singleOption("analyze")
self.noWebFocus = values.flag("noWebFocus")
}
}
extension SeeCommand {
private func screenDisplayBaseText(index: Int, displayInfo: DisplayInfo) -> String {
let displayName = displayInfo.name ?? "Display \(index)"
let bounds = displayInfo.bounds
let resolution = "(\(Int(bounds.width))×\(Int(bounds.height)))"
return "[scrn]️ Display \(index): \(displayName) \(resolution)"
}
private func printScreenDisplayInfo(
index: Int,
displayInfo: DisplayInfo,
indent: String = "",
suffix: String? = nil
) {
var line = self.screenDisplayBaseText(index: index, displayInfo: displayInfo)
if let suffix {
line += " → \(suffix)"
}
print("\(indent)\(line)")
}
}