import Algorithms
import AppKit
import AXorcist
import Commander
import CoreGraphics
import Foundation
import PeekabooCore
import PeekabooFoundation
import ScreenCaptureKit
private enum ScreenCaptureBridge {
static func captureFrontmost(services: any PeekabooServiceProviding) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureFrontmost()
}.value
}
static func captureWindow(
services: any PeekabooServiceProviding,
appIdentifier: String,
windowIndex: Int?
) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureWindow(appIdentifier: appIdentifier, windowIndex: windowIndex)
}.value
}
static func captureArea(services: any PeekabooServiceProviding, rect: CGRect) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureArea(rect)
}.value
}
static func captureScreen(
services: any PeekabooServiceProviding,
displayIndex: Int?
) async throws -> CaptureResult {
try await Task { @MainActor in
try await services.screenCapture.captureScreen(displayIndex: displayIndex)
}.value
}
}
/// Capture a screenshot and build an interactive UI map
@available(macOS 14.0, *)
struct SeeCommand: ApplicationResolvable, ErrorHandlingCommand, RuntimeOptionsConfigurable {
@Option(help: "Application name to capture, or special values: 'menubar', 'frontmost'")
var app: String?
@Option(name: .long, help: "Target application by process ID")
var pid: Int32?
@Option(help: "Specific window title to capture")
var windowTitle: String?
@Option(help: "Capture mode (screen, window, frontmost)")
var mode: PeekabooCore.CaptureMode?
@Option(
names: [.automatic, .customLong("save"), .customLong("output"), .customShort("o", allowingJoined: false)],
help: "Output path for screenshot (aliases: --save, --output, -o)"
)
var path: String?
@Option(
name: .long,
help: "Specific screen index to capture (0-based). If not specified, captures all screens when in screen mode"
)
var screenIndex: Int?
@Flag(help: "Generate annotated screenshot with interaction markers")
var annotate = false
@Option(help: "Analyze captured content with AI")
var analyze: String?
@Option(
name: .long,
help: """
Capture engine: auto|modern|sckit|classic|cg (default: auto).
modern/sckit force ScreenCaptureKit; classic/cg force CGWindowList;
auto tries SC then falls back when allowed.
"""
)
var captureEngine: String?
@Flag(name: .customLong("no-web-focus"), help: "Skip web-content focus fallback when no text fields are detected")
var noWebFocus = false
@RuntimeStorage private var runtime: CommandRuntime?
var runtimeOptions = CommandRuntimeOptions()
private var resolvedRuntime: CommandRuntime {
guard let runtime else {
preconditionFailure("CommandRuntime must be configured before accessing runtime resources")
}
return runtime
}
var jsonOutput: Bool { self.runtime?.configuration.jsonOutput ?? self.runtimeOptions.jsonOutput }
var verbose: Bool { self.runtime?.configuration.verbose ?? self.runtimeOptions.verbose }
private var logger: Logger { self.resolvedRuntime.logger }
private var services: any PeekabooServiceProviding { self.resolvedRuntime.services }
var outputLogger: Logger { self.logger }
@MainActor
mutating func run(using runtime: CommandRuntime) async throws {
self.runtime = runtime
let startTime = Date()
let logger = self.logger
let overallTimeout: TimeInterval = 30.0
logger.operationStart("see_command", metadata: [
"app": self.app ?? "none",
"mode": self.mode?.rawValue ?? "auto",
"annotate": self.annotate,
"hasAnalyzePrompt": self.analyze != nil,
])
let commandCopy = self
let operationTask = Task {
try await commandCopy.runImpl(startTime: startTime, logger: logger)
}
let timeoutTask = Task {
try await Task.sleep(nanoseconds: UInt64(overallTimeout * 1_000_000_000))
operationTask.cancel()
throw CaptureError.detectionTimedOut(overallTimeout)
}
do {
_ = try await withThrowingTaskGroup(of: Void.self) { group in
group.addTask { try await operationTask.value }
group.addTask { try await timeoutTask.value }
guard let value = try await group.next() else { return }
group.cancelAll()
return value
}
} catch {
logger.operationComplete(
"see_command",
success: false,
metadata: [
"error": error.localizedDescription,
]
)
throw error
}
}
private func runImpl(startTime: Date, logger: Logger) async throws {
do {
// Check permissions
logger.verbose("Checking screen recording permissions", category: "Permissions")
try await requireScreenRecordingPermission(services: self.services)
logger.verbose("Screen recording permission granted", category: "Permissions")
// Perform capture and element detection
logger.verbose("Starting capture and detection phase", category: "Capture")
let captureResult = try await performCaptureWithDetection()
logger.verbose("Capture completed successfully", category: "Capture", metadata: [
"sessionId": captureResult.sessionId,
"elementCount": captureResult.elements.all.count,
"screenshotSize": self.getFileSize(captureResult.screenshotPath) ?? 0,
])
// Generate annotated screenshot if requested
var annotatedPath: String?
if self.annotate {
logger.operationStart("generate_annotations")
annotatedPath = try await self.generateAnnotatedScreenshot(
sessionId: captureResult.sessionId,
originalPath: captureResult.screenshotPath
)
logger.operationComplete("generate_annotations", metadata: [
"annotatedPath": annotatedPath ?? "none",
])
}
// Perform AI analysis if requested
var analysisResult: SeeAnalysisData?
if let prompt = analyze {
// Pre-analysis diagnostics
let fileSize = (try? FileManager.default
.attributesOfItem(atPath: captureResult.screenshotPath)[.size] as? Int) ?? 0
logger.verbose(
"Starting AI analysis",
category: "AI",
metadata: [
"imagePath": captureResult.screenshotPath,
"imageSizeBytes": fileSize,
"promptLength": prompt.count
]
)
logger.operationStart("ai_analysis", metadata: ["promptPreview": String(prompt.prefix(80))])
logger.startTimer("ai_generate")
analysisResult = try await self.performAnalysisDetailed(
imagePath: captureResult.screenshotPath,
prompt: prompt
)
logger.stopTimer("ai_generate")
logger.operationComplete(
"ai_analysis",
success: analysisResult != nil,
metadata: [
"provider": analysisResult?.provider ?? "unknown",
"model": analysisResult?.model ?? "unknown"
]
)
}
// Output results
let executionTime = Date().timeIntervalSince(startTime)
logger.operationComplete("see_command", metadata: [
"executionTimeMs": Int(executionTime * 1000),
"success": true,
])
let context = SeeCommandRenderContext(
sessionId: captureResult.sessionId,
screenshotPath: captureResult.screenshotPath,
annotatedPath: annotatedPath,
metadata: captureResult.metadata,
elements: captureResult.elements,
analysis: analysisResult,
executionTime: executionTime
)
await self.renderResults(context: context)
} catch {
logger.operationComplete("see_command", success: false, metadata: [
"error": error.localizedDescription,
])
self.handleError(error) // Use protocol's error handling
throw ExitCode.failure
}
}
private func getFileSize(_ path: String) -> Int? {
try? FileManager.default.attributesOfItem(atPath: path)[.size] as? Int
}
private func renderResults(context: SeeCommandRenderContext) async {
if self.jsonOutput {
await self.outputJSONResults(context: context)
} else {
await self.outputTextResults(context: context)
}
}
private func performCaptureWithDetection() async throws -> CaptureAndDetectionResult {
// Handle special app cases
let captureResult: CaptureResult
if let appName = self.app?.lowercased() {
switch appName {
case "menubar":
self.logger.verbose("Capturing menu bar area", category: "Capture")
captureResult = try await self.captureMenuBar()
case "frontmost":
self.logger.verbose("Capturing frontmost window (via --app frontmost)", category: "Capture")
captureResult = try await ScreenCaptureBridge.captureFrontmost(services: self.services)
default:
// Use normal capture logic
captureResult = try await self.performStandardCapture()
}
} else {
// Use normal capture logic
captureResult = try await self.performStandardCapture()
}
// Save screenshot
self.logger.startTimer("file_write")
let outputPath = try saveScreenshot(captureResult.imageData)
self.logger.stopTimer("file_write")
// Create window context from capture metadata
let windowContext = WindowContext(
applicationName: captureResult.metadata.applicationInfo?.name,
windowTitle: captureResult.metadata.windowInfo?.title,
windowBounds: captureResult.metadata.windowInfo?.bounds,
shouldFocusWebContent: self.noWebFocus ? false : true
)
// Detect UI elements with window context
self.logger.operationStart("element_detection")
let detectionResult: ElementDetectionResult
do {
detectionResult = try await Self.withWallClockTimeout(seconds: 20.0) {
try await AutomationServiceBridge.detectElements(
automation: self.services.automation,
imageData: captureResult.imageData,
sessionId: nil,
windowContext: windowContext
)
}
} catch is TimeoutError {
throw CaptureError.detectionTimedOut(20.0)
}
self.logger.operationComplete("element_detection")
// Update the result with the correct screenshot path
let resultWithPath = ElementDetectionResult(
sessionId: detectionResult.sessionId,
screenshotPath: outputPath,
elements: detectionResult.elements,
metadata: detectionResult.metadata
)
try await self.services.sessions.storeScreenshot(
sessionId: detectionResult.sessionId,
screenshotPath: outputPath,
applicationName: windowContext.applicationName,
windowTitle: windowContext.windowTitle,
windowBounds: windowContext.windowBounds
)
// Store the result in session
try await self.services.sessions.storeDetectionResult(
sessionId: detectionResult.sessionId,
result: resultWithPath
)
return CaptureAndDetectionResult(
sessionId: detectionResult.sessionId,
screenshotPath: outputPath,
elements: detectionResult.elements,
metadata: detectionResult.metadata
)
}
private func performStandardCapture() async throws -> CaptureResult {
let effectiveMode = self.determineMode()
self.logger.verbose(
"Determined capture mode",
category: "Capture",
metadata: ["mode": effectiveMode.rawValue]
)
self.logger.operationStart("capture_phase", metadata: ["mode": effectiveMode.rawValue])
switch effectiveMode {
case .screen:
// Handle screen capture with multi-screen support
let result = try await self.performScreenCapture()
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
case .multi:
// Commander currently treats multi captures as multi-display screen grabs
let result = try await self.performScreenCapture()
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
case .window:
if self.app != nil || self.pid != nil {
let appIdentifier = try self.resolveApplicationIdentifier()
self.logger.verbose("Initiating window capture", category: "Capture", metadata: [
"app": appIdentifier,
"windowTitle": self.windowTitle ?? "any",
])
let windowIndex = try await self.resolveSeeWindowIndex(
appIdentifier: appIdentifier,
titleFragment: self.windowTitle
)
self.logger.startTimer("window_capture")
let result = try await ScreenCaptureBridge.captureWindow(
services: self.services,
appIdentifier: appIdentifier,
windowIndex: windowIndex
)
self.logger.stopTimer("window_capture")
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
} else {
throw ValidationError("--app or --pid is required for window mode")
}
case .frontmost:
self.logger.verbose("Capturing frontmost window")
let result = try await ScreenCaptureBridge.captureFrontmost(services: self.services)
self.logger.operationComplete("capture_phase", metadata: ["mode": effectiveMode.rawValue])
return result
case .area:
throw ValidationError("Area capture mode is not supported for 'see' yet. Use --mode screen or window")
}
}
private func captureMenuBar() async throws -> CaptureResult {
// Get the main screen bounds
guard let mainScreen = NSScreen.main else {
throw PeekabooError.captureFailed("No main screen found")
}
// Menu bar is at the top of the screen
let menuBarHeight: CGFloat = 24.0 // Standard macOS menu bar height
let menuBarRect = CGRect(
x: mainScreen.frame.origin.x,
y: mainScreen.frame.origin.y + mainScreen.frame.height - menuBarHeight,
width: mainScreen.frame.width,
height: menuBarHeight
)
// Capture the menu bar area
return try await ScreenCaptureBridge.captureArea(services: self.services, rect: menuBarRect)
}
private func saveScreenshot(_ imageData: Data) throws -> String {
let outputPath: String
if let providedPath = path {
outputPath = NSString(string: providedPath).expandingTildeInPath
} else {
let timestamp = Date().timeIntervalSince1970
let filename = "peekaboo_see_\(Int(timestamp)).png"
let defaultPath = ConfigurationManager.shared.getDefaultSavePath(cliValue: nil)
outputPath = (defaultPath as NSString).appendingPathComponent(filename)
}
// Create directory if needed
let directory = (outputPath as NSString).deletingLastPathComponent
try FileManager.default.createDirectory(
atPath: directory,
withIntermediateDirectories: true
)
// Save the image
try imageData.write(to: URL(fileURLWithPath: outputPath))
self.logger.verbose("Saved screenshot to: \(outputPath)")
return outputPath
}
private func resolveSeeWindowIndex(appIdentifier: String, titleFragment: String?) async throws -> Int? {
do {
let windows = try await WindowServiceBridge.listWindows(
windows: self.services.windows,
target: .application(appIdentifier)
)
let filtered = WindowFilterHelper.filter(
windows: windows,
appIdentifier: appIdentifier,
mode: .capture,
logger: self.logger
)
guard !filtered.isEmpty else {
throw CaptureError.windowNotFound
}
if let fragment = titleFragment {
guard let match = filtered.first(where: { window in
window.title.localizedCaseInsensitiveContains(fragment)
}) else {
throw CaptureError.windowNotFound
}
return match.index
}
return filtered.first?.index
} catch let error as PeekabooError {
switch error {
case .permissionDeniedAccessibility, .windowNotFound:
self.logger.debug(
"Window enumeration unavailable; falling back",
metadata: ["app": appIdentifier, "reason": error.localizedDescription]
)
return nil
default:
throw error
}
} catch {
self.logger.debug(
"Window enumeration failed; falling back",
metadata: ["app": appIdentifier, "reason": error.localizedDescription]
)
return nil
}
}
// swiftlint:disable function_body_length
private func generateAnnotatedScreenshot(
sessionId: String,
originalPath: String
) async throws -> String {
// Get detection result from session
guard let detectionResult = try await self.services.sessions.getDetectionResult(sessionId: sessionId)
else {
self.logger.info("No detection result found for session")
return originalPath
}
// Create annotated image
let annotatedPath = (originalPath as NSString).deletingPathExtension + "_annotated.png"
// Load original image
guard let nsImage = NSImage(contentsOfFile: originalPath) else {
throw CaptureError.fileIOError("Failed to load image from \(originalPath)")
}
// Get image size
let imageSize = nsImage.size
// Create bitmap context
guard let bitmapRep = NSBitmapImageRep(
bitmapDataPlanes: nil,
pixelsWide: Int(imageSize.width),
pixelsHigh: Int(imageSize.height),
bitsPerSample: 8,
samplesPerPixel: 4,
hasAlpha: true,
isPlanar: false,
colorSpaceName: .calibratedRGB,
bytesPerRow: 0,
bitsPerPixel: 0
)
else {
throw CaptureError.captureFailure("Failed to create bitmap representation")
}
// Draw into context
NSGraphicsContext.saveGraphicsState()
guard let context = NSGraphicsContext(bitmapImageRep: bitmapRep) else {
self.logger.error("Failed to create graphics context")
throw CaptureError.captureFailure("Failed to create graphics context")
}
NSGraphicsContext.current = context
self.logger.verbose("Graphics context created successfully")
// Draw original image
nsImage.draw(in: NSRect(origin: .zero, size: imageSize))
self.logger.verbose("Original image drawn")
// Configure text attributes - smaller font for less occlusion
let fontSize: CGFloat = 8
let textAttributes: [NSAttributedString.Key: Any] = [
.font: NSFont.systemFont(ofSize: fontSize, weight: .semibold),
.foregroundColor: NSColor.white,
]
// Role-based colors from spec
let roleColors: [ElementType: NSColor] = [
.button: NSColor(red: 0, green: 0.48, blue: 1.0, alpha: 1.0), // #007AFF
.textField: NSColor(red: 0.204, green: 0.78, blue: 0.349, alpha: 1.0), // #34C759
.link: NSColor(red: 0, green: 0.48, blue: 1.0, alpha: 1.0), // #007AFF
.checkbox: NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0), // #8E8E93
.slider: NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0), // #8E8E93
.menu: NSColor(red: 0, green: 0.48, blue: 1.0, alpha: 1.0), // #007AFF
]
// Draw UI elements
let enabledElements = detectionResult.elements.all.filter(\.isEnabled)
if enabledElements.isEmpty {
self.logger.info("No enabled elements to annotate. Total elements: \(detectionResult.elements.all.count)")
print("\(AgentDisplayTokens.Status.warning) No interactive UI elements found to annotate")
return originalPath // Return original image if no elements to annotate
}
self.logger.info(
"Annotating \(enabledElements.count) enabled elements out of \(detectionResult.elements.all.count) total"
)
self.logger.verbose("Image size: \(imageSize)")
// Calculate window origin from element bounds if we have elements
var windowOrigin = CGPoint.zero
if !detectionResult.elements.all.isEmpty {
// Find the leftmost and topmost element to estimate window origin
let minX = detectionResult.elements.all.map(\.bounds.minX).min() ?? 0
let minY = detectionResult.elements.all.map(\.bounds.minY).min() ?? 0
windowOrigin = CGPoint(x: minX, y: minY)
self.logger.verbose("Estimated window origin from elements: \(windowOrigin)")
}
// Convert all element bounds to window-relative coordinates and flip Y
var elementRects: [(element: DetectedElement, rect: NSRect)] = []
for element in enabledElements {
let elementFrame = CGRect(
x: element.bounds.origin.x - windowOrigin.x,
y: element.bounds.origin.y - windowOrigin.y,
width: element.bounds.width,
height: element.bounds.height
)
let rect = NSRect(
x: elementFrame.origin.x,
y: imageSize.height - elementFrame.origin.y - elementFrame.height, // Flip Y coordinate
width: elementFrame.width,
height: elementFrame.height
)
elementRects.append((element: element, rect: rect))
}
// Create smart label placer for intelligent label positioning
let labelPlacer = SmartLabelPlacer(
image: nsImage,
fontSize: fontSize,
debugMode: self.verbose,
logger: self.logger
)
// Draw elements and calculate label positions
var labelPositions: [(rect: NSRect, connection: NSPoint?, element: DetectedElement)] = []
for (element, rect) in elementRects {
let drawingDetails = [
"Drawing element: \(element.id)",
"type: \(element.type)",
"original bounds: \(element.bounds)",
"window rect: \(rect)"
].joined(separator: ", ")
self.logger.verbose(drawingDetails)
// Get color for element type
let color = roleColors[element.type] ?? NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0)
// Draw bounding box
color.withAlphaComponent(0.5).setFill()
rect.fill()
color.setStroke()
let path = NSBezierPath(rect: rect)
path.lineWidth = 2
path.stroke()
// Calculate label size
let idString = NSAttributedString(string: element.id, attributes: textAttributes)
let textSize = idString.size()
let labelPadding: CGFloat = 4
let labelSize = NSSize(width: textSize.width + labelPadding * 2, height: textSize.height + labelPadding)
// Use smart label placer to find best position
if let placement = labelPlacer.findBestLabelPosition(
for: element,
elementRect: rect,
labelSize: labelSize,
existingLabels: labelPositions.map { ($0.rect, $0.element) },
allElements: elementRects
) {
labelPositions.append((
rect: placement.labelRect,
connection: placement.connectionPoint,
element: element
))
}
}
// NOTE: Old placement code removed - now using SmartLabelPlacer
// [OLD CODE REMOVED - lines 483-785 contained the old placement logic]
// Draw all labels and connection lines
for (labelRect, connectionPoint, element) in labelPositions {
// Draw connection line if label is outside - make it more subtle
if let connection = connectionPoint {
NSColor.black.withAlphaComponent(0.3).setStroke()
let linePath = NSBezierPath()
linePath.lineWidth = 0.5
// Draw line from connection point to nearest edge of label
linePath.move(to: connection)
// Find the closest point on label rectangle to the connection point
let closestX = max(labelRect.minX, min(connection.x, labelRect.maxX))
let closestY = max(labelRect.minY, min(connection.y, labelRect.maxY))
linePath.line(to: NSPoint(x: closestX, y: closestY))
linePath.stroke()
}
// Draw label background - more transparent to show content beneath
NSColor.black.withAlphaComponent(0.7).setFill()
NSBezierPath(roundedRect: labelRect, xRadius: 1, yRadius: 1).fill()
// Draw label border (same color as element) - thinner for less occlusion
let color = roleColors[element.type] ?? NSColor(red: 0.557, green: 0.557, blue: 0.576, alpha: 1.0)
color.withAlphaComponent(0.8).setStroke()
let borderPath = NSBezierPath(roundedRect: labelRect, xRadius: 1, yRadius: 1)
borderPath.lineWidth = 0.5
borderPath.stroke()
// Draw label text
let idString = NSAttributedString(string: element.id, attributes: textAttributes)
idString.draw(at: NSPoint(x: labelRect.origin.x + 4, y: labelRect.origin.y + 2))
}
NSGraphicsContext.restoreGraphicsState()
// Save annotated image
guard let pngData = bitmapRep.representation(using: .png, properties: [:]) else {
throw CaptureError.captureFailure("Failed to create PNG data")
}
try pngData.write(to: URL(fileURLWithPath: annotatedPath))
self.logger.verbose("Created annotated screenshot: \(annotatedPath)")
// Log annotation info only in non-JSON mode
if !self.jsonOutput {
let interactableElements = detectionResult.elements.all.filter(\.isEnabled)
print("š Created annotated screenshot with \(interactableElements.count) interactive elements")
}
return annotatedPath
}
// swiftlint:enable function_body_length
// [OLD CODE REMOVED - massive cleanup of duplicate placement logic]
}
// MARK: - Supporting Types
private struct CaptureAndDetectionResult {
let sessionId: String
let screenshotPath: String
let elements: DetectedElements
let metadata: DetectionMetadata
}
private struct SessionPaths {
let raw: String
let annotated: String
let map: String
}
private struct SeeCommandRenderContext {
let sessionId: String
let screenshotPath: String
let annotatedPath: String?
let metadata: DetectionMetadata
let elements: DetectedElements
let analysis: SeeAnalysisData?
let executionTime: TimeInterval
}
// MARK: - JSON Output Structure (matching original)
struct UIElementSummary: Codable {
let id: String
let role: String
let title: String?
let label: String?
let description: String?
let role_description: String?
let help: String?
let identifier: String?
let is_actionable: Bool
let keyboard_shortcut: String?
}
struct SeeAnalysisData: Codable {
let provider: String
let model: String
let text: String
}
struct SeeResult: Codable {
let session_id: String
let screenshot_raw: String
let screenshot_annotated: String
let ui_map: String
let application_name: String?
let window_title: String?
let is_dialog: Bool
let element_count: Int
let interactable_count: Int
let capture_mode: String
let analysis: SeeAnalysisData?
let execution_time: TimeInterval
let ui_elements: [UIElementSummary]
let menu_bar: MenuBarSummary?
var success: Bool = true
}
struct MenuBarSummary: Codable {
let menus: [MenuSummary]
struct MenuSummary: Codable {
let title: String
let item_count: Int
let enabled: Bool
let items: [MenuItemSummary]
}
struct MenuItemSummary: Codable {
let title: String
let enabled: Bool
let keyboard_shortcut: String?
}
}
// MARK: - Format Helpers Extension
extension SeeCommand {
/// Fetches the menu bar summary only when verbose output is requested, with a short timeout.
private func fetchMenuBarSummaryIfEnabled() async -> MenuBarSummary? {
guard self.verbose else { return nil }
do {
return try await Self.withWallClockTimeout(seconds: 2.5) {
try Task.checkCancellation()
return await self.getMenuBarItemsSummary()
}
} catch {
self.logger.debug(
"Skipping menu bar summary",
category: "Menu",
metadata: ["reason": error.localizedDescription]
)
return nil
}
}
/// Timeout helper that is not MainActor-bound, so it can still fire if the main actor is blocked.
static func withWallClockTimeout<T: Sendable>(
seconds: TimeInterval,
operation: @escaping @Sendable () async throws -> T
) async throws -> T {
try await withThrowingTaskGroup(of: T.self) { group in
group.addTask {
try await operation()
}
group.addTask {
try await Task.sleep(nanoseconds: UInt64(seconds * 1_000_000_000))
throw CaptureError.detectionTimedOut(seconds)
}
guard let result = try await group.next() else {
throw CaptureError.detectionTimedOut(seconds)
}
group.cancelAll()
return result
}
}
private func performAnalysisDetailed(imagePath: String, prompt: String) async throws -> SeeAnalysisData {
// Use PeekabooCore AI service which is configured via ConfigurationManager/Tachikoma
let ai = PeekabooAIService()
let res = try await ai.analyzeImageFileDetailed(at: imagePath, question: prompt, model: nil)
return SeeAnalysisData(provider: res.provider, model: res.model, text: res.text)
}
private func buildMenuSummaryIfNeeded() async -> MenuBarSummary? {
// Placeholder for future UI summary generation; currently unused.
nil
}
private func determineMode() -> PeekabooCore.CaptureMode {
if let mode = self.mode {
mode
} else if self.app != nil || self.windowTitle != nil {
// If app or window title is specified, default to window mode
.window
} else {
// Otherwise default to frontmost
.frontmost
}
}
// MARK: - Output Methods
private func outputJSONResults(context: SeeCommandRenderContext) async {
let uiElements: [UIElementSummary] = context.elements.all.map { element in
UIElementSummary(
id: element.id,
role: element.type.rawValue,
title: element.attributes["title"],
label: element.label,
description: element.attributes["description"],
role_description: element.attributes["roleDescription"],
help: element.attributes["help"],
identifier: element.attributes["identifier"],
is_actionable: element.isEnabled,
keyboard_shortcut: element.attributes["keyboardShortcut"]
)
}
let sessionPaths = self.sessionPaths(for: context)
// Menu bar enumeration can be slow or hang on some setups. Only attempt it in verbose
// mode and bound it with a short timeout so JSON output is responsive by default.
let menuSummary = await self.fetchMenuBarSummaryIfEnabled()
let output = SeeResult(
session_id: context.sessionId,
screenshot_raw: sessionPaths.raw,
screenshot_annotated: sessionPaths.annotated,
ui_map: sessionPaths.map,
application_name: context.metadata.windowContext?.applicationName,
window_title: context.metadata.windowContext?.windowTitle,
is_dialog: context.metadata.isDialog,
element_count: context.metadata.elementCount,
interactable_count: context.elements.all.count { $0.isEnabled },
capture_mode: self.determineMode().rawValue,
analysis: context.analysis,
execution_time: context.executionTime,
ui_elements: uiElements,
menu_bar: menuSummary
)
outputSuccessCodable(data: output, logger: self.outputLogger)
}
private func getMenuBarItemsSummary() async -> MenuBarSummary {
// Get menu bar items from service
var menuExtras: [MenuExtraInfo] = []
do {
menuExtras = try await self.services.menu.listMenuExtras()
} catch {
// If there's an error, just return empty array
menuExtras = []
}
// Group items into menu categories
// For now, we'll create a simplified view showing each menu bar item as a "menu"
let menus = menuExtras.map { extra in
MenuBarSummary.MenuSummary(
title: extra.title,
item_count: 1, // Each menu bar item is treated as a single menu
enabled: true,
items: [
MenuBarSummary.MenuItemSummary(
title: extra.title,
enabled: true,
keyboard_shortcut: nil
)
]
)
}
return MenuBarSummary(menus: menus)
}
private func outputTextResults(context: SeeCommandRenderContext) async {
print("š¼ļø Screenshot saved to: \(context.screenshotPath)")
if let annotatedPath = context.annotatedPath {
print("š Annotated screenshot: \(annotatedPath)")
}
if let appName = context.metadata.windowContext?.applicationName {
print("š± Application: \(appName)")
}
if let windowTitle = context.metadata.windowContext?.windowTitle {
let windowType = context.metadata.isDialog ? "Dialog" : "Window"
let icon = context.metadata.isDialog ? "šØļø" : "[win]"
print("\(icon) \(windowType): \(windowTitle)")
}
print("š§ Detection method: \(context.metadata.method)")
print("š UI elements detected: \(context.metadata.elementCount)")
print("āļø Interactable elements: \(context.elements.all.count { $0.isEnabled })")
let formattedDuration = String(format: "%.2f", context.executionTime)
print("ā±ļø Execution time: \(formattedDuration)s")
if let analysis = context.analysis {
print("\nš¤ AI Analysis\n\(analysis.text)")
}
if context.metadata.elementCount > 0 {
print("\nš Element Summary")
for element in context.elements.all.prefix(10) {
let summaryLabel = element.label ?? element.attributes["title"] ?? element.value ?? "Untitled"
print("⢠\(element.id) (\(element.type.rawValue)) - \(summaryLabel)")
}
if context.metadata.elementCount > 10 {
print(" ...and \(context.metadata.elementCount - 10) more elements")
}
}
if self.annotate {
print("\nš Annotated screenshot created")
}
if let menuSummary = await self.buildMenuSummaryIfNeeded() {
print("\nš§ Menu Bar Summary")
for menu in menuSummary.menus {
print("- \(menu.title) (\(menu.enabled ? "Enabled" : "Disabled"))")
for item in menu.items.prefix(5) {
let shortcut = item.keyboard_shortcut.map { " [\($0)]" } ?? ""
print(" ⢠\(item.title)\(shortcut)")
}
}
}
print("\nSession ID: \(context.sessionId)")
let terminalCapabilities = TerminalDetector.detectCapabilities()
if terminalCapabilities.recommendedOutputMode == .minimal {
print("Agent: Use a tool like view_image to inspect it.")
}
}
private func sessionPaths(for context: SeeCommandRenderContext) -> SessionPaths {
SessionPaths(
raw: context.screenshotPath,
annotated: context.annotatedPath ?? context.screenshotPath,
map: self.services.sessions.getSessionStoragePath() + "/\(context.sessionId)/map.json"
)
}
}
// MARK: - Multi-Screen Support
extension SeeCommand {
private func performScreenCapture() async throws -> CaptureResult {
// Log warning if annotation was requested for full screen captures
if self.annotate {
self.logger.info("Annotation is disabled for full screen captures due to performance constraints")
}
self.logger.verbose("Initiating screen capture", category: "Capture")
self.logger.startTimer("screen_capture")
defer {
self.logger.stopTimer("screen_capture")
}
if let index = self.screenIndex ?? (self.analyze != nil ? 0 : nil) {
// Capture specific screen
self.logger.verbose("Capturing specific screen", category: "Capture", metadata: ["screenIndex": index])
let result = try await ScreenCaptureBridge.captureScreen(services: self.services, displayIndex: index)
// Add display info to output
if let displayInfo = result.metadata.displayInfo {
self.printScreenDisplayInfo(
index: index,
displayInfo: displayInfo,
indent: "",
suffix: nil
)
}
self.logger.verbose("Screen capture completed", category: "Capture", metadata: [
"mode": "screen-index",
"screenIndex": index,
"imageBytes": result.imageData.count
])
return result
} else {
// Capture all screens
self.logger.verbose("Capturing all screens", category: "Capture")
let results = try await self.captureAllScreens()
if results.isEmpty {
throw CaptureError.captureFailure("Failed to capture any screens")
}
// Save all screenshots except the first (which will be saved by the normal flow)
print("šø Captured \(results.count) screen(s):")
for (index, result) in results.indexed() {
if index > 0 {
// Save additional screenshots
let screenPath: String
if let basePath = self.path {
// User specified a path - add screen index to filename
let directory = (basePath as NSString).deletingLastPathComponent
let filename = (basePath as NSString).lastPathComponent
let nameWithoutExt = (filename as NSString).deletingPathExtension
let ext = (filename as NSString).pathExtension
screenPath = (directory as NSString)
.appendingPathComponent("\(nameWithoutExt)_screen\(index).\(ext)")
} else {
// Default path with screen index
let timestamp = ISO8601DateFormatter().string(from: Date())
screenPath = "screenshot_\(timestamp)_screen\(index).png"
}
// Save the screenshot
try result.imageData.write(to: URL(fileURLWithPath: screenPath))
// Display info about this screen
if let displayInfo = result.metadata.displayInfo {
let fileSize = self.getFileSize(screenPath) ?? 0
let suffix = "\(screenPath) (\(self.formatFileSize(Int64(fileSize))))"
self.printScreenDisplayInfo(
index: index,
displayInfo: displayInfo,
indent: " ",
suffix: suffix
)
}
} else {
// First screen will be saved by the normal flow, just show info
if let displayInfo = result.metadata.displayInfo {
self.printScreenDisplayInfo(
index: index,
displayInfo: displayInfo,
indent: " ",
suffix: "(primary)"
)
}
}
}
// Return the primary screen result (first one)
self.logger.verbose("Multi-screen capture completed", category: "Capture", metadata: [
"count": results.count,
"primaryBytes": results.first?.imageData.count ?? 0
])
return results[0]
}
}
}
// MARK: - Multi-Screen Support
extension SeeCommand {
private func captureAllScreens() async throws -> [CaptureResult] {
var results: [CaptureResult] = []
// Get available displays from the screen capture service
let content = try await SCShareableContent.current
let displays = content.displays
self.logger.info("Found \(displays.count) display(s) to capture")
for (index, display) in displays.indexed() {
self.logger.verbose("Capturing display \(index)", category: "MultiScreen", metadata: [
"displayID": display.displayID,
"width": display.width,
"height": display.height
])
do {
let result = try await ScreenCaptureBridge.captureScreen(services: self.services, displayIndex: index)
// Update path to include screen index if capturing multiple screens
if displays.count > 1 {
let updatedResult = self.updateCaptureResultPath(result, screenIndex: index, displayInfo: display)
results.append(updatedResult)
} else {
results.append(result)
}
} catch {
self.logger.error("Failed to capture display \(index): \(error)")
// Continue capturing other screens even if one fails
}
}
if results.isEmpty {
throw CaptureError.captureFailure("Failed to capture any screens")
}
return results
}
private func updateCaptureResultPath(
_ result: CaptureResult,
screenIndex: Int,
displayInfo: SCDisplay
) -> CaptureResult {
// Since CaptureResult is immutable and doesn't have a path property,
// we can't update the path. Just return the original result.
// The saved path is already included in result.savedPath if it was saved.
result
}
private func formatFileSize(_ bytes: Int64) -> String {
let formatter = ByteCountFormatter()
formatter.countStyle = .file
return formatter.string(fromByteCount: bytes)
}
}
@MainActor
extension SeeCommand: ParsableCommand {
nonisolated(unsafe) static var commandDescription: CommandDescription {
MainActorCommandDescription.describe {
let definition = VisionToolDefinitions.see.commandConfiguration
return CommandDescription(
commandName: definition.commandName,
abstract: definition.abstract,
discussion: definition.discussion,
usageExamples: [
CommandUsageExample(
command: "peekaboo see --json-output --annotate --path /tmp/see.png",
description: "Capture the frontmost window, print structured output, and save annotations."
),
CommandUsageExample(
command: "peekaboo see --app Safari --window-title \"Login\" --json-output",
description: "Target a specific Safari window to collect stable element IDs."
),
CommandUsageExample(
command: "peekaboo see --mode screen --screen-index 0 --analyze 'Summarize the dashboard'",
description: "Capture a display and immediately send it to the configured AI provider."
)
],
showHelpOnEmptyInvocation: true
)
}
}
}
extension SeeCommand: AsyncRuntimeCommand {}
@MainActor
extension SeeCommand: CommanderBindableCommand {
mutating func applyCommanderValues(_ values: CommanderBindableValues) throws {
self.app = values.singleOption("app")
self.pid = try values.decodeOption("pid", as: Int32.self)
self.windowTitle = values.singleOption("windowTitle")
if let parsedMode: PeekabooCore.CaptureMode = try values.decodeOptionEnum("mode", caseInsensitive: false) {
self.mode = parsedMode
}
self.path = values.singleOption("path")
self.screenIndex = try values.decodeOption("screenIndex", as: Int.self)
self.annotate = values.flag("annotate")
self.analyze = values.singleOption("analyze")
}
}
extension SeeCommand {
private func screenDisplayBaseText(index: Int, displayInfo: DisplayInfo) -> String {
let displayName = displayInfo.name ?? "Display \(index)"
let bounds = displayInfo.bounds
let resolution = "(\(Int(bounds.width))Ć\(Int(bounds.height)))"
return "[scrn]ļø Display \(index): \(displayName) \(resolution)"
}
private func printScreenDisplayInfo(
index: Int,
displayInfo: DisplayInfo,
indent: String = "",
suffix: String? = nil
) {
var line = self.screenDisplayBaseText(index: index, displayInfo: displayInfo)
if let suffix {
line += " ā \(suffix)"
}
print("\(indent)\(line)")
}
}