import { Image, Region } from '@nut-tree-fork/nut-js';
import { Canvas, createCanvas, CanvasRenderingContext2D, ImageData } from 'canvas';
import { imageToBase64, base64ToBuffer } from './image-utils.js';
import { getTextLocations, type TextLocation } from './ocr-utils.js';
import { logger } from './logger.js';
import { UIDetectionError } from './errors.js';
/**
* Enhanced UI element interface with more detailed properties
*/
export interface UIElement {
id: string;
type: UIElementType;
subtype?: string;
text: string;
x: number;
y: number;
width: number;
height: number;
confidence: number;
clickable: boolean;
interactive: boolean;
description: string;
role?: string;
state?: UIElementState;
attributes: Record<string, any>;
children?: UIElement[];
parent?: string; // parent element ID
detectionMethod: DetectionMethod[];
visualFeatures?: VisualFeatures;
}
export type UIElementType =
| 'button'
| 'text_field'
| 'link'
| 'image'
| 'icon'
| 'dialog'
| 'menu'
| 'window'
| 'checkbox'
| 'radio_button'
| 'dropdown'
| 'slider'
| 'tab'
| 'toolbar'
| 'list'
| 'table'
| 'cell'
| 'scrollbar'
| 'tooltip'
| 'alert'
| 'progress_bar'
| 'other';
export type UIElementState =
| 'enabled'
| 'disabled'
| 'selected'
| 'unselected'
| 'focused'
| 'unfocused'
| 'expanded'
| 'collapsed'
| 'loading'
| 'error'
| 'warning'
| 'success';
export type DetectionMethod =
| 'visual_analysis'
| 'text_pattern'
| 'color_analysis'
| 'shape_analysis'
| 'accessibility_api'
| 'ocr_text'
| 'context_analysis';
export interface VisualFeatures {
backgroundColor: string;
borderColor?: string;
textColor?: string;
borderWidth?: number;
borderRadius?: number;
shadow?: boolean;
gradient?: boolean;
opacity?: number;
isRounded?: boolean;
hasIcon?: boolean;
iconPosition?: 'left' | 'right' | 'top' | 'bottom';
}
export interface DetectionConfig {
enableVisualAnalysis: boolean;
enableTextAnalysis: boolean;
enableColorAnalysis: boolean;
enableShapeAnalysis: boolean;
enableContextAnalysis: boolean;
minConfidence: number;
maxElements: number;
skipDuplicates: boolean;
expandSearchRegion: boolean;
debugMode: boolean;
}
const DEFAULT_DETECTION_CONFIG: DetectionConfig = {
enableVisualAnalysis: true,
enableTextAnalysis: true,
enableColorAnalysis: true,
enableShapeAnalysis: true,
enableContextAnalysis: true,
minConfidence: 0.5,
maxElements: 100,
skipDuplicates: true,
expandSearchRegion: false,
debugMode: false
};
/**
* macOS UI patterns and characteristics
*/
const MACOS_UI_PATTERNS = {
buttons: {
colors: {
system: ['#007AFF', '#34C759', '#FF3B30', '#FF9500', '#AF52DE', '#00C7BE', '#FF2D92'],
default: ['#FFFFFF', '#F2F2F7', '#E5E5EA'],
accent: ['#007AFF'],
destructive: ['#FF3B30']
},
textPatterns: [
/^(OK|Cancel|Apply|Save|Delete|Close|Open|Submit|Send|Add|Remove|Edit|Copy|Paste|Cut|Undo|Redo|Refresh|Reload|Login|Logout|Sign In|Sign Up|Register|Continue|Next|Previous|Back|Forward|Play|Pause|Stop|Start|Finish|Done|Create|New|Browse|Search|Find|Help|About|Settings|Preferences|Options)$/i,
/^[A-Z][a-z]+$/,
/^[A-Z][a-z]+\s[A-Z][a-z]+$/
],
minWidth: 44, // Apple HIG minimum touch target
minHeight: 44,
borderRadius: [4, 6, 8, 12, 20] // Common macOS border radius values
},
textFields: {
colors: {
background: ['#FFFFFF', '#F2F2F7'],
border: ['#C7C7CC', '#D1D1D6']
},
minHeight: 28,
borderRadius: [4, 6],
textPatterns: [
/placeholder/i,
/enter|type|search|email|password|username|name|address|phone|number|message|comment|description/i
]
},
menus: {
colors: {
background: ['#FFFFFF', '#F2F2F7'],
separator: ['#C6C6C8', '#E5E5EA']
},
textPatterns: [
/File|Edit|View|Tools|Help|Window|Format|Insert/i
]
},
dialogs: {
colors: {
background: ['#FFFFFF', '#F2F2F7'],
overlay: ['rgba(0,0,0,0.4)', 'rgba(0,0,0,0.6)']
},
textPatterns: [
/alert|warning|error|confirm|are you sure|do you want to|failed|success|completed/i
]
}
};
/**
* Advanced UI Element Detection Engine
*/
export class UIElementDetector {
private config: DetectionConfig;
private canvas: Canvas | null = null;
private ctx: CanvasRenderingContext2D | null = null;
private imageData: ImageData | null = null;
private detectedElements: Map<string, UIElement> = new Map();
constructor(config: Partial<DetectionConfig> = {}) {
this.config = { ...DEFAULT_DETECTION_CONFIG, ...config };
}
/**
* Main detection method that combines all detection strategies
*/
async detectUIElements(image: Image, region?: Region): Promise<UIElement[]> {
try {
logger.startTimer('ui_element_detection');
// Initialize canvas for visual analysis
await this.initializeCanvas(image);
this.detectedElements.clear();
const allElements: UIElement[] = [];
// Strategy 1: OCR-based text element detection
if (this.config.enableTextAnalysis) {
const textElements = await this.detectTextBasedElements(image, region);
allElements.push(...textElements);
}
// Strategy 2: Visual analysis for UI components
if (this.config.enableVisualAnalysis) {
const visualElements = await this.detectVisualElements();
allElements.push(...visualElements);
}
// Strategy 3: Color-based detection
if (this.config.enableColorAnalysis) {
const colorElements = await this.detectColorBasedElements();
allElements.push(...colorElements);
}
// Strategy 4: Shape-based detection
if (this.config.enableShapeAnalysis) {
const shapeElements = await this.detectShapeBasedElements();
allElements.push(...shapeElements);
}
// Strategy 5: Context-based element classification
if (this.config.enableContextAnalysis) {
await this.enhanceWithContextAnalysis(allElements);
}
// Filter and merge duplicate elements
const processedElements = this.postProcessElements(allElements);
// Validate element interactivity
const validatedElements = await this.validateDetectedElements(processedElements);
// Create element hierarchy
const finalElements = this.createElementHierarchy(validatedElements);
logger.endTimer('ui_element_detection');
logger.info('UI element detection completed', {
totalDetected: allElements.length,
afterProcessing: processedElements.length,
afterValidation: validatedElements.length,
final: finalElements.length,
methods: Object.keys(this.config).filter(k => k.startsWith('enable') && this.config[k as keyof DetectionConfig])
});
return finalElements;
} catch (error) {
logger.error('UI element detection failed', error as Error);
throw new UIDetectionError(`UI element detection failed: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Initialize canvas for visual analysis
*/
private async initializeCanvas(image: Image): Promise<void> {
try {
const base64 = await imageToBase64(image);
const buffer = base64ToBuffer(base64);
this.canvas = createCanvas(image.width, image.height);
this.ctx = this.canvas.getContext('2d');
// Load image into canvas
const canvasImage = new (this.canvas.constructor as any).Image();
canvasImage.src = buffer;
this.ctx.drawImage(canvasImage, 0, 0);
// Get image data for pixel analysis
this.imageData = this.ctx.getImageData(0, 0, image.width, image.height);
} catch (error) {
throw new UIDetectionError(`Failed to initialize canvas: ${error}`);
}
}
/**
* Detect UI elements based on text content and patterns
*/
private async detectTextBasedElements(image: Image, region?: Region): Promise<UIElement[]> {
const elements: UIElement[] = [];
try {
const textLocations = await getTextLocations(image, undefined, region);
for (const textLoc of textLocations) {
const element = this.classifyTextElement(textLoc);
if (element && element.confidence >= this.config.minConfidence) {
elements.push(element);
}
}
} catch (error) {
logger.warn('Text-based detection failed', error as Error);
}
return elements;
}
/**
* Classify text element based on content and patterns
*/
private classifyTextElement(textLoc: TextLocation): UIElement | null {
const text = textLoc.text.trim();
if (!text) {
return null;
}
const id = this.generateElementId('text', textLoc.x, textLoc.y);
const baseElement: Partial<UIElement> = {
id,
text,
x: textLoc.x,
y: textLoc.y,
width: textLoc.width,
height: textLoc.height,
confidence: textLoc.confidence / 100, // Convert to 0-1 scale
detectionMethod: ['text_pattern', 'ocr_text'],
attributes: {}
};
// Button detection
if (this.isButtonText(text)) {
return {
...baseElement,
type: 'button',
clickable: true,
interactive: true,
description: `Button with text "${text}"`,
role: 'button',
state: 'enabled'
} as UIElement;
}
// Link detection
if (this.isLinkText(text)) {
return {
...baseElement,
type: 'link',
clickable: true,
interactive: true,
description: `Link with text "${text}"`,
role: 'link',
state: 'enabled'
} as UIElement;
}
// Text field detection
if (this.isTextFieldIndicator(text)) {
return {
...baseElement,
type: 'text_field',
clickable: true,
interactive: true,
description: `Text field labeled "${text}"`,
role: 'textbox',
state: 'enabled'
} as UIElement;
}
// Menu detection
if (this.isMenuText(text)) {
return {
...baseElement,
type: 'menu',
clickable: true,
interactive: true,
description: `Menu item "${text}"`,
role: 'menuitem',
state: 'enabled'
} as UIElement;
}
// Dialog detection
if (this.isDialogText(text)) {
return {
...baseElement,
type: 'dialog',
clickable: false,
interactive: false,
description: `Dialog content "${text}"`,
role: 'dialog',
state: 'enabled'
} as UIElement;
}
// Default text element
return {
...baseElement,
type: 'other',
clickable: false,
interactive: false,
description: `Text element "${text}"`,
role: 'text',
state: 'enabled'
} as UIElement;
}
/**
* Detect UI elements based on visual characteristics
*/
private async detectVisualElements(): Promise<UIElement[]> {
if (!this.imageData || !this.canvas) {
return [];
}
const elements: UIElement[] = [];
const data = this.imageData.data;
const width = this.imageData.width;
const height = this.imageData.height;
// Detect rectangular regions that might be UI elements
const regions = this.detectRectangularRegions(data, width, height);
for (const region of regions) {
const element = await this.analyzeVisualRegion(region);
if (element && element.confidence >= this.config.minConfidence) {
elements.push(element);
}
}
return elements;
}
/**
* Detect rectangular regions in the image
*/
private detectRectangularRegions(data: Uint8ClampedArray, width: number, height: number): Array<{x: number, y: number, width: number, height: number}> {
const regions: Array<{x: number, y: number, width: number, height: number}> = [];
const visited = new Set<string>();
// Simple edge detection to find potential UI boundaries
for (let y = 1; y < height - 1; y++) {
for (let x = 1; x < width - 1; x++) {
const key = `${x},${y}`;
if (visited.has(key)) {
continue;
}
if (this.isEdgePixel(data, x, y, width)) {
const region = this.growRegion(data, x, y, width, height, visited);
if (this.isValidUIRegion(region)) {
regions.push(region);
}
}
}
}
return regions.slice(0, 50); // Limit regions to prevent overprocessing
}
/**
* Check if pixel is an edge pixel
*/
private isEdgePixel(data: Uint8ClampedArray, x: number, y: number, width: number): boolean {
const getPixel = (px: number, py: number) => {
const idx = (py * width + px) * 4;
return {
r: data[idx],
g: data[idx + 1],
b: data[idx + 2],
a: data[idx + 3]
};
};
const center = getPixel(x, y);
const neighbors = [
getPixel(x - 1, y),
getPixel(x + 1, y),
getPixel(x, y - 1),
getPixel(x, y + 1)
];
// Check for significant color difference with neighbors
return neighbors.some(neighbor => {
const diff = Math.abs(center.r - neighbor.r) +
Math.abs(center.g - neighbor.g) +
Math.abs(center.b - neighbor.b);
return diff > 30; // Threshold for edge detection
});
}
/**
* Grow a region from a seed point
*/
private growRegion(data: Uint8ClampedArray, startX: number, startY: number, width: number, height: number, visited: Set<string>): {x: number, y: number, width: number, height: number} {
let minX = startX, maxX = startX;
let minY = startY, maxY = startY;
const queue = [[startX, startY]];
const regionPixels = new Set<string>();
while (queue.length > 0 && regionPixels.size < 1000) { // Limit region size
const item = queue.shift();
if (!item) {break;}
const [x, y] = item;
const key = `${x},${y}`;
if (visited.has(key) || regionPixels.has(key)) {
continue;
}
if (x < 0 || x >= width || y < 0 || y >= height) {
continue;
}
visited.add(key);
regionPixels.add(key);
minX = Math.min(minX, x);
maxX = Math.max(maxX, x);
minY = Math.min(minY, y);
maxY = Math.max(maxY, y);
// Add neighbors if they're similar
for (const [dx, dy] of [[-1, 0], [1, 0], [0, -1], [0, 1]]) {
const nx = x + dx;
const ny = y + dy;
if (this.areSimilarPixels(data, x, y, nx, ny, width)) {
queue.push([nx, ny]);
}
}
}
return {
x: minX,
y: minY,
width: maxX - minX + 1,
height: maxY - minY + 1
};
}
/**
* Check if two pixels are similar
*/
private areSimilarPixels(data: Uint8ClampedArray, x1: number, y1: number, x2: number, y2: number, width: number): boolean {
const idx1 = (y1 * width + x1) * 4;
const idx2 = (y2 * width + x2) * 4;
const diff = Math.abs(data[idx1] - data[idx2]) +
Math.abs(data[idx1 + 1] - data[idx2 + 1]) +
Math.abs(data[idx1 + 2] - data[idx2 + 2]);
return diff < 20; // Similarity threshold
}
/**
* Check if region is valid for UI element
*/
private isValidUIRegion(region: {x: number, y: number, width: number, height: number}): boolean {
return region.width >= 10 &&
region.height >= 10 &&
region.width <= 1000 &&
region.height <= 1000 &&
region.width * region.height >= 100;
}
/**
* Analyze a visual region to determine UI element type
*/
private async analyzeVisualRegion(region: {x: number, y: number, width: number, height: number}): Promise<UIElement | null> {
const visualFeatures = this.extractVisualFeatures(region);
const elementType = this.classifyByVisualFeatures(visualFeatures, region);
if (!elementType) {
return null;
}
const id = this.generateElementId('visual', region.x, region.y);
return {
id,
type: elementType.type,
subtype: elementType.subtype,
text: '', // Will be filled by OCR if available
x: region.x,
y: region.y,
width: region.width,
height: region.height,
confidence: elementType.confidence,
clickable: elementType.clickable,
interactive: elementType.interactive,
description: elementType.description,
role: elementType.role,
state: 'enabled',
detectionMethod: ['visual_analysis'],
visualFeatures,
attributes: {}
};
}
/**
* Extract visual features from a region
*/
private extractVisualFeatures(region: {x: number, y: number, width: number, height: number}): VisualFeatures {
if (!this.imageData) {
return { backgroundColor: '#FFFFFF' };
}
const data = this.imageData.data;
const width = this.imageData.width;
// Sample colors from the region
const colors: Array<{r: number, g: number, b: number}> = [];
const sampleRate = 5; // Sample every 5th pixel
for (let y = region.y; y < region.y + region.height; y += sampleRate) {
for (let x = region.x; x < region.x + region.width; x += sampleRate) {
if (x >= 0 && x < width && y >= 0 && y < this.imageData.height) {
const idx = (y * width + x) * 4;
colors.push({
r: data[idx],
g: data[idx + 1],
b: data[idx + 2]
});
}
}
}
// Calculate dominant color
const dominantColor = this.calculateDominantColor(colors);
return {
backgroundColor: `rgb(${dominantColor.r},${dominantColor.g},${dominantColor.b})`,
borderRadius: this.estimateBorderRadius(region),
isRounded: region.width === region.height && Math.min(region.width, region.height) < 100,
shadow: false, // Complex to detect from static image
gradient: false // Complex to detect from static image
};
}
/**
* Calculate dominant color from color samples
*/
private calculateDominantColor(colors: Array<{r: number, g: number, b: number}>): {r: number, g: number, b: number} {
if (colors.length === 0) {
return { r: 255, g: 255, b: 255 };
}
const avgR = colors.reduce((sum, c) => sum + c.r, 0) / colors.length;
const avgG = colors.reduce((sum, c) => sum + c.g, 0) / colors.length;
const avgB = colors.reduce((sum, c) => sum + c.b, 0) / colors.length;
return {
r: Math.round(avgR),
g: Math.round(avgG),
b: Math.round(avgB)
};
}
/**
* Estimate border radius based on region shape
*/
private estimateBorderRadius(region: {x: number, y: number, width: number, height: number}): number {
const aspectRatio = region.width / region.height;
if (Math.abs(aspectRatio - 1) < 0.1) {
// Square-ish, might be rounded
return Math.min(region.width, region.height) * 0.1;
}
// Rectangular, likely small border radius
return 4;
}
/**
* Classify element type based on visual features
*/
private classifyByVisualFeatures(features: VisualFeatures, region: {width: number, height: number}): {
type: UIElementType;
subtype?: string;
confidence: number;
clickable: boolean;
interactive: boolean;
description: string;
role: string;
} | null {
const aspectRatio = region.width / region.height;
// Button characteristics
if (this.looksLikeButton(features, region)) {
return {
type: 'button',
confidence: 0.7,
clickable: true,
interactive: true,
description: 'Visually detected button',
role: 'button'
};
}
// Text field characteristics
if (this.looksLikeTextField(features, region)) {
return {
type: 'text_field',
confidence: 0.6,
clickable: true,
interactive: true,
description: 'Visually detected text field',
role: 'textbox'
};
}
// Very rectangular and wide - might be a toolbar or menu
if (aspectRatio > 5 && region.height < 50) {
return {
type: 'toolbar',
confidence: 0.5,
clickable: false,
interactive: true,
description: 'Visually detected toolbar',
role: 'toolbar'
};
}
// Very tall and narrow - might be a scrollbar
if (aspectRatio < 0.2 && region.width < 30) {
return {
type: 'scrollbar',
confidence: 0.6,
clickable: true,
interactive: true,
description: 'Visually detected scrollbar',
role: 'scrollbar'
};
}
return null;
}
/**
* Check if visual features suggest this is a button
*/
private looksLikeButton(features: VisualFeatures, region: {width: number, height: number}): boolean {
const aspectRatio = region.width / region.height;
// Button size constraints (Apple HIG)
if (region.width < 44 || region.height < 30) {
return false;
}
if (region.width > 300 || region.height > 100) {
return false;
}
// Button aspect ratio (not too wide or tall)
if (aspectRatio < 1 || aspectRatio > 8) {
return false;
}
// Check for button-like colors
const bgColor = features.backgroundColor;
if (this.isButtonColor(bgColor)) {
return true;
}
// Rounded corners suggest button
if (features.isRounded || features.borderRadius && features.borderRadius > 3) {
return true;
}
return false;
}
/**
* Check if visual features suggest this is a text field
*/
private looksLikeTextField(features: VisualFeatures, region: {width: number, height: number}): boolean {
const aspectRatio = region.width / region.height;
// Text field constraints
if (region.height < 20 || region.height > 60) {
return false;
}
if (region.width < 50) {
return false;
}
if (aspectRatio < 2) {
return false; // Text fields are usually wider than tall
}
// Check for text field colors (usually white or light gray)
const bgColor = features.backgroundColor;
return this.isTextFieldColor(bgColor);
}
/**
* Check if color is typical for buttons
*/
private isButtonColor(color: string): boolean {
const buttonColors = [
...MACOS_UI_PATTERNS.buttons.colors.system,
...MACOS_UI_PATTERNS.buttons.colors.default,
...MACOS_UI_PATTERNS.buttons.colors.accent
];
return buttonColors.some(buttonColor => this.colorSimilarity(color, buttonColor) > 0.8);
}
/**
* Check if color is typical for text fields
*/
private isTextFieldColor(color: string): boolean {
const textFieldColors = MACOS_UI_PATTERNS.textFields.colors.background;
return textFieldColors.some(fieldColor => this.colorSimilarity(color, fieldColor) > 0.8);
}
/**
* Calculate color similarity (0-1)
*/
private colorSimilarity(color1: string, color2: string): number {
// Simple RGB comparison - could be enhanced with LAB color space
const rgb1 = this.parseColor(color1);
const rgb2 = this.parseColor(color2);
if (!rgb1 || !rgb2) {
return 0;
}
const diffR = Math.abs(rgb1.r - rgb2.r);
const diffG = Math.abs(rgb1.g - rgb2.g);
const diffB = Math.abs(rgb1.b - rgb2.b);
const maxDiff = 255 * 3;
const totalDiff = diffR + diffG + diffB;
return 1 - (totalDiff / maxDiff);
}
/**
* Parse color string to RGB values
*/
private parseColor(color: string): {r: number, g: number, b: number} | null {
// Handle rgb() format
const rgbMatch = color.match(/rgb\((\d+),\s*(\d+),\s*(\d+)\)/);
if (rgbMatch) {
return {
r: parseInt(rgbMatch[1]),
g: parseInt(rgbMatch[2]),
b: parseInt(rgbMatch[3])
};
}
// Handle hex format
const hexMatch = color.match(/^#([A-Fa-f0-9]{6})$/);
if (hexMatch) {
const hex = hexMatch[1];
return {
r: parseInt(hex.substr(0, 2), 16),
g: parseInt(hex.substr(2, 2), 16),
b: parseInt(hex.substr(4, 2), 16)
};
}
return null;
}
/**
* Detect elements based on color patterns
*/
private async detectColorBasedElements(): Promise<UIElement[]> {
// This would implement color-specific detection
// For now, return empty array as this is complex
return [];
}
/**
* Detect elements based on shape analysis
*/
private async detectShapeBasedElements(): Promise<UIElement[]> {
// This would implement shape-based detection
// For now, return empty array as this is complex
return [];
}
/**
* Enhance elements with context analysis
*/
private async enhanceWithContextAnalysis(elements: UIElement[]): Promise<void> {
// Group nearby elements
const groups = this.groupNearbyElements(elements);
// Enhance based on spatial relationships
for (const group of groups) {
this.analyzeElementGroup(group);
}
}
/**
* Group nearby elements for context analysis
*/
private groupNearbyElements(elements: UIElement[]): UIElement[][] {
const groups: UIElement[][] = [];
const processed = new Set<string>();
for (const element of elements) {
if (processed.has(element.id)) {
continue;
}
const group = [element];
processed.add(element.id);
// Find nearby elements
for (const other of elements) {
if (processed.has(other.id)) {
continue;
}
if (this.areElementsNearby(element, other)) {
group.push(other);
processed.add(other.id);
}
}
groups.push(group);
}
return groups;
}
/**
* Check if two elements are nearby
*/
private areElementsNearby(a: UIElement, b: UIElement): boolean {
const distance = Math.sqrt(
Math.pow(a.x - b.x, 2) + Math.pow(a.y - b.y, 2)
);
return distance < 100; // Within 100 pixels
}
/**
* Analyze a group of elements for context
*/
private analyzeElementGroup(group: UIElement[]): void {
if (group.length < 2) {
return;
}
// Check for form patterns (label + input)
this.detectFormPatterns(group);
// Check for dialog patterns (title + buttons)
this.detectDialogPatterns(group);
// Check for menu patterns
this.detectMenuPatterns(group);
}
/**
* Detect form patterns in element group
*/
private detectFormPatterns(group: UIElement[]): void {
for (const element of group) {
if (element.type === 'text_field') {
// Look for nearby text that might be a label
const nearbyText = group.find(other =>
other.type === 'other' &&
Math.abs(other.y - element.y) < 30 &&
other.x < element.x
);
if (nearbyText) {
element.attributes.label = nearbyText.text;
element.description = `Text field for "${nearbyText.text}"`;
}
}
}
}
/**
* Detect dialog patterns in element group
*/
private detectDialogPatterns(group: UIElement[]): void {
const buttons = group.filter(e => e.type === 'button');
const dialogs = group.filter(e => e.type === 'dialog');
if (buttons.length >= 2 && dialogs.length >= 1) {
// Enhance button descriptions based on dialog context
for (const button of buttons) {
button.attributes.isDialogButton = true;
if (this.isDestructiveButtonText(button.text)) {
button.attributes.style = 'destructive';
}
}
}
}
/**
* Detect menu patterns in element group
*/
private detectMenuPatterns(group: UIElement[]): void {
const menus = group.filter(e => e.type === 'menu');
if (menus.length > 3) {
// Multiple menu items likely form a menu
for (const menu of menus) {
menu.attributes.isMenuGroup = true;
}
}
}
/**
* Post-process elements to remove duplicates and enhance
*/
private postProcessElements(elements: UIElement[]): UIElement[] {
let filtered = elements;
// Remove duplicates if enabled
if (this.config.skipDuplicates) {
filtered = this.removeDuplicateElements(filtered);
}
// Filter by confidence
filtered = filtered.filter(e => e.confidence >= this.config.minConfidence);
// Sort by confidence (descending)
filtered.sort((a, b) => b.confidence - a.confidence);
// Limit number of elements
if (filtered.length > this.config.maxElements) {
filtered = filtered.slice(0, this.config.maxElements);
}
// Merge text information where available
filtered = this.mergeTextInformation(filtered);
return filtered;
}
/**
* Remove duplicate elements based on position and type
*/
private removeDuplicateElements(elements: UIElement[]): UIElement[] {
const unique = new Map<string, UIElement>();
for (const element of elements) {
const key = `${element.type}_${Math.round(element.x / 10)}_${Math.round(element.y / 10)}`;
const existing = unique.get(key);
if (!existing || existing.confidence < element.confidence) {
unique.set(key, element);
}
}
return Array.from(unique.values());
}
/**
* Merge text information from OCR with visual detection
*/
private mergeTextInformation(elements: UIElement[]): UIElement[] {
// Group elements by proximity
const spatialGroups = new Map<string, UIElement[]>();
for (const element of elements) {
const key = `${Math.round(element.x / 20)}_${Math.round(element.y / 20)}`;
if (!spatialGroups.has(key)) {
spatialGroups.set(key, []);
}
const group = spatialGroups.get(key);
if (group) {
group.push(element);
}
}
// Merge elements in the same spatial group
const merged: UIElement[] = [];
for (const group of spatialGroups.values()) {
if (group.length === 1) {
merged.push(group[0]);
} else {
// Merge multiple elements in the same area
const textElements = group.filter(e => e.text && e.text.trim());
const visualElements = group.filter(e => e.detectionMethod.includes('visual_analysis'));
if (textElements.length > 0 && visualElements.length > 0) {
// Merge text element with visual element
const bestText = textElements.reduce((prev, curr) =>
prev.confidence > curr.confidence ? prev : curr
);
const bestVisual = visualElements.reduce((prev, curr) =>
prev.confidence > curr.confidence ? prev : curr
);
merged.push({
...bestVisual,
text: bestText.text,
confidence: (bestText.confidence + bestVisual.confidence) / 2,
detectionMethod: [...new Set([...bestText.detectionMethod, ...bestVisual.detectionMethod])],
description: `${bestVisual.description} with text "${bestText.text}"`
});
} else {
// Use the highest confidence element
merged.push(group.reduce((prev, curr) =>
prev.confidence > curr.confidence ? prev : curr
));
}
}
}
return merged;
}
/**
* Verify if an element is actually interactive by checking visual characteristics
*/
async verifyElementInteractivity(element: UIElement): Promise<{
isInteractive: boolean;
confidence: number;
reasons: string[];
}> {
const reasons: string[] = [];
let interactivityScore = 0;
// Check visual characteristics that suggest interactivity
if (element.visualFeatures) {
// Buttons usually have distinct visual features
if (element.type === 'button') {
if (element.visualFeatures.borderRadius && element.visualFeatures.borderRadius > 0) {
interactivityScore += 0.3;
reasons.push('Has rounded corners typical of buttons');
}
if (this.isButtonColor(element.visualFeatures.backgroundColor)) {
interactivityScore += 0.4;
reasons.push('Has button-like background color');
}
if (element.width >= 44 && element.height >= 30) {
interactivityScore += 0.2;
reasons.push('Meets minimum touch target size guidelines');
}
}
// Text fields usually have specific characteristics
if (element.type === 'text_field') {
if (this.isTextFieldColor(element.visualFeatures.backgroundColor)) {
interactivityScore += 0.4;
reasons.push('Has text field background color');
}
if (element.width > element.height * 2) {
interactivityScore += 0.3;
reasons.push('Has text field aspect ratio');
}
}
}
// Check text content for interactive indicators
if (element.text) {
if (this.isButtonText(element.text)) {
interactivityScore += 0.3;
reasons.push('Text suggests button functionality');
}
if (this.isLinkText(element.text)) {
interactivityScore += 0.4;
reasons.push('Text suggests link functionality');
}
}
// Check position and size constraints
if (element.width < 10 || element.height < 10) {
interactivityScore -= 0.3;
reasons.push('Too small to be interactive');
}
if (element.width > 1000 || element.height > 500) {
interactivityScore -= 0.2;
reasons.push('Unusually large for interactive element');
}
// Detection method reliability
if (element.detectionMethod.includes('visual_analysis')) {
interactivityScore += 0.2;
reasons.push('Detected through visual analysis');
}
if (element.detectionMethod.includes('text_pattern')) {
interactivityScore += 0.1;
reasons.push('Detected through text pattern matching');
}
// Clamp score between 0 and 1
const confidence = Math.max(0, Math.min(1, interactivityScore));
const isInteractive = confidence > 0.5;
return {
isInteractive,
confidence,
reasons
};
}
/**
* Validate detected elements and update their properties based on verification
*/
async validateDetectedElements(elements: UIElement[]): Promise<UIElement[]> {
const validatedElements: UIElement[] = [];
for (const element of elements) {
const verification = await this.verifyElementInteractivity(element);
// Update element properties based on verification
const updatedElement = {
...element,
interactive: verification.isInteractive,
confidence: (element.confidence + verification.confidence) / 2,
attributes: {
...element.attributes,
verificationScore: verification.confidence,
verificationReasons: verification.reasons,
isVerified: true
}
};
// Only include elements that pass minimum verification threshold
if (verification.confidence > 0.3) {
validatedElements.push(updatedElement);
}
}
return validatedElements;
}
/**
* Create element hierarchy based on spatial relationships
*/
createElementHierarchy(elements: UIElement[]): UIElement[] {
// Sort by area (largest first) to establish parent-child relationships
const sortedElements = [...elements].sort((a, b) =>
(b.width * b.height) - (a.width * a.height)
);
const hierarchicalElements: UIElement[] = [];
for (const element of sortedElements) {
// Find potential parent (larger element that contains this one)
const parent = hierarchicalElements.find(p =>
this.isElementContainedIn(element, p)
);
if (parent) {
// Add as child
if (!parent.children) {
parent.children = [];
}
parent.children.push({
...element,
parent: parent.id
});
} else {
// Add as root element
hierarchicalElements.push(element);
}
}
return hierarchicalElements;
}
/**
* Check if one element is contained within another
*/
private isElementContainedIn(child: UIElement, parent: UIElement): boolean {
return child.x >= parent.x &&
child.y >= parent.y &&
child.x + child.width <= parent.x + parent.width &&
child.y + child.height <= parent.y + parent.height &&
child.id !== parent.id;
}
/**
* Generate unique element ID
*/
private generateElementId(prefix: string, x: number, y: number): string {
return `${prefix}_${x}_${y}_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
// Text classification helper methods (enhanced versions)
private isButtonText(text: string): boolean {
return MACOS_UI_PATTERNS.buttons.textPatterns.some(pattern => pattern.test(text));
}
private isLinkText(text: string): boolean {
return /^https?:\/\/|www\.|\.com|\.org|\.edu|click here|learn more|read more/i.test(text);
}
private isTextFieldIndicator(text: string): boolean {
return MACOS_UI_PATTERNS.textFields.textPatterns.some(pattern => pattern.test(text));
}
private isMenuText(text: string): boolean {
return MACOS_UI_PATTERNS.menus.textPatterns.some(pattern => pattern.test(text));
}
private isDialogText(text: string): boolean {
return MACOS_UI_PATTERNS.dialogs.textPatterns.some(pattern => pattern.test(text));
}
private isDestructiveButtonText(text: string): boolean {
return /delete|remove|destroy|cancel|no|discard/i.test(text);
}
}
// Export singleton instance
export const uiElementDetector = new UIElementDetector();