Skip to main content
Glama
service.py21.8 kB
from src.tree.config import INTERACTIVE_CONTROL_TYPE_NAMES,DOCUMENT_CONTROL_TYPE_NAMES,INFORMATIVE_CONTROL_TYPE_NAMES, DEFAULT_ACTIONS, THREAD_MAX_RETRIES from uiautomation import Control,ImageControl,ScrollPattern,WindowControl,Rect,GetRootControl,PatternId from src.tree.views import TreeElementNode, ScrollElementNode, Center, BoundingBox, TreeState from concurrent.futures import ThreadPoolExecutor, as_completed from src.tree.utils import random_point_within_bounding_box from PIL import Image, ImageFont, ImageDraw from src.desktop.views import App from typing import TYPE_CHECKING from time import sleep import logging import random logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() formatter = logging.Formatter('[%(levelname)s] %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) if TYPE_CHECKING: from src.desktop.service import Desktop class Tree: def __init__(self,desktop:'Desktop'): self.desktop=desktop screen_size=self.desktop.get_screen_size() self.dom_bounding_box:BoundingBox=None self.screen_box=BoundingBox( top=0, left=0, bottom=screen_size.height, right=screen_size.width, width=screen_size.width, height=screen_size.height ) def get_state(self,active_app:App,other_apps:list[App])->TreeState: root=GetRootControl() other_apps_handle=set(map(lambda other_app: other_app.handle,other_apps)) apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,root.GetChildren())) del other_apps_handle if active_app: apps=list(filter(lambda app:app.ClassName!='Progman',apps)) interactive_nodes,scrollable_nodes=self.get_appwise_nodes(apps=apps) return TreeState(interactive_nodes=interactive_nodes,scrollable_nodes=scrollable_nodes) def get_appwise_nodes(self,apps:list[Control]) -> tuple[list[TreeElementNode],list[ScrollElementNode]]: interactive_nodes, scrollable_nodes = [], [] with ThreadPoolExecutor() as executor: retry_counts = {app: 0 for app in apps} future_to_app = { executor.submit( self.get_nodes, app, self.desktop.is_app_browser(app) ): app for app in apps } while future_to_app: # keep running until no pending futures for future in as_completed(list(future_to_app)): app = future_to_app.pop(future) # remove completed future try: result = future.result() if result: element_nodes, scroll_nodes = result interactive_nodes.extend(element_nodes) scrollable_nodes.extend(scroll_nodes) except Exception as e: retry_counts[app] += 1 logger.debug(f"Error in processing node {app.Name}, retry attempt {retry_counts[app]}\nError: {e}") if retry_counts[app] < THREAD_MAX_RETRIES: new_future = executor.submit(self.get_nodes, app, self.desktop.is_app_browser(app)) future_to_app[new_future] = app else: logger.error(f"Task failed completely for {app.Name} after {THREAD_MAX_RETRIES} retries") return interactive_nodes,scrollable_nodes def iou_bounding_box(self,window_box: Rect,element_box: Rect,) -> BoundingBox: # Step 1: Intersection of element and window (existing logic) intersection_left = max(window_box.left, element_box.left) intersection_top = max(window_box.top, element_box.top) intersection_right = min(window_box.right, element_box.right) intersection_bottom = min(window_box.bottom, element_box.bottom) # Step 2: Clamp to screen boundaries (new addition) intersection_left = max(self.screen_box.left, intersection_left) intersection_top = max(self.screen_box.top, intersection_top) intersection_right = min(self.screen_box.right, intersection_right) intersection_bottom = min(self.screen_box.bottom, intersection_bottom) # Step 3: Validate intersection if (intersection_right > intersection_left and intersection_bottom > intersection_top): bounding_box = BoundingBox( left=intersection_left, top=intersection_top, right=intersection_right, bottom=intersection_bottom, width=intersection_right - intersection_left, height=intersection_bottom - intersection_top ) else: # No valid visible intersection (either outside window or screen) bounding_box = BoundingBox( left=0, top=0, right=0, bottom=0, width=0, height=0 ) return bounding_box def get_nodes(self, node: Control, is_browser:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]: window_bounding_box=node.BoundingRectangle def is_element_visible(node:Control,threshold:int=0): is_control=node.IsControlElement box=node.BoundingRectangle if box.isempty(): return False width=box.width() height=box.height() area=width*height is_offscreen=(not node.IsOffscreen) or node.ControlTypeName in ['EditControl'] return area > threshold and is_offscreen and is_control def is_element_enabled(node:Control): try: return node.IsEnabled except Exception: return False def is_default_action(node:Control): legacy_pattern=node.GetLegacyIAccessiblePattern() default_action=legacy_pattern.DefaultAction.title() if default_action in DEFAULT_ACTIONS: return True return False def is_element_image(node:Control): if isinstance(node,ImageControl): if node.LocalizedControlType=='graphic' or not node.IsKeyboardFocusable: return True return False def is_element_text(node:Control): try: if node.ControlTypeName in INFORMATIVE_CONTROL_TYPE_NAMES: if is_element_visible(node) and is_element_enabled(node) and not is_element_image(node): return True except Exception: return False return False def is_window_modal(node:WindowControl): try: window_pattern=node.GetWindowPattern() return window_pattern.IsModal except Exception: return False def is_keyboard_focusable(node:Control): try: if node.ControlTypeName in set(['EditControl','ButtonControl','CheckBoxControl','RadioButtonControl','TabItemControl']): return True return node.IsKeyboardFocusable except Exception: return False def element_has_child_element(node:Control,control_type:str,child_control_type:str): if node.LocalizedControlType==control_type: first_child=node.GetFirstChildControl() if first_child is None: return False return first_child.LocalizedControlType==child_control_type def group_has_no_name(node:Control): try: if node.ControlTypeName=='GroupControl': if not node.Name.strip(): return True return False except Exception: return False def is_element_scrollable(node:Control): try: if (node.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES|INFORMATIVE_CONTROL_TYPE_NAMES) or node.IsOffscreen: return False scroll_pattern:ScrollPattern=node.GetPattern(PatternId.ScrollPattern) if scroll_pattern is None: return False return scroll_pattern.VerticallyScrollable except Exception: return False def is_element_interactive(node:Control): try: if is_browser and node.ControlTypeName in set(['DataItemControl','ListItemControl']) and not is_keyboard_focusable(node): return False elif not is_browser and node.ControlTypeName=="ImageControl" and is_keyboard_focusable(node): return True elif node.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES|DOCUMENT_CONTROL_TYPE_NAMES: return is_element_visible(node) and is_element_enabled(node) and (not is_element_image(node) or is_keyboard_focusable(node)) elif node.ControlTypeName=='GroupControl': if is_browser: return is_element_visible(node) and is_element_enabled(node) and (is_default_action(node) or is_keyboard_focusable(node)) # else: # return is_element_visible and is_element_enabled(node) and is_default_action(node) except Exception: return False return False def dom_correction(node:Control): if element_has_child_element(node,'list item','link') or element_has_child_element(node,'item','link'): dom_interactive_nodes.pop() return None elif node.ControlTypeName=='GroupControl': dom_interactive_nodes.pop() if is_keyboard_focusable(node): child=node try: while child.GetFirstChildControl() is not None: if child.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES: return None child=child.GetFirstChildControl() except Exception: return None if child.ControlTypeName!='TextControl': return None legacy_pattern=node.GetLegacyIAccessiblePattern() value=legacy_pattern.Value element_bounding_box = node.BoundingRectangle bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box) center = bounding_box.get_center() is_focused=node.HasKeyboardFocus dom_interactive_nodes.append(TreeElementNode(**{ 'name':child.Name.strip(), 'control_type':node.LocalizedControlType, 'value':value, 'shortcut':node.AcceleratorKey, 'bounding_box':bounding_box, 'xpath':'', 'center':center, 'app_name':app_name, 'is_focused':is_focused })) elif element_has_child_element(node,'link','heading'): dom_interactive_nodes.pop() node=node.GetFirstChildControl() control_type='link' legacy_pattern=node.GetLegacyIAccessiblePattern() value=legacy_pattern.Value element_bounding_box = node.BoundingRectangle bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box) center = bounding_box.get_center() is_focused=node.HasKeyboardFocus dom_interactive_nodes.append(TreeElementNode(**{ 'name':node.Name.strip(), 'control_type':control_type, 'value':node.Name.strip(), 'shortcut':node.AcceleratorKey, 'bounding_box':bounding_box, 'xpath':'', 'center':center, 'app_name':app_name, 'is_focused':is_focused })) def tree_traversal(node: Control,is_dom:bool=False,is_dialog:bool=False): # Checks to skip the nodes that are not interactive if node.IsOffscreen and (node.ControlTypeName not in set(["GroupControl","EditControl","TitleBarControl"])) and node.ClassName not in set(["Popup","Windows.UI.Core.CoreComponentInputSource"]): return None if is_element_scrollable(node): scroll_pattern:ScrollPattern=node.GetPattern(PatternId.ScrollPattern) box = node.BoundingRectangle # Get the center x,y=random_point_within_bounding_box(node=node,scale_factor=0.8) center = Center(x=x,y=y) scrollable_nodes.append(ScrollElementNode(**{ 'name':node.Name.strip() or node.AutomationId or node.LocalizedControlType.capitalize() or "''", 'app_name':app_name, 'control_type':node.LocalizedControlType.title(), 'bounding_box':BoundingBox(**{ 'left':box.left, 'top':box.top, 'right':box.right, 'bottom':box.bottom, 'width':box.width(), 'height':box.height() }), 'center':center, 'xpath':'', 'horizontal_scrollable':scroll_pattern.HorizontallyScrollable, 'horizontal_scroll_percent':scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0, 'vertical_scrollable':scroll_pattern.VerticallyScrollable, 'vertical_scroll_percent':scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0, 'is_focused':node.HasKeyboardFocus })) if is_element_interactive(node): legacy_pattern=node.GetLegacyIAccessiblePattern() value=legacy_pattern.Value.strip() if legacy_pattern.Value is not None else "" is_focused=node.HasKeyboardFocus name=node.Name.strip() element_bounding_box = node.BoundingRectangle if is_browser and is_dom: bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box) center = bounding_box.get_center() tree_node=TreeElementNode(**{ 'name':name, 'control_type':node.LocalizedControlType.title(), 'value':value, 'shortcut':node.AcceleratorKey, 'bounding_box':bounding_box, 'center':center, 'xpath':'', 'app_name':app_name, 'is_focused':is_focused }) dom_interactive_nodes.append(tree_node) dom_correction(node=node) else: bounding_box=self.iou_bounding_box(window_bounding_box,element_bounding_box) center = bounding_box.get_center() tree_node=TreeElementNode(**{ 'name':name, 'control_type':node.LocalizedControlType.title(), 'value':value, 'shortcut':node.AcceleratorKey, 'bounding_box':bounding_box, 'center':center, 'xpath':'', 'app_name':app_name, 'is_focused':is_focused }) interactive_nodes.append(tree_node) # elif is_element_text(node): # informative_nodes.append(TextElementNode( # name=node.Name.strip() or "''", # app_name=app_name # )) children=node.GetChildren() # Recursively traverse the tree the right to left for normal apps and for DOM traverse from left to right for child in (children if is_dom else children[::-1]): # Incrementally building the xpath # Check if the child is a DOM element if is_browser and child.ClassName == "Chrome_RenderWidgetHostHWND": bounding_box=child.BoundingRectangle self.dom_bounding_box=BoundingBox(left=bounding_box.left,top=bounding_box.top, right=bounding_box.right,bottom=bounding_box.bottom,width=bounding_box.width(), height=bounding_box.height()) # enter DOM subtree tree_traversal(child, is_dom=True, is_dialog=is_dialog) # Check if the child is a dialog elif isinstance(child,WindowControl): if not child.IsOffscreen: if is_dom: bounding_box=child.BoundingRectangle if bounding_box.width() > 0.8*self.dom_bounding_box.width: # Because this window element covers the majority of the screen dom_interactive_nodes.clear() else: if is_window_modal(child): # Because this window element is modal interactive_nodes.clear() # enter dialog subtree tree_traversal(child, is_dom=is_dom, is_dialog=True) else: # normal non-dialog children tree_traversal(child, is_dom=is_dom, is_dialog=is_dialog) interactive_nodes, dom_interactive_nodes, scrollable_nodes = [], [], [] app_name=node.Name.strip() match node.ClassName: case "Progman": app_name="Desktop" case 'Shell_TrayWnd'|'Shell_SecondaryTrayWnd': app_name="Taskbar" case 'Microsoft.UI.Content.PopupWindowSiteBridge': app_name="Context Menu" case _: pass tree_traversal(node,is_dom=False,is_dialog=False) logger.debug(f'Interactive nodes:{len(interactive_nodes)}') logger.debug(f'DOM interactive nodes:{len(dom_interactive_nodes)}') logger.debug(f'Scrollable nodes:{len(scrollable_nodes)}') interactive_nodes.extend(dom_interactive_nodes) return (interactive_nodes,scrollable_nodes) def get_annotated_screenshot(self, nodes: list[TreeElementNode],scale:float=1.0) -> Image.Image: screenshot = self.desktop.get_screenshot() sleep(0.10) original_width = screenshot.width original_height = screenshot.height scaled_width = int(original_width * scale) scaled_height = int(original_height * scale) screenshot = screenshot.resize((scaled_width, scaled_height), Image.Resampling.LANCZOS) # Add padding padding = 5 width = int(screenshot.width + (1.5 * padding)) height = int(screenshot.height + (1.5 * padding)) padded_screenshot = Image.new("RGB", (width, height), color=(255, 255, 255)) padded_screenshot.paste(screenshot, (padding, padding)) draw = ImageDraw.Draw(padded_screenshot) font_size = 12 try: font = ImageFont.truetype('arial.ttf', font_size) except IOError: font = ImageFont.load_default() def get_random_color(): return "#{:06x}".format(random.randint(0, 0xFFFFFF)) def draw_annotation(label, node: TreeElementNode): box = node.bounding_box color = get_random_color() # Scale and pad the bounding box coordinates adjusted_box = ( int(box.left * scale) + padding, int(box.top * scale) + padding, int(box.right * scale) + padding, int(box.bottom * scale) + padding ) # Draw bounding box draw.rectangle(adjusted_box, outline=color, width=2) # Label dimensions label_width = draw.textlength(str(label), font=font) label_height = font_size left, top, right, bottom = adjusted_box # Label position above bounding box label_x1 = right - label_width label_y1 = top - label_height - 4 label_x2 = label_x1 + label_width label_y2 = label_y1 + label_height + 4 # Draw label background and text draw.rectangle([(label_x1, label_y1), (label_x2, label_y2)], fill=color) draw.text((label_x1 + 2, label_y1 + 2), str(label), fill=(255, 255, 255), font=font) # Draw annotations in parallel with ThreadPoolExecutor() as executor: executor.map(draw_annotation, range(len(nodes)), nodes) return padded_screenshot

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/CursorTouch/windows-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server