from windows_mcp.tree.config import INTERACTIVE_CONTROL_TYPE_NAMES,DOCUMENT_CONTROL_TYPE_NAMES,INFORMATIVE_CONTROL_TYPE_NAMES, DEFAULT_ACTIONS, THREAD_MAX_RETRIES
from windows_mcp.tree.views import TreeElementNode, ScrollElementNode, TextElementNode, Center, BoundingBox, TreeState, DOMInfo
from windows_mcp.uia import Control,ImageControl,ScrollPattern,WindowControl,Rect,GetRootControl,PatternId
from concurrent.futures import ThreadPoolExecutor, as_completed
from windows_mcp.tree.utils import random_point_within_bounding_box
from PIL import Image, ImageFont, ImageDraw
from typing import TYPE_CHECKING,Optional
from windows_mcp.desktop.views import App
from time import sleep,time
import logging
import random
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(levelname)s] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
if TYPE_CHECKING:
from windows_mcp.desktop.service import Desktop
class Tree:
def __init__(self,desktop:'Desktop'):
self.desktop=desktop
self.screen_size=self.desktop.get_screen_size()
self.dom_info:Optional[DOMInfo]=None
self.dom_bounding_box:BoundingBox=None
self.screen_box=BoundingBox(
top=0, left=0, bottom=self.screen_size.height, right=self.screen_size.width,
width=self.screen_size.width, height=self.screen_size.height
)
def get_state(self,active_app:App,other_apps:list[App],use_dom:bool=False)->TreeState:
root=GetRootControl()
other_apps_handle=set(map(lambda other_app: other_app.handle,other_apps))
apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,root.GetChildren()))
del other_apps_handle
if active_app:
apps=list(filter(lambda app:app.ClassName!='Progman',apps))
interactive_nodes,scrollable_nodes,dom_informative_nodes=self.get_appwise_nodes(apps=apps,use_dom=use_dom)
return TreeState(dom_info=self.dom_info,interactive_nodes=interactive_nodes,scrollable_nodes=scrollable_nodes,dom_informative_nodes=dom_informative_nodes)
def get_appwise_nodes(self,apps:list[Control],use_dom:bool=False)-> tuple[list[TreeElementNode],list[ScrollElementNode],list[TextElementNode]]:
interactive_nodes, scrollable_nodes,dom_informative_nodes = [], [], []
with ThreadPoolExecutor() as executor:
retry_counts = {app: 0 for app in apps}
future_to_app = {
executor.submit(
self.get_nodes, app,
self.desktop.is_app_browser(app),
use_dom
): app
for app in apps
}
while future_to_app: # keep running until no pending futures
for future in as_completed(list(future_to_app)):
app = future_to_app.pop(future) # remove completed future
try:
result = future.result()
if result:
element_nodes, scroll_nodes,informative_nodes = result
interactive_nodes.extend(element_nodes)
scrollable_nodes.extend(scroll_nodes)
dom_informative_nodes.extend(informative_nodes)
except Exception as e:
retry_counts[app] += 1
logger.debug(f"Error in processing node {app.Name}, retry attempt {retry_counts[app]}\nError: {e}")
if retry_counts[app] < THREAD_MAX_RETRIES:
logger.debug(f"Retrying {app.Name} for the {retry_counts[app]}th time")
new_future = executor.submit(self.get_nodes, app, self.desktop.is_app_browser(app),use_dom)
future_to_app[new_future] = app
else:
logger.error(f"Task failed completely for {app.Name} after {THREAD_MAX_RETRIES} retries")
return interactive_nodes,scrollable_nodes,dom_informative_nodes
def iou_bounding_box(self,window_box: Rect,element_box: Rect,) -> BoundingBox:
# Step 1: Intersection of element and window (existing logic)
intersection_left = max(window_box.left, element_box.left)
intersection_top = max(window_box.top, element_box.top)
intersection_right = min(window_box.right, element_box.right)
intersection_bottom = min(window_box.bottom, element_box.bottom)
# Step 2: Clamp to screen boundaries (new addition)
intersection_left = max(self.screen_box.left, intersection_left)
intersection_top = max(self.screen_box.top, intersection_top)
intersection_right = min(self.screen_box.right, intersection_right)
intersection_bottom = min(self.screen_box.bottom, intersection_bottom)
# Step 3: Validate intersection
if (intersection_right > intersection_left and intersection_bottom > intersection_top):
bounding_box = BoundingBox(
left=intersection_left,
top=intersection_top,
right=intersection_right,
bottom=intersection_bottom,
width=intersection_right - intersection_left,
height=intersection_bottom - intersection_top
)
else:
# No valid visible intersection (either outside window or screen)
bounding_box = BoundingBox(
left=0,
top=0,
right=0,
bottom=0,
width=0,
height=0
)
return bounding_box
def get_nodes(self, node: Control, is_browser:bool=False,use_dom:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]:
window_bounding_box=node.BoundingRectangle
def is_element_visible(node:Control,threshold:int=0):
is_control=node.IsControlElement
box=node.BoundingRectangle
if box.isempty():
return False
width=box.width()
height=box.height()
area=width*height
is_offscreen=(not node.IsOffscreen) or node.ControlTypeName in ['EditControl']
return area > threshold and is_offscreen and is_control
def is_element_enabled(node:Control):
try:
return node.IsEnabled
except Exception:
return False
def is_default_action(node:Control):
legacy_pattern=node.GetLegacyIAccessiblePattern()
default_action=legacy_pattern.DefaultAction.title()
if default_action in DEFAULT_ACTIONS:
return True
return False
def is_element_image(node:Control):
if isinstance(node,ImageControl):
if node.LocalizedControlType=='graphic' or not node.IsKeyboardFocusable:
return True
return False
def is_element_text(node:Control):
try:
if node.ControlTypeName in INFORMATIVE_CONTROL_TYPE_NAMES:
if is_element_visible(node) and is_element_enabled(node) and not is_element_image(node):
return True
except Exception:
return False
return False
def is_window_modal(node:WindowControl):
try:
window_pattern=node.GetWindowPattern()
return window_pattern.IsModal
except Exception:
return False
def is_keyboard_focusable(node:Control):
try:
if node.ControlTypeName in set(['EditControl','ButtonControl','CheckBoxControl','RadioButtonControl','TabItemControl']):
return True
return node.IsKeyboardFocusable
except Exception:
return False
def element_has_child_element(node:Control,control_type:str,child_control_type:str):
if node.LocalizedControlType==control_type:
first_child=node.GetFirstChildControl()
if first_child is None:
return False
return first_child.LocalizedControlType==child_control_type
def group_has_no_name(node:Control):
try:
if node.ControlTypeName=='GroupControl':
if not node.Name.strip():
return True
return False
except Exception:
return False
def is_element_scrollable(node:Control):
try:
if (node.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES|INFORMATIVE_CONTROL_TYPE_NAMES) or node.IsOffscreen:
return False
scroll_pattern:ScrollPattern=node.GetPattern(PatternId.ScrollPattern)
if scroll_pattern is None:
return False
return scroll_pattern.VerticallyScrollable
except Exception:
return False
def is_element_interactive(node:Control):
try:
if is_browser and node.ControlTypeName in set(['DataItemControl','ListItemControl']) and not is_keyboard_focusable(node):
return False
elif not is_browser and node.ControlTypeName=="ImageControl" and is_keyboard_focusable(node):
return True
elif node.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES|DOCUMENT_CONTROL_TYPE_NAMES:
return is_element_visible(node) and is_element_enabled(node) and (not is_element_image(node) or is_keyboard_focusable(node))
elif node.ControlTypeName=='GroupControl':
if is_browser:
return is_element_visible(node) and is_element_enabled(node) and (is_default_action(node) or is_keyboard_focusable(node))
# else:
# return is_element_visible and is_element_enabled(node) and is_default_action(node)
except Exception:
return False
return False
def dom_correction(node:Control):
if element_has_child_element(node,'list item','link') or element_has_child_element(node,'item','link'):
dom_interactive_nodes.pop()
return None
elif node.ControlTypeName=='GroupControl':
dom_interactive_nodes.pop()
if is_keyboard_focusable(node):
child=node
try:
while child.GetFirstChildControl() is not None:
if child.ControlTypeName in INTERACTIVE_CONTROL_TYPE_NAMES:
return None
child=child.GetFirstChildControl()
except Exception:
return None
if child.ControlTypeName!='TextControl':
return None
legacy_pattern=node.GetLegacyIAccessiblePattern()
value=legacy_pattern.Value
element_bounding_box = node.BoundingRectangle
bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box)
center = bounding_box.get_center()
is_focused=node.HasKeyboardFocus
dom_interactive_nodes.append(TreeElementNode(**{
'name':child.Name.strip(),
'control_type':node.LocalizedControlType,
'value':value,
'shortcut':node.AcceleratorKey,
'bounding_box':bounding_box,
'xpath':'',
'center':center,
'app_name':app_name,
'is_focused':is_focused
}))
elif element_has_child_element(node,'link','heading'):
dom_interactive_nodes.pop()
node=node.GetFirstChildControl()
control_type='link'
legacy_pattern=node.GetLegacyIAccessiblePattern()
value=legacy_pattern.Value
element_bounding_box = node.BoundingRectangle
bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box)
center = bounding_box.get_center()
is_focused=node.HasKeyboardFocus
dom_interactive_nodes.append(TreeElementNode(**{
'name':node.Name.strip(),
'control_type':control_type,
'value':node.Name.strip(),
'shortcut':node.AcceleratorKey,
'bounding_box':bounding_box,
'xpath':'',
'center':center,
'app_name':app_name,
'is_focused':is_focused
}))
def tree_traversal(node: Control,is_dom:bool=False,is_dialog:bool=False):
# Checks to skip the nodes that are not interactive
if node.IsOffscreen and (node.ControlTypeName not in set(["GroupControl","EditControl","TitleBarControl"])) and node.ClassName not in set(["Popup","Windows.UI.Core.CoreComponentInputSource"]):
return None
if is_element_scrollable(node):
scroll_pattern:ScrollPattern=node.GetPattern(PatternId.ScrollPattern)
box = node.BoundingRectangle
# Get the center
x,y=random_point_within_bounding_box(node=node,scale_factor=0.8)
center = Center(x=x,y=y)
scrollable_nodes.append(ScrollElementNode(**{
'name':node.Name.strip() or node.AutomationId or node.LocalizedControlType.capitalize() or "''",
'app_name':app_name,
'control_type':node.LocalizedControlType.title(),
'bounding_box':BoundingBox(**{
'left':box.left,
'top':box.top,
'right':box.right,
'bottom':box.bottom,
'width':box.width(),
'height':box.height()
}),
'center':center,
'xpath':'',
'horizontal_scrollable':scroll_pattern.HorizontallyScrollable,
'horizontal_scroll_percent':scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0,
'vertical_scrollable':scroll_pattern.VerticallyScrollable,
'vertical_scroll_percent':scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0,
'is_focused':node.HasKeyboardFocus
}))
if is_element_interactive(node):
legacy_pattern=node.GetLegacyIAccessiblePattern()
value=legacy_pattern.Value.strip() if legacy_pattern.Value is not None else ""
is_focused=node.HasKeyboardFocus
name=node.Name.strip()
element_bounding_box = node.BoundingRectangle
if is_browser and is_dom:
bounding_box=self.iou_bounding_box(self.dom_bounding_box,element_bounding_box)
center = bounding_box.get_center()
tree_node=TreeElementNode(**{
'name':name,
'control_type':node.LocalizedControlType.title(),
'value':value,
'shortcut':node.AcceleratorKey,
'bounding_box':bounding_box,
'center':center,
'xpath':'',
'app_name':app_name,
'is_focused':is_focused
})
dom_interactive_nodes.append(tree_node)
dom_correction(node=node)
else:
bounding_box=self.iou_bounding_box(window_bounding_box,element_bounding_box)
center = bounding_box.get_center()
tree_node=TreeElementNode(**{
'name':name,
'control_type':node.LocalizedControlType.title(),
'value':value,
'shortcut':node.AcceleratorKey,
'bounding_box':bounding_box,
'center':center,
'xpath':'',
'app_name':app_name,
'is_focused':is_focused
})
interactive_nodes.append(tree_node)
elif is_element_text(node):
dom_informative_nodes.append(TextElementNode(
text=node.Name.strip(),
))
children=node.GetChildren()
# Recursively traverse the tree the right to left for normal apps and for DOM traverse from left to right
for child in (children if is_dom else children[::-1]):
# Incrementally building the xpath
# Check if the child is a DOM element
if is_browser and child.AutomationId == "RootWebArea":
bounding_box=child.BoundingRectangle
self.dom_bounding_box=BoundingBox(left=bounding_box.left,top=bounding_box.top,
right=bounding_box.right,bottom=bounding_box.bottom,width=bounding_box.width(),
height=bounding_box.height())
scroll_pattern=child.GetPattern(PatternId.ScrollPattern)
self.dom_info=DOMInfo(
horizontal_scrollable=scroll_pattern.HorizontallyScrollable,
horizontal_scroll_percent=scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0,
vertical_scrollable=scroll_pattern.VerticallyScrollable,
vertical_scroll_percent=scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0
)
# enter DOM subtree
tree_traversal(child, is_dom=True, is_dialog=is_dialog)
# Check if the child is a dialog
elif isinstance(child,WindowControl):
if not child.IsOffscreen:
if is_dom:
bounding_box=child.BoundingRectangle
if bounding_box.width() > 0.8*self.dom_bounding_box.width:
# Because this window element covers the majority of the screen
dom_interactive_nodes.clear()
else:
if is_window_modal(child):
# Because this window element is modal
interactive_nodes.clear()
# enter dialog subtree
tree_traversal(child, is_dom=is_dom, is_dialog=True)
else:
# normal non-dialog children
tree_traversal(child, is_dom=is_dom, is_dialog=is_dialog)
interactive_nodes, dom_interactive_nodes, scrollable_nodes, dom_informative_nodes = [], [], [], []
app_name=node.Name.strip()
match node.ClassName:
case "Progman":
app_name="Desktop"
case 'Shell_TrayWnd'|'Shell_SecondaryTrayWnd':
app_name="Taskbar"
case 'Microsoft.UI.Content.PopupWindowSiteBridge':
app_name="Context Menu"
case _:
pass
tree_traversal(node,is_dom=False,is_dialog=False)
logger.debug(f'Interactive nodes:{len(interactive_nodes)}')
logger.debug(f'DOM interactive nodes:{len(dom_interactive_nodes)}')
logger.debug(f'Scrollable nodes:{len(scrollable_nodes)}')
if use_dom:
if is_browser:
return (dom_interactive_nodes,scrollable_nodes,dom_informative_nodes)
else:
return ([],[],[])
else:
return (interactive_nodes+dom_interactive_nodes,scrollable_nodes,dom_informative_nodes)
def _on_focus_change(self, sender:'ctypes.POINTER(IUIAutomationElement)'):
"""Handle focus change events."""
# Debounce duplicate events
current_time = time()
element = Control.CreateControlFromElement(sender)
runtime_id=element.GetRuntimeId()
event_key = tuple(runtime_id)
if hasattr(self, '_last_focus_event') and self._last_focus_event:
last_key, last_time = self._last_focus_event
if last_key == event_key and (current_time - last_time) < 1.0:
return None
self._last_focus_event = (event_key, current_time)
try:
logger.debug(f"[WatchDog] Focus changed to: '{element.Name}' ({element.ControlTypeName})")
except Exception:
pass
def _on_structure_change(self, sender:'ctypes.POINTER(IUIAutomationElement)', changeType:int, runtime_id:list[int]):
"""Handle structure change events."""
try:
# Debounce duplicate events
current_time = time()
event_key = (changeType, tuple(runtime_id))
if hasattr(self, '_last_structure_event') and self._last_structure_event:
last_key, last_time = self._last_structure_event
if last_key == event_key and (current_time - last_time) < 5.0:
return None
self._last_structure_event = (event_key, current_time)
node = Control.CreateControlFromElement(sender)
match StructureChangeType(changeType):
case StructureChangeType.StructureChangeType_ChildAdded|StructureChangeType.StructureChangeType_ChildrenBulkAdded:
interactive_nodes=[]
app=self.desktop.get_app_from_element(node)
app_name=self.app_name_correction(app.name if app else node.Name.strip())
is_browser=app.is_browser if app else False
if isinstance(node,WindowControl|PaneControl):
#Subtree traversal
window_bounding_box=app.bounding_box if app else node.BoundingRectangle
self.tree_traversal(node,window_bounding_box,app_name,is_browser,interactive_nodes=interactive_nodes)
else:
#If element is interactive take it else skip it
if not self.is_element_interactive(node=node,is_browser=is_browser):
return None
legacy_pattern=node.GetLegacyIAccessiblePattern()
value=legacy_pattern.Value.strip() if legacy_pattern.Value is not None else ""
cursor_type=AccessibleRoleNames.get(legacy_pattern.Role, "Default")
runtime_id=node.GetRuntimeId()
is_focused=node.HasKeyboardFocus
name=node.Name.strip()
element_bounding_box = node.BoundingRectangle
bounding_box=self.iou_bounding_box(window_bounding_box,element_bounding_box)
center = bounding_box.get_center()
interactive_nodes.append(TreeElementNode(
name=name,
control_type=cursor_type,
bounding_box=bounding_box,
center=center,
runtime_id=runtime_id,
app_name=app_name,
value=value,
shortcut="",
xpath="",
is_focused=is_focused
))
if self.tree_state:
existing_ids={n.runtime_id for n in self.tree_state.interactive_nodes}
interactive_nodes=[n for n in interactive_nodes if n.runtime_id not in existing_ids]
self.tree_state.interactive_nodes.extend(interactive_nodes)
case StructureChangeType.StructureChangeType_ChildrenBulkRemoved | StructureChangeType.StructureChangeType_ChildRemoved:
if changeType == StructureChangeType.StructureChangeType_ChildRemoved and self.tree_state:
if isinstance(node,WindowControl|PaneControl):
parent_bounding_box=BoundingBox.from_bounding_rectangle(node.BoundingRectangle)
# Remove nodes spatially contained in the parent (heuristic for "is descendant")
def is_contained(n:'TreeElementNode'):
cx, cy = n.center.x, n.center.y
return (parent_bounding_box.left <= cx <= parent_bounding_box.right and
parent_bounding_box.top <= cy <= parent_bounding_box.bottom)
self.tree_state.interactive_nodes = list(filter(lambda n:not is_contained(n),self.tree_state.interactive_nodes))
else:
target_runtime_id = tuple(runtime_id)
self.tree_state.interactive_nodes = list(filter(lambda n:n.runtime_id != target_runtime_id,self.tree_state.interactive_nodes))
case StructureChangeType.StructureChangeType_ChildrenInvalidated:
#Rebuild subtree
parent_bounding_box=BoundingBox.from_bounding_rectangle(node.BoundingRectangle)
app=self.desktop.get_app_from_element(node)
app_name=self.app_name_correction(app.name if app else node.Name.strip())
is_browser=app.is_browser if app else False
window_bounding_box=app.bounding_box if app else parent_bounding_box
interactive_nodes=[]
self.tree_traversal(node,window_bounding_box,app_name,is_browser,interactive_nodes=interactive_nodes)
# Remove nodes spatially contained in the parent (heuristic for "is descendant")
def is_contained(n:'TreeElementNode'):
cx, cy = n.center.x, n.center.y
return (parent_bounding_box.left <= cx <= parent_bounding_box.right and
parent_bounding_box.top <= cy <= parent_bounding_box.bottom)
if self.tree_state:
self.tree_state.interactive_nodes = list(filter(lambda n:not is_contained(n),self.tree_state.interactive_nodes))
self.tree_state.interactive_nodes.extend(interactive_nodes)
case StructureChangeType.StructureChangeType_ChildrenReordered:
app=self.desktop.get_app_from_element(node)
app_name=self.app_name_correction(app.name if app else node.Name.strip())
is_browser=app.is_browser if app else False
window_bounding_box=app.bounding_box if app else node.BoundingRectangle
interactive_nodes=[]
self.tree_traversal(node,window_bounding_box,app_name,is_browser,interactive_nodes=interactive_nodes)
# Update existing nodes
fresh_nodes_map = {n.runtime_id: n for n in interactive_nodes}
def update_node(existing_node:'TreeElementNode'):
if new_node:=fresh_nodes_map.get(existing_node.runtime_id):
existing_node.update_from_node(new_node)
list(map(update_node,self.tree_state.interactive_nodes))
except Exception as e:
logger.debug(f"[WatchDog] Structure changed with error: {e}, StructureChangeType={StructureChangeType(changeType).name}")
try:
logger.debug(f"[WatchDog] Structure changed: Type={StructureChangeType(changeType).name} RuntimeID={tuple(runtime_id)} Sender: '{node.Name}' ({node.ControlTypeName})")
except Exception:
pass
def _on_property_change(self, sender:'ctypes.POINTER(IUIAutomationElement)', propertyId:int, newValue):
"""Handle property change events."""
try:
element = Control.CreateControlFromElement(sender)
logger.debug(f"[WatchDog] Property changed: ID={propertyId} Value={newValue} Element: '{element.Name}' ({element.ControlTypeName})")
except Exception:
pass
def get_annotated_screenshot(self, nodes: list[TreeElementNode],scale:float=1.0) -> Image.Image:
screenshot = self.desktop.get_screenshot()
sleep(0.10)
original_width = screenshot.width
original_height = screenshot.height
scaled_width = int(original_width * scale)
scaled_height = int(original_height * scale)
screenshot = screenshot.resize((scaled_width, scaled_height), Image.Resampling.LANCZOS)
# Add padding
padding = 5
width = int(screenshot.width + (1.5 * padding))
height = int(screenshot.height + (1.5 * padding))
padded_screenshot = Image.new("RGB", (width, height), color=(255, 255, 255))
padded_screenshot.paste(screenshot, (padding, padding))
draw = ImageDraw.Draw(padded_screenshot)
font_size = 12
try:
font = ImageFont.truetype('arial.ttf', font_size)
except IOError:
font = ImageFont.load_default()
def get_random_color():
return "#{:06x}".format(random.randint(0, 0xFFFFFF))
def draw_annotation(label, node: TreeElementNode):
box = node.bounding_box
color = get_random_color()
# Scale and pad the bounding box coordinates
adjusted_box = (
int(box.left * scale) + padding,
int(box.top * scale) + padding,
int(box.right * scale) + padding,
int(box.bottom * scale) + padding
)
# Draw bounding box
draw.rectangle(adjusted_box, outline=color, width=2)
# Label dimensions
label_width = draw.textlength(str(label), font=font)
label_height = font_size
left, top, right, bottom = adjusted_box
# Label position above bounding box
label_x1 = right - label_width
label_y1 = top - label_height - 4
label_x2 = label_x1 + label_width
label_y2 = label_y1 + label_height + 4
# Draw label background and text
draw.rectangle([(label_x1, label_y1), (label_x2, label_y2)], fill=color)
draw.text((label_x1 + 2, label_y1 + 2), str(label), fill=(255, 255, 255), font=font)
# Draw annotations in parallel
with ThreadPoolExecutor() as executor:
executor.map(draw_annotation, range(len(nodes)), nodes)
return padded_screenshot