Skip to content

Commit 62a29ee

Browse files
Hybrid of a11y tree & DOM for input to observe (#459)
* include backendDOMNodeId * skip ax nodeId if negative * replace role with dom tag name if none or generic * add xpath to AXNode type * revert unnecessary changed lines * revert more unnecessary changed lines * changeset * speedup * prettier * prune before updating roles * take xpath out of AXnode type * rm commented code --------- Co-authored-by: Miguel <[email protected]>
1 parent 00da6dd commit 62a29ee

File tree

5 files changed

+152
-42
lines changed

5 files changed

+152
-42
lines changed

Diff for: .changeset/chilled-apes-sneeze.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
create a11y + dom hybrid input for observe

Diff for: lib/a11y/utils.ts

+143-39
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ export function formatSimplifiedTree(
1313
level = 0,
1414
): string {
1515
const indent = " ".repeat(level);
16-
let result = `${indent}[${node.nodeId}] ${node.role}${node.name ? `: ${node.name}` : ""}\n`;
16+
let result = `${indent}[${node.nodeId}] ${node.role}${
17+
node.name ? `: ${node.name}` : ""
18+
}\n`;
1719

1820
if (node.children?.length) {
1921
result += node.children
@@ -29,39 +31,113 @@ export function formatSimplifiedTree(
2931
* 1. Removes generic/none nodes with no children
3032
* 2. Collapses generic/none nodes with single child
3133
* 3. Keeps generic/none nodes with multiple children but cleans their subtrees
34+
* and attempts to resolve their role to a DOM tag name
3235
*/
33-
function cleanStructuralNodes(
36+
async function cleanStructuralNodes(
3437
node: AccessibilityNode,
35-
): AccessibilityNode | null {
36-
// Filter out nodes with negative IDs
38+
page?: StagehandPage,
39+
logger?: (logLine: LogLine) => void,
40+
): Promise<AccessibilityNode | null> {
41+
// 1) Filter out nodes with negative IDs
3742
if (node.nodeId && parseInt(node.nodeId) < 0) {
3843
return null;
3944
}
4045

41-
// Base case: leaf node
42-
if (!node.children) {
46+
// 2) Base case: if no children exist, this is effectively a leaf.
47+
// If it's "generic" or "none", we remove it; otherwise, keep it.
48+
if (!node.children || node.children.length === 0) {
4349
return node.role === "generic" || node.role === "none" ? null : node;
4450
}
4551

46-
// Recursively clean children
47-
const cleanedChildren = node.children
48-
.map((child) => cleanStructuralNodes(child))
49-
.filter(Boolean) as AccessibilityNode[];
50-
51-
// Handle generic/none nodes specially
52+
// 3) Recursively clean children
53+
const cleanedChildrenPromises = node.children.map((child) =>
54+
cleanStructuralNodes(child, page, logger),
55+
);
56+
const resolvedChildren = await Promise.all(cleanedChildrenPromises);
57+
const cleanedChildren = resolvedChildren.filter(
58+
(child): child is AccessibilityNode => child !== null,
59+
);
60+
61+
// 4) **Prune** "generic" or "none" nodes first,
62+
// before resolving them to their tag names.
5263
if (node.role === "generic" || node.role === "none") {
5364
if (cleanedChildren.length === 1) {
54-
// Collapse single-child generic nodes
65+
// Collapse single-child structural node
5566
return cleanedChildren[0];
56-
} else if (cleanedChildren.length > 1) {
57-
// Keep generic nodes with multiple children
58-
return { ...node, children: cleanedChildren };
67+
} else if (cleanedChildren.length === 0) {
68+
// Remove empty structural node
69+
return null;
70+
}
71+
// If we have multiple children, we keep this node as a container.
72+
// We'll update role below if needed.
73+
}
74+
75+
// 5) If we still have a "generic"/"none" node after pruning
76+
// (i.e., because it had multiple children), now we try
77+
// to resolve and replace its role with the DOM tag name.
78+
if (
79+
page &&
80+
logger &&
81+
node.backendDOMNodeId !== undefined &&
82+
(node.role === "generic" || node.role === "none")
83+
) {
84+
try {
85+
const { object } = await page.sendCDP<{
86+
object: { objectId?: string };
87+
}>("DOM.resolveNode", {
88+
backendNodeId: node.backendDOMNodeId,
89+
});
90+
91+
if (object && object.objectId) {
92+
try {
93+
// Get the tagName for the node
94+
const { result } = await page.sendCDP<{
95+
result: { type: string; value?: string };
96+
}>("Runtime.callFunctionOn", {
97+
objectId: object.objectId,
98+
functionDeclaration: `
99+
function() {
100+
return this.tagName ? this.tagName.toLowerCase() : "";
101+
}
102+
`,
103+
returnByValue: true,
104+
});
105+
106+
// If we got a tagName, update the node's role
107+
if (result?.value) {
108+
node.role = result.value;
109+
}
110+
} catch (tagNameError) {
111+
logger({
112+
category: "observation",
113+
message: `Could not fetch tagName for node ${node.backendDOMNodeId}`,
114+
level: 2,
115+
auxiliary: {
116+
error: {
117+
value: tagNameError.message,
118+
type: "string",
119+
},
120+
},
121+
});
122+
}
123+
}
124+
} catch (resolveError) {
125+
logger({
126+
category: "observation",
127+
message: `Could not resolve DOM node ID ${node.backendDOMNodeId}`,
128+
level: 2,
129+
auxiliary: {
130+
error: {
131+
value: resolveError.message,
132+
type: "string",
133+
},
134+
},
135+
});
59136
}
60-
// Remove generic nodes with no children
61-
return null;
62137
}
63138

64-
// For non-generic nodes, keep them if they have children after cleaning
139+
// 6) Return the updated node.
140+
// If it has children, update them; otherwise keep it as-is.
65141
return cleanedChildren.length > 0
66142
? { ...node, children: cleanedChildren }
67143
: node;
@@ -73,13 +149,23 @@ function cleanStructuralNodes(
73149
* @param nodes - Flat array of accessibility nodes from the CDP
74150
* @returns Object containing both the tree structure and a simplified string representation
75151
*/
76-
export function buildHierarchicalTree(nodes: AccessibilityNode[]): TreeResult {
152+
export async function buildHierarchicalTree(
153+
nodes: AccessibilityNode[],
154+
page?: StagehandPage,
155+
logger?: (logLine: LogLine) => void,
156+
): Promise<TreeResult> {
77157
// Map to store processed nodes for quick lookup
78158
const nodeMap = new Map<string, AccessibilityNode>();
79159

80160
// First pass: Create nodes that are meaningful
81161
// We only keep nodes that either have a name or children to avoid cluttering the tree
82162
nodes.forEach((node) => {
163+
// Skip node if its ID is negative (e.g., "-1000002014")
164+
const nodeIdValue = parseInt(node.nodeId, 10);
165+
if (nodeIdValue < 0) {
166+
return;
167+
}
168+
83169
const hasChildren = node.childIds && node.childIds.length > 0;
84170
const hasValidName = node.name && node.name.trim() !== "";
85171
const isInteractive =
@@ -99,6 +185,9 @@ export function buildHierarchicalTree(nodes: AccessibilityNode[]): TreeResult {
99185
...(hasValidName && { name: node.name }), // Only include name if it exists and isn't empty
100186
...(node.description && { description: node.description }),
101187
...(node.value && { value: node.value }),
188+
...(node.backendDOMNodeId !== undefined && {
189+
backendDOMNodeId: node.backendDOMNodeId,
190+
}),
102191
});
103192
});
104193

@@ -119,13 +208,18 @@ export function buildHierarchicalTree(nodes: AccessibilityNode[]): TreeResult {
119208
});
120209

121210
// Final pass: Build the root-level tree and clean up structural nodes
122-
const finalTree = nodes
211+
const rootNodes = nodes
123212
.filter((node) => !node.parentId && nodeMap.has(node.nodeId)) // Get root nodes
124213
.map((node) => nodeMap.get(node.nodeId))
125-
.filter(Boolean)
126-
.map((node) => cleanStructuralNodes(node))
127214
.filter(Boolean) as AccessibilityNode[];
128215

216+
const cleanedTreePromises = rootNodes.map((node) =>
217+
cleanStructuralNodes(node, page, logger),
218+
);
219+
const finalTree = (await Promise.all(cleanedTreePromises)).filter(
220+
Boolean,
221+
) as AccessibilityNode[];
222+
129223
// Generate a simplified string representation of the tree
130224
const simplifiedFormat = finalTree
131225
.map((node) => formatSimplifiedTree(node))
@@ -137,29 +231,43 @@ export function buildHierarchicalTree(nodes: AccessibilityNode[]): TreeResult {
137231
};
138232
}
139233

234+
/**
235+
* Retrieves the full accessibility tree via CDP and transforms it into a hierarchical structure.
236+
*/
140237
export async function getAccessibilityTree(
141238
page: StagehandPage,
142239
logger: (logLine: LogLine) => void,
143-
) {
240+
): Promise<TreeResult> {
144241
await page.enableCDP("Accessibility");
145242

146243
try {
244+
// Fetch the full accessibility tree from Chrome DevTools Protocol
147245
const { nodes } = await page.sendCDP<{ nodes: AXNode[] }>(
148246
"Accessibility.getFullAXTree",
149247
);
248+
const startTime = Date.now();
150249

151-
// Extract specific sources
152-
const sources = nodes.map((node) => ({
153-
role: node.role?.value,
154-
name: node.name?.value,
155-
description: node.description?.value,
156-
value: node.value?.value,
157-
nodeId: node.nodeId,
158-
parentId: node.parentId,
159-
childIds: node.childIds,
160-
}));
161250
// Transform into hierarchical structure
162-
const hierarchicalTree = buildHierarchicalTree(sources);
251+
const hierarchicalTree = await buildHierarchicalTree(
252+
nodes.map((node) => ({
253+
role: node.role?.value,
254+
name: node.name?.value,
255+
description: node.description?.value,
256+
value: node.value?.value,
257+
nodeId: node.nodeId,
258+
backendDOMNodeId: node.backendDOMNodeId,
259+
parentId: node.parentId,
260+
childIds: node.childIds,
261+
})),
262+
page,
263+
logger,
264+
);
265+
266+
logger({
267+
category: "observation",
268+
message: `got accessibility tree in ${Date.now() - startTime}ms`,
269+
level: 1,
270+
});
163271

164272
return hierarchicalTree;
165273
} catch (error) {
@@ -258,7 +366,6 @@ export async function performPlaywrightMethod(
258366
method: string,
259367
args: unknown[],
260368
xpath: string,
261-
// domSettleTimeoutMs?: number,
262369
) {
263370
const locator = stagehandPage.locator(`xpath=${xpath}`).first();
264371
const initialUrl = stagehandPage.url();
@@ -503,7 +610,6 @@ export async function performPlaywrightMethod(
503610
await newOpenedTab.close();
504611
await stagehandPage.goto(newOpenedTab.url());
505612
await stagehandPage.waitForLoadState("domcontentloaded");
506-
// await stagehandPage._waitForSettledDom(domSettleTimeoutMs);
507613
}
508614

509615
await Promise.race([
@@ -564,6 +670,4 @@ export async function performPlaywrightMethod(
564670
`Method ${method} not supported`,
565671
);
566672
}
567-
568-
// await stagehandPage._waitForSettledDom(domSettleTimeoutMs);
569673
}

Diff for: lib/handlers/observeHandler.ts

+1-2
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ export class StagehandObserveHandler {
108108
isUsingAccessibilityTree: useAccessibilityTree,
109109
returnAction,
110110
});
111+
111112
const elementsWithSelectors = await Promise.all(
112113
observationResponse.elements.map(async (element) => {
113114
const { elementId, ...rest } = element;
@@ -137,7 +138,6 @@ export class StagehandObserveHandler {
137138
message: `Invalid object ID returned for element: ${elementId}`,
138139
level: 1,
139140
});
140-
return null;
141141
}
142142

143143
const xpath = await getXPathByResolvedObjectId(
@@ -151,7 +151,6 @@ export class StagehandObserveHandler {
151151
message: `Empty xpath returned for element: ${elementId}`,
152152
level: 1,
153153
});
154-
return null;
155154
}
156155

157156
return {

Diff for: lib/prompt.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ You will be given:
361361
1. a instruction of elements to observe
362362
2. ${
363363
isUsingAccessibilityTree
364-
? "a hierarchical accessibility tree showing the semantic structure of the page"
364+
? "a hierarchical accessibility tree showing the semantic structure of the page. The tree is a hybrid of the DOM and the accessibility tree."
365365
: "a numbered list of possible elements"
366366
}
367367

Diff for: types/context.ts

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ export interface AXNode {
44
description?: { value: string };
55
value?: { value: string };
66
nodeId: string;
7+
backendDOMNodeId?: number;
78
parentId?: string;
89
childIds?: string[];
910
}
@@ -17,6 +18,7 @@ export type AccessibilityNode = {
1718
childIds?: string[];
1819
parentId?: string;
1920
nodeId?: string;
21+
backendDOMNodeId?: number;
2022
};
2123

2224
export interface TreeResult {

0 commit comments

Comments
 (0)