Skip to content

Commit e40bf6f

Browse files
Annotations for Scrollable elements in a11y-dom hybrid (#463)
* include backendDOMNodeId * skip ax nodeId if negative * replace role with dom tag name if none or generic * add xpath to AXNode type * revert unnecessary changed lines * revert more unnecessary changed lines * changeset * add getScrollableElementXpaths & expose it on the window * call browser-side scrollable elems fn, inject into observe output * changeset * speedup * prettier * prune before updating roles * take xpath out of AXnode type * find scrollable elems --------- Co-authored-by: Miguel <[email protected]>
1 parent f72123d commit e40bf6f

File tree

4 files changed

+144
-34
lines changed

4 files changed

+144
-34
lines changed

Diff for: .changeset/early-tables-type.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
include 'Scrollable' annotations in a11y-dom hybrid

Diff for: lib/a11y/utils.ts

+88-10
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,9 @@ export async function getAccessibilityTree(
241241
await page.enableCDP("Accessibility");
242242

243243
try {
244+
// Identify which elements are scrollable and get their backendNodeIds
245+
const scrollableBackendIds = await findScrollableElementIds(page);
246+
244247
// Fetch the full accessibility tree from Chrome DevTools Protocol
245248
const { nodes } = await page.sendCDP<{ nodes: AXNode[] }>(
246249
"Accessibility.getFullAXTree",
@@ -249,16 +252,28 @@ export async function getAccessibilityTree(
249252

250253
// Transform into hierarchical structure
251254
const hierarchicalTree = await buildHierarchicalTree(
252-
nodes.map((node) => ({
253-
role: node.role?.value,
254-
name: node.name?.value,
255-
description: node.description?.value,
256-
value: node.value?.value,
257-
nodeId: node.nodeId,
258-
backendDOMNodeId: node.backendDOMNodeId,
259-
parentId: node.parentId,
260-
childIds: node.childIds,
261-
})),
255+
nodes.map((node) => {
256+
let roleValue = node.role?.value || "";
257+
258+
if (scrollableBackendIds.has(node.backendDOMNodeId)) {
259+
if (roleValue === "generic" || roleValue === "none") {
260+
roleValue = "scrollable";
261+
} else {
262+
roleValue = roleValue ? `scrollable, ${roleValue}` : "scrollable";
263+
}
264+
}
265+
266+
return {
267+
role: roleValue,
268+
name: node.name?.value,
269+
description: node.description?.value,
270+
value: node.value?.value,
271+
nodeId: node.nodeId,
272+
backendDOMNodeId: node.backendDOMNodeId,
273+
parentId: node.parentId,
274+
childIds: node.childIds,
275+
};
276+
}),
262277
page,
263278
logger,
264279
);
@@ -360,6 +375,69 @@ export async function getXPathByResolvedObjectId(
360375
return result.value || "";
361376
}
362377

378+
/**
379+
* `findScrollableElementIds` is a function that identifies elements in
380+
* the browser that are deemed "scrollable". At a high level, it does the
381+
* following:
382+
* - Calls the browser-side `window.getScrollableElementXpaths()` function,
383+
* which returns a list of XPaths for scrollable containers.
384+
* - Iterates over the returned list of XPaths, locating each element in the DOM
385+
* using `stagehandPage.sendCDP(...)`
386+
* - During each iteration, we call `Runtime.evaluate` to run `document.evaluate(...)`
387+
* with each XPath, obtaining a `RemoteObject` reference if it exists.
388+
* - Then, for each valid object reference, we call `DOM.describeNode` to retrieve
389+
* the element’s `backendNodeId`.
390+
* - Collects all resulting `backendNodeId`s in a Set and returns them.
391+
*
392+
* @param stagehandPage - A StagehandPage instance with built-in CDP helpers.
393+
* @returns A Promise that resolves to a Set of unique `backendNodeId`s corresponding
394+
* to scrollable elements in the DOM.
395+
*/
396+
export async function findScrollableElementIds(
397+
stagehandPage: StagehandPage,
398+
): Promise<Set<number>> {
399+
// get the xpaths of the scrollable elements
400+
const xpaths = await stagehandPage.page.evaluate(() => {
401+
return window.getScrollableElementXpaths();
402+
});
403+
404+
const scrollableBackendIds = new Set<number>();
405+
406+
for (const xpath of xpaths) {
407+
if (!xpath) continue;
408+
409+
// evaluate the XPath in the stagehandPage
410+
const { result } = await stagehandPage.sendCDP<{
411+
result?: { objectId?: string };
412+
}>("Runtime.evaluate", {
413+
expression: `
414+
(function() {
415+
const res = document.evaluate(${JSON.stringify(
416+
xpath,
417+
)}, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
418+
return res.singleNodeValue;
419+
})();
420+
`,
421+
returnByValue: false,
422+
});
423+
424+
// if we have an objectId, call DOM.describeNode to get backendNodeId
425+
if (result?.objectId) {
426+
const { node } = await stagehandPage.sendCDP<{
427+
node?: { backendNodeId?: number };
428+
}>("DOM.describeNode", {
429+
objectId: result.objectId,
430+
});
431+
432+
if (node?.backendNodeId) {
433+
scrollableBackendIds.add(node.backendNodeId);
434+
}
435+
}
436+
}
437+
438+
return scrollableBackendIds;
439+
}
440+
363441
export async function performPlaywrightMethod(
364442
stagehandPage: Page,
365443
logger: (logLine: LogLine) => void,

Diff for: lib/dom/global.d.ts

+1
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,6 @@ declare global {
3636
width: number;
3737
height: number;
3838
}>;
39+
getScrollableElementXpaths: (topN?: number) => Promise<string[]>;
3940
}
4041
}

Diff for: lib/dom/process.ts

+50-24
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,24 @@ export function isTextNode(node: Node): node is Text {
1111
return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim());
1212
}
1313

14-
function getMainScrollableElement(): HTMLElement {
14+
/**
15+
* Finds and returns a list of scrollable elements on the page,
16+
* ordered from the element with the largest scrollHeight to the smallest.
17+
*
18+
* @param topN Optional maximum number of scrollable elements to return.
19+
* If not provided, all found scrollable elements are returned.
20+
* @returns An array of HTMLElements sorted by descending scrollHeight.
21+
*/
22+
export function getScrollableElements(topN?: number): HTMLElement[] {
23+
// Get the root <html> element
1524
const docEl = document.documentElement;
16-
let mainScrollable: HTMLElement = docEl;
17-
18-
// 1) Compute how “scrollable” the root <html> is
19-
// i.e. total scrollHeight - visible clientHeight
20-
const rootScrollDiff = docEl.scrollHeight - docEl.clientHeight;
2125

22-
// Keep track of the “largest” scroll diff found so far.
23-
let maxScrollDiff = rootScrollDiff;
26+
// 1) Initialize an array to hold all scrollable elements.
27+
// Always include the root <html> element as a fallback.
28+
const scrollableElements: HTMLElement[] = [docEl];
2429

25-
// 2) Scan all elements to find if any <div> has a larger scrollable diff
30+
// 2) Scan all elements to find potential scrollable containers.
31+
// A candidate must have a scrollable overflow style and extra scrollable content.
2632
const allElements = document.querySelectorAll<HTMLElement>("*");
2733
for (const elem of allElements) {
2834
const style = window.getComputedStyle(elem);
@@ -33,25 +39,44 @@ function getMainScrollableElement(): HTMLElement {
3339

3440
if (isPotentiallyScrollable) {
3541
const candidateScrollDiff = elem.scrollHeight - elem.clientHeight;
36-
// Only pick this <div> if it has strictly more vertical “scrollable distance” than our current best
37-
if (candidateScrollDiff > maxScrollDiff) {
38-
maxScrollDiff = candidateScrollDiff;
39-
mainScrollable = elem;
42+
// Only consider this element if it actually has extra scrollable content
43+
// and it can truly scroll.
44+
if (candidateScrollDiff > 0 && canElementScroll(elem)) {
45+
scrollableElements.push(elem);
4046
}
4147
}
4248
}
4349

44-
// 3) Verify the chosen element truly scrolls
45-
if (mainScrollable !== docEl) {
46-
if (!canElementScroll(mainScrollable)) {
47-
console.log(
48-
"Stagehand (Browser Process): Unable to scroll candidate. Fallback to <html>.",
49-
);
50-
mainScrollable = docEl;
51-
}
50+
// 3) Sort the scrollable elements from largest scrollHeight to smallest.
51+
scrollableElements.sort((a, b) => b.scrollHeight - a.scrollHeight);
52+
53+
// 4) If a topN limit is specified, return only the first topN elements.
54+
if (topN !== undefined) {
55+
return scrollableElements.slice(0, topN);
5256
}
5357

54-
return mainScrollable;
58+
// Return all found scrollable elements if no limit is provided.
59+
return scrollableElements;
60+
}
61+
62+
/**
63+
* Calls getScrollableElements, then for each element calls generateXPaths,
64+
* and returns the first XPath for each.
65+
*
66+
* @param topN (optional) integer limit on how many scrollable elements to process
67+
* @returns string[] list of XPaths (1 for each scrollable element)
68+
*/
69+
export async function getScrollableElementXpaths(
70+
topN?: number,
71+
): Promise<string[]> {
72+
const scrollableElems = getScrollableElements(topN);
73+
const xpaths = [];
74+
for (const elem of scrollableElems) {
75+
const allXPaths = await generateXPaths(elem);
76+
const firstXPath = allXPaths?.[0] || "";
77+
xpaths.push(firstXPath);
78+
}
79+
return xpaths;
5580
}
5681

5782
export async function processDom(chunksSeen: Array<number>) {
@@ -80,7 +105,8 @@ export async function processDom(chunksSeen: Array<number>) {
80105
export async function processAllOfDom() {
81106
console.log("Stagehand (Browser Process): Processing all of DOM");
82107

83-
const mainScrollable = getMainScrollableElement();
108+
const mainScrollableElements = getScrollableElements(1);
109+
const mainScrollable = mainScrollableElements[0];
84110

85111
const container =
86112
mainScrollable === document.documentElement
@@ -481,7 +507,7 @@ window.restoreDOM = restoreDOM;
481507
window.createTextBoundingBoxes = createTextBoundingBoxes;
482508
window.getElementBoundingBoxes = getElementBoundingBoxes;
483509
window.createStagehandContainer = createStagehandContainer;
484-
510+
window.getScrollableElementXpaths = getScrollableElementXpaths;
485511
const leafElementDenyList = ["SVG", "IFRAME", "SCRIPT", "STYLE", "LINK"];
486512

487513
const interactiveElementTypes = [

0 commit comments

Comments
 (0)