Skip to content

Commit affa564

Browse files
a11y extract (#617)
* a11y extract * changeset
1 parent cd36068 commit affa564

File tree

2 files changed

+19
-90
lines changed

2 files changed

+19
-90
lines changed

Diff for: .changeset/forty-symbols-crash.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
use a11y tree for default extract

Diff for: lib/handlers/extractHandler.ts

+14-90
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { formatText } from "../utils";
77
import { StagehandPage } from "../StagehandPage";
88
import { Stagehand, StagehandFunctionName } from "../index";
99
import { pageTextSchema } from "../../types/page";
10+
import { getAccessibilityTree } from "@/lib/a11y/utils";
1011

1112
const PROXIMITY_THRESHOLD = 15;
1213

@@ -51,31 +52,6 @@ const PROXIMITY_THRESHOLD = 15;
5152
* **10. Handle the extraction response and logging the results.**
5253
* - Processes the output from the LLM and logs relevant information.
5354
*
54-
*
55-
* Here is what `domExtract` does at a high level:
56-
*
57-
* **1. Wait for the DOM to settle and start DOM debugging.**
58-
* - Ensures the page is fully loaded and stable before extraction.
59-
*
60-
* **2. Process the DOM in chunks.**
61-
* - The `processDom` function:
62-
* - Divides the page into vertical "chunks" based on viewport height.
63-
* - Picks the next chunk that hasn't been processed yet.
64-
* - Scrolls to that chunk and extracts candidate elements.
65-
* - Returns `outputString` (HTML snippets of candidate elements),
66-
* `selectorMap` (the XPaths of the candidate elements),
67-
* `chunk` (the current chunk index), and `chunks` (the array of all chunk indices).
68-
* - This chunk-based approach ensures that large or lengthy pages can be processed in smaller, manageable sections.
69-
*
70-
* **3. Pass the extracted DOM elements (in `outputString`) to the LLM for structured data extraction.**
71-
* - Uses the instructions, schema, and previously extracted content as context to
72-
* guide the LLM in extracting the structured data.
73-
*
74-
* **4. Check if extraction is complete.**
75-
* - If the extraction is complete (all chunks have been processed or the LLM determines
76-
* that we do not need to continue), return the final result.
77-
* - If not, repeat steps 1-4 with the next chunk until extraction is complete or no more chunks remain.
78-
*
7955
* @remarks
8056
* Each step corresponds to specific code segments, as noted in the comments throughout the code.
8157
*/
@@ -112,7 +88,6 @@ export class StagehandExtractHandler {
11288
instruction,
11389
schema,
11490
content = {},
115-
chunksSeen = [],
11691
llmClient,
11792
requestId,
11893
domSettleTimeoutMs,
@@ -154,7 +129,6 @@ export class StagehandExtractHandler {
154129
instruction,
155130
schema,
156131
content,
157-
chunksSeen,
158132
llmClient,
159133
requestId,
160134
domSettleTimeoutMs,
@@ -415,22 +389,20 @@ export class StagehandExtractHandler {
415389
instruction,
416390
schema,
417391
content = {},
418-
chunksSeen = [],
419392
llmClient,
420393
requestId,
421394
domSettleTimeoutMs,
422395
}: {
423396
instruction: string;
424397
schema: T;
425398
content?: z.infer<T>;
426-
chunksSeen?: Array<number>;
427399
llmClient: LLMClient;
428400
requestId?: string;
429401
domSettleTimeoutMs?: number;
430402
}): Promise<z.infer<T>> {
431403
this.logger({
432404
category: "extraction",
433-
message: "starting extraction using old approach",
405+
message: "starting extraction using a11y tree",
434406
level: 1,
435407
auxiliary: {
436408
instruction: {
@@ -440,56 +412,24 @@ export class StagehandExtractHandler {
440412
},
441413
});
442414

443-
// **1:** Wait for the DOM to settle and start DOM debugging
444-
// This ensures the page is stable before extracting any data.
445415
await this.stagehandPage._waitForSettledDom(domSettleTimeoutMs);
446-
447-
// **2:** Call processDom() to handle chunk-based extraction
448-
// processDom determines which chunk of the page to process next.
449-
// It will:
450-
// - Identify all chunks (vertical segments of the page),
451-
// - Pick the next unprocessed chunk,
452-
// - Scroll to that chunk's region,
453-
// - Extract candidate elements and their text,
454-
// - Return the extracted text (outputString), a selectorMap (for referencing elements),
455-
// the current chunk index, and the full list of chunks.
456-
const { outputString, chunk, chunks } = await this.stagehand.page.evaluate(
457-
(chunksSeen?: number[]) => window.processDom(chunksSeen ?? []),
458-
chunksSeen,
459-
);
460-
416+
const tree = await getAccessibilityTree(this.stagehandPage, this.logger);
461417
this.logger({
462418
category: "extraction",
463-
message: "received output from processDom.",
464-
auxiliary: {
465-
chunk: {
466-
value: chunk.toString(),
467-
type: "integer",
468-
},
469-
chunks_left: {
470-
value: (chunks.length - chunksSeen.length).toString(),
471-
type: "integer",
472-
},
473-
chunks_total: {
474-
value: chunks.length.toString(),
475-
type: "integer",
476-
},
477-
},
419+
message: "Getting accessibility tree data",
420+
level: 1,
478421
});
422+
const outputString = tree.simplified;
479423

480-
// **3:** Pass the list of candidate HTML snippets to the LLM
481-
// The LLM uses the provided instruction and schema to parse and extract
482-
// structured data.
483424
const extractionResponse = await extract({
484425
instruction,
485426
previouslyExtractedContent: content,
486427
domElements: outputString,
487428
schema,
429+
chunksSeen: 1,
430+
chunksTotal: 1,
488431
llmClient,
489-
chunksSeen: chunksSeen.length,
490-
chunksTotal: chunks.length,
491432
requestId,
492-
isUsingTextExtract: false,
493433
userProvidedInstructions: this.userProvidedInstructions,
494434
logger: this.logger,
495435
logInferenceToFile: this.stagehand.logInferenceToFile,
@@ -521,48 +461,32 @@ export class StagehandExtractHandler {
521461
},
522462
});
523463

524-
// Mark the current chunk as processed by adding it to chunksSeen
525-
chunksSeen.push(chunk);
526-
527-
// **4:** Check if extraction is complete
528-
// If the LLM deems the extraction complete or we've processed all chunks, return the final result.
529-
// Otherwise, call domExtract again for the next chunk.
530-
if (completed || chunksSeen.length === chunks.length) {
464+
if (completed) {
531465
this.logger({
532466
category: "extraction",
533-
message: "got response",
467+
message: "extraction completed successfully",
468+
level: 1,
534469
auxiliary: {
535470
extraction_response: {
536471
value: JSON.stringify(extractionResponse),
537472
type: "object",
538473
},
539474
},
540475
});
541-
542-
return output;
543476
} else {
544477
this.logger({
545478
category: "extraction",
546-
message: "continuing extraction",
479+
message: "extraction incomplete after processing all data",
480+
level: 1,
547481
auxiliary: {
548482
extraction_response: {
549483
value: JSON.stringify(extractionResponse),
550484
type: "object",
551485
},
552486
},
553487
});
554-
await this.stagehandPage._waitForSettledDom(domSettleTimeoutMs);
555-
556-
// Recursively continue with the next chunk
557-
return this.domExtract({
558-
instruction,
559-
schema,
560-
content: output,
561-
chunksSeen,
562-
llmClient,
563-
domSettleTimeoutMs,
564-
});
565488
}
489+
return output;
566490
}
567491

568492
/**

0 commit comments

Comments
 (0)