Skip to content

Commit 6351121

Browse files
committed
Merge branch 'main' into miguel/stg-318-stagehand-evaluator
2 parents b7caabf + 8f0f97b commit 6351121

22 files changed

+704
-119
lines changed

.changeset/fifty-cats-sell.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": minor
3+
---
4+
5+
extract links

.changeset/green-signs-live.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
use javsacript click instead of playwright

.changeset/short-banks-sit.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
Fixed the schema input for Gemini's response model

.changeset/solid-rice-admire.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": minor
3+
---
4+
5+
Added Gemini 2.5 Flash to Google supported models

.changeset/vast-vans-crash.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
Fixes a redundant unnecessary log

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ For more information, please see our [Contributing Guide](https://docs.stagehand
128128

129129
## Acknowledgements
130130

131-
This project heavily relies on [Playwright](https://playwright.dev/) as a resilient backbone to automate the web. It also would not be possible without the awesome techniques and discoveries made by [tarsier](https://github.com/reworkd/tarsier), and [fuji-web](https://github.com/normal-computing/fuji-web).
131+
This project heavily relies on [Playwright](https://playwright.dev/) as a resilient backbone to automate the web. It also would not be possible without the awesome techniques and discoveries made by [tarsier](https://github.com/reworkd/tarsier), [gemini-zod](https://github.com/jbeoris/gemini-zod), and [fuji-web](https://github.com/normal-computing/fuji-web).
132132

133133
We'd like to thank the following people for their major contributions to Stagehand:
134134
- [Paul Klein](https://github.com/pkiv)

evals/evals.config.json

+16
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,22 @@
310310
"name": "google_flights",
311311
"categories": ["act"]
312312
},
313+
{
314+
"name": "extract_jfk_links",
315+
"categories": ["extract"]
316+
},
317+
{
318+
"name": "extract_single_link",
319+
"categories": ["extract"]
320+
},
321+
{
322+
"name": "radio_btn",
323+
"categories": ["act"]
324+
},
325+
{
326+
"name": "checkboxes",
327+
"categories": ["act"]
328+
},
313329
{
314330
"name": "agent/iframe_form",
315331
"categories": ["agent"]

evals/tasks/checkboxes.ts

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const checkboxes: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
await stagehand.page.goto(
10+
"https://browserbase.github.io/stagehand-eval-sites/sites/checkboxes/",
11+
);
12+
13+
await stagehand.page.act({
14+
action: "click the 'baseball' option",
15+
});
16+
17+
await stagehand.page.act({
18+
action: "click the 'netball' option",
19+
});
20+
21+
const baseballChecked = await stagehand.page
22+
.locator('input[type="checkbox"][name="sports"][value="baseball"]')
23+
.isChecked();
24+
25+
const netballChecked = await stagehand.page
26+
.locator('input[type="checkbox"][name="sports"][value="netball"]')
27+
.isChecked();
28+
29+
await stagehand.close();
30+
31+
return {
32+
_success: baseballChecked && netballChecked,
33+
debugUrl,
34+
sessionUrl,
35+
logs: logger.getLogs(),
36+
};
37+
};

evals/tasks/extract_jfk_links.ts

+125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { EvalFunction } from "@/types/evals";
2+
import { z } from "zod";
3+
4+
export const extract_jfk_links: EvalFunction = async ({
5+
logger,
6+
debugUrl,
7+
sessionUrl,
8+
stagehand,
9+
}) => {
10+
try {
11+
await stagehand.page.goto(
12+
"https://browserbase.github.io/stagehand-eval-sites/sites/jfk/",
13+
);
14+
15+
const extraction = await stagehand.page.extract({
16+
instruction:
17+
"extract all the record file name and their corresponding links",
18+
schema: z.object({
19+
records: z.array(
20+
z.object({
21+
file_name: z.string().describe("the file name of the record"),
22+
link: z.string().url(),
23+
}),
24+
),
25+
}),
26+
});
27+
28+
// The list of records we expect to see
29+
const expectedRecords = [
30+
{
31+
file_name: "104-10003-10041.pdf",
32+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10003-10041.pdf",
33+
},
34+
{
35+
file_name: "104-10004-10143 (C06932208).pdf",
36+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10004-10143%20(C06932208).pdf",
37+
},
38+
{
39+
file_name: "104-10004-10143.pdf",
40+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10004-10143.pdf",
41+
},
42+
{
43+
file_name: "104-10004-10156.pdf",
44+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10004-10156.pdf",
45+
},
46+
{
47+
file_name: "104-10004-10213.pdf",
48+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10004-10213.pdf",
49+
},
50+
{
51+
file_name: "104-10005-10321.pdf",
52+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10005-10321.pdf",
53+
},
54+
{
55+
file_name: "104-10006-10247.pdf",
56+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10006-10247.pdf",
57+
},
58+
{
59+
file_name: "104-10007-10345.pdf",
60+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10007-10345.pdf",
61+
},
62+
{
63+
file_name: "104-10009-10021.pdf",
64+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10009-10021.pdf",
65+
},
66+
{
67+
file_name: "104-10009-10222.pdf",
68+
link: "https://www.archives.gov/files/research/jfk/releases/2025/0318/104-10009-10222.pdf",
69+
},
70+
];
71+
72+
const extractedRecords = extraction.records;
73+
74+
// Check that all expected records exist in the extraction
75+
const missingRecords = expectedRecords.filter((expected) => {
76+
return !extractedRecords.some(
77+
(r) => r.file_name === expected.file_name && r.link === expected.link,
78+
);
79+
});
80+
81+
// Check that the extraction array is exactly length 10
82+
if (extractedRecords.length !== 10) {
83+
await stagehand.close();
84+
return {
85+
_success: false,
86+
reason: `Extraction has ${extractedRecords.length} records (expected 10).`,
87+
debugUrl,
88+
sessionUrl,
89+
logs: logger.getLogs(),
90+
};
91+
}
92+
93+
if (missingRecords.length > 0) {
94+
await stagehand.close();
95+
return {
96+
_success: false,
97+
reason: "Missing one or more expected records.",
98+
missingRecords,
99+
extractedRecords,
100+
debugUrl,
101+
sessionUrl,
102+
logs: logger.getLogs(),
103+
};
104+
}
105+
106+
// If we reach here, the number of records is correct, and all are present
107+
await stagehand.close();
108+
return {
109+
_success: true,
110+
debugUrl,
111+
sessionUrl,
112+
logs: logger.getLogs(),
113+
};
114+
} catch (error) {
115+
await stagehand.close();
116+
117+
return {
118+
_success: false,
119+
error: JSON.parse(JSON.stringify(error, null, 2)),
120+
debugUrl,
121+
sessionUrl,
122+
logs: logger.getLogs(),
123+
};
124+
}
125+
};

evals/tasks/extract_single_link.ts

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import { EvalFunction } from "@/types/evals";
2+
import { z } from "zod";
3+
4+
export const extract_single_link: EvalFunction = async ({
5+
logger,
6+
debugUrl,
7+
sessionUrl,
8+
stagehand,
9+
}) => {
10+
try {
11+
await stagehand.page.goto(
12+
"https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/",
13+
);
14+
15+
const extraction = await stagehand.page.extract({
16+
instruction: "extract the link to the 'contact us' page",
17+
schema: z.object({
18+
link: z.string().url(),
19+
}),
20+
});
21+
22+
await stagehand.close();
23+
const extractedLink = extraction.link;
24+
const expectedLink =
25+
"https://browserbase.github.io/stagehand-eval-sites/sites/geniusee/#contact";
26+
27+
if (extractedLink === expectedLink) {
28+
return {
29+
_success: true,
30+
debugUrl,
31+
sessionUrl,
32+
logs: logger.getLogs(),
33+
};
34+
}
35+
return {
36+
_success: false,
37+
reason: `Extracted link: ${extractedLink} does not match expected link: ${expectedLink}`,
38+
debugUrl,
39+
sessionUrl,
40+
logs: logger.getLogs(),
41+
};
42+
} catch (error) {
43+
await stagehand.close();
44+
return {
45+
_success: false,
46+
error: JSON.parse(JSON.stringify(error, null, 2)),
47+
debugUrl,
48+
sessionUrl,
49+
logs: logger.getLogs(),
50+
};
51+
}
52+
};

evals/tasks/radio_btn.ts

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import { EvalFunction } from "@/types/evals";
2+
3+
export const radio_btn: EvalFunction = async ({
4+
debugUrl,
5+
sessionUrl,
6+
stagehand,
7+
logger,
8+
}) => {
9+
await stagehand.page.goto(
10+
"https://browserbase.github.io/stagehand-eval-sites/sites/paneer-pizza/",
11+
);
12+
13+
await stagehand.page.act({
14+
action: "click the 'medium' option",
15+
});
16+
17+
// confirm that the Medium radio is now checked
18+
const radioBtnClicked = await stagehand.page
19+
.locator('input[type="radio"][name="Pizza"][value="Medium"]')
20+
.isChecked();
21+
22+
await stagehand.close();
23+
24+
return {
25+
_success: radioBtnClicked,
26+
debugUrl,
27+
sessionUrl,
28+
logs: logger.getLogs(),
29+
};
30+
};

0 commit comments

Comments
 (0)