Skip to content

Commit 944bbbf

Browse files
wrap braintrust to get llm usage data (#637)
* temp * temp * custom openai * unit and evals * changeset * fix evals config * fix evals config * Update evals/taskConfig.ts Co-authored-by: Sean McGuire <[email protected]> * rebase * temp * all eval tasks * address comments and remove hn from ci * press enter * dont use braintrust ai proxy * fix amazon eval * remove WRITE_FILE check * revert amazon to act category * unify regression evals * update CI * fix job naming * wrap in try catch * fix yml * update other amazon eval * vanta_h experimental * add text_extract eval category to CI --------- Co-authored-by: Sean McGuire <[email protected]> Co-authored-by: Sean McGuire <[email protected]>
1 parent 9774e56 commit 944bbbf

File tree

91 files changed

+1110
-1239
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+1110
-1239
lines changed

Diff for: .changeset/stupid-ghosts-smash.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
Fix: forward along the stack trace in StagehandDefaultError

Diff for: .github/workflows/ci.yml

+18-141
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ on:
1010

1111
env:
1212
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
13-
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract"
13+
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract"
1414

1515
concurrency:
1616
group: ${{ github.ref }}
@@ -186,64 +186,13 @@ jobs:
186186
- name: Run E2E Tests (browserbase)
187187
run: npm run e2e:bb
188188

189-
run-regression-evals-llm-providers:
189+
run-regression-evals:
190190
needs:
191191
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
192192
runs-on: ubuntu-latest
193193
timeout-minutes: 9
194194
outputs:
195-
regression_llm_providers_score: ${{ steps.set-llm-providers-score.outputs.regression_llm_providers_score }}
196-
env:
197-
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
198-
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
199-
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
200-
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
201-
HEADLESS: true
202-
EVAL_ENV: browserbase
203-
EVAL_MODELS: "gpt-4o-mini"
204-
steps:
205-
- name: Check out repository code
206-
uses: actions/checkout@v4
207-
208-
- name: Set up Node.js
209-
uses: actions/setup-node@v4
210-
with:
211-
node-version: "20"
212-
213-
- name: Install dependencies
214-
run: |
215-
rm -rf node_modules
216-
rm -f package-lock.json
217-
npm install
218-
219-
- name: Build Stagehand
220-
run: npm run build
221-
222-
- name: Install Playwright browsers
223-
run: npm exec playwright install --with-deps
224-
225-
- name: Run Regression Evals (llmProviders)
226-
run: npm run evals category regression_llm_providers trials=2 concurrency=8 env=BROWSERBASE
227-
228-
- name: Save Regression llmProviders Results
229-
run: mv eval-summary.json eval-summary-regression-llm-providers.json
230-
231-
- name: Log and Regression (llmProviders) Evals Performance
232-
id: set-llm-providers-score
233-
run: |
234-
experimentNameRegressionLlmProviders=$(jq -r '.experimentName' eval-summary-regression-llm-providers.json)
235-
regression_llm_providers_score=$(jq '.categories.regression_llm_providers' eval-summary-regression-llm-providers.json)
236-
echo "regression_llm_providers category score: ${regression_llm_providers_score}%"
237-
echo "View regression_llm_providers results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionLlmProviders}"
238-
echo "regression_llm_providers_score=$regression_llm_providers_score" >> "$GITHUB_OUTPUT"
239-
240-
run-regression-evals-dom-extract:
241-
needs:
242-
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
243-
runs-on: ubuntu-latest
244-
timeout-minutes: 9
245-
outputs:
246-
regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }}
195+
regression_score: ${{ steps.set-regression-score.outputs.regression_score }}
247196
env:
248197
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
249198
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -273,99 +222,27 @@ jobs:
273222
- name: Install Playwright browsers
274223
run: npm exec playwright install --with-deps
275224

276-
- name: Run Regression Evals (domExtract)
277-
run: npm run evals category regression_dom_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=domExtract
225+
- name: Run Regression Evals
226+
run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE
278227

279-
- name: Save Regression domExtract Results
280-
run: mv eval-summary.json eval-summary-regression-dom.json
281-
282-
- name: Log and Regression (domExtract) Evals Performance
283-
id: set-dom-score
228+
- name: Log Regression Evals Performance
284229
run: |
285-
experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json)
286-
regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json)
287-
echo "regression_dom_extract category score: ${regression_dom_score}%"
288-
echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}"
289-
echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT"
290-
291-
run-regression-evals-text-extract:
292-
needs:
293-
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
294-
runs-on: ubuntu-latest
295-
timeout-minutes: 9
296-
outputs:
297-
regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }}
298-
env:
299-
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
300-
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
301-
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
302-
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
303-
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
304-
HEADLESS: true
305-
EVAL_ENV: browserbase
306-
steps:
307-
- name: Check out repository code
308-
uses: actions/checkout@v4
309-
310-
- name: Set up Node.js
311-
uses: actions/setup-node@v4
312-
with:
313-
node-version: "20"
314-
315-
- name: Install dependencies
316-
run: |
317-
rm -rf node_modules
318-
rm -f package-lock.json
319-
npm install
320-
321-
- name: Build Stagehand
322-
run: npm run build
323-
324-
- name: Install Playwright browsers
325-
run: npm exec playwright install --with-deps
326-
327-
- name: Run Regression Evals (textExtract)
328-
run: npm run evals category regression_text_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=textExtract
329-
330-
- name: Save Regression textExtract Results
331-
run: mv eval-summary.json eval-summary-regression-text.json
332-
333-
- name: Log Regression (textExtract) Evals Performance
334-
id: set-text-score
335-
run: |
336-
experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json)
337-
regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json)
338-
echo "regression_text_extract category score: ${regression_text_score}%"
339-
echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}"
340-
echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT"
341-
342-
check-regression-evals-score:
343-
needs:
344-
[
345-
run-regression-evals-text-extract,
346-
run-regression-evals-dom-extract,
347-
run-regression-evals-llm-providers,
348-
]
349-
runs-on: ubuntu-latest
350-
timeout-minutes: 5
351-
steps:
352-
- name: Compare Overall Regression Evals Score
353-
run: |
354-
regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}"
355-
regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}"
356-
regression_llm_providers_score="${{ needs.run-regression-evals-llm-providers.outputs.regression_llm_providers_score }}"
357-
358-
overall_score=$(echo "(${regression_dom_score} + ${regression_text_score} + ${regression_llm_providers_score}) / 3" | bc -l)
359-
echo "Overall regression score: ${overall_score}%"
360-
361-
# Fail if overall score is below 90%
362-
if (( $(echo "${overall_score} < 90" | bc -l) )); then
363-
echo "Overall regression score is below 90%. Failing CI."
230+
experimentName=$(jq -r '.experimentName' eval-summary.json)
231+
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"
232+
if [ -f eval-summary.json ]; then
233+
regression_score=$(jq '.categories.regression' eval-summary.json)
234+
echo "Regression category score: $regression_score%"
235+
if (( $(echo "$regression_score < 90" | bc -l) )); then
236+
echo "Regression category score is below 90%. Failing CI."
237+
exit 1
238+
fi
239+
else
240+
echo "Eval summary not found for regression category. Failing CI."
364241
exit 1
365242
fi
366243
367244
run-combination-evals:
368-
needs: [check-regression-evals-score, determine-evals]
245+
needs: [run-regression-evals, determine-evals]
369246
runs-on: ubuntu-latest
370247
timeout-minutes: 40
371248
env:

Diff for: evals/args.ts

+10-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ const parsedArgs: {
88
trials?: number;
99
concurrency?: number;
1010
extractMethod?: string;
11+
provider?: string;
1112
leftover: string[];
1213
} = {
1314
leftover: [],
@@ -28,6 +29,8 @@ for (const arg of rawArgs) {
2829
}
2930
} else if (arg.startsWith("--extract-method=")) {
3031
parsedArgs.extractMethod = arg.split("=")[1];
32+
} else if (arg.startsWith("provider=")) {
33+
parsedArgs.provider = arg.split("=")[1]?.toLowerCase();
3134
} else {
3235
parsedArgs.leftover.push(arg);
3336
}
@@ -64,8 +67,8 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
6467
"text_extract",
6568
"targeted_extract",
6669
"regression_llm_providers",
67-
"regression_text_extract",
68-
"regression_dom_extract",
70+
"regression",
71+
"llm_clients",
6972
];
7073

7174
// Finally, interpret leftover arguments to see if user typed "category X" or a single eval name
@@ -93,10 +96,15 @@ if (parsedArgs.leftover.length > 0) {
9396
}
9497
}
9598

99+
if (parsedArgs.provider !== undefined) {
100+
process.env.EVAL_PROVIDER = parsedArgs.provider;
101+
}
102+
96103
export {
97104
filterByCategory,
98105
filterByEvalName,
99106
useTextExtract,
100107
useAccessibilityTree,
101108
DEFAULT_EVAL_CATEGORIES,
109+
parsedArgs,
102110
};

Diff for: evals/evals.config.json

+26-31
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,7 @@
66
},
77
{
88
"name": "expect_act_timeout",
9-
"categories": ["act"]
10-
},
11-
{
12-
"name": "expect_act_timeout_global",
13-
"categories": ["act"]
9+
"categories": ["regression"]
1410
},
1511
{
1612
"name": "extract_repo_name",
@@ -22,15 +18,16 @@
2218
},
2319
{
2420
"name": "instructions",
25-
"categories": ["combination"]
21+
"categories": ["regression", "combination"]
2622
},
2723
{
2824
"name": "bidnet",
2925
"categories": ["act"]
3026
},
3127
{
3228
"name": "ionwave",
33-
"categories": ["act", "regression_dom_extract"]
29+
"categories": ["act", "regression"],
30+
"extract_method": "domExtract"
3431
},
3532
{
3633
"name": "nonsense_action",
@@ -83,19 +80,20 @@
8380
},
8481
{
8582
"name": "wichita",
86-
"categories": ["combination", "regression_dom_extract"]
83+
"categories": ["combination", "regression"],
84+
"extract_method": "domExtract"
8785
},
8886
{
8987
"name": "hn_aisdk",
90-
"categories": ["regression_llm_providers"]
88+
"categories": ["llm_clients"]
9189
},
9290
{
9391
"name": "hn_langchain",
94-
"categories": ["regression_llm_providers"]
92+
"categories": ["llm_clients"]
9593
},
9694
{
9795
"name": "hn_customOpenAI",
98-
"categories": ["regression_llm_providers"]
96+
"categories": ["llm_clients"]
9997
},
10098
{
10199
"name": "apple",
@@ -119,7 +117,8 @@
119117
},
120118
{
121119
"name": "extract_aigrant_companies",
122-
"categories": ["experimental", "text_extract", "regression_text_extract"]
120+
"categories": ["text_extract", "regression"],
121+
"extract_method": "textExtract"
123122
},
124123
{
125124
"name": "extract_capacitor_info",
@@ -168,7 +167,8 @@
168167
},
169168
{
170169
"name": "extract_memorial_healthcare",
171-
"categories": ["extract", "regression_dom_extract"]
170+
"categories": ["extract", "regression"],
171+
"extract_method": "domExtract"
172172
},
173173
{
174174
"name": "extract_nhl_stats",
@@ -203,17 +203,9 @@
203203
"name": "panamcs",
204204
"categories": ["observe"]
205205
},
206-
{
207-
"name": "shopify_homepage",
208-
"categories": ["observe"]
209-
},
210-
{
211-
"name": "vanta",
212-
"categories": ["observe"]
213-
},
214206
{
215207
"name": "vanta_h",
216-
"categories": ["observe"]
208+
"categories": ["experimental"]
217209
},
218210
{
219211
"name": "extract_area_codes",
@@ -237,11 +229,13 @@
237229
},
238230
{
239231
"name": "observe_github",
240-
"categories": ["observe", "regression_text_extract"]
232+
"categories": ["observe", "regression"],
233+
"extract_method": "textExtract"
241234
},
242235
{
243236
"name": "observe_vantechjournal",
244-
"categories": ["observe", "regression_text_extract"]
237+
"categories": ["observe", "regression"],
238+
"extract_method": "textExtract"
245239
},
246240
{
247241
"name": "observe_amazon_add_to_cart",
@@ -261,15 +255,16 @@
261255
},
262256
{
263257
"name": "observe_iframes1",
264-
"categories": ["observe"]
258+
"categories": ["regression", "observe"]
265259
},
266260
{
267261
"name": "observe_iframes2",
268-
"categories": ["observe"]
262+
"categories": ["regression", "observe"]
269263
},
270264
{
271265
"name": "extract_hamilton_weather",
272-
"categories": ["targeted_extract", "regression_text_extract"]
266+
"categories": ["targeted_extract", "regression"],
267+
"extract_method": "textExtract"
273268
},
274269
{
275270
"name": "extract_regulations_table",
@@ -297,19 +292,19 @@
297292
},
298293
{
299294
"name": "scroll_50",
300-
"categories": ["act"]
295+
"categories": ["regression", "act"]
301296
},
302297
{
303298
"name": "scroll_75",
304-
"categories": ["act", "regression_dom_extract"]
299+
"categories": ["regression", "act"]
305300
},
306301
{
307302
"name": "nextChunk",
308-
"categories": ["act"]
303+
"categories": ["regression", "act"]
309304
},
310305
{
311306
"name": "prevChunk",
312-
"categories": ["act"]
307+
"categories": ["regression", "act"]
313308
}
314309
]
315310
}

0 commit comments

Comments
 (0)