|
10 | 10 |
|
11 | 11 | env:
|
12 | 12 | EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
|
13 |
| - EVAL_CATEGORIES: "observe,act,combination,extract,text_extract" |
| 13 | + EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract" |
14 | 14 |
|
15 | 15 | concurrency:
|
16 | 16 | group: ${{ github.ref }}
|
@@ -186,64 +186,13 @@ jobs:
|
186 | 186 | - name: Run E2E Tests (browserbase)
|
187 | 187 | run: npm run e2e:bb
|
188 | 188 |
|
189 |
| - run-regression-evals-llm-providers: |
| 189 | + run-regression-evals: |
190 | 190 | needs:
|
191 | 191 | [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
|
192 | 192 | runs-on: ubuntu-latest
|
193 | 193 | timeout-minutes: 9
|
194 | 194 | outputs:
|
195 |
| - regression_llm_providers_score: ${{ steps.set-llm-providers-score.outputs.regression_llm_providers_score }} |
196 |
| - env: |
197 |
| - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
198 |
| - BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} |
199 |
| - BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} |
200 |
| - BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} |
201 |
| - HEADLESS: true |
202 |
| - EVAL_ENV: browserbase |
203 |
| - EVAL_MODELS: "gpt-4o-mini" |
204 |
| - steps: |
205 |
| - - name: Check out repository code |
206 |
| - uses: actions/checkout@v4 |
207 |
| - |
208 |
| - - name: Set up Node.js |
209 |
| - uses: actions/setup-node@v4 |
210 |
| - with: |
211 |
| - node-version: "20" |
212 |
| - |
213 |
| - - name: Install dependencies |
214 |
| - run: | |
215 |
| - rm -rf node_modules |
216 |
| - rm -f package-lock.json |
217 |
| - npm install |
218 |
| -
|
219 |
| - - name: Build Stagehand |
220 |
| - run: npm run build |
221 |
| - |
222 |
| - - name: Install Playwright browsers |
223 |
| - run: npm exec playwright install --with-deps |
224 |
| - |
225 |
| - - name: Run Regression Evals (llmProviders) |
226 |
| - run: npm run evals category regression_llm_providers trials=2 concurrency=8 env=BROWSERBASE |
227 |
| - |
228 |
| - - name: Save Regression llmProviders Results |
229 |
| - run: mv eval-summary.json eval-summary-regression-llm-providers.json |
230 |
| - |
231 |
| - - name: Log and Regression (llmProviders) Evals Performance |
232 |
| - id: set-llm-providers-score |
233 |
| - run: | |
234 |
| - experimentNameRegressionLlmProviders=$(jq -r '.experimentName' eval-summary-regression-llm-providers.json) |
235 |
| - regression_llm_providers_score=$(jq '.categories.regression_llm_providers' eval-summary-regression-llm-providers.json) |
236 |
| - echo "regression_llm_providers category score: ${regression_llm_providers_score}%" |
237 |
| - echo "View regression_llm_providers results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionLlmProviders}" |
238 |
| - echo "regression_llm_providers_score=$regression_llm_providers_score" >> "$GITHUB_OUTPUT" |
239 |
| -
|
240 |
| - run-regression-evals-dom-extract: |
241 |
| - needs: |
242 |
| - [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] |
243 |
| - runs-on: ubuntu-latest |
244 |
| - timeout-minutes: 9 |
245 |
| - outputs: |
246 |
| - regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }} |
| 195 | + regression_score: ${{ steps.set-regression-score.outputs.regression_score }} |
247 | 196 | env:
|
248 | 197 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
249 | 198 | ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
@@ -273,99 +222,27 @@ jobs:
|
273 | 222 | - name: Install Playwright browsers
|
274 | 223 | run: npm exec playwright install --with-deps
|
275 | 224 |
|
276 |
| - - name: Run Regression Evals (domExtract) |
277 |
| - run: npm run evals category regression_dom_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=domExtract |
| 225 | + - name: Run Regression Evals |
| 226 | + run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE |
278 | 227 |
|
279 |
| - - name: Save Regression domExtract Results |
280 |
| - run: mv eval-summary.json eval-summary-regression-dom.json |
281 |
| - |
282 |
| - - name: Log and Regression (domExtract) Evals Performance |
283 |
| - id: set-dom-score |
| 228 | + - name: Log Regression Evals Performance |
284 | 229 | run: |
|
285 |
| - experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json) |
286 |
| - regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json) |
287 |
| - echo "regression_dom_extract category score: ${regression_dom_score}%" |
288 |
| - echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}" |
289 |
| - echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT" |
290 |
| -
|
291 |
| - run-regression-evals-text-extract: |
292 |
| - needs: |
293 |
| - [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] |
294 |
| - runs-on: ubuntu-latest |
295 |
| - timeout-minutes: 9 |
296 |
| - outputs: |
297 |
| - regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }} |
298 |
| - env: |
299 |
| - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
300 |
| - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} |
301 |
| - BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} |
302 |
| - BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} |
303 |
| - BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} |
304 |
| - HEADLESS: true |
305 |
| - EVAL_ENV: browserbase |
306 |
| - steps: |
307 |
| - - name: Check out repository code |
308 |
| - uses: actions/checkout@v4 |
309 |
| - |
310 |
| - - name: Set up Node.js |
311 |
| - uses: actions/setup-node@v4 |
312 |
| - with: |
313 |
| - node-version: "20" |
314 |
| - |
315 |
| - - name: Install dependencies |
316 |
| - run: | |
317 |
| - rm -rf node_modules |
318 |
| - rm -f package-lock.json |
319 |
| - npm install |
320 |
| -
|
321 |
| - - name: Build Stagehand |
322 |
| - run: npm run build |
323 |
| - |
324 |
| - - name: Install Playwright browsers |
325 |
| - run: npm exec playwright install --with-deps |
326 |
| - |
327 |
| - - name: Run Regression Evals (textExtract) |
328 |
| - run: npm run evals category regression_text_extract trials=2 concurrency=8 env=BROWSERBASE -- --extract-method=textExtract |
329 |
| - |
330 |
| - - name: Save Regression textExtract Results |
331 |
| - run: mv eval-summary.json eval-summary-regression-text.json |
332 |
| - |
333 |
| - - name: Log Regression (textExtract) Evals Performance |
334 |
| - id: set-text-score |
335 |
| - run: | |
336 |
| - experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json) |
337 |
| - regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json) |
338 |
| - echo "regression_text_extract category score: ${regression_text_score}%" |
339 |
| - echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}" |
340 |
| - echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT" |
341 |
| -
|
342 |
| - check-regression-evals-score: |
343 |
| - needs: |
344 |
| - [ |
345 |
| - run-regression-evals-text-extract, |
346 |
| - run-regression-evals-dom-extract, |
347 |
| - run-regression-evals-llm-providers, |
348 |
| - ] |
349 |
| - runs-on: ubuntu-latest |
350 |
| - timeout-minutes: 5 |
351 |
| - steps: |
352 |
| - - name: Compare Overall Regression Evals Score |
353 |
| - run: | |
354 |
| - regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}" |
355 |
| - regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}" |
356 |
| - regression_llm_providers_score="${{ needs.run-regression-evals-llm-providers.outputs.regression_llm_providers_score }}" |
357 |
| -
|
358 |
| - overall_score=$(echo "(${regression_dom_score} + ${regression_text_score} + ${regression_llm_providers_score}) / 3" | bc -l) |
359 |
| - echo "Overall regression score: ${overall_score}%" |
360 |
| -
|
361 |
| - # Fail if overall score is below 90% |
362 |
| - if (( $(echo "${overall_score} < 90" | bc -l) )); then |
363 |
| - echo "Overall regression score is below 90%. Failing CI." |
| 230 | + experimentName=$(jq -r '.experimentName' eval-summary.json) |
| 231 | + echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" |
| 232 | + if [ -f eval-summary.json ]; then |
| 233 | + regression_score=$(jq '.categories.regression' eval-summary.json) |
| 234 | + echo "Regression category score: $regression_score%" |
| 235 | + if (( $(echo "$regression_score < 90" | bc -l) )); then |
| 236 | + echo "Regression category score is below 90%. Failing CI." |
| 237 | + exit 1 |
| 238 | + fi |
| 239 | + else |
| 240 | + echo "Eval summary not found for regression category. Failing CI." |
364 | 241 | exit 1
|
365 | 242 | fi
|
366 | 243 |
|
367 | 244 | run-combination-evals:
|
368 |
| - needs: [check-regression-evals-score, determine-evals] |
| 245 | + needs: [run-regression-evals, determine-evals] |
369 | 246 | runs-on: ubuntu-latest
|
370 | 247 | timeout-minutes: 40
|
371 | 248 | env:
|
|
0 commit comments