Skip to content

Commit 566e587

Browse files
authored
Add support for custom OpenAI provider (#620)
* Add support for custom OpenAI provider * changeset * param cleanup * evals config * regression llm providers * llm providers * llm providers * pls * timeout * pls sean
1 parent 1e49dee commit 566e587

File tree

9 files changed

+238
-60
lines changed

9 files changed

+238
-60
lines changed

Diff for: .changeset/wide-oranges-yawn.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": minor
3+
---
4+
5+
You can now pass in an OpenAI instance as an `llmClient` to the Stagehand constructor! This allows you to use Stagehand with any OpenAI-compatible model, like Ollama, Gemini, etc., as well as OpenAI wrappers like Braintrust.

Diff for: .github/workflows/ci.yml

+57
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,63 @@ jobs:
186186
- name: Run E2E Tests (browserbase)
187187
run: npm run e2e:bb
188188

189+
run-regression-evals-llm-providers:
190+
needs:
191+
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
192+
runs-on: ubuntu-latest
193+
timeout-minutes: 10
194+
outputs:
195+
regression_llm_providers_score: ${{ steps.set-llm-providers-score.outputs.regression_llm_providers_score }}
196+
env:
197+
EVAL_MODELS: "gpt-4o-mini"
198+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
199+
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
200+
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
201+
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
202+
HEADLESS: true
203+
EVAL_ENV: browserbase
204+
steps:
205+
- name: Check out repository code
206+
uses: actions/checkout@v4
207+
208+
- name: Set up Node.js
209+
uses: actions/setup-node@v4
210+
with:
211+
node-version: "20"
212+
213+
- name: Install dependencies
214+
run: |
215+
rm -rf node_modules
216+
rm -f package-lock.json
217+
npm install
218+
219+
- name: Build Stagehand
220+
run: npm run build
221+
222+
- name: Install Playwright browsers
223+
run: npm exec playwright install --with-deps
224+
225+
- name: Run Regression Evals (llmProviders)
226+
run: npm run evals category regression_llm_providers trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=llmProviders
227+
228+
- name: Save Regression llmProviders Results
229+
run: mv eval-summary.json eval-summary-regression-llm-providers.json
230+
231+
- name: Log and Regression (llmProviders) Evals Performance
232+
id: set-llm-providers-score
233+
run: |
234+
experimentNameRegressionLlmProviders=$(jq -r '.experimentName' eval-summary-regression-llm-providers.json)
235+
regression_llm_providers_score=$(jq '.categories.regression_llm_providers' eval-summary-regression-llm-providers.json)
236+
echo "regression_llm_providers category score: ${regression_llm_providers_score}%"
237+
echo "View regression_llm_providers results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionLlmProviders}"
238+
echo "regression_llm_providers_score=$regression_llm_providers_score" >> "$GITHUB_OUTPUT"
239+
240+
# Fail if regression_llm_providers_score is below 83%
241+
if (( $(echo "${regression_llm_providers_score} < 83" | bc -l) )); then
242+
echo "regression_llm_providers score is below 83%. Failing CI."
243+
exit 1
244+
fi
245+
189246
run-regression-evals-dom-extract:
190247
needs:
191248
[run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]

Diff for: evals/args.ts

+3
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
6363
"experimental",
6464
"text_extract",
6565
"targeted_extract",
66+
"regression_llm_providers",
67+
"regression_text_extract",
68+
"regression_dom_extract",
6669
];
6770

6871
// Finally, interpret leftover arguments to see if user typed "category X" or a single eval name

Diff for: evals/evals.config.json

+6-2
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,15 @@
8787
},
8888
{
8989
"name": "hn_aisdk",
90-
"categories": ["combination", "regression_dom_extract"]
90+
"categories": ["regression_llm_providers"]
9191
},
9292
{
9393
"name": "hn_langchain",
94-
"categories": ["combination", "regression_dom_extract"]
94+
"categories": ["regression_llm_providers"]
95+
},
96+
{
97+
"name": "hn_customOpenAI",
98+
"categories": ["regression_llm_providers"]
9599
},
96100
{
97101
"name": "apple",

Diff for: evals/tasks/hn_customOpenAI.ts

+116
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import { EvalFunction } from "@/types/evals";
2+
import { initStagehand } from "@/evals/initStagehand";
3+
import { z } from "zod";
4+
import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
5+
import OpenAI from "openai";
6+
7+
export const hn_customOpenAI: EvalFunction = async ({ logger }) => {
8+
const { stagehand, initResponse } = await initStagehand({
9+
logger,
10+
llmClient: new CustomOpenAIClient({
11+
modelName: "gpt-4o-mini",
12+
client: new OpenAI({
13+
apiKey: process.env.OPENAI_API_KEY,
14+
}),
15+
}),
16+
});
17+
18+
const { debugUrl, sessionUrl } = initResponse;
19+
20+
await stagehand.page.goto("https://news.ycombinator.com");
21+
22+
let { story } = await stagehand.page.extract({
23+
schema: z.object({
24+
story: z.string().describe("the title of the top story on the page"),
25+
}),
26+
});
27+
// remove the (url) part of the story title
28+
story = story.split(" (")[0];
29+
30+
const expectedStoryElement = await stagehand.page.$(
31+
"xpath=/html/body/center/table/tbody/tr[3]/td/table/tbody/tr[1]/td[3]/span/a",
32+
);
33+
// remove the (url) part of the story title
34+
const expectedStory = (await expectedStoryElement?.textContent())?.split(
35+
" (",
36+
)?.[0];
37+
38+
if (!expectedStory) {
39+
logger.error({
40+
message: "Could not find expected story element",
41+
level: 0,
42+
});
43+
return {
44+
_success: false,
45+
error: "Could not find expected story element",
46+
debugUrl,
47+
sessionUrl,
48+
logs: logger.getLogs(),
49+
};
50+
}
51+
52+
if (story !== expectedStory) {
53+
logger.error({
54+
message: "Extracted story does not match expected story",
55+
level: 0,
56+
auxiliary: {
57+
expected: {
58+
value: expectedStory,
59+
type: "string",
60+
},
61+
actual: {
62+
value: story,
63+
type: "string",
64+
},
65+
},
66+
});
67+
return {
68+
_success: false,
69+
error: "Extracted story does not match expected story",
70+
expectedStory,
71+
actualStory: story,
72+
debugUrl,
73+
sessionUrl,
74+
logs: logger.getLogs(),
75+
};
76+
}
77+
78+
await stagehand.page.act("Click on the 'new' tab");
79+
80+
if (stagehand.page.url() !== "https://news.ycombinator.com/newest") {
81+
logger.error({
82+
message: "Page did not navigate to the 'new' tab",
83+
level: 0,
84+
auxiliary: {
85+
expected: {
86+
value: "https://news.ycombinator.com/newest",
87+
type: "string",
88+
},
89+
actual: {
90+
value: stagehand.page.url(),
91+
type: "string",
92+
},
93+
},
94+
});
95+
return {
96+
_success: false,
97+
error: "Page did not navigate to the 'new' tab",
98+
expectedUrl: "https://news.ycombinator.com/newest",
99+
actualUrl: stagehand.page.url(),
100+
debugUrl,
101+
sessionUrl,
102+
logs: logger.getLogs(),
103+
};
104+
}
105+
106+
await stagehand.close();
107+
108+
return {
109+
_success: true,
110+
expectedStory,
111+
actualStory: story,
112+
debugUrl,
113+
sessionUrl,
114+
logs: logger.getLogs(),
115+
};
116+
};

Diff for: examples/actionable_observe_example.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ async function example() {
5959
console.log("✅ Success! we made it to the correct page");
6060
} else {
6161
console.log(
62-
"❌ Whoops, looks like we didnt make it to the correct page. " +
62+
"❌ Whoops, looks like we didn't make it to the correct page. " +
6363
"\nThanks for testing out this new Stagehand feature!" +
6464
"\nReach us on Slack if you have any feedback/questions/suggestions!",
6565
);

Diff for: examples/external_client.ts

+16-13
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,34 @@
11
import { Stagehand } from "@/dist";
22
import { z } from "zod";
3-
import { OllamaClient } from "./external_clients/ollama";
3+
import { CustomOpenAIClient } from "./external_clients/customOpenAI";
44
import StagehandConfig from "@/stagehand.config";
5+
import OpenAI from "openai";
56

67
async function example() {
78
const stagehand = new Stagehand({
89
...StagehandConfig,
9-
llmClient: new OllamaClient({
10-
modelName: "llama3.2",
10+
llmClient: new CustomOpenAIClient({
11+
modelName: "gpt-4o-mini",
12+
client: new OpenAI({
13+
apiKey: process.env.OPENAI_API_KEY,
14+
}),
1115
}),
1216
});
1317

1418
await stagehand.init();
1519
await stagehand.page.goto("https://news.ycombinator.com");
20+
await stagehand.page.act("click on the 'new' link");
1621

1722
const headlines = await stagehand.page.extract({
18-
instruction: "Extract only 3 stories from the Hacker News homepage.",
23+
instruction: "Extract the top 3 stories from the Hacker News homepage.",
1924
schema: z.object({
20-
stories: z
21-
.array(
22-
z.object({
23-
title: z.string(),
24-
url: z.string(),
25-
points: z.number(),
26-
}),
27-
)
28-
.length(3),
25+
stories: z.array(
26+
z.object({
27+
title: z.string(),
28+
url: z.string(),
29+
points: z.number(),
30+
}),
31+
),
2932
}),
3033
});
3134

0 commit comments

Comments
 (0)