Add support for custom OpenAI provider (#620)

kamath · web-flow · commit 566e5877a186 · 2025-03-30T19:28:41.000-07:00
* Add support for custom OpenAI provider

* changeset

* param cleanup

* evals config

* regression llm providers

* llm providers

* llm providers

* pls

* timeout

* pls sean
diff --git a/.changeset/wide-oranges-yawn.md b/.changeset/wide-oranges-yawn.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+You can now pass in an OpenAI instance as an `llmClient` to the Stagehand constructor! This allows you to use Stagehand with any OpenAI-compatible model, like Ollama, Gemini, etc., as well as OpenAI wrappers like Braintrust.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -186,6 +186,63 @@ jobs:
       - name: Run E2E Tests (browserbase)
         run: npm run e2e:bb
 
+  run-regression-evals-llm-providers:
+    needs:
+      [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    outputs:
+      regression_llm_providers_score: ${{ steps.set-llm-providers-score.outputs.regression_llm_providers_score }}
+    env:
+      EVAL_MODELS: "gpt-4o-mini"
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      HEADLESS: true
+      EVAL_ENV: browserbase
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install dependencies
+        run: |
+          rm -rf node_modules
+          rm -f package-lock.json
+          npm install
+
+      - name: Build Stagehand
+        run: npm run build
+
+      - name: Install Playwright browsers
+        run: npm exec playwright install --with-deps
+
+      - name: Run Regression Evals (llmProviders)
+        run: npm run evals category regression_llm_providers trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=llmProviders
+
+      - name: Save Regression llmProviders Results
+        run: mv eval-summary.json eval-summary-regression-llm-providers.json
+
+      - name: Log and Regression (llmProviders) Evals Performance
+        id: set-llm-providers-score
+        run: |
+          experimentNameRegressionLlmProviders=$(jq -r '.experimentName' eval-summary-regression-llm-providers.json)
+          regression_llm_providers_score=$(jq '.categories.regression_llm_providers' eval-summary-regression-llm-providers.json)
+          echo "regression_llm_providers category score: ${regression_llm_providers_score}%"
+          echo "View regression_llm_providers results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionLlmProviders}"
+          echo "regression_llm_providers_score=$regression_llm_providers_score" >> "$GITHUB_OUTPUT"
+
+          # Fail if regression_llm_providers_score is below 83%
+          if (( $(echo "${regression_llm_providers_score} < 83" | bc -l) )); then
+            echo "regression_llm_providers score is below 83%. Failing CI."
+            exit 1
+          fi
+
   run-regression-evals-dom-extract:
     needs:
       [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
diff --git a/evals/args.ts b/evals/args.ts
@@ -63,6 +63,9 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
       "experimental",
       "text_extract",
       "targeted_extract",
+      "regression_llm_providers",
+      "regression_text_extract",
+      "regression_dom_extract",
     ];
 
 // Finally, interpret leftover arguments to see if user typed "category X" or a single eval name
diff --git a/evals/evals.config.json b/evals/evals.config.json
@@ -87,11 +87,15 @@
     },
     {
       "name": "hn_aisdk",
-      "categories": ["combination", "regression_dom_extract"]
+      "categories": ["regression_llm_providers"]
     },
     {
       "name": "hn_langchain",
-      "categories": ["combination", "regression_dom_extract"]
+      "categories": ["regression_llm_providers"]
+    },
+    {
+      "name": "hn_customOpenAI",
+      "categories": ["regression_llm_providers"]
     },
     {
       "name": "apple",
diff --git a/evals/tasks/hn_customOpenAI.ts b/evals/tasks/hn_customOpenAI.ts
@@ -0,0 +1,116 @@
+import { EvalFunction } from "@/types/evals";
+import { initStagehand } from "@/evals/initStagehand";
+import { z } from "zod";
+import { CustomOpenAIClient } from "@/examples/external_clients/customOpenAI";
+import OpenAI from "openai";
+
+export const hn_customOpenAI: EvalFunction = async ({ logger }) => {
+  const { stagehand, initResponse } = await initStagehand({
+    logger,
+    llmClient: new CustomOpenAIClient({
+      modelName: "gpt-4o-mini",
+      client: new OpenAI({
+        apiKey: process.env.OPENAI_API_KEY,
+      }),
+    }),
+  });
+
+  const { debugUrl, sessionUrl } = initResponse;
+
+  await stagehand.page.goto("https://news.ycombinator.com");
+
+  let { story } = await stagehand.page.extract({
+    schema: z.object({
+      story: z.string().describe("the title of the top story on the page"),
+    }),
+  });
+  // remove the (url) part of the story title
+  story = story.split(" (")[0];
+
+  const expectedStoryElement = await stagehand.page.$(
+    "xpath=/html/body/center/table/tbody/tr[3]/td/table/tbody/tr[1]/td[3]/span/a",
+  );
+  // remove the (url) part of the story title
+  const expectedStory = (await expectedStoryElement?.textContent())?.split(
+    " (",
+  )?.[0];
+
+  if (!expectedStory) {
+    logger.error({
+      message: "Could not find expected story element",
+      level: 0,
+    });
+    return {
+      _success: false,
+      error: "Could not find expected story element",
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  }
+
+  if (story !== expectedStory) {
+    logger.error({
+      message: "Extracted story does not match expected story",
+      level: 0,
+      auxiliary: {
+        expected: {
+          value: expectedStory,
+          type: "string",
+        },
+        actual: {
+          value: story,
+          type: "string",
+        },
+      },
+    });
+    return {
+      _success: false,
+      error: "Extracted story does not match expected story",
+      expectedStory,
+      actualStory: story,
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  }
+
+  await stagehand.page.act("Click on the 'new' tab");
+
+  if (stagehand.page.url() !== "https://news.ycombinator.com/newest") {
+    logger.error({
+      message: "Page did not navigate to the 'new' tab",
+      level: 0,
+      auxiliary: {
+        expected: {
+          value: "https://news.ycombinator.com/newest",
+          type: "string",
+        },
+        actual: {
+          value: stagehand.page.url(),
+          type: "string",
+        },
+      },
+    });
+    return {
+      _success: false,
+      error: "Page did not navigate to the 'new' tab",
+      expectedUrl: "https://news.ycombinator.com/newest",
+      actualUrl: stagehand.page.url(),
+      debugUrl,
+      sessionUrl,
+      logs: logger.getLogs(),
+    };
+  }
+
+  await stagehand.close();
+
+  return {
+    _success: true,
+    expectedStory,
+    actualStory: story,
+    debugUrl,
+    sessionUrl,
+    logs: logger.getLogs(),
+  };
+};
diff --git a/examples/actionable_observe_example.ts b/examples/actionable_observe_example.ts
@@ -59,7 +59,7 @@ async function example() {
     console.log("✅ Success! we made it to the correct page");
   } else {
     console.log(
-      "❌ Whoops, looks like we didnt make it to the correct page. " +
+      "❌ Whoops, looks like we didn't make it to the correct page. " +
         "\nThanks for testing out this new Stagehand feature!" +
         "\nReach us on Slack if you have any feedback/questions/suggestions!",
     );
diff --git a/examples/external_client.ts b/examples/external_client.ts
@@ -1,31 +1,34 @@
 import { Stagehand } from "@/dist";
 import { z } from "zod";
-import { OllamaClient } from "./external_clients/ollama";
+import { CustomOpenAIClient } from "./external_clients/customOpenAI";
 import StagehandConfig from "@/stagehand.config";
+import OpenAI from "openai";
 
 async function example() {
   const stagehand = new Stagehand({
     ...StagehandConfig,
-    llmClient: new OllamaClient({
-      modelName: "llama3.2",
+    llmClient: new CustomOpenAIClient({
+      modelName: "gpt-4o-mini",
+      client: new OpenAI({
+        apiKey: process.env.OPENAI_API_KEY,
+      }),
     }),
   });
 
   await stagehand.init();
   await stagehand.page.goto("https://news.ycombinator.com");
+  await stagehand.page.act("click on the 'new' link");
 
   const headlines = await stagehand.page.extract({
-    instruction: "Extract only 3 stories from the Hacker News homepage.",
+    instruction: "Extract the top 3 stories from the Hacker News homepage.",
     schema: z.object({
-      stories: z
-        .array(
-          z.object({
-            title: z.string(),
-            url: z.string(),
-            points: z.number(),
-          }),
-        )
-        .length(3),
+      stories: z.array(
+        z.object({
+          title: z.string(),
+          url: z.string(),
+          points: z.number(),
+        }),
+      ),
     }),
   });
 
diff --git a/examples/external_clients/customOpenAI.ts b/examples/external_clients/customOpenAI.ts
diff --git a/types/evals.ts b/types/evals.ts

Original file line number	Diff line number	Diff line change
`@@ -87,11 +87,15 @@`
`87`	`87`	`},`
`88`	`88`	`{`
`89`	`89`	`"name": "hn_aisdk",`
`90`		`- "categories": ["combination", "regression_dom_extract"]`
	`90`	`+ "categories": ["regression_llm_providers"]`
`91`	`91`	`},`
`92`	`92`	`{`
`93`	`93`	`"name": "hn_langchain",`
`94`		`- "categories": ["combination", "regression_dom_extract"]`
	`94`	`+ "categories": ["regression_llm_providers"]`
	`95`	`+ },`
	`96`	`+ {`
	`97`	`+ "name": "hn_customOpenAI",`
	`98`	`+ "categories": ["regression_llm_providers"]`
`95`	`99`	`},`
`96`	`100`	`{`
`97`	`101`	`"name": "apple",`