operator handler (#586)

sameelarif · kamath · web-flow · commit c57dc19c448b · 2025-03-18T00:31:55.000-07:00
* operator handler * changeset * Update young-dots-fry.md * better task memory & cleaner code * provide extraction result in reasoning * remove action log * make agent config optional * increase max steps * update close logic * add operator example * made handler messages private * update operator (#596) --------- Co-authored-by: Anirudh Kamath <github@kamath.io>
diff --git a/.changeset/young-dots-fry.md b/.changeset/young-dots-fry.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider.
diff --git a/examples/operator-example.ts b/examples/operator-example.ts
@@ -0,0 +1,47 @@
+import { LogLine, Stagehand } from "@/dist";
+import dotenv from "dotenv";
+import StagehandConfig from "@/stagehand.config";
+import chalk from "chalk";
+
+// Load environment variables
+dotenv.config();
+
+const INSTRUCTION =
+  "Go to Google Japan and interact with it in Japanese. Tell me (in English) an authentic recipe that I can make with ingredients found in American grocery stores.";
+
+async function main() {
+  console.log(`\n${chalk.bold("Stagehand 🤘 Operator Example")}\n`);
+
+  // Initialize Stagehand
+  const stagehand = new Stagehand({
+    ...StagehandConfig,
+    logger: ({ level, message, timestamp }: LogLine) => {
+      console.log({ level, message, timestamp });
+    },
+  });
+
+  await stagehand.init();
+
+  try {
+    const agent = stagehand.agent();
+
+    // Execute the agent
+    console.log(`${chalk.cyan("↳")} Instruction: ${INSTRUCTION}`);
+
+    const result = await agent.execute({
+      instruction: INSTRUCTION,
+      maxSteps: 20,
+    });
+
+    console.log(`${chalk.green("✓")} Execution complete`);
+    console.log(`${chalk.yellow("⤷")} Result:`);
+    console.log(JSON.stringify(result, null, 2));
+    console.log(chalk.white(result.message));
+  } catch (error) {
+    console.log(`${chalk.red("✗")} Error: ${error}`);
+  } finally {
+    await stagehand.close();
+  }
+}
+
+main();
diff --git a/lib/handlers/operatorHandler.ts b/lib/handlers/operatorHandler.ts
@@ -0,0 +1,221 @@
+import { AgentAction, AgentExecuteOptions, AgentResult } from "@/types/agent";
+import { LogLine } from "@/types/log";
+import {
+  OperatorResponse,
+  operatorResponseSchema,
+  OperatorSummary,
+  operatorSummarySchema,
+} from "@/types/operator";
+import { LLMParsedResponse } from "../inference";
+import { ChatMessage, LLMClient } from "../llm/LLMClient";
+import { buildOperatorSystemPrompt } from "../prompt";
+import { StagehandPage } from "../StagehandPage";
+import { ObserveResult } from "@/types/stagehand";
+
+export class StagehandOperatorHandler {
+  private stagehandPage: StagehandPage;
+  private logger: (message: LogLine) => void;
+  private llmClient: LLMClient;
+  private messages: ChatMessage[];
+
+  constructor(
+    stagehandPage: StagehandPage,
+    logger: (message: LogLine) => void,
+    llmClient: LLMClient,
+  ) {
+    this.stagehandPage = stagehandPage;
+    this.logger = logger;
+    this.llmClient = llmClient;
+  }
+
+  public async execute(
+    instructionOrOptions: string | AgentExecuteOptions,
+  ): Promise<AgentResult> {
+    const options =
+      typeof instructionOrOptions === "string"
+        ? { instruction: instructionOrOptions }
+        : instructionOrOptions;
+
+    this.messages = [buildOperatorSystemPrompt(options.instruction)];
+    let completed = false;
+    let currentStep = 0;
+    const maxSteps = options.maxSteps || 10;
+    const actions: AgentAction[] = [];
+
+    while (!completed && currentStep < maxSteps) {
+      const url = this.stagehandPage.page.url();
+
+      if (!url || url === "about:blank") {
+        this.messages.push({
+          role: "user",
+          content: [
+            {
+              type: "text",
+              text: "No page is currently loaded. The first step should be a 'goto' action to navigate to a URL.",
+            },
+          ],
+        });
+      } else {
+        const screenshot = await this.stagehandPage.page.screenshot({
+          type: "png",
+          fullPage: false,
+        });
+
+        const base64Image = screenshot.toString("base64");
+
+        let messageText = `Here is a screenshot of the current page (URL: ${url}):`;
+
+        messageText = `Previous actions were: ${actions
+          .map((action) => {
+            let result: string = "";
+            if (action.type === "act") {
+              const args = action.playwrightArguments as ObserveResult;
+              result = `Performed a "${args.method}" action ${args.arguments.length > 0 ? `with arguments: ${args.arguments.map((arg) => `"${arg}"`).join(", ")}` : ""} on "${args.description}"`;
+            } else if (action.type === "extract") {
+              result = `Extracted data: ${action.extractionResult}`;
+            }
+            return `[${action.type}] ${action.reasoning}. Result: ${result}`;
+          })
+          .join("\n")}\n\n${messageText}`;
+
+        this.messages.push({
+          role: "user",
+          content: [
+            {
+              type: "text",
+              text: messageText,
+            },
+            {
+              type: "image_url",
+              image_url: { url: `data:image/png;base64,${base64Image}` },
+            },
+          ],
+        });
+      }
+
+      const result = await this.getNextStep(currentStep);
+
+      if (result.method === "close") {
+        completed = true;
+      }
+
+      let playwrightArguments: ObserveResult | undefined;
+      if (result.method === "act") {
+        [playwrightArguments] = await this.stagehandPage.page.observe(
+          result.parameters,
+        );
+      }
+      let extractionResult: unknown | undefined;
+      if (result.method === "extract") {
+        extractionResult = await this.stagehandPage.page.extract(
+          result.parameters,
+        );
+      }
+
+      await this.executeAction(result, playwrightArguments, extractionResult);
+
+      actions.push({
+        type: result.method,
+        reasoning: result.reasoning,
+        taskCompleted: result.taskComplete,
+        parameters: result.parameters,
+        playwrightArguments,
+        extractionResult,
+      });
+
+      currentStep++;
+    }
+
+    return {
+      success: true,
+      message: await this.getSummary(options.instruction),
+      actions,
+      completed: actions[actions.length - 1].taskCompleted as boolean,
+    };
+  }
+
+  private async getNextStep(currentStep: number): Promise<OperatorResponse> {
+    const { data: response } =
+      (await this.llmClient.createChatCompletion<OperatorResponse>({
+        options: {
+          messages: this.messages,
+          response_model: {
+            name: "operatorResponseSchema",
+            schema: operatorResponseSchema,
+          },
+          requestId: `operator-step-${currentStep}`,
+        },
+        logger: this.logger,
+      })) as LLMParsedResponse<OperatorResponse>;
+
+    return response;
+  }
+
+  private async getSummary(goal: string): Promise<string> {
+    const { data: response } =
+      (await this.llmClient.createChatCompletion<OperatorSummary>({
+        options: {
+          messages: [
+            ...this.messages,
+            {
+              role: "user",
+              content: [
+                {
+                  type: "text",
+                  text: `Now use the steps taken to answer the original instruction of ${goal}.`,
+                },
+              ],
+            },
+          ],
+          response_model: {
+            name: "operatorSummarySchema",
+            schema: operatorSummarySchema,
+          },
+          requestId: "operator-summary",
+        },
+        logger: this.logger,
+      })) as LLMParsedResponse<OperatorSummary>;
+
+    return response.answer;
+  }
+  private async executeAction(
+    action: OperatorResponse,
+    playwrightArguments?: ObserveResult,
+    extractionResult?: unknown,
+  ): Promise<unknown> {
+    const { method, parameters } = action;
+    const page = this.stagehandPage.page;
+
+    if (method === "close") {
+      return;
+    }
+
+    switch (method) {
+      case "act":
+        if (!playwrightArguments) {
+          throw new Error("No playwright arguments provided");
+        }
+        await page.act(playwrightArguments);
+        break;
+      case "extract":
+        if (!extractionResult) {
+          throw new Error("No extraction result provided");
+        }
+        return extractionResult;
+      case "goto":
+        await page.goto(parameters, { waitUntil: "load" });
+        break;
+      case "wait":
+        await page.waitForTimeout(parseInt(parameters));
+        break;
+      case "navback":
+        await page.goBack();
+        break;
+      case "refresh":
+        await page.reload();
+        break;
+      default:
+        throw new Error(`Unknown action: ${method}`);
+    }
+  }
+}
diff --git a/lib/index.ts b/lib/index.ts
@@ -39,6 +39,7 @@ import { logLineToString, isRunningInBun } from "./utils";
 import { ApiResponse, ErrorResponse } from "@/types/api";
 import { AgentExecuteOptions, AgentResult } from "../types/agent";
 import { StagehandAgentHandler } from "./handlers/agentHandler";
+import { StagehandOperatorHandler } from "./handlers/operatorHandler";
 
 dotenv.config({ path: ".env" });
 
@@ -818,18 +819,35 @@ export class Stagehand {
    * Create an agent instance that can be executed with different instructions
    * @returns An agent instance with execute() method
    */
-  agent(options: AgentConfig): {
+  agent(options?: AgentConfig): {
     execute: (
       instructionOrOptions: string | AgentExecuteOptions,
     ) => Promise<AgentResult>;
   } {
+    if (!options || !options.provider) {
+      // use open operator agent
+      return {
+        execute: async (instructionOrOptions: string | AgentExecuteOptions) => {
+          return new StagehandOperatorHandler(
+            this.stagehandPage,
+            this.logger,
+            this.llmClient,
+          ).execute(instructionOrOptions);
+        },
+      };
+    }
+
     const agentHandler = new StagehandAgentHandler(
       this.stagehandPage,
       this.logger,
       {
         modelName: options.model,
         clientOptions: options.options,
-        userProvidedInstructions: options.instructions,
+        userProvidedInstructions:
+          options.instructions ??
+          `You are a helpful assistant that can use a web browser.
+      You are currently on the following page: ${this.stagehandPage.page.url()}.
+      Do not ask follow up questions, the user will trust your judgement.`,
         agentType: options.provider,
       },
     );
@@ -889,5 +907,6 @@ export * from "../types/model";
 export * from "../types/page";
 export * from "../types/playwright";
 export * from "../types/stagehand";
+export * from "../types/operator";
 export * from "../types/agent";
 export * from "./llm/LLMClient";
diff --git a/lib/inference.ts b/lib/inference.ts
@@ -48,7 +48,7 @@ interface LLMUsage {
 /**
  * For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage }
  */
-interface LLMParsedResponse<T> {
+export interface LLMParsedResponse<T> {
   data: T;
   usage?: LLMUsage;
 }
diff --git a/lib/prompt.ts b/lib/prompt.ts
@@ -411,3 +411,24 @@ export function buildActObservePrompt(
 
   return instruction;
 }
+
+export function buildOperatorSystemPrompt(goal: string): ChatMessage {
+  return {
+    role: "system",
+    content: `You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page.
+
+You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken.
+
+# Your current goal
+${goal}
+
+# Important guidelines
+1. Break down complex actions into individual atomic steps
+2. For \`act\` commands, use only one action at a time, such as:
+   - Single click on a specific element
+   - Type into a single input field
+   - Select a single option
+3. Avoid combining multiple actions in one instruction
+4. If multiple actions are needed, they should be separate steps`,
+  };
+}
diff --git a/package.json b/package.json
@@ -9,6 +9,7 @@
     "2048": "npm run build && tsx examples/2048.ts",
     "popup": "npm run build && tsx examples/popup.ts",
     "cua": "npm run build && tsx examples/cua-example.ts",
+    "operator": "npm run build && tsx examples/operator-example.ts",
     "example": "npm run build && tsx examples/example.ts",
     "langchain": "npm run build && tsx examples/langchain.ts",
     "debug-url": "npm run build && tsx examples/debugUrl.ts",
diff --git a/types/operator.ts b/types/operator.ts

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ interface LLMUsage {`
`48`	`48`	`/**`
`49`	`49`	`* For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage }`
`50`	`50`	`*/`
`51`		`-interface LLMParsedResponse<T> {`
	`51`	`+export interface LLMParsedResponse<T> {`
`52`	`52`	`data: T;`
`53`	`53`	`usage?: LLMUsage;`
`54`	`54`	`}`