Skip to content

Commit c57dc19

Browse files
sameelarifkamath
andauthored
operator handler (#586)
* operator handler * changeset * Update young-dots-fry.md * better task memory & cleaner code * provide extraction result in reasoning * remove action log * make agent config optional * increase max steps * update close logic * add operator example * made handler messages private * update operator (#596) --------- Co-authored-by: Anirudh Kamath <[email protected]>
1 parent ba9efc5 commit c57dc19

File tree

8 files changed

+367
-3
lines changed

8 files changed

+367
-3
lines changed

Diff for: .changeset/young-dots-fry.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": minor
3+
---
4+
5+
Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider.

Diff for: examples/operator-example.ts

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { LogLine, Stagehand } from "@/dist";
2+
import dotenv from "dotenv";
3+
import StagehandConfig from "@/stagehand.config";
4+
import chalk from "chalk";
5+
6+
// Load environment variables
7+
dotenv.config();
8+
9+
const INSTRUCTION =
10+
"Go to Google Japan and interact with it in Japanese. Tell me (in English) an authentic recipe that I can make with ingredients found in American grocery stores.";
11+
12+
async function main() {
13+
console.log(`\n${chalk.bold("Stagehand 🤘 Operator Example")}\n`);
14+
15+
// Initialize Stagehand
16+
const stagehand = new Stagehand({
17+
...StagehandConfig,
18+
logger: ({ level, message, timestamp }: LogLine) => {
19+
console.log({ level, message, timestamp });
20+
},
21+
});
22+
23+
await stagehand.init();
24+
25+
try {
26+
const agent = stagehand.agent();
27+
28+
// Execute the agent
29+
console.log(`${chalk.cyan("↳")} Instruction: ${INSTRUCTION}`);
30+
31+
const result = await agent.execute({
32+
instruction: INSTRUCTION,
33+
maxSteps: 20,
34+
});
35+
36+
console.log(`${chalk.green("✓")} Execution complete`);
37+
console.log(`${chalk.yellow("⤷")} Result:`);
38+
console.log(JSON.stringify(result, null, 2));
39+
console.log(chalk.white(result.message));
40+
} catch (error) {
41+
console.log(`${chalk.red("✗")} Error: ${error}`);
42+
} finally {
43+
await stagehand.close();
44+
}
45+
}
46+
47+
main();

Diff for: lib/handlers/operatorHandler.ts

+221
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
import { AgentAction, AgentExecuteOptions, AgentResult } from "@/types/agent";
2+
import { LogLine } from "@/types/log";
3+
import {
4+
OperatorResponse,
5+
operatorResponseSchema,
6+
OperatorSummary,
7+
operatorSummarySchema,
8+
} from "@/types/operator";
9+
import { LLMParsedResponse } from "../inference";
10+
import { ChatMessage, LLMClient } from "../llm/LLMClient";
11+
import { buildOperatorSystemPrompt } from "../prompt";
12+
import { StagehandPage } from "../StagehandPage";
13+
import { ObserveResult } from "@/types/stagehand";
14+
15+
export class StagehandOperatorHandler {
16+
private stagehandPage: StagehandPage;
17+
private logger: (message: LogLine) => void;
18+
private llmClient: LLMClient;
19+
private messages: ChatMessage[];
20+
21+
constructor(
22+
stagehandPage: StagehandPage,
23+
logger: (message: LogLine) => void,
24+
llmClient: LLMClient,
25+
) {
26+
this.stagehandPage = stagehandPage;
27+
this.logger = logger;
28+
this.llmClient = llmClient;
29+
}
30+
31+
public async execute(
32+
instructionOrOptions: string | AgentExecuteOptions,
33+
): Promise<AgentResult> {
34+
const options =
35+
typeof instructionOrOptions === "string"
36+
? { instruction: instructionOrOptions }
37+
: instructionOrOptions;
38+
39+
this.messages = [buildOperatorSystemPrompt(options.instruction)];
40+
let completed = false;
41+
let currentStep = 0;
42+
const maxSteps = options.maxSteps || 10;
43+
const actions: AgentAction[] = [];
44+
45+
while (!completed && currentStep < maxSteps) {
46+
const url = this.stagehandPage.page.url();
47+
48+
if (!url || url === "about:blank") {
49+
this.messages.push({
50+
role: "user",
51+
content: [
52+
{
53+
type: "text",
54+
text: "No page is currently loaded. The first step should be a 'goto' action to navigate to a URL.",
55+
},
56+
],
57+
});
58+
} else {
59+
const screenshot = await this.stagehandPage.page.screenshot({
60+
type: "png",
61+
fullPage: false,
62+
});
63+
64+
const base64Image = screenshot.toString("base64");
65+
66+
let messageText = `Here is a screenshot of the current page (URL: ${url}):`;
67+
68+
messageText = `Previous actions were: ${actions
69+
.map((action) => {
70+
let result: string = "";
71+
if (action.type === "act") {
72+
const args = action.playwrightArguments as ObserveResult;
73+
result = `Performed a "${args.method}" action ${args.arguments.length > 0 ? `with arguments: ${args.arguments.map((arg) => `"${arg}"`).join(", ")}` : ""} on "${args.description}"`;
74+
} else if (action.type === "extract") {
75+
result = `Extracted data: ${action.extractionResult}`;
76+
}
77+
return `[${action.type}] ${action.reasoning}. Result: ${result}`;
78+
})
79+
.join("\n")}\n\n${messageText}`;
80+
81+
this.messages.push({
82+
role: "user",
83+
content: [
84+
{
85+
type: "text",
86+
text: messageText,
87+
},
88+
{
89+
type: "image_url",
90+
image_url: { url: `data:image/png;base64,${base64Image}` },
91+
},
92+
],
93+
});
94+
}
95+
96+
const result = await this.getNextStep(currentStep);
97+
98+
if (result.method === "close") {
99+
completed = true;
100+
}
101+
102+
let playwrightArguments: ObserveResult | undefined;
103+
if (result.method === "act") {
104+
[playwrightArguments] = await this.stagehandPage.page.observe(
105+
result.parameters,
106+
);
107+
}
108+
let extractionResult: unknown | undefined;
109+
if (result.method === "extract") {
110+
extractionResult = await this.stagehandPage.page.extract(
111+
result.parameters,
112+
);
113+
}
114+
115+
await this.executeAction(result, playwrightArguments, extractionResult);
116+
117+
actions.push({
118+
type: result.method,
119+
reasoning: result.reasoning,
120+
taskCompleted: result.taskComplete,
121+
parameters: result.parameters,
122+
playwrightArguments,
123+
extractionResult,
124+
});
125+
126+
currentStep++;
127+
}
128+
129+
return {
130+
success: true,
131+
message: await this.getSummary(options.instruction),
132+
actions,
133+
completed: actions[actions.length - 1].taskCompleted as boolean,
134+
};
135+
}
136+
137+
private async getNextStep(currentStep: number): Promise<OperatorResponse> {
138+
const { data: response } =
139+
(await this.llmClient.createChatCompletion<OperatorResponse>({
140+
options: {
141+
messages: this.messages,
142+
response_model: {
143+
name: "operatorResponseSchema",
144+
schema: operatorResponseSchema,
145+
},
146+
requestId: `operator-step-${currentStep}`,
147+
},
148+
logger: this.logger,
149+
})) as LLMParsedResponse<OperatorResponse>;
150+
151+
return response;
152+
}
153+
154+
private async getSummary(goal: string): Promise<string> {
155+
const { data: response } =
156+
(await this.llmClient.createChatCompletion<OperatorSummary>({
157+
options: {
158+
messages: [
159+
...this.messages,
160+
{
161+
role: "user",
162+
content: [
163+
{
164+
type: "text",
165+
text: `Now use the steps taken to answer the original instruction of ${goal}.`,
166+
},
167+
],
168+
},
169+
],
170+
response_model: {
171+
name: "operatorSummarySchema",
172+
schema: operatorSummarySchema,
173+
},
174+
requestId: "operator-summary",
175+
},
176+
logger: this.logger,
177+
})) as LLMParsedResponse<OperatorSummary>;
178+
179+
return response.answer;
180+
}
181+
private async executeAction(
182+
action: OperatorResponse,
183+
playwrightArguments?: ObserveResult,
184+
extractionResult?: unknown,
185+
): Promise<unknown> {
186+
const { method, parameters } = action;
187+
const page = this.stagehandPage.page;
188+
189+
if (method === "close") {
190+
return;
191+
}
192+
193+
switch (method) {
194+
case "act":
195+
if (!playwrightArguments) {
196+
throw new Error("No playwright arguments provided");
197+
}
198+
await page.act(playwrightArguments);
199+
break;
200+
case "extract":
201+
if (!extractionResult) {
202+
throw new Error("No extraction result provided");
203+
}
204+
return extractionResult;
205+
case "goto":
206+
await page.goto(parameters, { waitUntil: "load" });
207+
break;
208+
case "wait":
209+
await page.waitForTimeout(parseInt(parameters));
210+
break;
211+
case "navback":
212+
await page.goBack();
213+
break;
214+
case "refresh":
215+
await page.reload();
216+
break;
217+
default:
218+
throw new Error(`Unknown action: ${method}`);
219+
}
220+
}
221+
}

Diff for: lib/index.ts

+21-2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import { logLineToString, isRunningInBun } from "./utils";
3939
import { ApiResponse, ErrorResponse } from "@/types/api";
4040
import { AgentExecuteOptions, AgentResult } from "../types/agent";
4141
import { StagehandAgentHandler } from "./handlers/agentHandler";
42+
import { StagehandOperatorHandler } from "./handlers/operatorHandler";
4243

4344
dotenv.config({ path: ".env" });
4445

@@ -818,18 +819,35 @@ export class Stagehand {
818819
* Create an agent instance that can be executed with different instructions
819820
* @returns An agent instance with execute() method
820821
*/
821-
agent(options: AgentConfig): {
822+
agent(options?: AgentConfig): {
822823
execute: (
823824
instructionOrOptions: string | AgentExecuteOptions,
824825
) => Promise<AgentResult>;
825826
} {
827+
if (!options || !options.provider) {
828+
// use open operator agent
829+
return {
830+
execute: async (instructionOrOptions: string | AgentExecuteOptions) => {
831+
return new StagehandOperatorHandler(
832+
this.stagehandPage,
833+
this.logger,
834+
this.llmClient,
835+
).execute(instructionOrOptions);
836+
},
837+
};
838+
}
839+
826840
const agentHandler = new StagehandAgentHandler(
827841
this.stagehandPage,
828842
this.logger,
829843
{
830844
modelName: options.model,
831845
clientOptions: options.options,
832-
userProvidedInstructions: options.instructions,
846+
userProvidedInstructions:
847+
options.instructions ??
848+
`You are a helpful assistant that can use a web browser.
849+
You are currently on the following page: ${this.stagehandPage.page.url()}.
850+
Do not ask follow up questions, the user will trust your judgement.`,
833851
agentType: options.provider,
834852
},
835853
);
@@ -889,5 +907,6 @@ export * from "../types/model";
889907
export * from "../types/page";
890908
export * from "../types/playwright";
891909
export * from "../types/stagehand";
910+
export * from "../types/operator";
892911
export * from "../types/agent";
893912
export * from "./llm/LLMClient";

Diff for: lib/inference.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ interface LLMUsage {
4848
/**
4949
* For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage }
5050
*/
51-
interface LLMParsedResponse<T> {
51+
export interface LLMParsedResponse<T> {
5252
data: T;
5353
usage?: LLMUsage;
5454
}

Diff for: lib/prompt.ts

+21
Original file line numberDiff line numberDiff line change
@@ -411,3 +411,24 @@ export function buildActObservePrompt(
411411

412412
return instruction;
413413
}
414+
415+
export function buildOperatorSystemPrompt(goal: string): ChatMessage {
416+
return {
417+
role: "system",
418+
content: `You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page.
419+
420+
You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken.
421+
422+
# Your current goal
423+
${goal}
424+
425+
# Important guidelines
426+
1. Break down complex actions into individual atomic steps
427+
2. For \`act\` commands, use only one action at a time, such as:
428+
- Single click on a specific element
429+
- Type into a single input field
430+
- Select a single option
431+
3. Avoid combining multiple actions in one instruction
432+
4. If multiple actions are needed, they should be separate steps`,
433+
};
434+
}

Diff for: package.json

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"2048": "npm run build && tsx examples/2048.ts",
1010
"popup": "npm run build && tsx examples/popup.ts",
1111
"cua": "npm run build && tsx examples/cua-example.ts",
12+
"operator": "npm run build && tsx examples/operator-example.ts",
1213
"example": "npm run build && tsx examples/example.ts",
1314
"langchain": "npm run build && tsx examples/langchain.ts",
1415
"debug-url": "npm run build && tsx examples/debugUrl.ts",

0 commit comments

Comments
 (0)