Skip to content

Commit 5c6d2cf

Browse files
authored
Stagehand Evaluator & first agent evals (#668)
* Stagehand evaluator and first agent evals * viewport size for bb sessions * prettier * refactor * cleanup * changeset * remove log * addressing comments
1 parent 08a6e92 commit 5c6d2cf

17 files changed

+892
-60
lines changed

.changeset/whole-yaks-cheat.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": minor
3+
---
4+
5+
Added a new class - Stagehand Evaluator - that wraps around a Stagehand object to determine whether a task is successful or not. Currently used for agent evals

evals/args.ts

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ const DEFAULT_EVAL_CATEGORIES = process.env.EVAL_CATEGORIES
6969
"regression_llm_providers",
7070
"regression",
7171
"llm_clients",
72+
"agent",
7273
];
7374

7475
// Finally, interpret leftover arguments to see if user typed "category X" or a single eval name

evals/evals.config.json

+20
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,26 @@
329329
{
330330
"name": "checkboxes",
331331
"categories": ["act"]
332+
},
333+
{
334+
"name": "agent/iframe_form",
335+
"categories": ["agent"]
336+
},
337+
{
338+
"name": "agent/iframe_form_multiple",
339+
"categories": ["agent"]
340+
},
341+
{
342+
"name": "agent/google_flights",
343+
"categories": ["agent"]
344+
},
345+
{
346+
"name": "agent/sf_library_card",
347+
"categories": ["agent"]
348+
},
349+
{
350+
"name": "agent/sf_library_card_multiple",
351+
"categories": ["agent"]
332352
}
333353
]
334354
}

evals/evaluator.ts

+313
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
/**
2+
* This class is responsible for evaluating the result of an agentic task.
3+
* The first version includes a VLM evaluator specifically prompted to evaluate the state of a task
4+
* usually represented as a screenshot.
5+
* The evaluator will reply with YES or NO given the state of the provided task.
6+
*/
7+
8+
import { AvailableModel, ClientOptions, Stagehand } from "@/dist";
9+
import { LLMResponseError } from "@/types/stagehandErrors";
10+
import dotenv from "dotenv";
11+
import {
12+
EvaluateOptions,
13+
EvaluationResult,
14+
BatchEvaluateOptions,
15+
} from "@/types/evaluator";
16+
17+
dotenv.config();
18+
19+
export class Evaluator {
20+
private stagehand: Stagehand;
21+
private modelName: AvailableModel;
22+
private modelClientOptions: ClientOptions | { apiKey: string };
23+
// Define regex patterns directly in the class or as constants if preferred elsewhere
24+
private yesPattern = /^(YES|Y|TRUE|CORRECT|AFFIRMATIVE)/i;
25+
private noPattern = /^(NO|N|FALSE|INCORRECT|NEGATIVE)/i;
26+
27+
constructor(
28+
stagehand: Stagehand,
29+
modelName?: AvailableModel,
30+
modelClientOptions?: ClientOptions,
31+
) {
32+
this.stagehand = stagehand;
33+
this.modelName = modelName || "gemini-2.0-flash";
34+
this.modelClientOptions = modelClientOptions || {
35+
apiKey: process.env.GOOGLE_API_KEY || "",
36+
};
37+
}
38+
39+
/**
40+
* Evaluates the current state of the page against a specific question.
41+
* Expects a JSON object response: { "evaluation": "YES" | "NO", "reasoning": "..." }
42+
* Returns the evaluation result with normalized response and success status.
43+
*
44+
* @param options - The options for evaluation
45+
* @returns A promise that resolves to an EvaluationResult
46+
* @throws Error if strictResponse is true and response is not clearly YES or NO, or if JSON parsing/validation fails.
47+
*/
48+
async evaluate(options: EvaluateOptions): Promise<EvaluationResult> {
49+
const {
50+
question,
51+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO given the state of a task (most times in the form of a screenshot) and a question. Provide a detailed reasoning for your answer.
52+
Return your response as a JSON object with the following format:
53+
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`,
54+
screenshotDelayMs = 1000,
55+
strictResponse = false,
56+
} = options;
57+
58+
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
59+
const imageBuffer = await this.stagehand.page.screenshot();
60+
const llmClient = this.stagehand.llmProvider.getClient(
61+
this.modelName,
62+
this.modelClientOptions,
63+
);
64+
65+
const response = await llmClient.createChatCompletion({
66+
logger: this.stagehand.logger,
67+
options: {
68+
messages: [
69+
{ role: "system", content: systemPrompt },
70+
{ role: "user", content: question },
71+
],
72+
image: { buffer: imageBuffer },
73+
},
74+
});
75+
76+
const rawResponse = response.choices[0].message.content;
77+
let evaluationResult: "YES" | "NO" | "INVALID" = "INVALID";
78+
let reasoning = `Failed to process response. Raw response: ${rawResponse}`;
79+
80+
try {
81+
// Clean potential markdown fences
82+
const cleanedResponse = rawResponse
83+
.replace(/^```json\s*/, "")
84+
.replace(/\s*```$/, "")
85+
.trim();
86+
87+
// Attempt to parse the JSON object
88+
const parsedResult: { evaluation: unknown; reasoning: unknown } =
89+
JSON.parse(cleanedResponse);
90+
91+
// Validate structure
92+
if (
93+
typeof parsedResult !== "object" ||
94+
parsedResult === null ||
95+
typeof parsedResult.evaluation !== "string" ||
96+
typeof parsedResult.reasoning !== "string"
97+
) {
98+
throw new LLMResponseError(
99+
"Evaluator",
100+
`Invalid JSON structure received: ${JSON.stringify(parsedResult)}`,
101+
);
102+
}
103+
104+
const evaluationString = parsedResult.evaluation.trim().toUpperCase();
105+
reasoning = parsedResult.reasoning.trim(); // Update reasoning from parsed object
106+
107+
// Use regex patterns to validate the evaluation string
108+
const isYes = this.yesPattern.test(evaluationString);
109+
const isNo = this.noPattern.test(evaluationString);
110+
111+
if (isYes) {
112+
evaluationResult = "YES";
113+
} else if (isNo) {
114+
evaluationResult = "NO";
115+
} else {
116+
// Parsed JSON but evaluation value wasn't YES/NO variant
117+
if (strictResponse) {
118+
throw new LLMResponseError(
119+
"Evaluator",
120+
`Invalid evaluation value in JSON: ${parsedResult.evaluation}`,
121+
);
122+
}
123+
// Keep INVALID, reasoning already updated
124+
reasoning = `Invalid evaluation value: ${parsedResult.evaluation}. Reasoning: ${reasoning}`;
125+
}
126+
} catch (error) {
127+
const errorMessage =
128+
error instanceof Error ? error.message : String(error);
129+
// Update reasoning with error details
130+
reasoning = `Processing error: ${errorMessage}. Raw response: ${rawResponse}`;
131+
if (strictResponse) {
132+
// Re-throw error if in strict mode
133+
throw new LLMResponseError("Evaluator", reasoning);
134+
}
135+
// Keep evaluationResult as "INVALID"
136+
}
137+
138+
return {
139+
evaluation: evaluationResult,
140+
reasoning: reasoning,
141+
};
142+
}
143+
144+
/**
145+
* Evaluates the current state of the page against multiple questions in a single screenshot.
146+
* Returns an array of evaluation results.
147+
*
148+
* @param options - The options for batch evaluation
149+
* @returns A promise that resolves to an array of EvaluationResults
150+
* @throws Error if strictResponse is true and any response is not clearly YES or NO
151+
*/
152+
async batchEvaluate(
153+
options: BatchEvaluateOptions,
154+
): Promise<EvaluationResult[]> {
155+
const {
156+
questions,
157+
systemPrompt = `You are an expert evaluator that confidently returns YES or NO for each question given the state of a task in the screenshot. Provide a detailed reasoning for your answer.
158+
Return your response as a JSON array, where each object corresponds to a question and has the following format:
159+
{ "evaluation": "YES" | "NO", "reasoning": "detailed reasoning for your answer" }`,
160+
screenshotDelayMs = 1000,
161+
strictResponse = false,
162+
} = options;
163+
164+
// Wait for the specified delay before taking screenshot
165+
await new Promise((resolve) => setTimeout(resolve, screenshotDelayMs));
166+
167+
// Take a screenshot of the current page state
168+
const imageBuffer = await this.stagehand.page.screenshot();
169+
170+
// Create a numbered list of questions for the VLM
171+
const formattedQuestions = questions
172+
.map((q, i) => `${i + 1}. ${q}`)
173+
.join("\n");
174+
175+
// Get the LLM client with our preferred model
176+
const llmClient = this.stagehand.llmProvider.getClient(
177+
this.modelName,
178+
this.modelClientOptions,
179+
);
180+
181+
// Use the model-specific LLM client to evaluate the screenshot with all questions
182+
const response = await llmClient.createChatCompletion({
183+
logger: this.stagehand.logger,
184+
options: {
185+
messages: [
186+
{
187+
role: "system",
188+
content: `${systemPrompt}\n\nYou will be given multiple questions. Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
189+
},
190+
{
191+
role: "user",
192+
content: formattedQuestions,
193+
},
194+
],
195+
image: {
196+
buffer: imageBuffer,
197+
},
198+
},
199+
});
200+
201+
const rawResponse = response.choices[0].message.content;
202+
let finalResults: EvaluationResult[] = [];
203+
204+
try {
205+
// Clean potential markdown fences
206+
const cleanedResponse = rawResponse
207+
.replace(/^```json\s*/, "")
208+
.replace(/\s*```$/, "")
209+
.trim();
210+
211+
// Attempt to parse the JSON array
212+
const parsedResults: { evaluation: unknown; reasoning: unknown }[] =
213+
JSON.parse(cleanedResponse);
214+
215+
if (!Array.isArray(parsedResults)) {
216+
throw new LLMResponseError(
217+
"Evaluator",
218+
"Response is not a JSON array.",
219+
);
220+
}
221+
222+
if (parsedResults.length !== questions.length && strictResponse) {
223+
throw new LLMResponseError(
224+
"Evaluator",
225+
`Expected ${questions.length} results, but got ${parsedResults.length}`,
226+
);
227+
}
228+
229+
for (let i = 0; i < questions.length; i++) {
230+
if (i < parsedResults.length) {
231+
const item = parsedResults[i];
232+
// Ensure item is an object and has the required properties
233+
if (
234+
typeof item !== "object" ||
235+
item === null ||
236+
typeof item.evaluation !== "string" ||
237+
typeof item.reasoning !== "string"
238+
) {
239+
if (strictResponse) {
240+
throw new LLMResponseError(
241+
"Evaluator",
242+
`Invalid object structure for question ${i + 1}: ${JSON.stringify(item)}`,
243+
);
244+
}
245+
finalResults.push({
246+
evaluation: "INVALID",
247+
reasoning: `Invalid object structure received: ${JSON.stringify(
248+
item,
249+
)}`,
250+
});
251+
continue; // Move to the next question
252+
}
253+
254+
// Use regex patterns for validation
255+
const evaluationString = item.evaluation.trim().toUpperCase();
256+
const reasoning = item.reasoning.trim();
257+
const isYes = this.yesPattern.test(evaluationString);
258+
const isNo = this.noPattern.test(evaluationString);
259+
260+
if (isYes) {
261+
finalResults.push({ evaluation: "YES", reasoning: reasoning });
262+
} else if (isNo) {
263+
finalResults.push({ evaluation: "NO", reasoning: reasoning });
264+
} else {
265+
// Invalid evaluation value
266+
if (strictResponse) {
267+
throw new LLMResponseError(
268+
"Evaluator",
269+
`Invalid evaluation value for question ${i + 1}: ${item.evaluation}`,
270+
);
271+
}
272+
finalResults.push({
273+
evaluation: "INVALID",
274+
reasoning: `Invalid evaluation value: ${item.evaluation}. Reasoning: ${reasoning}`,
275+
});
276+
}
277+
} else {
278+
// Missing result for this question
279+
if (strictResponse) {
280+
throw new LLMResponseError(
281+
"Evaluator",
282+
`No response found for question ${i + 1}`,
283+
);
284+
}
285+
finalResults.push({
286+
evaluation: "INVALID",
287+
reasoning: "No response found for this question.",
288+
});
289+
}
290+
}
291+
} catch (error) {
292+
const errorMessage =
293+
error instanceof Error ? error.message : String(error);
294+
// If JSON parsing fails or structure is wrong, handle based on strictResponse
295+
if (strictResponse) {
296+
throw new LLMResponseError(
297+
"Evaluator",
298+
`Failed to parse LLM response or invalid format: ${rawResponse}. Error: ${errorMessage}`,
299+
);
300+
}
301+
// Fallback: return INVALID for all questions
302+
finalResults = []; // Clear any potentially partially filled results
303+
for (let i = 0; i < questions.length; i++) {
304+
finalResults.push({
305+
evaluation: "INVALID",
306+
reasoning: `Failed to parse response. Raw response: ${rawResponse}. Error: ${errorMessage}`,
307+
});
308+
}
309+
}
310+
311+
return finalResults;
312+
}
313+
}

0 commit comments

Comments
 (0)