Skip to content

Commit f53fed6

Browse files
committed
addressing comments
1 parent 6351121 commit f53fed6

File tree

3 files changed

+96
-77
lines changed

3 files changed

+96
-77
lines changed

evals/evaluator.ts

+31-77
Original file line numberDiff line numberDiff line change
@@ -6,70 +6,16 @@
66
*/
77

88
import { AvailableModel, ClientOptions, Stagehand } from "@/dist";
9+
import { LLMResponseError } from "@/types/stagehandErrors";
910
import dotenv from "dotenv";
11+
import {
12+
EvaluateOptions,
13+
EvaluationResult,
14+
BatchEvaluateOptions,
15+
} from "@/types/evaluator";
1016

1117
dotenv.config();
1218

13-
export interface EvaluateOptions {
14-
/**
15-
* The question to ask about the task state
16-
*/
17-
question: string;
18-
/**
19-
* Custom system prompt for the evaluator
20-
*/
21-
systemPrompt?: string;
22-
/**
23-
* Delay in milliseconds before taking the screenshot
24-
* @default 1000
25-
*/
26-
screenshotDelayMs?: number;
27-
/**
28-
* Whether to throw an error if the response is not a clear YES or NO
29-
* @default false
30-
*/
31-
strictResponse?: boolean;
32-
}
33-
34-
export interface BatchEvaluateOptions {
35-
/**
36-
* Array of questions to evaluate
37-
*/
38-
questions: string[];
39-
/**
40-
* Custom system prompt for the evaluator
41-
*/
42-
systemPrompt?: string;
43-
/**
44-
* Delay in milliseconds before taking the screenshot
45-
* @default 1000
46-
*/
47-
screenshotDelayMs?: number;
48-
/**
49-
* Whether to throw an error if any response is not a clear YES or NO
50-
* @default false
51-
*/
52-
strictResponse?: boolean;
53-
/**
54-
* The reasoning behind the evaluation
55-
*/
56-
reasoning?: string;
57-
}
58-
59-
/**
60-
* Result of an evaluation
61-
*/
62-
export interface EvaluationResult {
63-
/**
64-
* The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected)
65-
*/
66-
evaluation: "YES" | "NO" | "INVALID";
67-
/**
68-
* The reasoning behind the evaluation
69-
*/
70-
reasoning: string;
71-
}
72-
7319
export class Evaluator {
7420
private stagehand: Stagehand;
7521
private modelName: AvailableModel;
@@ -117,7 +63,7 @@ export class Evaluator {
11763
);
11864

11965
const response = await llmClient.createChatCompletion({
120-
logger: () => {},
66+
logger: this.stagehand.logger,
12167
options: {
12268
messages: [
12369
{ role: "system", content: systemPrompt },
@@ -149,7 +95,8 @@ export class Evaluator {
14995
typeof parsedResult.evaluation !== "string" ||
15096
typeof parsedResult.reasoning !== "string"
15197
) {
152-
throw new Error(
98+
throw new LLMResponseError(
99+
"Evaluator",
153100
`Invalid JSON structure received: ${JSON.stringify(parsedResult)}`,
154101
);
155102
}
@@ -168,7 +115,8 @@ export class Evaluator {
168115
} else {
169116
// Parsed JSON but evaluation value wasn't YES/NO variant
170117
if (strictResponse) {
171-
throw new Error(
118+
throw new LLMResponseError(
119+
"Evaluator",
172120
`Invalid evaluation value in JSON: ${parsedResult.evaluation}`,
173121
);
174122
}
@@ -178,12 +126,11 @@ export class Evaluator {
178126
} catch (error) {
179127
const errorMessage =
180128
error instanceof Error ? error.message : String(error);
181-
console.error("Failed during evaluation processing:", errorMessage);
182129
// Update reasoning with error details
183130
reasoning = `Processing error: ${errorMessage}. Raw response: ${rawResponse}`;
184131
if (strictResponse) {
185132
// Re-throw error if in strict mode
186-
throw new Error(reasoning);
133+
throw new LLMResponseError("Evaluator", reasoning);
187134
}
188135
// Keep evaluationResult as "INVALID"
189136
}
@@ -233,7 +180,7 @@ export class Evaluator {
233180

234181
// Use the model-specific LLM client to evaluate the screenshot with all questions
235182
const response = await llmClient.createChatCompletion({
236-
logger: () => {},
183+
logger: this.stagehand.logger,
237184
options: {
238185
messages: [
239186
{
@@ -252,7 +199,7 @@ export class Evaluator {
252199
});
253200

254201
const rawResponse = response.choices[0].message.content;
255-
const finalResults: EvaluationResult[] = [];
202+
let finalResults: EvaluationResult[] = [];
256203

257204
try {
258205
// Clean potential markdown fences
@@ -266,13 +213,15 @@ export class Evaluator {
266213
JSON.parse(cleanedResponse);
267214

268215
if (!Array.isArray(parsedResults)) {
269-
throw new Error("Response is not a JSON array.");
216+
throw new LLMResponseError(
217+
"Evaluator",
218+
"Response is not a JSON array.",
219+
);
270220
}
271221

272222
if (parsedResults.length !== questions.length && strictResponse) {
273-
// Optional: Log a warning even if not strict?
274-
// console.warn(`LLM returned ${parsedResults.length} results, but ${questions.length} questions were asked.`);
275-
throw new Error(
223+
throw new LLMResponseError(
224+
"Evaluator",
276225
`Expected ${questions.length} results, but got ${parsedResults.length}`,
277226
);
278227
}
@@ -288,7 +237,8 @@ export class Evaluator {
288237
typeof item.reasoning !== "string"
289238
) {
290239
if (strictResponse) {
291-
throw new Error(
240+
throw new LLMResponseError(
241+
"Evaluator",
292242
`Invalid object structure for question ${i + 1}: ${JSON.stringify(item)}`,
293243
);
294244
}
@@ -314,7 +264,8 @@ export class Evaluator {
314264
} else {
315265
// Invalid evaluation value
316266
if (strictResponse) {
317-
throw new Error(
267+
throw new LLMResponseError(
268+
"Evaluator",
318269
`Invalid evaluation value for question ${i + 1}: ${item.evaluation}`,
319270
);
320271
}
@@ -326,7 +277,10 @@ export class Evaluator {
326277
} else {
327278
// Missing result for this question
328279
if (strictResponse) {
329-
throw new Error(`No response found for question ${i + 1}`);
280+
throw new LLMResponseError(
281+
"Evaluator",
282+
`No response found for question ${i + 1}`,
283+
);
330284
}
331285
finalResults.push({
332286
evaluation: "INVALID",
@@ -337,15 +291,15 @@ export class Evaluator {
337291
} catch (error) {
338292
const errorMessage =
339293
error instanceof Error ? error.message : String(error);
340-
console.error("Failed to parse LLM response as JSON:", errorMessage);
341294
// If JSON parsing fails or structure is wrong, handle based on strictResponse
342295
if (strictResponse) {
343-
throw new Error(
296+
throw new LLMResponseError(
297+
"Evaluator",
344298
`Failed to parse LLM response or invalid format: ${rawResponse}. Error: ${errorMessage}`,
345299
);
346300
}
347301
// Fallback: return INVALID for all questions
348-
finalResults.length = 0; // Clear any potentially partially filled results
302+
finalResults = []; // Clear any potentially partially filled results
349303
for (let i = 0; i < questions.length; i++) {
350304
finalResults.push({
351305
evaluation: "INVALID",

types/evaluator.ts

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
export interface EvaluateOptions {
2+
/**
3+
* The question to ask about the task state
4+
*/
5+
question: string;
6+
/**
7+
* Custom system prompt for the evaluator
8+
*/
9+
systemPrompt?: string;
10+
/**
11+
* Delay in milliseconds before taking the screenshot
12+
* @default 1000
13+
*/
14+
screenshotDelayMs?: number;
15+
/**
16+
* Whether to throw an error if the response is not a clear YES or NO
17+
* @default false
18+
*/
19+
strictResponse?: boolean;
20+
}
21+
22+
export interface BatchEvaluateOptions {
23+
/**
24+
* Array of questions to evaluate
25+
*/
26+
questions: string[];
27+
/**
28+
* Custom system prompt for the evaluator
29+
*/
30+
systemPrompt?: string;
31+
/**
32+
* Delay in milliseconds before taking the screenshot
33+
* @default 1000
34+
*/
35+
screenshotDelayMs?: number;
36+
/**
37+
* Whether to throw an error if any response is not a clear YES or NO
38+
* @default false
39+
*/
40+
strictResponse?: boolean;
41+
/**
42+
* The reasoning behind the evaluation
43+
*/
44+
reasoning?: string;
45+
}
46+
47+
/**
48+
* Result of an evaluation
49+
*/
50+
export interface EvaluationResult {
51+
/**
52+
* The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected)
53+
*/
54+
evaluation: "YES" | "NO" | "INVALID";
55+
/**
56+
* The reasoning behind the evaluation
57+
*/
58+
reasoning: string;
59+
}

types/stagehandErrors.ts

+6
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,9 @@ export class StagehandClickError extends StagehandError {
155155
);
156156
}
157157
}
158+
159+
export class LLMResponseError extends StagehandError {
160+
constructor(primitive: string, message: string) {
161+
super(`${primitive} LLM response error: ${message}`);
162+
}
163+
}

0 commit comments

Comments
 (0)