6
6
*/
7
7
8
8
import { AvailableModel , ClientOptions , Stagehand } from "@/dist" ;
9
+ import { LLMResponseError } from "@/types/stagehandErrors" ;
9
10
import dotenv from "dotenv" ;
11
+ import {
12
+ EvaluateOptions ,
13
+ EvaluationResult ,
14
+ BatchEvaluateOptions ,
15
+ } from "@/types/evaluator" ;
10
16
11
17
dotenv . config ( ) ;
12
18
13
- export interface EvaluateOptions {
14
- /**
15
- * The question to ask about the task state
16
- */
17
- question : string ;
18
- /**
19
- * Custom system prompt for the evaluator
20
- */
21
- systemPrompt ?: string ;
22
- /**
23
- * Delay in milliseconds before taking the screenshot
24
- * @default 1000
25
- */
26
- screenshotDelayMs ?: number ;
27
- /**
28
- * Whether to throw an error if the response is not a clear YES or NO
29
- * @default false
30
- */
31
- strictResponse ?: boolean ;
32
- }
33
-
34
- export interface BatchEvaluateOptions {
35
- /**
36
- * Array of questions to evaluate
37
- */
38
- questions : string [ ] ;
39
- /**
40
- * Custom system prompt for the evaluator
41
- */
42
- systemPrompt ?: string ;
43
- /**
44
- * Delay in milliseconds before taking the screenshot
45
- * @default 1000
46
- */
47
- screenshotDelayMs ?: number ;
48
- /**
49
- * Whether to throw an error if any response is not a clear YES or NO
50
- * @default false
51
- */
52
- strictResponse ?: boolean ;
53
- /**
54
- * The reasoning behind the evaluation
55
- */
56
- reasoning ?: string ;
57
- }
58
-
59
- /**
60
- * Result of an evaluation
61
- */
62
- export interface EvaluationResult {
63
- /**
64
- * The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected)
65
- */
66
- evaluation : "YES" | "NO" | "INVALID" ;
67
- /**
68
- * The reasoning behind the evaluation
69
- */
70
- reasoning : string ;
71
- }
72
-
73
19
export class Evaluator {
74
20
private stagehand : Stagehand ;
75
21
private modelName : AvailableModel ;
@@ -117,7 +63,7 @@ export class Evaluator {
117
63
) ;
118
64
119
65
const response = await llmClient . createChatCompletion ( {
120
- logger : ( ) => { } ,
66
+ logger : this . stagehand . logger ,
121
67
options : {
122
68
messages : [
123
69
{ role : "system" , content : systemPrompt } ,
@@ -149,7 +95,8 @@ export class Evaluator {
149
95
typeof parsedResult . evaluation !== "string" ||
150
96
typeof parsedResult . reasoning !== "string"
151
97
) {
152
- throw new Error (
98
+ throw new LLMResponseError (
99
+ "Evaluator" ,
153
100
`Invalid JSON structure received: ${ JSON . stringify ( parsedResult ) } ` ,
154
101
) ;
155
102
}
@@ -168,7 +115,8 @@ export class Evaluator {
168
115
} else {
169
116
// Parsed JSON but evaluation value wasn't YES/NO variant
170
117
if ( strictResponse ) {
171
- throw new Error (
118
+ throw new LLMResponseError (
119
+ "Evaluator" ,
172
120
`Invalid evaluation value in JSON: ${ parsedResult . evaluation } ` ,
173
121
) ;
174
122
}
@@ -178,12 +126,11 @@ export class Evaluator {
178
126
} catch ( error ) {
179
127
const errorMessage =
180
128
error instanceof Error ? error . message : String ( error ) ;
181
- console . error ( "Failed during evaluation processing:" , errorMessage ) ;
182
129
// Update reasoning with error details
183
130
reasoning = `Processing error: ${ errorMessage } . Raw response: ${ rawResponse } ` ;
184
131
if ( strictResponse ) {
185
132
// Re-throw error if in strict mode
186
- throw new Error ( reasoning ) ;
133
+ throw new LLMResponseError ( "Evaluator" , reasoning ) ;
187
134
}
188
135
// Keep evaluationResult as "INVALID"
189
136
}
@@ -233,7 +180,7 @@ export class Evaluator {
233
180
234
181
// Use the model-specific LLM client to evaluate the screenshot with all questions
235
182
const response = await llmClient . createChatCompletion ( {
236
- logger : ( ) => { } ,
183
+ logger : this . stagehand . logger ,
237
184
options : {
238
185
messages : [
239
186
{
@@ -252,7 +199,7 @@ export class Evaluator {
252
199
} ) ;
253
200
254
201
const rawResponse = response . choices [ 0 ] . message . content ;
255
- const finalResults : EvaluationResult [ ] = [ ] ;
202
+ let finalResults : EvaluationResult [ ] = [ ] ;
256
203
257
204
try {
258
205
// Clean potential markdown fences
@@ -266,13 +213,15 @@ export class Evaluator {
266
213
JSON . parse ( cleanedResponse ) ;
267
214
268
215
if ( ! Array . isArray ( parsedResults ) ) {
269
- throw new Error ( "Response is not a JSON array." ) ;
216
+ throw new LLMResponseError (
217
+ "Evaluator" ,
218
+ "Response is not a JSON array." ,
219
+ ) ;
270
220
}
271
221
272
222
if ( parsedResults . length !== questions . length && strictResponse ) {
273
- // Optional: Log a warning even if not strict?
274
- // console.warn(`LLM returned ${parsedResults.length} results, but ${questions.length} questions were asked.`);
275
- throw new Error (
223
+ throw new LLMResponseError (
224
+ "Evaluator" ,
276
225
`Expected ${ questions . length } results, but got ${ parsedResults . length } ` ,
277
226
) ;
278
227
}
@@ -288,7 +237,8 @@ export class Evaluator {
288
237
typeof item . reasoning !== "string"
289
238
) {
290
239
if ( strictResponse ) {
291
- throw new Error (
240
+ throw new LLMResponseError (
241
+ "Evaluator" ,
292
242
`Invalid object structure for question ${ i + 1 } : ${ JSON . stringify ( item ) } ` ,
293
243
) ;
294
244
}
@@ -314,7 +264,8 @@ export class Evaluator {
314
264
} else {
315
265
// Invalid evaluation value
316
266
if ( strictResponse ) {
317
- throw new Error (
267
+ throw new LLMResponseError (
268
+ "Evaluator" ,
318
269
`Invalid evaluation value for question ${ i + 1 } : ${ item . evaluation } ` ,
319
270
) ;
320
271
}
@@ -326,7 +277,10 @@ export class Evaluator {
326
277
} else {
327
278
// Missing result for this question
328
279
if ( strictResponse ) {
329
- throw new Error ( `No response found for question ${ i + 1 } ` ) ;
280
+ throw new LLMResponseError (
281
+ "Evaluator" ,
282
+ `No response found for question ${ i + 1 } ` ,
283
+ ) ;
330
284
}
331
285
finalResults . push ( {
332
286
evaluation : "INVALID" ,
@@ -337,15 +291,15 @@ export class Evaluator {
337
291
} catch ( error ) {
338
292
const errorMessage =
339
293
error instanceof Error ? error . message : String ( error ) ;
340
- console . error ( "Failed to parse LLM response as JSON:" , errorMessage ) ;
341
294
// If JSON parsing fails or structure is wrong, handle based on strictResponse
342
295
if ( strictResponse ) {
343
- throw new Error (
296
+ throw new LLMResponseError (
297
+ "Evaluator" ,
344
298
`Failed to parse LLM response or invalid format: ${ rawResponse } . Error: ${ errorMessage } ` ,
345
299
) ;
346
300
}
347
301
// Fallback: return INVALID for all questions
348
- finalResults . length = 0 ; // Clear any potentially partially filled results
302
+ finalResults = [ ] ; // Clear any potentially partially filled results
349
303
for ( let i = 0 ; i < questions . length ; i ++ ) {
350
304
finalResults . push ( {
351
305
evaluation : "INVALID" ,
0 commit comments