Skip to content

Commit d2e98a7

Browse files
clefourriersadra-barikbinNathanHB
authored
Pr sadra (#393)
--------- Co-authored-by: Sadra Barikbin <[email protected]> Co-authored-by: Nathan Habib <[email protected]>
1 parent 372a6fc commit d2e98a7

File tree

10 files changed

+55
-73
lines changed

10 files changed

+55
-73
lines changed

community_tasks/arabic_evals.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ def mmlu_arabic(line, task_name: str = None):
7272
choices=LETTER_INDICES_AR[:4],
7373
gold_index=gold_ix,
7474
instruction=instruction,
75-
target_for_fewshot_sorting=LETTER_INDICES_AR[gold_ix],
7675
)
7776

7877

@@ -181,7 +180,6 @@ def arabic_exams(line, task_name: str = None):
181180
choices=LETTER_INDICES_AR[:4],
182181
gold_index=answer_index,
183182
instruction=instruction,
184-
target_for_fewshot_sorting=choices[answer_index],
185183
)
186184

187185

@@ -231,7 +229,6 @@ def alghafa_prompt(line, task_name: str = None):
231229
choices=choices,
232230
gold_index=answer_index,
233231
instruction=instruction,
234-
target_for_fewshot_sorting=choices[answer_index],
235232
)
236233

237234

@@ -371,7 +368,6 @@ def __init__(
371368
def boolq_prompt_arabic(line, task_name: str = None):
372369
question = line["question"]
373370
passage = line["passage"]
374-
answer = "نعم" if line["answer"] else "لا"
375371
instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا"
376372
query = f"""{instruction}
377373
المقطع :
@@ -387,7 +383,6 @@ def boolq_prompt_arabic(line, task_name: str = None):
387383
choices=["نعم", "لا"],
388384
gold_index=0 if line["answer"] else 1,
389385
instruction=instruction,
390-
target_for_fewshot_sorting=answer,
391386
)
392387

393388

@@ -423,7 +418,6 @@ def copa_prompt_arabic(line, task_name: str = None):
423418
choices=choices,
424419
gold_index=answer,
425420
instruction="",
426-
target_for_fewshot_sorting=choices[answer],
427421
)
428422

429423

@@ -468,7 +462,6 @@ def hellaswag_prompt_arabic(line, task_name: str = None):
468462
choices=endings,
469463
gold_index=answer_index,
470464
instruction=instruction,
471-
target_for_fewshot_sorting=endings[answer_index],
472465
)
473466

474467

@@ -506,7 +499,6 @@ def toxigen_prompt_arabic(line, task_name: str = None):
506499
choices=["لا", "نعم"],
507500
gold_index=label,
508501
instruction=instruction,
509-
target_for_fewshot_sorting="نعم" if label == 1 else "لا",
510502
)
511503

512504

@@ -558,7 +550,6 @@ def sciq_prompt_arabic(line, task_name: str = None):
558550
choices=choices,
559551
gold_index=answer_index,
560552
instruction=instruction,
561-
target_for_fewshot_sorting=choices[answer_index],
562553
)
563554

564555

community_tasks/serbian_eval.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,6 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:
200200
- choices (list of str): The list of available answer choices.
201201
- gold_index (int): The index of the correct answer.
202202
- instruction (str): The instruction shown to the user in Serbian.
203-
- target_for_fewshot_sorting (Union[str, list of str]): The correct answer, either as a
204-
string (for regular tasks) or a list of strings (for MMLU tasks).
205203
"""
206204

207205
question = line["query"]
@@ -226,16 +224,12 @@ def serbian_eval_prompt(line: dict, task_name: Optional[str] = None) -> Doc:
226224

227225
query += "\n\nKrajnji odgovor:"
228226

229-
# Finalize target_for_fewshot_sorting as we handle mmlu task group as string
230-
target_for_fewshot_sorting = [choices[gold_index]] if task_name and "mmlu" in task_name else choices[gold_index]
231-
232227
return Doc(
233228
task_name=task_name,
234229
query=query,
235230
choices=choices,
236231
gold_index=gold_index,
237232
instruction=instruction,
238-
target_for_fewshot_sorting=target_for_fewshot_sorting,
239233
)
240234

241235

examples/model_configs/test.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
model:
2+
type: "endpoint"
3+
base_params:
4+
endpoint_name: "smollm-360m-instruct-v0-2-q8-lvy" # needs to be lower case without special characters
5+
model: HuggingFaceTB/SmolLM-360M-Instruct
6+
revision: "main"
7+
dtype: "default" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
8+
reuse_existing: true # if true, ignore all params in instance, and don't delete the endpoint after evaluation
9+
instance:
10+
accelerator: "gpu"
11+
region: "eu-west-1"
12+
vendor: "aws"
13+
instance_size: "medium"
14+
instance_type: "g5.2xlarge"
15+
framework: "pytorch"
16+
endpoint_type: "protected"
17+
namespace: null # The namespace under which to launch the endopint. Defaults to the current user's namespace
18+
image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
19+
env_vars:
20+
null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
21+
generation:
22+
add_special_tokens: true

examples/nanotron/custom_evaluation_tasks.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,6 @@ def mmlu_harness(line, task_name: str = None):
333333
task_name=task_name,
334334
query=prompt,
335335
choices=[" A", " B", " C", " D"],
336-
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
337336
gold_index=gold_ix,
338337
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
339338
)

examples/nanotron/custom_task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def mmlu_signs(line, topic):
3636
return {
3737
"query": prompt,
3838
"choices": [" +", " *", " =", " #"] if is_few_shots else ["+", "*", "=", "#"],
39-
"target_for_fewshot_sorting": [" +", " *", " =", " #"][gold_ix],
39+
"fewshot_sorting_class": [" +", " *", " =", " #"][gold_ix],
4040
"gold_index": gold_ix,
4141
"instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
4242
}
@@ -58,7 +58,7 @@ def mmlu_numbers(line, topic):
5858
return {
5959
"query": prompt,
6060
"choices": [" 1", " 2", " 3", " 4"] if is_few_shots else ["1", "2", "3", "4"],
61-
"target_for_fewshot_sorting": [" 1", " 2", " 3", " 4"][gold_ix],
61+
"fewshot_sorting_class": [" 1", " 2", " 3", " 4"][gold_ix],
6262
"gold_index": gold_ix,
6363
"instruction": f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
6464
}

src/lighteval/tasks/default_prompts.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
176176
query=query,
177177
choices=choices,
178178
gold_index=correct_index,
179-
target_for_fewshot_sorting=choices,
180179
instruction=line.get("task_prefix", None),
181180
)
182181

@@ -196,7 +195,6 @@ def bbh_lighteval(line, task_name: str = None):
196195
query=query,
197196
choices=LETTER_INDICES[: len(line["choices"])],
198197
gold_index=line["target_idx"],
199-
target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
200198
instruction=line.get("task_prefix", None),
201199
)
202200

@@ -207,7 +205,6 @@ def bbh(line, instruction, choices, task_name: str = None):
207205
query=f"{instruction}Q: {line['input']}\nA:",
208206
choices=choices,
209207
gold_index=choices.index(line["target"]),
210-
target_for_fewshot_sorting=[f" {c}" for c in choices],
211208
instruction=instruction,
212209
)
213210

@@ -799,7 +796,6 @@ def hellaswag_generative(line, task_name: str = None):
799796
choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
800797
gold_index=gold_ix, # -1 for test,
801798
instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
802-
target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
803799
)
804800

805801

@@ -1352,7 +1348,6 @@ def mmlu(line, topic, task_name: str = None):
13521348
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
13531349
gold_index=gold_ix,
13541350
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
1355-
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
13561351
)
13571352

13581353

@@ -1373,7 +1368,6 @@ def custom_mmlu_thom(line, task_name: str = None):
13731368
choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
13741369
gold_index=gold_ix,
13751370
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
1376-
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
13771371
)
13781372

13791373

@@ -1613,15 +1607,13 @@ def mmlu_harness(line, task_name: str = None):
16131607
query += "Answer:"
16141608

16151609
gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
1616-
"__few_shots" in line and line["__few_shots"] is True # We are adding few shots
16171610

16181611
return Doc(
16191612
task_name=task_name,
16201613
query=query,
16211614
choices=[" A", " B", " C", " D"],
16221615
gold_index=gold_ix,
16231616
instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
1624-
target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
16251617
)
16261618

16271619

@@ -1638,8 +1630,8 @@ def mmlu_helm(line, task_name: str = None):
16381630
query=query,
16391631
choices=[" A", " B", " C", " D"],
16401632
gold_index=gold_ix,
1633+
fewshot_sorting_class=line["choices"][gold_ix],
16411634
instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
1642-
target_for_fewshot_sorting=line["choices"][gold_ix], # specific to HELM evals
16431635
)
16441636

16451637

@@ -1816,7 +1808,6 @@ def openbookqa_helm(line, task_name: str = None):
18161808
choices=["A", "B", "C", "D", "E"],
18171809
gold_index=gold_ix,
18181810
instruction="The following are multiple choice questions (with answers) about common sense.\n",
1819-
target_for_fewshot_sorting=line["choices"]["text"][gold_ix], # specific to HELM evals
18201811
)
18211812

18221813

@@ -1837,14 +1828,13 @@ def piqa_helm(line, task_name: str = None):
18371828
query += "Answer: "
18381829

18391830
gold_ix = int(line["label"])
1840-
1831+
is_few_shots = line.get("__few_shots", False)
18411832
return Doc(
18421833
task_name=task_name,
18431834
query=query,
1844-
choices=["A", "B"],
1835+
choices=["A", "B"] if not is_few_shots else [line["sol1"], line["sol2"]],
18451836
gold_index=gold_ix,
18461837
instruction="The following are multiple choice questions (with answers) about common sense.\n",
1847-
target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
18481838
)
18491839

18501840

@@ -1877,13 +1867,11 @@ def pubmed_qa_helm(line, task_name: str = None):
18771867
)
18781868
query += f"\n\nQuestion: {line['question']}\nAnswer: "
18791869
gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])
1880-
18811870
return Doc(
18821871
task_name=task_name,
18831872
query=query,
18841873
choices=["A", "B", "C"],
18851874
gold_index=gold_ix,
1886-
target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
18871875
)
18881876

18891877

@@ -2263,13 +2251,11 @@ def truthful_qa_helm(line, task_name: str = None):
22632251
query = f"Question: {line['question']}\n"
22642252
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
22652253
query += "Answer:"
2266-
22672254
return Doc(
22682255
task_name=task_name,
22692256
query=query,
22702257
choices=LETTER_INDICES[: len(line["choices"])],
22712258
gold_index=line["gold_index"],
2272-
target_for_fewshot_sorting=line["choices"][line["gold_index"]],
22732259
)
22742260

22752261

src/lighteval/tasks/lighteval_task.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -340,21 +340,6 @@ def eval_docs(self) -> list[Doc]:
340340
self._docs = self.remove_duplicate_docs(self._docs)
341341
return self._docs
342342

343-
def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
344-
"""
345-
Returns the target of the given document.
346-
347-
Args:
348-
formatted_doc (Doc): Formatted document.
349-
few_shot (bool, optional): Whether the document is used for few
350-
shot examples. Defaults to False.
351-
352-
Returns:
353-
str: Target of the document, which is the correct answer for a document.
354-
"""
355-
# likely we mostly need one example not all
356-
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
357-
358343
def construct_requests(
359344
self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str
360345
) -> Dict[RequestType, List[Request]]:

src/lighteval/tasks/prompt_manager.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,33 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
6565
)
6666

6767
@staticmethod
68-
def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
68+
def doc_to_target(formatted_doc: Doc) -> str:
6969
"""
7070
Returns the target of the given document.
7171
7272
Args:
7373
formatted_doc (Doc): Formatted document.
74-
few_shot (bool, optional): Whether the document is used for few
75-
shot examples. Defaults to False.
7674
7775
Returns:
7876
str: Target of the document, which is the correct answer for a document.
7977
"""
80-
# likely we mostly need one example not all
81-
return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
78+
return as_list(formatted_doc.get_golds())[0]
79+
80+
@staticmethod
81+
def doc_to_fewshot_sorting_class(formatted_doc: Doc) -> str:
82+
"""
83+
In some cases, when selecting few-shot samples, we want to use specific document classes
84+
which need to be specified separately from the target.
85+
For example, a document where the gold is a json might want to use only one of the keys of
86+
the json to define sorting classes in few shot samples. Else we take the gold.
87+
88+
Args:
89+
formatted_doc (Doc): Formatted document.
90+
91+
Returns:
92+
str: Class of the
93+
"""
94+
return formatted_doc.fewshot_sorting_class or PromptManager.doc_to_target(formatted_doc)
8295

8396
def add_context_to_doc(
8497
self,
@@ -255,9 +268,7 @@ def get_examples(
255268
class FewShotSelectionMethod:
256269
sorting: str # sorting method for the overall few shot pool (balanced, random, sequential)
257270
with_sampling: bool # samples item randomly from the few shot pool
258-
fewshotpool_unique: (
259-
bool
260-
) # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
271+
fewshotpool_unique: bool # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
261272

262273

263274
class FewShotSelection(Enum):
@@ -356,16 +367,16 @@ def _init_fewshot_sampling_balanced(
356367
):
357368
fewshotpool = self.task.fewshot_docs()
358369

359-
# rnd = random.Random(variance_seed)
360370
random.seed(variance_seed)
361371

362-
# Build up balanced selection based on labels
363-
# Sort by counts of labels
372+
# Build up balanced selection based on fewshot_sorting_class
373+
# (or the gold target, if the class is undefined)
364374
label_to_instances = defaultdict(list)
365375
for instance in fewshotpool:
366-
target = PromptManager.doc_to_target(instance, few_shot=True)
376+
target = PromptManager.doc_to_fewshot_sorting_class(instance)
367377
label_to_instances[target].append(instance)
368378

379+
# Sort by counts of class labels
369380
counts_to_labels = defaultdict(list)
370381
for label, instances in sorted(label_to_instances.items()):
371382
counts_to_labels[len(instances)].append(label)

src/lighteval/tasks/requests.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ class Doc:
178178

179179
# For few-shot
180180
instruction: Optional[str] = ""
181-
target_for_fewshot_sorting: Optional[str] = None # will probably have to be removed in the future
181+
fewshot_sorting_class: Optional[str] = None # class to use to select balanced few-shot samples
182182

183183
# Filled when parsing and adding the few-shot context
184184
ctx: Optional[str] = ""
@@ -194,18 +194,12 @@ def __post_init__(self):
194194
if self.instruction is None:
195195
self.instruction = ""
196196

197-
def get_golds(self, few_shot: bool = False):
197+
def get_golds(self):
198198
"""Return gold targets extracted from the target dict"""
199199
gold_indices = as_list(self.gold_index)
200-
if few_shot and self.target_for_fewshot_sorting is not None:
201-
choices = self.target_for_fewshot_sorting
202-
if isinstance(choices, str): # correct choice is already selected
203-
return choices
204-
else:
205-
choices = self.choices
206200
golds = []
207201
for gold_ix in gold_indices:
208-
golds.extend(as_list(choices[gold_ix]))
202+
golds.extend(as_list(self.choices[gold_ix]))
209203
return golds
210204

211205
def __repr__(self):

0 commit comments

Comments
 (0)