Merge branch 'main' into deciding-target-for-fewshot-sorting

sadra-barikbin · web-flow · commit df4d57b095f4 · 2024-08-18T01:12:21.000+03:30
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,2 @@
 include src/lighteval/tasks/tasks_table.jsonl
+include src/lighteval/metrics/*.jsonl
diff --git a/README.md b/README.md
@@ -101,7 +101,8 @@ accelerate launch --multi_gpu --num_processes=<num_gpus> -m \
     --output_dir output_dir
 ```
 
-Here, `--tasks` refers to either a _comma-separated_ list of supported tasks from the [metadata table](src/lighteval/tasks/tasks_table.jsonl) in the format:
+Here, `--tasks` refers to either a _comma-separated_ list of supported tasks from the [tasks_list](examples/tasks/all_tasks.txt) in the format:
+Tasks details can also be found in the [file implementing them](src/lighteval/tasks/default_tasks.py).
 
 ```
 suite|task|num_few_shot|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long}
@@ -113,7 +114,7 @@ or a file path like [`examples/tasks/recommended_set.txt`](./examples/tasks/reco
 accelerate launch --multi_gpu --num_processes=8 -m \
     lighteval accelerate \
     --model_args "pretrained=gpt2" \
-    --tasks "lighteval|truthfulqa:mc|0|0" \
+    --tasks "leaderboard|truthfulqa:mc|0|0" \
     --override_batch_size 1 \
     --output_dir="./evals/"
 ```
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -324,6 +324,7 @@ def __init__(
         normalize_gold: callable = None,
         normalize_pred: callable = None,
         aggregation_function: callable = None,
+        tokenizer: object = None,
     ):
         """A ROUGE wrapper method. Relies on `rouge_scorer`.
 
@@ -338,6 +339,8 @@ def __init__(
                 Defaults to None if no normalization is applied.
             normalize_pred (callable, optional): Function to use to normalize the predicted strings.
                 Defaults to None if no normalization is applied.
+            tokenizer (object, optional): An object with `tokenize` method to be used by rouge scorer. If None, rouge-scorer's
+                default tokenizer will be used.
         """
         if aggregation_function and bootstrap:
             hlog_warn("Can't use both bootstrapping and an aggregation function in Rouge. Keeping bootstrap.")
@@ -350,7 +353,7 @@ def __init__(
             raise ValueError(
                 f"Rouge was initialised with method {methods}, which is not in {','.join(self.ALLOWED_ROUGE_METHODS)}"
             )
-        self.scorer = rouge_scorer.RougeScorer([methods])
+        self.scorer = rouge_scorer.RougeScorer([methods], tokenizer=tokenizer)
         self.multiple_golds = multiple_golds
         self.bootstrap = bootstrap
         self.normalize_gold = normalize_gold
@@ -416,8 +419,18 @@ def __init__(
         normalize_gold: callable = None,
         normalize_pred: callable = None,
     ):
-        """A BERT scorer class. Relies on some called extracted from `bert-score`. By default, will use the
-        `microsoft/deberta-large-mnli` as scorer
+        r"""A BERT scorer class. Relies on some called extracted from `bert-score`. By default, will use the
+        `microsoft/deberta-large-mnli` as scorer. For each tokenized (pred, target) pair, it computes Precision,
+        Recall and F1 as following:
+
+            Precision = \sum_{t=1}^{len(pred)} \div{max(Cos.Sim.(pred_t, target))}{IDF(pred_t)}
+
+            Recall = \sum_{t=1}^{len(target)} \div{max(Cos.Sim.(target_t, pred))}{IDF(target_t)}
+
+            F1 = \div{Precision * Recall}{Precision + Recall}
+
+        in which `Cos.Sim.` is the Cosine Similarity metric and `IDF(.)` represents the Inverse Document
+        Frequency of its input token. It defaults to 1 for all tokens and 0 for EOS and SEP tokens.
 
         Args:
             normalize_gold (callable, optional): Function to use to normalize the reference strings.
@@ -563,19 +576,19 @@ def __init__(
         self.strip_prediction = strip_prediction
         self.sample_aggregations = {"longest_common_prefix_length": max, "edit_distance": min, "edit_similarity": max}
 
-    def compute(self, gold: list[str], predictions: list[str], **kwargs) -> dict:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict:
         """Computes all the requested metrics on the golds and prediction.
 
         Args:
-            gold (list[str]): A list of possible golds. If it contains more than one item, only the first one is kept.
+            golds (list[str]): A list of possible golds. If it contains more than one item, only the first one is kept.
             predictions (list[str]): Predicted strings.
 
         Returns:
            dict: The different scores computed
         """
-        if len(gold) > 0:
+        if len(golds) > 1:
             hlog_warn("Provided more than one gold to compute a string distance metric. Just using the first one.")
-        reference = gold[0]
+        reference = golds[0]
 
         result = {m: [] for m in self.metric_types}
         for sequence in predictions:
diff --git a/src/lighteval/models/utils.py b/src/lighteval/models/utils.py
@@ -29,7 +29,7 @@
 from transformers import AutoConfig
 
 
-def _get_dtype(dtype: Union[str, torch.dtype], config: Optional[AutoConfig] = None) -> torch.dtype:
+def _get_dtype(dtype: Union[str, torch.dtype, None], config: Optional[AutoConfig] = None) -> Optional[torch.dtype]:
     """
     Get the torch dtype based on the input arguments.
 
@@ -41,17 +41,21 @@ def _get_dtype(dtype: Union[str, torch.dtype], config: Optional[AutoConfig] = No
         torch.dtype: The torch dtype based on the input arguments.
     """
 
-    if config is not None:  # For quantized models
-        if hasattr(config, "quantization_config"):
-            _torch_dtype = None  # must be inferred
-        else:
-            _torch_dtype = config.torch_dtype
-    elif isinstance(dtype, str) and dtype not in ["auto", "4bit", "8bit"]:
-        # Convert `str` args torch dtype: `float16` -> `torch.float16`
-        _torch_dtype = getattr(torch, dtype)
-    else:
-        _torch_dtype = dtype
-    return _torch_dtype
+    if config is not None and hasattr(config, "quantization_config"):
+        # must be infered
+        return None
+
+    if dtype is not None:
+        if isinstance(dtype, str) and dtype not in ["auto", "4bit", "8bit"]:
+            # Convert `str` args torch dtype: `float16` -> `torch.float16`
+            return getattr(torch, dtype)
+        elif isinstance(dtype, torch.dtype):
+            return dtype
+
+    if config is not None:
+        return config.torch_dtype
+
+    return None
 
 
 def _simplify_name(name_or_path: str) -> str:

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`include src/lighteval/tasks/tasks_table.jsonl`
	`2`	`+include src/lighteval/metrics/*.jsonl`