Supports extended tasks (#101)

clefourrier · NathanHB · web-flow · commit 876f721e10a1 · 2024-03-09T19:25:36.000+01:00
* init - now gives the path with an arg, maybe will remove

* allows several custom task modules to be loaded

* fix quality

---------

Co-authored-by: Nathan Habib &lt;30601243+NathanHB@users.noreply.github.com&gt;
Co-authored-by: Nathan Habib &lt;nathan.habib@huggingface.co&gt;
diff --git a/README.md b/README.md
@@ -210,9 +210,13 @@ However, we are very grateful to the Harness and HELM teams for their continued
 If your new task or metric has requirements, add a specific `requirements.txt` file with your evaluation.
 
 ### Adding a new task
-To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, or in the community tasks, and **add its dataset** on the hub.
-Note: Core evaluations are evals we will add to our test suite to ensure non regression through time, and which already see a high usage in the community.
-A popular community evaluation can move to become a core evaluation through time.
+To add a new task, first either open an issue, to determine whether it will be integrated in the core evaluations of lighteval, in the extended tasks, or in the community tasks, and **add its dataset** on the hub.
+
+- Core evaluations are evaluation which only require standard logic in their metrics and processing, and that we will add to our test suite to ensure non regression through time. They already see a high usage in the community.
+- Extended evaluations are evaluations which require custom logic in their metrics (complex normalisation, an LLM as a judge, ...), that we added to facilitate the life of users. They already see a high usage in the community.
+- Community evaluations are submissions by the community of new tasks.
+
+A popular community evaluation can move to becoming an extended or core evaluation through time.
 
 #### Core evaluations
 Prompt function: **find a suitable prompt function** in `src.lighteval.tasks.task_prompt_formatting.py`, or code your own. This function must output a `Doc` object, which should contain `query`, your prompt, and either `gold`, the gold output, or `choices` and `gold_index`, the list of choices and index or indices of correct answers. If your query contains an instruction which should not be repeated in a few shot setup, add it to an `instruction` field.
@@ -241,6 +245,9 @@ Summary: create a **line summary** of your evaluation, in `src/lighteval/tasks/t
 
 Make sure you can launch your model with your new task using `--tasks lighteval|yournewtask|2|0`.
 
+### Extended evaluations
+Proceed as for community evaluations, but in the `extended_tasks` folder.
+
 #### Community evaluations
 Copy the `community_tasks/_template.yml` to `community_tasks/yourevalname.py` and edit it to add your custom tasks (the parameters you can use are explained above). It contains an interesting mechanism if the dataset you are adding contains a lot of subsets.
 
diff --git a/extended_tasks/ifeval/instructions.py b/extended_tasks/ifeval/instructions.py
@@ -23,7 +23,7 @@
 
 import langdetect
 
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_utils as instructions_util
+import extended_tasks.ifeval.instructions_utils as instructions_util
 
 
 logger = logging.getLogger(__name__)
diff --git a/extended_tasks/ifeval/instructions_registry.py b/extended_tasks/ifeval/instructions_registry.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 """Registry of all instructions."""
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions as instructions
+import extended_tasks.ifeval.instructions as instructions
 
 
 _KEYWORD = "keywords:"
diff --git a/extended_tasks/ifeval/instructions_utils.py b/extended_tasks/ifeval/instructions_utils.py
diff --git a/extended_tasks/ifeval/main.py b/extended_tasks/ifeval/main.py
@@ -23,7 +23,7 @@
 import numpy as np
 from aenum import extend_enum
 
-import tasks_examples.custom_tasks_with_custom_metrics.ifeval.instructions_registry as instructions_registry
+import extended_tasks.ifeval.instructions_registry as instructions_registry
 from lighteval.metrics import Metrics
 from lighteval.metrics.utils import (
     MetricCategory,
@@ -38,7 +38,7 @@
 ifeval = LightevalTaskConfig(
     name="ifeval",
     prompt_function="ifeval_prompt",
-    suite=["custom"],
+    suite=["extended"],
     hf_repo="wis-k/instruction-following-eval",
     hf_subset="default",
     metric=["ifeval_metric"],
diff --git a/pyproject.toml b/pyproject.toml
@@ -78,7 +78,6 @@ dependencies = [
 accelerate = ["accelerate"]
 tgi = ["text-generation==0.6.0"]
 optimum = ["optimum==1.12.0"]
-# Quantization and adapter weights
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
 adapters = ["peft==0.3.0"]
 nanotron = [
@@ -88,7 +87,9 @@ nanotron = [
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]
-
+extended_tasks = [
+  "langdetect", #ifeval
+]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/lighteval"
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -103,6 +103,12 @@ def get_parser():
         default=None,
         help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
     )
+    parser.add_argument(
+        "--extended_tasks",
+        type=str,
+        default=None,
+        help="Path to the folder which contains all extended tasks",
+    )
     group.add_argument(
         "--tasks",
         type=str,
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -81,7 +81,7 @@ def main(args):
         with accelerator.main_process_first() if accelerator is not None else nullcontext():
             task_names_list, few_shots_dict = taskinfo_selector(args.tasks)
             task_dict = Registry(cache_dir=env_config.cache_dir).get_task_dict(
-                task_names_list, custom_tasks=args.custom_tasks
+                task_names_list, custom_tasks=args.custom_tasks, extended_tasks=args.extended_tasks
             )
             LightevalTask.load_datasets(task_dict.values(), args.dataset_loading_processes)
 
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -135,7 +135,8 @@ def main(
 
             task_names_list, few_shots_dict = taskinfo_selector(tasks_selection)
             task_dict = Registry(cache_dir=cache_dir).get_task_dict(
-                task_names_list, custom_tasks=lighteval_config.tasks.custom_tasks
+                task_names_list,
+                custom_tasks=lighteval_config.tasks.custom_tasks,
             )
             # Loading all the dataset in a distributed manner
             LightevalTask.load_datasets(task_dict.values(), lighteval_config.tasks.dataset_loading_processes)
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -145,7 +145,9 @@ def __post_init__(self):
 
 
 class LightevalTask:
-    def __init__(self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str] = None, custom_tasks_module=None):
+    def __init__(  # noqa: C901
+        self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str] = None, custom_tasks_module: list = None
+    ):
         """
         Initialize a LightEval task.
 
@@ -202,16 +204,26 @@ def __init__(self, name: str, cfg: LightevalTaskConfig, cache_dir: Optional[str]
         # to use once prompt formatting is managed as a module
         if custom_tasks_module is None:
             self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function)
-        elif hasattr(custom_tasks_module, cfg.prompt_function):
-            # If we have a prompt in both the custom_tasks_module and our tasks_prompt_formatting
-            # We take the prompt from the custom_tasks_module
-            if hasattr(tasks_prompt_formatting, cfg.prompt_function):
-                hlog_warn(
-                    f"Be careful you are using custom prompt function {cfg.prompt_function} and not the default one."
-                )
-            self.formatter = getattr(custom_tasks_module, cfg.prompt_function)
         else:
-            self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function)
+            formatter = []
+            for module in custom_tasks_module:
+                if hasattr(module, cfg.prompt_function):
+                    formatter.append(getattr(module, cfg.prompt_function))
+
+            if len(formatter) == 0:  # Default version
+                self.formatter = getattr(tasks_prompt_formatting, cfg.prompt_function)
+            elif len(formatter) == 1:
+                # If we have a prompt in both the module and our tasks_prompt_formatting
+                # We take the prompt from the module
+                if hasattr(tasks_prompt_formatting, cfg.prompt_function):
+                    hlog_warn(
+                        f"Be careful you are using custom prompt function {cfg.prompt_function} and not the default one."
+                    )
+                self.formatter = getattr(module, cfg.prompt_function)
+            else:
+                raise Exception(
+                    f"You defined the prompt function {cfg.prompt_function} several times in the different custom modules you are loading."
+                )
         self.generation_size = cfg.generation_size
         self.stop_sequence = cfg.stop_sequence
         self.output_regex = cfg.output_regex
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
@@ -39,8 +39,19 @@
 # Original follows the original implementation as closely as possible
 # Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results
 # Community are for community added evaluations
+# Extended are for evaluations with custom logic
 # Custom is for all the experiments you might want to do!
-DEFAULT_SUITES = ["helm", "bigbench", "harness", "leaderboard", "lighteval", "original", "custom", "community"]
+DEFAULT_SUITES = [
+    "helm",
+    "bigbench",
+    "harness",
+    "leaderboard",
+    "lighteval",
+    "original",
+    "extended",
+    "custom",
+    "community",
+]
 
 TRUNCATE_FEW_SHOTS_DEFAULTS = True
 
@@ -97,14 +108,18 @@ def get_task_class(
         )
 
     def get_task_dict(
-        self, task_name_list: List[str], custom_tasks: Optional[Union[str, ModuleType]] = None
+        self,
+        task_name_list: List[str],
+        custom_tasks: Optional[Union[str, ModuleType]] = None,
+        extended_tasks: str = None,
     ) -> Dict[str, LightevalTask]:
         """
         Get a dictionary of tasks based on the task name list.
 
         Args:
             task_name_list (List[str]): A list of task names.
             custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
+            extended_tasks (Optional[str]): The path to the extended tasks group of submodules
 
         Returns:
             Dict[str, LightevalTask]: A dictionary containing the tasks.
@@ -115,13 +130,20 @@ def get_task_dict(
         """
         # Import custom tasks provided by the user
         custom_tasks_registry = None
-        custom_tasks_module = None
+        custom_tasks_module = []
+        TASKS_TABLE = []
         if custom_tasks is not None:
-            custom_tasks_module = create_custom_tasks_module(custom_tasks=custom_tasks)
-        if custom_tasks_module is not None:
-            custom_tasks_registry = create_config_tasks(
-                meta_table=custom_tasks_module.TASKS_TABLE, cache_dir=self.cache_dir
+            custom_tasks_module.append(create_custom_tasks_module(custom_tasks=custom_tasks))
+        if extended_tasks is not None:
+            hlog_warn(
+                "You are using extended_tasks. Make sure you installed their dependencies using `pip install -e .[extended_tasks]`."
             )
+            custom_tasks_module.extend(load_extended_tasks_modules(extended_tasks_path=extended_tasks))
+        for module in custom_tasks_module:
+            TASKS_TABLE.extend(module.TASKS_TABLE)
+
+        if len(TASKS_TABLE) > 0:
+            custom_tasks_registry = create_config_tasks(meta_table=TASKS_TABLE, cache_dir=self.cache_dir)
             hlog(custom_tasks_registry)
 
         # Select relevant tasks given the subset asked for by the user
@@ -133,6 +155,16 @@ def get_task_dict(
         return tasks_dict
 
 
+def load_extended_tasks_modules(extended_tasks_path: str):
+    all_modules = []
+    for folder in os.listdir(extended_tasks_path):
+        cur_module = create_custom_tasks_module(os.path.join(extended_tasks_path, folder, "main.py"))
+        hlog(f"Successfully loaded extended task: {folder}.")
+        all_modules.append(cur_module)
+
+    return all_modules
+
+
 def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleType:
     """Creates a custom task module to load tasks defined by the user in their own file.
 
@@ -153,7 +185,7 @@ def create_custom_tasks_module(custom_tasks: Union[str, ModuleType]) -> ModuleTy
 
 
 def get_custom_tasks(custom_tasks: Union[str, ModuleType]) -> Tuple[ModuleType, str]:
-    """Get custom tasks from the given custom tasks file or module.
+    """Get all the custom tasks available from the given custom tasks file or module.
 
     Args:
         custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module it-self
diff --git a/tasks_examples/custom_tasks_with_custom_metrics/ifeval/requirements.txt b/tasks_examples/custom_tasks_with_custom_metrics/ifeval/requirements.txt

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ def main(args):`
`81`	`81`	`with accelerator.main_process_first() if accelerator is not None else nullcontext():`
`82`	`82`	`task_names_list, few_shots_dict = taskinfo_selector(args.tasks)`
`83`	`83`	`task_dict = Registry(cache_dir=env_config.cache_dir).get_task_dict(`
`84`		`- task_names_list, custom_tasks=args.custom_tasks`
	`84`	`+ task_names_list, custom_tasks=args.custom_tasks, extended_tasks=args.extended_tasks`
`85`	`85`	`)`
`86`	`86`	`LightevalTask.load_datasets(task_dict.values(), args.dataset_loading_processes)`
`87`	`87`
Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,8 @@ def main(`
`135`	`135`
`136`	`136`	`task_names_list, few_shots_dict = taskinfo_selector(tasks_selection)`
`137`	`137`	`task_dict = Registry(cache_dir=cache_dir).get_task_dict(`
`138`		`- task_names_list, custom_tasks=lighteval_config.tasks.custom_tasks`
	`138`	`+ task_names_list,`
	`139`	`+ custom_tasks=lighteval_config.tasks.custom_tasks,`
`139`	`140`	`)`
`140`	`141`	`# Loading all the dataset in a distributed manner`
`141`	`142`	`LightevalTask.load_datasets(task_dict.values(), lighteval_config.tasks.dataset_loading_processes)`