huggingface · clefourrier · Apr 3, 2024 · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -164,29 +164,19 @@ tests/.data
 tests/data
 
 # outputs folder
-examples/*/outputs
-examples/*/NeMo_experiments
-examples/*/nemo_experiments
-examples/*/.hydra
-examples/*/wandb
-examples/*/data
 wandb
 dump.py
 
 docs/sources/source/test_build/
 
 # Checkpoints, config files and temporary files created in tutorials.
-examples/neural_graphs/*.chkpt
-examples/neural_graphs/*.yml
-
 .hydra/
 nemo_experiments/
 
 .ruff_cache
 
 tmp.py
 
-examples
 benchmark_output
 prod_env
 

diff --git a/README.md b/README.md
@@ -10,9 +10,6 @@ We're releasing it with the community in the spirit of building in the open.
 Note that it is still very much early so don't expect 100% stability ^^'
 In case of problems or question, feel free to open an issue!
 
-## News
-- **Feb 08, 2024**: Release of `lighteval`
-
 ## Installation
 
 Clone the repo:
@@ -98,7 +95,7 @@ Here, `--tasks` refers to either a _comma-separated_ list of supported tasks fro
 suite|task|num_few_shot|{0 or 1 to automatically reduce `num_few_shot` if prompt is too long}
 ```
 
-or a file path like [`tasks_examples/recommended_set.txt`](./tasks_examples/recommended_set.txt) which specifies multiple task configurations. For example, to evaluate GPT-2 on the Truthful QA benchmark run:
+or a file path like [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set.txt) which specifies multiple task configurations. For example, to evaluate GPT-2 on the Truthful QA benchmark run:
 
 ```shell
 accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
@@ -118,7 +115,20 @@ accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
     --output_dir="./evals/"
 ```
 
-See the [`tasks_examples/recommended_set.txt`](./tasks_examples/recommended_set.txt) file for a list of recommended task configurations.
+See the [`examples/tasks/recommended_set.txt`](./examples/tasks/recommended_set.txt) file for a list of recommended task configurations.
+
+### Evaluating a model with a complex configuration
+
+If you want to evaluate a model by spinning up inference endpoints, or use adapter/delta weights, or more complex configuration options, you can load models using a configuration file. This is done as follows:
+
+```shell
+accelerate launch --multi_gpu --num_processes=<num_gpus> run_evals_accelerate.py \
+    --model_config_path="<path to your model configuration>" \
+    --tasks <task parameters> \
+    --output_dir output_dir
+```
+
+Examples of possible configuration files are provided in `examples/model_configs`.
 
 ### Evaluating a large model with pipeline parallelism
 
@@ -127,15 +137,13 @@ To evaluate models larger that ~40B parameters in 16-bit precision, you will nee
 ```shell
 # PP=2, DP=4 - good for models < 70B params
 accelerate launch --multi_gpu --num_processes=4 run_evals_accelerate.py \
-    --model_args="pretrained=<path to model on the hub>" \
-    --model_parallel \
+    --model_args="pretrained=<path to model on the hub>,model_parallel=True" \
     --tasks <task parameters> \
     --output_dir output_dir
 
 # PP=4, DP=2 - good for huge models >= 70B params
 accelerate launch --multi_gpu --num_processes=2 run_evals_accelerate.py \
-    --model_args="pretrained=<path to model on the hub>" \
-    --model_parallel \
+    --model_args="pretrained=<path to model on the hub>,model_parallel=True" \
     --tasks <task parameters> \
     --output_dir output_dir
 ```
@@ -147,7 +155,7 @@ To evaluate a model on all the benchmarks of the [Open LLM Leaderboard](https://
 ```shell
 accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py \
     --model_args "pretrained=<model name>" \
-    --tasks tasks_examples/open_llm_leaderboard_tasks.txt \
+    --tasks examples/tasks/open_llm_leaderboard_tasks.txt \
     --override_batch_size 1 \
     --output_dir="./evals/"
 ```
@@ -220,7 +228,7 @@ However, we are very grateful to the Harness and HELM teams for their continued
         - [metrics](https://github.com/huggingface/lighteval/tree/main/src/lighteval/metrics): All the available metrics you can use. They are described in metrics, and divided between sample metrics (applied at the sample level, such as a prediction accuracy) and corpus metrics (applied over the whole corpus). You'll also find available normalisation functions.
         - [models](https://github.com/huggingface/lighteval/tree/main/src/lighteval/models): Possible models to use. We cover transformers (base_model), with adapter or delta weights, as well as TGI models locally deployed (it's likely the code here is out of date though), and brrr/nanotron models.
         - [tasks](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks): Available tasks. The complete list is in `tasks_table.jsonl`, and you'll find all the prompts in `tasks_prompt_formatting.py`. Popular tasks requiring custom logic are exceptionally added in the [extended tasks](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/extended).
-- [tasks_examples](https://github.com/huggingface/lighteval/tree/main/tasks_examples) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
+- [examples/tasks](https://github.com/huggingface/lighteval/tree/main/examples/tasks) contains a list of available tasks you can launch. We advise using tasks in the `recommended_set`, as it's possible that some of the other tasks need double checking.
 - [tests](https://github.com/huggingface/lighteval/tree/main/tests) contains our test suite, that we run at each PR to prevent regressions in metrics/prompts/tasks, for a subset of important tasks.
 
 ## Customisation
@@ -291,7 +299,7 @@ if __name__ == "__main__":
 
 You can then give your custom metric to lighteval by using `--custom-tasks path_to_your_file` when launching it.
 
-To see an example of a custom metric added along with a custom task, look at `tasks_examples/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
+To see an example of a custom metric added along with a custom task, look at `examples/tasks/custom_tasks_with_custom_metrics/ifeval/ifeval.py`.
 
 ## Available metrics
 ### Metrics for multiple choice tasks
@@ -414,7 +422,7 @@ source <path_to_your_venv>/activate #or conda activate yourenv
 cd <path_to_your_lighteval>/lighteval
 
 export CUDA_LAUNCH_BLOCKING=1
-srun accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=your model name" --tasks tasks_examples/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
+srun accelerate launch --multi_gpu --num_processes=8 run_evals_accelerate.py --model_args "pretrained=your model name" --tasks examples/tasks/open_llm_leaderboard_tasks.txt --override_batch_size 1 --save_details --output_dir=your output dir
 ```
 
 ## Releases

diff --git a/examples/model_configs/base_model.yaml b/examples/model_configs/base_model.yaml
@@ -0,0 +1,12 @@
+model:
+  type: "base" # can be base, tgi, or endpoint
+  base_params:
+    model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
+    dtype: "bfloat16"
+  merged_weights: # Ignore this section if you are not using PEFT models
+    delta_weights: false # set to True of your model should be merged with a base model, also need to provide the base model name
+    adapter_weights: false # set to True of your model has been trained with peft, also need to provide the base model name
+    base_model: null # path to the base_model
+  generation:
+    multichoice_continuations_start_space: false # Whether to force multiple choice continuations to start with a space
+    no_multichoice_continuations_start_space: false # Whether to force multiple choice continuations to not start with a space
diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml
@@ -0,0 +1,18 @@
+model:
+  type: "endpoint" # can be base, tgi, or endpoint
+  base_params:
+    endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
+    model: "meta-llama/Llama-2-7b-hf"
+    revision: "main"
+    dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
+    reuse_existing: false # if true, ignore all params in instance
+  instance:
+    accelerator: "gpu"
+    region: "eu-west-1"
+    vendor: "aws"
+    instance_size: "medium"
+    instance_type: "g5.2xlarge"
+    framework: "pytorch"
+    endpoint_type: "protected"
+  generation:
+    add_special_tokens: true
diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml
@@ -0,0 +1,5 @@
+model:
+  type: "tgi" # can be base, tgi, or endpoint
+  instance:
+    inference_server_address: ""
+    inference_server_auth: null
diff --git a/...s/custom_tasks/custom_evaluation_tasks.py → examples/nanotron/custom_evaluation_tasks.py b/...s/custom_tasks/custom_evaluation_tasks.py → examples/nanotron/custom_evaluation_tasks.py
diff --git a/tasks_examples/custom_tasks/custom_task.py → examples/nanotron/custom_task.py b/tasks_examples/custom_tasks/custom_task.py → examples/nanotron/custom_task.py
diff --git a/...s/lighteval_config_override_template.yaml → ...n/lighteval_config_override_template.yaml b/...s/lighteval_config_override_template.yaml → ...n/lighteval_config_override_template.yaml
diff --git a/tasks_examples/OALL_tasks.txt → examples/tasks/OALL_tasks.txt b/tasks_examples/OALL_tasks.txt → examples/tasks/OALL_tasks.txt
diff --git a/tasks_examples/all_arabic_tasks.txt → examples/tasks/all_arabic_tasks.txt b/tasks_examples/all_arabic_tasks.txt → examples/tasks/all_arabic_tasks.txt
diff --git a/tasks_examples/all_tasks.txt → examples/tasks/all_tasks.txt b/tasks_examples/all_tasks.txt → examples/tasks/all_tasks.txt
diff --git a/tasks_examples/bbh.txt → examples/tasks/bbh.txt b/tasks_examples/bbh.txt → examples/tasks/bbh.txt
diff --git a/...s_examples/open_llm_leaderboard_tasks.txt → ...ples/tasks/open_llm_leaderboard_tasks.txt b/...s_examples/open_llm_leaderboard_tasks.txt → ...ples/tasks/open_llm_leaderboard_tasks.txt
diff --git a/tasks_examples/recommended_set.txt → examples/tasks/recommended_set.txt b/tasks_examples/recommended_set.txt → examples/tasks/recommended_set.txt
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -34,54 +34,16 @@ def get_parser():
     group = parser.add_mutually_exclusive_group(required=True)
     task_type_group = parser.add_mutually_exclusive_group(required=True)
 
-    # Model type 1) Base model
-    weight_type_group = parser.add_mutually_exclusive_group()
-    weight_type_group.add_argument(
-        "--delta_weights",
-        action="store_true",
-        default=False,
-        help="set to True of your model should be merged with a base model, also need to provide the base model name",
-    )
-    weight_type_group.add_argument(
-        "--adapter_weights",
-        action="store_true",
-        default=False,
-        help="set to True of your model has been trained with peft, also need to provide the base model name",
-    )
-    parser.add_argument(
-        "--base_model", type=str, default=None, help="name of the base model to be used for delta or adapter weights"
-    )
-
+    # Model type: either use a config file or simply the model name
+    task_type_group.add_argument("--model_config_path")
     task_type_group.add_argument("--model_args")
-    parser.add_argument("--model_dtype", type=str, default=None)
-    parser.add_argument(
-        "--multichoice_continuations_start_space",
-        action="store_true",
-        help="Whether to force multiple choice continuations to start with a space",
-    )
-    parser.add_argument(
-        "--no_multichoice_continuations_start_space",
-        action="store_true",
-        help="Whether to force multiple choice continuations to not start with a space",
-    )
-    parser.add_argument("--use_chat_template", default=False, action="store_true")
-    parser.add_argument("--system_prompt", type=str, default=None)
-    # Model type 2) TGI
-    task_type_group.add_argument("--inference_server_address", type=str)
-    parser.add_argument("--inference_server_auth", type=str, default=None)
-    # Model type 3) Inference endpoints
-    task_type_group.add_argument("--endpoint_model_name", type=str)
-    parser.add_argument("--revision", type=str)
-    parser.add_argument("--accelerator", type=str, default=None)
-    parser.add_argument("--vendor", type=str, default=None)
-    parser.add_argument("--region", type=str, default=None)
-    parser.add_argument("--instance_size", type=str, default=None)
-    parser.add_argument("--instance_type", type=str, default=None)
-    parser.add_argument("--reuse_existing", default=False, action="store_true")
+
     # Debug
     parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--override_batch_size", type=int, default=-1)
     parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")
     # Saving
+    parser.add_argument("--output_dir", required=True)
     parser.add_argument("--push_results_to_hub", default=False, action="store_true")
     parser.add_argument("--save_details", action="store_true")
     parser.add_argument("--push_details_to_hub", default=False, action="store_true")
@@ -95,8 +57,8 @@ def get_parser():
         help="Hub organisation where you want to store the results. Your current token must have write access to it",
     )
     # Common parameters
-    parser.add_argument("--output_dir", required=True)
-    parser.add_argument("--override_batch_size", type=int, default=-1)
+    parser.add_argument("--use_chat_template", default=False, action="store_true")
+    parser.add_argument("--system_prompt", type=str, default=None)
     parser.add_argument("--dataset_loading_processes", type=int, default=1)
     parser.add_argument(
         "--custom_tasks",

diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -131,18 +131,15 @@ def main(args):
             final_dict = evaluation_tracker.generate_final_dict()
 
         with htrack_block("Cleaninp up"):
-            if args.delta_weights:
-                tmp_weights_dir = f"{evaluation_tracker.general_config_logger.model_name}-delta-applied"
-                hlog(f"Removing {tmp_weights_dir}")
-                shutil.rmtree(tmp_weights_dir)
-            if args.adapter_weights:
-                tmp_weights_dir = f"{evaluation_tracker.general_config_logger.model_name}-adapter-applied"
-                hlog(f"Removing {tmp_weights_dir}")
-                shutil.rmtree(tmp_weights_dir)
+            for weights in ["delta", "adapter"]:
+                try:
+                    tmp_weights_dir = f"{evaluation_tracker.general_config_logger.model_name}-{weights}-applied"
+                    hlog(f"Removing {tmp_weights_dir}")
+                    shutil.rmtree(tmp_weights_dir)
+                except OSError:
+                    pass
 
         print(make_results_table(final_dict))
 
-        if not args.reuse_existing:
-            model.cleanup()
-
+        model.cleanup()
         return final_dict
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -42,7 +42,7 @@
     LoglikelihoodReturn,
     LoglikelihoodSingleTokenReturn,
 )
-from lighteval.models.utils import _get_dtype, _get_precision, _simplify_name, batched
+from lighteval.models.utils import _get_dtype, _simplify_name, batched
 from lighteval.tasks.requests import (
     GreedyUntilMultiTurnRequest,
     GreedyUntilRequest,
@@ -88,7 +88,7 @@ def __init__(
         self.multichoice_continuations_start_space = config.multichoice_continuations_start_space
 
         # We are in DP (and launch the script with `accelerate launch`)
-        if not config.model_parallel and not config.load_in_4bit and not config.load_in_8bit:
+        if not config.model_parallel and config.quantization_config is None:
             # might need to use accelerate instead
             # self.model = config.accelerator.prepare(self.model)
             hlog(f"Using Data Parallelism, putting model on device {self._device}")
@@ -97,7 +97,7 @@ def __init__(
         self.model_name = _simplify_name(config.pretrained)
         self.model_sha = config.get_model_sha()
 
-        self.precision = _get_precision(config, model_auto_config=self._config)
+        self.precision = _get_dtype(config.dtype, config=self._config)
 
     @property
     def tokenizer(self):

diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
@@ -63,6 +63,7 @@ class InferenceEndpointModel(LightevalModel):
     def __init__(
         self, config: Union[InferenceEndpointModelConfig, InferenceModelConfig], env_config: EnvConfig
     ) -> None:
+        self.reuse_existing = getattr(config, "should_reuse_existing", True)
         if isinstance(config, InferenceEndpointModelConfig):
             if config.should_reuse_existing:
                 self.endpoint = get_inference_endpoint(name=config.name, token=env_config.token)
@@ -130,7 +131,7 @@ def disable_tqdm(self) -> bool:
         False  # no accelerator = this is the main process
 
     def cleanup(self):
-        if self.endpoint is not None:
+        if self.endpoint is not None and not self.reuse_existing:
             self.endpoint.delete()
             hlog_warn(
                 "You deleted your endpoint after using it. You'll need to create it again if you need to reuse it."