diff --git a/README.md b/README.md
index 44af6a8a5..47f4c07fd 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ Here’s a quick command to evaluate using the Accelerate backend:
 ```shell
 lighteval accelerate \
     "model_name=gpt2" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 ## 🙏 Acknowledgements
diff --git a/community_tasks/custom_task_classification_grammar_task.py b/community_tasks/custom_task_classification_grammar_task.py
index f513cf0bf..5b248093b 100644
--- a/community_tasks/custom_task_classification_grammar_task.py
+++ b/community_tasks/custom_task_classification_grammar_task.py
@@ -32,7 +32,7 @@
 Example usage:
     TGI endpoint evaluation:
     ```bash
-    uv run --active --extra litellm --extra tgi lighteval endpoint tgi examples/model_configs/tgi_model.yaml "custom|emotion_classification|0|0"
+    uv run --active --extra litellm --extra tgi lighteval endpoint tgi examples/model_configs/tgi_model.yaml "custom|emotion_classification|0"
     --custom-tasks examples/custom_tasks_templates/custom_task_classification_grammar_task.py
     --output-dir results
     --save-details
@@ -449,8 +449,8 @@ def get_emotion_classification_grammar() -> TextGenerationInputGrammarType:
 
     print("\nUsage Examples:")
     print(
-        f"  TGI: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|0|0' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run --max-samples 10"
+        f"  TGI: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|0' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run --max-samples 10"
     )
     print(
-        f"  Full: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|5|1' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run"
+        f"  Full: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|5' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run"
     )
diff --git a/community_tasks/filipino_evals.py b/community_tasks/filipino_evals.py
index 1ce362c1d..45011535e 100644
--- a/community_tasks/filipino_evals.py
+++ b/community_tasks/filipino_evals.py
@@ -42,7 +42,7 @@
 from langcodes import Language as LangCodeLanguage
 from langcodes import standardize_tag
 
-from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric
+from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
 from lighteval.metrics.metrics import Metrics
 from lighteval.metrics.normalizations import (
     LogProbCharNorm,
@@ -87,8 +87,8 @@
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
             ],
         ),
     )
@@ -117,8 +117,8 @@
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
             ],
         ),
     )
@@ -154,9 +154,9 @@
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         version=0,
@@ -191,9 +191,9 @@
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         hf_avail_splits=["test"],
@@ -275,9 +275,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         hf_avail_splits=["train", "test"],
@@ -327,9 +327,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
     )
@@ -360,9 +360,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         hf_avail_splits=["test"],
@@ -396,9 +396,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
         hf_subset="default",
         evaluation_splits=["tl"],
         metrics=[
-            loglikelihood_acc_metric(normalization=None),
-            loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-            loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+            LogLikelihoodAccMetric(normalization=None),
+            LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+            LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
         ],
     )
     for formulation in [HybridFormulation(), MCFFormulation()]
@@ -427,9 +427,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc:
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=None),
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=None),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
             ],
         ),
         trust_dataset=True,
@@ -509,9 +509,9 @@ def create_sib200_task(language: Language, formulation):
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         hf_avail_splits=["test", "validation"],
@@ -565,9 +565,9 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         hf_avail_splits=["test"],
@@ -595,9 +595,9 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         hf_avail_splits=["test"],
@@ -718,9 +718,9 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str,
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         version=0,
@@ -762,9 +762,9 @@ def create_universalner_task(language: Language, formulation):
         metrics=get_metrics_for_formulation(
             formulation,
             [
-                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
-                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
-                loglikelihood_acc_metric(normalization=LogProbPMINorm()),
+                LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
+                LogLikelihoodAccMetric(normalization=LogProbPMINorm()),
             ],
         ),
         version=0,
diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py
index 38e8b257e..d6d86ab00 100644
--- a/community_tasks/serbian_eval.py
+++ b/community_tasks/serbian_eval.py
@@ -298,7 +298,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.ARC_EASY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 arc_challenge = create_task_config(
@@ -306,7 +306,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.ARC_CHALLENGE.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -318,14 +318,14 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.HELLASWAG.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 piqa = create_task_config(
     task_name="serbian_evals:piqa",
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.PIQA.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 winogrande = create_task_config(
@@ -333,7 +333,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.WINOGRANDE.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -357,7 +357,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_ANATOMY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_astronomy = create_task_config(
@@ -365,7 +365,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_ASTRONOMY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_business_ethics = create_task_config(
@@ -373,7 +373,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_BUSINESS_ETHICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_clinical_knowledge = create_task_config(
@@ -381,7 +381,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_CLINICAL_KNOWLEDGE.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_miscellaneous = create_task_config(
@@ -389,7 +389,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_MISCELLANEOUS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_electrical_engineering = create_task_config(
@@ -397,7 +397,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_ELECTRONIC_ENGINEERING.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -409,7 +409,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_SERBIAN_ALL.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -421,7 +421,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_MARKETING.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_management = create_task_config(
@@ -429,7 +429,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_MANAGEMENT.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -441,7 +441,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_BIOLOGY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_college_chemistry = create_task_config(
@@ -449,7 +449,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_CHEMISTRY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_college_computer_science = create_task_config(
@@ -457,7 +457,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SCIENCE.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_college_mathematics = create_task_config(
@@ -465,7 +465,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_MATHEMATICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_college_medicine = create_task_config(
@@ -473,7 +473,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_MEDICINE.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_college_physics = create_task_config(
@@ -481,7 +481,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_PHYSICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_computer_security = create_task_config(
@@ -489,7 +489,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SECURITY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -501,7 +501,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_MORAL_DISPUTES.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_moral_scenarios = create_task_config(
@@ -509,7 +509,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_MORAL_SCENARIOS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_philosophy = create_task_config(
@@ -517,7 +517,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_PHILOSOPHY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_world_religions = create_task_config(
@@ -525,7 +525,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_WORLD_RELIGIONS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -537,7 +537,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_BIOLOGY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_chemistry = create_task_config(
@@ -545,7 +545,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_CHEMISTRY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_computer_science = create_task_config(
@@ -553,7 +553,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_COMPUTER_SCIENCE.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_european_history = create_task_config(
@@ -561,7 +561,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_EURO_HISTORY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_geography = create_task_config(
@@ -569,7 +569,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_GEOGRAPHY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_mathematics = create_task_config(
@@ -577,7 +577,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MATHEMATICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_microeconomics = create_task_config(
@@ -585,7 +585,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MICROECONOMICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_physics = create_task_config(
@@ -593,7 +593,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PHYSICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_psychology = create_task_config(
@@ -601,7 +601,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PSYCHOLOGY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_statistics = create_task_config(
@@ -609,7 +609,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_STATISTICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_high_school_world_history = create_task_config(
@@ -617,7 +617,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_WORLD_HISTORY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -629,7 +629,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_ABSTRACT_ALGEBRA.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_elementary_mathematics = create_task_config(
@@ -637,7 +637,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_ELEMENTARY_MATHEMATICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_formal_logic = create_task_config(
@@ -645,7 +645,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_FORMAL_LOGIC.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_conceptual_physics = create_task_config(
@@ -653,7 +653,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_CONCEPTUAL_PHYSICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_econometrics = create_task_config(
@@ -661,7 +661,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_ECONOMETRICS.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_machine_learning = create_task_config(
@@ -669,7 +669,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_MACHINE_LEARNING.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -681,7 +681,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_GLOBAL_FACT.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_logical_fallacies = create_task_config(
@@ -689,7 +689,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_LOGICAL_FALLACIES.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_sociology = create_task_config(
@@ -697,7 +697,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_SOCIOLOGY.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 mmlu_human_aging = create_task_config(
@@ -705,7 +705,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.MMLU_HUMAN_AGING.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 # ============================================
@@ -717,7 +717,7 @@ def create_task_config(
     prompt_function=boolq_serbian,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.BOOLQ.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 openbook_qa = create_task_config(
@@ -725,7 +725,7 @@ def create_task_config(
     prompt_function=serbian_eval_prompt,
     hf_repo=HFSubsets.HF_BASE_REPO.value,
     hf_subset=HFSubsets.OPENBOOK.value,
-    metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+    metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
 )
 
 
diff --git a/community_tasks/turkic_evals.py b/community_tasks/turkic_evals.py
index 9eae65d5b..242b25f81 100644
--- a/community_tasks/turkic_evals.py
+++ b/community_tasks/turkic_evals.py
@@ -123,7 +123,7 @@ def __init__(
             hf_subset=hf_subset,
             prompt_function=partial(tumlu_pfn, language=hf_subset),
             hf_repo="jafarisbarov/TUMLU-mini",
-            metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
+            metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})],
             hf_avail_splits=["test", "dev"],
             evaluation_splits=["test"],
             few_shots_split=["dev"],
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
index ae4076203..448941aa3 100644
--- a/docs/source/adding-a-custom-task.mdx
+++ b/docs/source/adding-a-custom-task.mdx
@@ -126,6 +126,6 @@ Once your file is created you can then run the evaluation with the following com
 ```bash
 lighteval accelerate \
     "model_name=HuggingFaceH4/zephyr-7b-beta" \
-    "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
+    "community|{custom_task}|{fewshots}" \
     --custom-tasks {path_to_your_custom_task_file}
 ```
diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx
index 1b055dedd..97a30aa53 100644
--- a/docs/source/evaluating-a-custom-model.mdx
+++ b/docs/source/evaluating-a-custom-model.mdx
@@ -56,7 +56,7 @@ You can evaluate your custom model using either the command line interface or th
 lighteval custom \
     "google-translate" \
     "examples/custom_models/google_translate_model.py" \
-    "lighteval|wmt20:fr-de|0|0" \
+    "lighteval|wmt20:fr-de|0" \
     --max-samples 10
 ```
 
@@ -91,7 +91,7 @@ model_config = CustomModelConfig(
 
 # Create and run the pipeline
 pipeline = Pipeline(
-    tasks="leaderboard|truthfulqa:mc|0|0",
+    tasks="leaderboard|truthfulqa:mc|0",
     pipeline_parameters=pipeline_params,
     evaluation_tracker=evaluation_tracker,
     model_config=model_config
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index de2059f49..682a08cda 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -27,7 +27,7 @@ To evaluate `GPT-2` on the Truthful QA benchmark with [🤗
 ```bash
 lighteval accelerate \
      "model_name=openai-community/gpt2" \
-     "leaderboard|truthfulqa:mc|0|0"
+     "leaderboard|truthfulqa:mc|0"
 ```
 
 Here, we first choose a backend (either `accelerate`, `nanotron`, `endpoint`, or `vllm`), and then specify the model and task(s) to run.
@@ -38,12 +38,9 @@ Valid key-value pairs correspond with the backend configuration, and are detaile
 The syntax for the task specification might be a bit hard to grasp at first. The format is as follows:
 
 ```txt
-{suite}|{task}|{num_few_shot}|{0 for strict `num_few_shots`, or 1 to allow a truncation if context size is too small}
+{suite}|{task}|{num_few_shot}
 ```
 
-If the fourth value is set to 1, lighteval will check if the prompt (including the few-shot examples) is too long for the context size of the task or the model.
-If so, the number of few shot examples is automatically reduced.
-
 Tasks have a function applied at the sample level and one at the corpus level. For example,
 - an exact match can be applied per sample, then averaged over the corpus to give the final score
 - samples can be left untouched before applying Corpus BLEU at the corpus level
@@ -52,7 +49,7 @@ etc.
 If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI.
 For example
 ```txt
-{suite}|{task}@{parameter_name1}={value1},{parameter_name2}={value2},...|0|0
+{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0
 ```
 
 All officially supported tasks can be found at the [tasks_list](available-tasks) and in the
@@ -71,7 +68,7 @@ When specifying a path to file, it should start with `./`.
 lighteval accelerate \
      "model_name=openai-community/gpt2" \
      ./path/to/lighteval/examples/tasks/recommended_set.txt
-# or, e.g., "leaderboard|truthfulqa:mc|0|0|,leaderboard|gsm8k|3|1"
+# or, e.g., "leaderboard|truthfulqa:mc|0,leaderboard|gsm8k|3"
 ```
 
 ## Evaluate a model on one or more GPUs
@@ -90,7 +87,7 @@ You can then evaluate a model using data parallelism on 8 GPUs like follows:
 accelerate launch --multi_gpu --num_processes=8 -m \
     lighteval accelerate \
     "model_name=openai-community/gpt2" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 Here, `--override_batch_size` defines the batch size per device, so the effective
@@ -103,7 +100,7 @@ To evaluate a model using pipeline parallelism on 2 or more GPUs, run:
 ```bash
 lighteval accelerate \
     "model_name=openai-community/gpt2,model_parallel=True" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 This will automatically use accelerate to distribute the model across the GPUs.
@@ -134,7 +131,7 @@ think tokens.
 ```bash
 lighteval vllm \
     "model_name=mistralai/Magistral-Small-2507,dtype=float16,data_parallel_size=4" \
-    "lighteval|aime24|0|0" \
+    "lighteval|aime24|0" \
     --remove-reasoning-tags \
     --reasoning-tags="[('[THINK]','[/THINK]')]"
 ```
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
index 4a6f379a1..a8a3fb2bc 100644
--- a/docs/source/saving-and-reading-results.mdx
+++ b/docs/source/saving-and-reading-results.mdx
@@ -203,12 +203,9 @@ The detail file contains the following columns:
         "hash_input_tokens": "29916e7afe5cb51d",
         "hash_cont_tokens": "37f91ce23ef6d435"
       },
-      "truncated": 2,
-      "non_truncated": 0,
       "padded": 0,
       "non_padded": 2,
       "effective_few_shots": 0.0,
-      "num_truncated_few_shots": 0
     }
   },
   "summary_general": {
@@ -218,11 +215,8 @@ The detail file contains the following columns:
       "hash_input_tokens": "ac933feb14f96d7b",
       "hash_cont_tokens": "9d03fb26f8da7277"
     },
-    "truncated": 2,
-    "non_truncated": 0,
     "padded": 0,
     "non_padded": 2,
-    "num_truncated_few_shots": 0
   }
 }
 ```
diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/use-inference-providers-as-backend.mdx
index 1e49e4931..70b436a8a 100644
--- a/docs/source/use-inference-providers-as-backend.mdx
+++ b/docs/source/use-inference-providers-as-backend.mdx
@@ -12,7 +12,7 @@ Lighteval allows to use Hugging Face's Inference Providers to evaluate llms on s
 ```bash
 lighteval endpoint inference-providers \
     "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
-    "lighteval|gsm8k|0|0"
+    "lighteval|gsm8k|0"
 ```
 
 ## Using a config file
@@ -22,7 +22,7 @@ You can use config files to define the model and the provider to use.
 ```bash
 lighteval endpoint inference-providers \
     examples/model_configs/inference_providers.yaml \
-    "lighteval|gsm8k|0|0"
+    "lighteval|gsm8k|0"
 ```
 
 with the following config file:
diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx
index 1bcbae6bf..36ecf841d 100644
--- a/docs/source/use-litellm-as-backend.mdx
+++ b/docs/source/use-litellm-as-backend.mdx
@@ -11,7 +11,7 @@ Documentation for available APIs and compatible endpoints can be found [here](ht
 ```bash
 lighteval endpoint litellm \
     "provider=openai,model_name=gpt-3.5-turbo" \
-    "lighteval|gsm8k|0|0" \
+    "lighteval|gsm8k|0" \
 ```
 
 ## Using a config file
diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
index cfa4352eb..a13c1b82a 100644
--- a/docs/source/use-sglang-as-backend.mdx
+++ b/docs/source/use-sglang-as-backend.mdx
@@ -6,7 +6,7 @@ To use, simply change the `model_args` to reflect the arguments you want to pass
 ```bash
 lighteval sglang \
     "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 `sglang` is able to distribute the model across multiple GPUs using data
@@ -18,7 +18,7 @@ For example if you have 4 GPUs you can split it across using `tp_size`:
 ```bash
 lighteval sglang \
     "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 Or, if your model fits on a single GPU, you can use `dp_size` to speed up the evaluation:
@@ -26,7 +26,7 @@ Or, if your model fits on a single GPU, you can use `dp_size` to speed up the ev
 ```bash
 lighteval sglang \
     "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 ## Use a config file
@@ -37,7 +37,7 @@ An example of a config file is shown below and can be found at `examples/model_c
 ```bash
 lighteval sglang \
     "examples/model_configs/sglang_model_config.yaml" \
-    "leaderboard|truthfulqa:mc|0|0"
+    "leaderboard|truthfulqa:mc|0"
 ```
 
 > [!TIP]
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
index 3a1dbfded..05ec1edde 100644
--- a/docs/source/use-vllm-as-backend.mdx
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -10,7 +10,7 @@ To use, simply change the `model_args` to reflect the arguments you want to pass
 ```bash
 lighteval vllm \
     "model_name=HuggingFaceH4/zephyr-7b-beta" \
-    "extended|ifeval|0|0"
+    "extended|ifeval|0"
 ```
 
 `vllm` is able to distribute the model across multiple GPUs using data
@@ -22,7 +22,7 @@ For example if you have 4 GPUs you can split it across using `tensor_parallelism
 ```bash
 export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
     "model_name=HuggingFaceH4/zephyr-7b-beta,tensor_parallel_size=4" \
-    "extended|ifeval|0|0"
+    "extended|ifeval|0"
 ```
 
 Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
@@ -30,7 +30,7 @@ Or, if your model fits on a single GPU, you can use `data_parallelism` to speed
 ```bash
 export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
     "model_name=HuggingFaceH4/zephyr-7b-beta,data_parallel_size=4" \
-    "extended|ifeval|0|0"
+    "extended|ifeval|0"
 ```
 
 ## Use a config file
@@ -41,7 +41,7 @@ An example of a config file is shown below and can be found at `examples/model_c
 ```bash
 lighteval vllm \
     "examples/model_configs/vllm_model_config.yaml" \
-    "extended|ifeval|0|0"
+    "extended|ifeval|0"
 ```
 
 ```yaml
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx
index 81fb3e4d9..28927d8de 100644
--- a/docs/source/using-the-python-api.mdx
+++ b/docs/source/using-the-python-api.mdx
@@ -41,7 +41,7 @@ def main():
             dtype="float16",
     )
 
-    task = "helm|mmlu|5|1"
+    task = "helm|mmlu|5"
 
     pipeline = Pipeline(
         tasks=task,
diff --git a/examples/custom_models/local_mt_model.py b/examples/custom_models/local_mt_model.py
index 18b604a5e..5d74aa78c 100644
--- a/examples/custom_models/local_mt_model.py
+++ b/examples/custom_models/local_mt_model.py
@@ -69,7 +69,7 @@ class LocalMTClient(LightevalModel):
     where src and tgt are ISO language codes (2 or 3 letter codes supported).
 
     Example:
-        ```lighteval custom facebook/seamless-m4t-v2-large examples/custom_models/local_mt_model.py "lighteval|wmt20:fr-de|0|0" --max-samples 10 --save-details
+        ```lighteval custom facebook/seamless-m4t-v2-large examples/custom_models/local_mt_model.py "lighteval|wmt20:fr-de|0" --max-samples 10 --save-details
         ```
 
     Note:
diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py
index 8430bee08..c0e166116 100644
--- a/examples/nanotron/custom_evaluation_tasks.py
+++ b/examples/nanotron/custom_evaluation_tasks.py
@@ -197,7 +197,7 @@ def preprocess(text):
 
 
 # 0 short for common sense
-COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS]
+COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0") for t in COMMON_SENSE_REASONING_TASKS]
 _TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING)
 _TASKS += COMMON_SENSE_REASONING_TASKS
 
@@ -239,8 +239,8 @@ def natural_questions_prompt(line, task_name: str = None):
 ]
 
 
-WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS]
-# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS}
+WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5") for t in WORLD_KNOWLEDGE_TASKS]
+# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0' for t in WORLD_KNOWLEDGE_TASKS}
 _TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING)
 _TASKS += WORLD_KNOWLEDGE_TASKS
 
@@ -278,7 +278,7 @@ def boolq_prompt(line, task_name: str = None):
 ]
 
 
-READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS]
+READING_COMP_STRING = [(t, f"custom|{t.name}|0") for t in READING_COMP_TASKS]
 _TASKS_STRINGS.extend(READING_COMP_STRING)
 _TASKS += READING_COMP_TASKS
 
@@ -342,8 +342,8 @@ def __init__(
 )
 
 
-MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS]
-GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")]
+MATH_STRING = [(t, f"custom|{t.name}|4") for t in MATH_TASKS]
+GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8")]
 _TASKS_STRINGS.extend(MATH_STRING)
 _TASKS_STRINGS.extend(GSM8K_STRING)
 _TASKS += MATH_TASKS + [GSM8K]
@@ -484,8 +484,8 @@ def __init__(
 ]
 
 
-# MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS}
-MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS]
+# MMLU_STRING = {t: f'custom|{t.name}|5' for t in MMLU_TASKS}
+MMLU_STRING = [(t, f"custom|{t.name}|0") for t in MMLU_TASKS]
 _TASKS_STRINGS.extend(MMLU_STRING)
 _TASKS += MMLU_TASKS
 
@@ -571,8 +571,8 @@ def __init__(
 ]
 
 
-# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS}
-BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS]
+# BBH_STRING = {t: f'custom|{t.name}|3' for t in BBH_TASKS}
+BBH_STRING = [(t, f"custom|{t.name}|0") for t in BBH_TASKS]
 _TASKS_STRINGS.extend(BBH_STRING)
 _TASKS += BBH_TASKS
 
@@ -687,8 +687,8 @@ def __init__(
 ]
 
 
-# AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS}
-AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS]
+# AGIEVAL_STRING = {t: f'custom|{t.name}|5' for t in AGIEVAL_TASKS}
+AGIEVAL_STRING = [(t, f"custom|{t.name}|0") for t in AGIEVAL_TASKS]
 _TASKS_STRINGS.extend(AGIEVAL_STRING)
 _TASKS += AGIEVAL_TASKS
 
diff --git a/examples/nanotron/lighteval_config_override_template.yaml b/examples/nanotron/lighteval_config_override_template.yaml
index 50886ced0..433498c34 100644
--- a/examples/nanotron/lighteval_config_override_template.yaml
+++ b/examples/nanotron/lighteval_config_override_template.yaml
@@ -20,4 +20,4 @@ tasks:
   max_samples: 10
   multichoice_continuations_start_space: null
   num_fewshot_seeds: null
-  tasks: lighteval|gsm8k|5|1
+  tasks: lighteval|gsm8k|5
diff --git a/examples/tasks/OALL_v1_tasks.txt b/examples/tasks/OALL_v1_tasks.txt
index 08e9a51cd..daecb62a7 100644
--- a/examples/tasks/OALL_v1_tasks.txt
+++ b/examples/tasks/OALL_v1_tasks.txt
@@ -1,136 +1,136 @@
-lighteval|xstory_cloze:ar|0|0
-community|arabic_mmlu_mt:abstract_algebra|0|0
-community|arabic_mmlu_mt:anatomy|0|0
-community|arabic_mmlu_mt:astronomy|0|0
-community|arabic_mmlu_mt:business_ethics|0|0
-community|arabic_mmlu_mt:clinical_knowledge|0|0
-community|arabic_mmlu_mt:college_biology|0|0
-community|arabic_mmlu_mt:college_chemistry|0|0
-community|arabic_mmlu_mt:college_computer_science|0|0
-community|arabic_mmlu_mt:college_mathematics|0|0
-community|arabic_mmlu_mt:college_medicine|0|0
-community|arabic_mmlu_mt:college_physics|0|0
-community|arabic_mmlu_mt:computer_security|0|0
-community|arabic_mmlu_mt:conceptual_physics|0|0
-community|arabic_mmlu_mt:econometrics|0|0
-community|arabic_mmlu_mt:electrical_engineering|0|0
-community|arabic_mmlu_mt:elementary_mathematics|0|0
-community|arabic_mmlu_mt:formal_logic|0|0
-community|arabic_mmlu_mt:global_facts|0|0
-community|arabic_mmlu_mt:high_school_biology|0|0
-community|arabic_mmlu_mt:high_school_chemistry|0|0
-community|arabic_mmlu_mt:high_school_computer_science|0|0
-community|arabic_mmlu_mt:high_school_european_history|0|0
-community|arabic_mmlu_mt:high_school_geography|0|0
-community|arabic_mmlu_mt:high_school_government_and_politics|0|0
-community|arabic_mmlu_mt:high_school_macroeconomics|0|0
-community|arabic_mmlu_mt:high_school_mathematics|0|0
-community|arabic_mmlu_mt:high_school_microeconomics|0|0
-community|arabic_mmlu_mt:high_school_physics|0|0
-community|arabic_mmlu_mt:high_school_psychology|0|0
-community|arabic_mmlu_mt:high_school_statistics|0|0
-community|arabic_mmlu_mt:high_school_us_history|0|0
-community|arabic_mmlu_mt:high_school_world_history|0|0
-community|arabic_mmlu_mt:human_aging|0|0
-community|arabic_mmlu_mt:human_sexuality|0|0
-community|arabic_mmlu_mt:international_law|0|0
-community|arabic_mmlu_mt:jurisprudence|0|0
-community|arabic_mmlu_mt:logical_fallacies|0|0
-community|arabic_mmlu_mt:machine_learning|0|0
-community|arabic_mmlu_mt:management|0|0
-community|arabic_mmlu_mt:marketing|0|0
-community|arabic_mmlu_mt:medical_genetics|0|0
-community|arabic_mmlu_mt:miscellaneous|0|0
-community|arabic_mmlu_mt:moral_disputes|0|0
-community|arabic_mmlu_mt:moral_scenarios|0|0
-community|arabic_mmlu_mt:nutrition|0|0
-community|arabic_mmlu_mt:philosophy|0|0
-community|arabic_mmlu_mt:prehistory|0|0
-community|arabic_mmlu_mt:professional_accounting|0|0
-community|arabic_mmlu_mt:professional_law|0|0
-community|arabic_mmlu_mt:professional_medicine|0|0
-community|arabic_mmlu_mt:professional_psychology|0|0
-community|arabic_mmlu_mt:public_relations|0|0
-community|arabic_mmlu_mt:security_studies|0|0
-community|arabic_mmlu_mt:sociology|0|0
-community|arabic_mmlu_mt:us_foreign_policy|0|0
-community|arabic_mmlu_mt:virology|0|0
-community|arabic_mmlu_mt:world_religions|0|0
-community|arabic_exams|0|0
-community|acva:Algeria|0|0
-community|acva:Ancient_Egypt|0|0
-community|acva:Arab_Empire|0|0
-community|acva:Arabic_Architecture|0|0
-community|acva:Arabic_Art|0|0
-community|acva:Arabic_Astronomy|0|0
-community|acva:Arabic_Calligraphy|0|0
-community|acva:Arabic_Ceremony|0|0
-community|acva:Arabic_Clothing|0|0
-community|acva:Arabic_Culture|0|0
-community|acva:Arabic_Food|0|0
-community|acva:Arabic_Funeral|0|0
-community|acva:Arabic_Geography|0|0
-community|acva:Arabic_History|0|0
-community|acva:Arabic_Language_Origin|0|0
-community|acva:Arabic_Literature|0|0
-community|acva:Arabic_Math|0|0
-community|acva:Arabic_Medicine|0|0
-community|acva:Arabic_Music|0|0
-community|acva:Arabic_Ornament|0|0
-community|acva:Arabic_Philosophy|0|0
-community|acva:Arabic_Physics_and_Chemistry|0|0
-community|acva:Arabic_Wedding|0|0
-community|acva:Bahrain|0|0
-community|acva:Comoros|0|0
-community|acva:Egypt_modern|0|0
-community|acva:InfluenceFromAncientEgypt|0|0
-community|acva:InfluenceFromByzantium|0|0
-community|acva:InfluenceFromChina|0|0
-community|acva:InfluenceFromGreece|0|0
-community|acva:InfluenceFromIslam|0|0
-community|acva:InfluenceFromPersia|0|0
-community|acva:InfluenceFromRome|0|0
-community|acva:Iraq|0|0
-community|acva:Islam_Education|0|0
-community|acva:Islam_branches_and_schools|0|0
-community|acva:Islamic_law_system|0|0
-community|acva:Jordan|0|0
-community|acva:Kuwait|0|0
-community|acva:Lebanon|0|0
-community|acva:Libya|0|0
-community|acva:Mauritania|0|0
-community|acva:Mesopotamia_civilization|0|0
-community|acva:Morocco|0|0
-community|acva:Oman|0|0
-community|acva:Palestine|0|0
-community|acva:Qatar|0|0
-community|acva:Saudi_Arabia|0|0
-community|acva:Somalia|0|0
-community|acva:Sudan|0|0
-community|acva:Syria|0|0
-community|acva:Tunisia|0|0
-community|acva:United_Arab_Emirates|0|0
-community|acva:Yemen|0|0
-community|acva:communication|0|0
-community|acva:computer_and_phone|0|0
-community|acva:daily_life|0|0
-community|acva:entertainment|0|0
-community|alghafa:mcq_exams_test_ar|0|0
-community|alghafa:meta_ar_dialects|0|0
-community|alghafa:meta_ar_msa|0|0
-community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0
-community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0
-community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0
-community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0
-community|alghafa:multiple_choice_rating_sentiment_task|0|0
-community|alghafa:multiple_choice_sentiment_task|0|0
-community|race_ar|0|0
-community|piqa_ar|0|0
-community|arc_easy_ar|0|0
-community|arc_challenge_okapi_ar|0|0
-community|openbook_qa_ext_ar|0|0
-community|boolq_ar|0|0
-community|copa_ext_ar|0|0
-community|hellaswag_okapi_ar|0|0
-community|toxigen_ar|0|0
-community|sciq_ar|0|0
+lighteval|xstory_cloze:ar|0
+community|arabic_mmlu_mt:abstract_algebra|0
+community|arabic_mmlu_mt:anatomy|0
+community|arabic_mmlu_mt:astronomy|0
+community|arabic_mmlu_mt:business_ethics|0
+community|arabic_mmlu_mt:clinical_knowledge|0
+community|arabic_mmlu_mt:college_biology|0
+community|arabic_mmlu_mt:college_chemistry|0
+community|arabic_mmlu_mt:college_computer_science|0
+community|arabic_mmlu_mt:college_mathematics|0
+community|arabic_mmlu_mt:college_medicine|0
+community|arabic_mmlu_mt:college_physics|0
+community|arabic_mmlu_mt:computer_security|0
+community|arabic_mmlu_mt:conceptual_physics|0
+community|arabic_mmlu_mt:econometrics|0
+community|arabic_mmlu_mt:electrical_engineering|0
+community|arabic_mmlu_mt:elementary_mathematics|0
+community|arabic_mmlu_mt:formal_logic|0
+community|arabic_mmlu_mt:global_facts|0
+community|arabic_mmlu_mt:high_school_biology|0
+community|arabic_mmlu_mt:high_school_chemistry|0
+community|arabic_mmlu_mt:high_school_computer_science|0
+community|arabic_mmlu_mt:high_school_european_history|0
+community|arabic_mmlu_mt:high_school_geography|0
+community|arabic_mmlu_mt:high_school_government_and_politics|0
+community|arabic_mmlu_mt:high_school_macroeconomics|0
+community|arabic_mmlu_mt:high_school_mathematics|0
+community|arabic_mmlu_mt:high_school_microeconomics|0
+community|arabic_mmlu_mt:high_school_physics|0
+community|arabic_mmlu_mt:high_school_psychology|0
+community|arabic_mmlu_mt:high_school_statistics|0
+community|arabic_mmlu_mt:high_school_us_history|0
+community|arabic_mmlu_mt:high_school_world_history|0
+community|arabic_mmlu_mt:human_aging|0
+community|arabic_mmlu_mt:human_sexuality|0
+community|arabic_mmlu_mt:international_law|0
+community|arabic_mmlu_mt:jurisprudence|0
+community|arabic_mmlu_mt:logical_fallacies|0
+community|arabic_mmlu_mt:machine_learning|0
+community|arabic_mmlu_mt:management|0
+community|arabic_mmlu_mt:marketing|0
+community|arabic_mmlu_mt:medical_genetics|0
+community|arabic_mmlu_mt:miscellaneous|0
+community|arabic_mmlu_mt:moral_disputes|0
+community|arabic_mmlu_mt:moral_scenarios|0
+community|arabic_mmlu_mt:nutrition|0
+community|arabic_mmlu_mt:philosophy|0
+community|arabic_mmlu_mt:prehistory|0
+community|arabic_mmlu_mt:professional_accounting|0
+community|arabic_mmlu_mt:professional_law|0
+community|arabic_mmlu_mt:professional_medicine|0
+community|arabic_mmlu_mt:professional_psychology|0
+community|arabic_mmlu_mt:public_relations|0
+community|arabic_mmlu_mt:security_studies|0
+community|arabic_mmlu_mt:sociology|0
+community|arabic_mmlu_mt:us_foreign_policy|0
+community|arabic_mmlu_mt:virology|0
+community|arabic_mmlu_mt:world_religions|0
+community|arabic_exams|0
+community|acva:Algeria|0
+community|acva:Ancient_Egypt|0
+community|acva:Arab_Empire|0
+community|acva:Arabic_Architecture|0
+community|acva:Arabic_Art|0
+community|acva:Arabic_Astronomy|0
+community|acva:Arabic_Calligraphy|0
+community|acva:Arabic_Ceremony|0
+community|acva:Arabic_Clothing|0
+community|acva:Arabic_Culture|0
+community|acva:Arabic_Food|0
+community|acva:Arabic_Funeral|0
+community|acva:Arabic_Geography|0
+community|acva:Arabic_History|0
+community|acva:Arabic_Language_Origin|0
+community|acva:Arabic_Literature|0
+community|acva:Arabic_Math|0
+community|acva:Arabic_Medicine|0
+community|acva:Arabic_Music|0
+community|acva:Arabic_Ornament|0
+community|acva:Arabic_Philosophy|0
+community|acva:Arabic_Physics_and_Chemistry|0
+community|acva:Arabic_Wedding|0
+community|acva:Bahrain|0
+community|acva:Comoros|0
+community|acva:Egypt_modern|0
+community|acva:InfluenceFromAncientEgypt|0
+community|acva:InfluenceFromByzantium|0
+community|acva:InfluenceFromChina|0
+community|acva:InfluenceFromGreece|0
+community|acva:InfluenceFromIslam|0
+community|acva:InfluenceFromPersia|0
+community|acva:InfluenceFromRome|0
+community|acva:Iraq|0
+community|acva:Islam_Education|0
+community|acva:Islam_branches_and_schools|0
+community|acva:Islamic_law_system|0
+community|acva:Jordan|0
+community|acva:Kuwait|0
+community|acva:Lebanon|0
+community|acva:Libya|0
+community|acva:Mauritania|0
+community|acva:Mesopotamia_civilization|0
+community|acva:Morocco|0
+community|acva:Oman|0
+community|acva:Palestine|0
+community|acva:Qatar|0
+community|acva:Saudi_Arabia|0
+community|acva:Somalia|0
+community|acva:Sudan|0
+community|acva:Syria|0
+community|acva:Tunisia|0
+community|acva:United_Arab_Emirates|0
+community|acva:Yemen|0
+community|acva:communication|0
+community|acva:computer_and_phone|0
+community|acva:daily_life|0
+community|acva:entertainment|0
+community|alghafa:mcq_exams_test_ar|0
+community|alghafa:meta_ar_dialects|0
+community|alghafa:meta_ar_msa|0
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|0
+community|alghafa:multiple_choice_grounded_statement_soqal_task|0
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0
+community|alghafa:multiple_choice_rating_sentiment_task|0
+community|alghafa:multiple_choice_sentiment_task|0
+community|race_ar|0
+community|piqa_ar|0
+community|arc_easy_ar|0
+community|arc_challenge_okapi_ar|0
+community|openbook_qa_ext_ar|0
+community|boolq_ar|0
+community|copa_ext_ar|0
+community|hellaswag_okapi_ar|0
+community|toxigen_ar|0
+community|sciq_ar|0
diff --git a/examples/tasks/OALL_v2_tasks.txt b/examples/tasks/OALL_v2_tasks.txt
index 26dc78646..890c551c4 100644
--- a/examples/tasks/OALL_v2_tasks.txt
+++ b/examples/tasks/OALL_v2_tasks.txt
@@ -1,117 +1,117 @@
-community|alghafa:meta_ar_dialects|0|0
-community|alghafa:meta_ar_msa|0|0
-community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0
-community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0
-community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0
-community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0
-community|alghafa:multiple_choice_rating_sentiment_task|0|0
-community|alghafa:multiple_choice_sentiment_task|0|0
-community|arabic_exams|0|0
-community|arabic_mmlu:Islamic Studies|0|0
-community|arabic_mmlu:Islamic Studies (Middle School)|0|0
-community|arabic_mmlu:Islamic Studies (Primary School)|0|0
-community|arabic_mmlu:Islamic Studies (High School)|0|0
-community|arabic_mmlu:Driving Test|0|0
-community|arabic_mmlu:Natural Science (Middle School)|0|0
-community|arabic_mmlu:Natural Science (Primary School)|0|0
-community|arabic_mmlu:History (Middle School)|0|0
-community|arabic_mmlu:History (Primary School)|0|0
-community|arabic_mmlu:History (High School)|0|0
-community|arabic_mmlu:General Knowledge|0|0
-community|arabic_mmlu:General Knowledge (Middle School)|0|0
-community|arabic_mmlu:General Knowledge (Primary School)|0|0
-community|arabic_mmlu:Law (Professional)|0|0
-community|arabic_mmlu:Physics (High School)|0|0
-community|arabic_mmlu:Social Science (Middle School)|0|0
-community|arabic_mmlu:Social Science (Primary School)|0|0
-community|arabic_mmlu:Management (University)|0|0
-community|arabic_mmlu:Arabic Language (Middle School)|0|0
-community|arabic_mmlu:Arabic Language (Primary School)|0|0
-community|arabic_mmlu:Arabic Language (High School)|0|0
-community|arabic_mmlu:Political Science (University)|0|0
-community|arabic_mmlu:Philosophy (High School)|0|0
-community|arabic_mmlu:Accounting (University)|0|0
-community|arabic_mmlu:Computer Science (Middle School)|0|0
-community|arabic_mmlu:Computer Science (Primary School)|0|0
-community|arabic_mmlu:Computer Science (High School)|0|0
-community|arabic_mmlu:Computer Science (University)|0|0
-community|arabic_mmlu:Geography (Middle School)|0|0
-community|arabic_mmlu:Geography (Primary School)|0|0
-community|arabic_mmlu:Geography (High School)|0|0
-community|arabic_mmlu:Math (Primary School)|0|0
-community|arabic_mmlu:Biology (High School)|0|0
-community|arabic_mmlu:Economics (Middle School)|0|0
-community|arabic_mmlu:Economics (High School)|0|0
-community|arabic_mmlu:Economics (University)|0|0
-community|arabic_mmlu:Arabic Language (General)|0|0
-community|arabic_mmlu:Arabic Language (Grammar)|0|0
-community|arabic_mmlu:Civics (Middle School)|0|0
-community|arabic_mmlu:Civics (High School)|0|0
-community|madinah_qa:Arabic Language (General)|0|0
-community|madinah_qa:Arabic Language (Grammar)|0|0
-community|aratrust:Trustfulness|0|0
-community|aratrust:MentalHealth|0|0
-community|aratrust:PhysicalHealth|0|0
-community|aratrust:Offensive|0|0
-community|aratrust:Ethics|0|0
-community|aratrust:Privacy|0|0
-community|aratrust:Unfairness|0|0
-community|aratrust:Illegal|0|0
-community|arabic_mmlu_ht:abstract_algebra|0|0
-community|arabic_mmlu_ht:anatomy|0|0
-community|arabic_mmlu_ht:astronomy|0|0
-community|arabic_mmlu_ht:business_ethics|0|0
-community|arabic_mmlu_ht:clinical_knowledge|0|0
-community|arabic_mmlu_ht:college_biology|0|0
-community|arabic_mmlu_ht:college_chemistry|0|0
-community|arabic_mmlu_ht:college_computer_science|0|0
-community|arabic_mmlu_ht:college_mathematics|0|0
-community|arabic_mmlu_ht:college_medicine|0|0
-community|arabic_mmlu_ht:college_physics|0|0
-community|arabic_mmlu_ht:computer_security|0|0
-community|arabic_mmlu_ht:conceptual_physics|0|0
-community|arabic_mmlu_ht:econometrics|0|0
-community|arabic_mmlu_ht:electrical_engineering|0|0
-community|arabic_mmlu_ht:elementary_mathematics|0|0
-community|arabic_mmlu_ht:formal_logic|0|0
-community|arabic_mmlu_ht:global_facts|0|0
-community|arabic_mmlu_ht:high_school_biology|0|0
-community|arabic_mmlu_ht:high_school_chemistry|0|0
-community|arabic_mmlu_ht:high_school_computer_science|0|0
-community|arabic_mmlu_ht:high_school_european_history|0|0
-community|arabic_mmlu_ht:high_school_geography|0|0
-community|arabic_mmlu_ht:high_school_government_and_politics|0|0
-community|arabic_mmlu_ht:high_school_macroeconomics|0|0
-community|arabic_mmlu_ht:high_school_mathematics|0|0
-community|arabic_mmlu_ht:high_school_microeconomics|0|0
-community|arabic_mmlu_ht:high_school_physics|0|0
-community|arabic_mmlu_ht:high_school_psychology|0|0
-community|arabic_mmlu_ht:high_school_statistics|0|0
-community|arabic_mmlu_ht:high_school_us_history|0|0
-community|arabic_mmlu_ht:high_school_world_history|0|0
-community|arabic_mmlu_ht:human_aging|0|0
-community|arabic_mmlu_ht:human_sexuality|0|0
-community|arabic_mmlu_ht:international_law|0|0
-community|arabic_mmlu_ht:jurisprudence|0|0
-community|arabic_mmlu_ht:logical_fallacies|0|0
-community|arabic_mmlu_ht:machine_learning|0|0
-community|arabic_mmlu_ht:management|0|0
-community|arabic_mmlu_ht:marketing|0|0
-community|arabic_mmlu_ht:medical_genetics|0|0
-community|arabic_mmlu_ht:miscellaneous|0|0
-community|arabic_mmlu_ht:moral_disputes|0|0
-community|arabic_mmlu_ht:moral_scenarios|0|0
-community|arabic_mmlu_ht:nutrition|0|0
-community|arabic_mmlu_ht:philosophy|0|0
-community|arabic_mmlu_ht:prehistory|0|0
-community|arabic_mmlu_ht:professional_accounting|0|0
-community|arabic_mmlu_ht:professional_law|0|0
-community|arabic_mmlu_ht:professional_medicine|0|0
-community|arabic_mmlu_ht:professional_psychology|0|0
-community|arabic_mmlu_ht:public_relations|0|0
-community|arabic_mmlu_ht:security_studies|0|0
-community|arabic_mmlu_ht:sociology|0|0
-community|arabic_mmlu_ht:us_foreign_policy|0|0
-community|arabic_mmlu_ht:virology|0|0
-community|arabic_mmlu_ht:world_religions|0|0
-community|alrage_qa|0|0
+community|alghafa:meta_ar_dialects|0
+community|alghafa:meta_ar_msa|0
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|0
+community|alghafa:multiple_choice_grounded_statement_soqal_task|0
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0
+community|alghafa:multiple_choice_rating_sentiment_task|0
+community|alghafa:multiple_choice_sentiment_task|0
+community|arabic_exams|0
+community|arabic_mmlu:Islamic Studies|0
+community|arabic_mmlu:Islamic Studies (Middle School)|0
+community|arabic_mmlu:Islamic Studies (Primary School)|0
+community|arabic_mmlu:Islamic Studies (High School)|0
+community|arabic_mmlu:Driving Test|0
+community|arabic_mmlu:Natural Science (Middle School)|0
+community|arabic_mmlu:Natural Science (Primary School)|0
+community|arabic_mmlu:History (Middle School)|0
+community|arabic_mmlu:History (Primary School)|0
+community|arabic_mmlu:History (High School)|0
+community|arabic_mmlu:General Knowledge|0
+community|arabic_mmlu:General Knowledge (Middle School)|0
+community|arabic_mmlu:General Knowledge (Primary School)|0
+community|arabic_mmlu:Law (Professional)|0
+community|arabic_mmlu:Physics (High School)|0
+community|arabic_mmlu:Social Science (Middle School)|0
+community|arabic_mmlu:Social Science (Primary School)|0
+community|arabic_mmlu:Management (University)|0
+community|arabic_mmlu:Arabic Language (Middle School)|0
+community|arabic_mmlu:Arabic Language (Primary School)|0
+community|arabic_mmlu:Arabic Language (High School)|0
+community|arabic_mmlu:Political Science (University)|0
+community|arabic_mmlu:Philosophy (High School)|0
+community|arabic_mmlu:Accounting (University)|0
+community|arabic_mmlu:Computer Science (Middle School)|0
+community|arabic_mmlu:Computer Science (Primary School)|0
+community|arabic_mmlu:Computer Science (High School)|0
+community|arabic_mmlu:Computer Science (University)|0
+community|arabic_mmlu:Geography (Middle School)|0
+community|arabic_mmlu:Geography (Primary School)|0
+community|arabic_mmlu:Geography (High School)|0
+community|arabic_mmlu:Math (Primary School)|0
+community|arabic_mmlu:Biology (High School)|0
+community|arabic_mmlu:Economics (Middle School)|0
+community|arabic_mmlu:Economics (High School)|0
+community|arabic_mmlu:Economics (University)|0
+community|arabic_mmlu:Arabic Language (General)|0
+community|arabic_mmlu:Arabic Language (Grammar)|0
+community|arabic_mmlu:Civics (Middle School)|0
+community|arabic_mmlu:Civics (High School)|0
+community|madinah_qa:Arabic Language (General)|0
+community|madinah_qa:Arabic Language (Grammar)|0
+community|aratrust:Trustfulness|0
+community|aratrust:MentalHealth|0
+community|aratrust:PhysicalHealth|0
+community|aratrust:Offensive|0
+community|aratrust:Ethics|0
+community|aratrust:Privacy|0
+community|aratrust:Unfairness|0
+community|aratrust:Illegal|0
+community|arabic_mmlu_ht:abstract_algebra|0
+community|arabic_mmlu_ht:anatomy|0
+community|arabic_mmlu_ht:astronomy|0
+community|arabic_mmlu_ht:business_ethics|0
+community|arabic_mmlu_ht:clinical_knowledge|0
+community|arabic_mmlu_ht:college_biology|0
+community|arabic_mmlu_ht:college_chemistry|0
+community|arabic_mmlu_ht:college_computer_science|0
+community|arabic_mmlu_ht:college_mathematics|0
+community|arabic_mmlu_ht:college_medicine|0
+community|arabic_mmlu_ht:college_physics|0
+community|arabic_mmlu_ht:computer_security|0
+community|arabic_mmlu_ht:conceptual_physics|0
+community|arabic_mmlu_ht:econometrics|0
+community|arabic_mmlu_ht:electrical_engineering|0
+community|arabic_mmlu_ht:elementary_mathematics|0
+community|arabic_mmlu_ht:formal_logic|0
+community|arabic_mmlu_ht:global_facts|0
+community|arabic_mmlu_ht:high_school_biology|0
+community|arabic_mmlu_ht:high_school_chemistry|0
+community|arabic_mmlu_ht:high_school_computer_science|0
+community|arabic_mmlu_ht:high_school_european_history|0
+community|arabic_mmlu_ht:high_school_geography|0
+community|arabic_mmlu_ht:high_school_government_and_politics|0
+community|arabic_mmlu_ht:high_school_macroeconomics|0
+community|arabic_mmlu_ht:high_school_mathematics|0
+community|arabic_mmlu_ht:high_school_microeconomics|0
+community|arabic_mmlu_ht:high_school_physics|0
+community|arabic_mmlu_ht:high_school_psychology|0
+community|arabic_mmlu_ht:high_school_statistics|0
+community|arabic_mmlu_ht:high_school_us_history|0
+community|arabic_mmlu_ht:high_school_world_history|0
+community|arabic_mmlu_ht:human_aging|0
+community|arabic_mmlu_ht:human_sexuality|0
+community|arabic_mmlu_ht:international_law|0
+community|arabic_mmlu_ht:jurisprudence|0
+community|arabic_mmlu_ht:logical_fallacies|0
+community|arabic_mmlu_ht:machine_learning|0
+community|arabic_mmlu_ht:management|0
+community|arabic_mmlu_ht:marketing|0
+community|arabic_mmlu_ht:medical_genetics|0
+community|arabic_mmlu_ht:miscellaneous|0
+community|arabic_mmlu_ht:moral_disputes|0
+community|arabic_mmlu_ht:moral_scenarios|0
+community|arabic_mmlu_ht:nutrition|0
+community|arabic_mmlu_ht:philosophy|0
+community|arabic_mmlu_ht:prehistory|0
+community|arabic_mmlu_ht:professional_accounting|0
+community|arabic_mmlu_ht:professional_law|0
+community|arabic_mmlu_ht:professional_medicine|0
+community|arabic_mmlu_ht:professional_psychology|0
+community|arabic_mmlu_ht:public_relations|0
+community|arabic_mmlu_ht:security_studies|0
+community|arabic_mmlu_ht:sociology|0
+community|arabic_mmlu_ht:us_foreign_policy|0
+community|arabic_mmlu_ht:virology|0
+community|arabic_mmlu_ht:world_religions|0
+community|alrage_qa|0
diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt
index 8593fa2f8..f88738993 100644
--- a/examples/tasks/all_arabic_tasks.txt
+++ b/examples/tasks/all_arabic_tasks.txt
@@ -1,244 +1,244 @@
-lighteval|xstory_cloze:ar|0|0
-community|arabic_exams|0|0
-community|arabic_mmlu_mt:abstract_algebra|0|0
-community|arabic_mmlu_mt:anatomy|0|0
-community|arabic_mmlu_mt:astronomy|0|0
-community|arabic_mmlu_mt:business_ethics|0|0
-community|arabic_mmlu_mt:clinical_knowledge|0|0
-community|arabic_mmlu_mt:college_biology|0|0
-community|arabic_mmlu_mt:college_chemistry|0|0
-community|arabic_mmlu_mt:college_computer_science|0|0
-community|arabic_mmlu_mt:college_mathematics|0|0
-community|arabic_mmlu_mt:college_medicine|0|0
-community|arabic_mmlu_mt:college_physics|0|0
-community|arabic_mmlu_mt:computer_security|0|0
-community|arabic_mmlu_mt:conceptual_physics|0|0
-community|arabic_mmlu_mt:econometrics|0|0
-community|arabic_mmlu_mt:electrical_engineering|0|0
-community|arabic_mmlu_mt:elementary_mathematics|0|0
-community|arabic_mmlu_mt:formal_logic|0|0
-community|arabic_mmlu_mt:global_facts|0|0
-community|arabic_mmlu_mt:high_school_biology|0|0
-community|arabic_mmlu_mt:high_school_chemistry|0|0
-community|arabic_mmlu_mt:high_school_computer_science|0|0
-community|arabic_mmlu_mt:high_school_european_history|0|0
-community|arabic_mmlu_mt:high_school_geography|0|0
-community|arabic_mmlu_mt:high_school_government_and_politics|0|0
-community|arabic_mmlu_mt:high_school_macroeconomics|0|0
-community|arabic_mmlu_mt:high_school_mathematics|0|0
-community|arabic_mmlu_mt:high_school_microeconomics|0|0
-community|arabic_mmlu_mt:high_school_physics|0|0
-community|arabic_mmlu_mt:high_school_psychology|0|0
-community|arabic_mmlu_mt:high_school_statistics|0|0
-community|arabic_mmlu_mt:high_school_us_history|0|0
-community|arabic_mmlu_mt:high_school_world_history|0|0
-community|arabic_mmlu_mt:human_aging|0|0
-community|arabic_mmlu_mt:human_sexuality|0|0
-community|arabic_mmlu_mt:international_law|0|0
-community|arabic_mmlu_mt:jurisprudence|0|0
-community|arabic_mmlu_mt:logical_fallacies|0|0
-community|arabic_mmlu_mt:machine_learning|0|0
-community|arabic_mmlu_mt:management|0|0
-community|arabic_mmlu_mt:marketing|0|0
-community|arabic_mmlu_mt:medical_genetics|0|0
-community|arabic_mmlu_mt:miscellaneous|0|0
-community|arabic_mmlu_mt:moral_disputes|0|0
-community|arabic_mmlu_mt:moral_scenarios|0|0
-community|arabic_mmlu_mt:nutrition|0|0
-community|arabic_mmlu_mt:philosophy|0|0
-community|arabic_mmlu_mt:prehistory|0|0
-community|arabic_mmlu_mt:professional_accounting|0|0
-community|arabic_mmlu_mt:professional_law|0|0
-community|arabic_mmlu_mt:professional_medicine|0|0
-community|arabic_mmlu_mt:professional_psychology|0|0
-community|arabic_mmlu_mt:public_relations|0|0
-community|arabic_mmlu_mt:security_studies|0|0
-community|arabic_mmlu_mt:sociology|0|0
-community|arabic_mmlu_mt:us_foreign_policy|0|0
-community|arabic_mmlu_mt:virology|0|0
-community|arabic_mmlu_mt:world_religions|0|0
-community|acva:Algeria|0|0
-community|acva:Ancient_Egypt|0|0
-community|acva:Arab_Empire|0|0
-community|acva:Arabic_Architecture|0|0
-community|acva:Arabic_Art|0|0
-community|acva:Arabic_Astronomy|0|0
-community|acva:Arabic_Calligraphy|0|0
-community|acva:Arabic_Ceremony|0|0
-community|acva:Arabic_Clothing|0|0
-community|acva:Arabic_Culture|0|0
-community|acva:Arabic_Food|0|0
-community|acva:Arabic_Funeral|0|0
-community|acva:Arabic_Geography|0|0
-community|acva:Arabic_History|0|0
-community|acva:Arabic_Language_Origin|0|0
-community|acva:Arabic_Literature|0|0
-community|acva:Arabic_Math|0|0
-community|acva:Arabic_Medicine|0|0
-community|acva:Arabic_Music|0|0
-community|acva:Arabic_Ornament|0|0
-community|acva:Arabic_Philosophy|0|0
-community|acva:Arabic_Physics_and_Chemistry|0|0
-community|acva:Arabic_Wedding|0|0
-community|acva:Bahrain|0|0
-community|acva:Comoros|0|0
-community|acva:Egypt_modern|0|0
-community|acva:InfluenceFromAncientEgypt|0|0
-community|acva:InfluenceFromByzantium|0|0
-community|acva:InfluenceFromChina|0|0
-community|acva:InfluenceFromGreece|0|0
-community|acva:InfluenceFromIslam|0|0
-community|acva:InfluenceFromPersia|0|0
-community|acva:InfluenceFromRome|0|0
-community|acva:Iraq|0|0
-community|acva:Islam_Education|0|0
-community|acva:Islam_branches_and_schools|0|0
-community|acva:Islamic_law_system|0|0
-community|acva:Jordan|0|0
-community|acva:Kuwait|0|0
-community|acva:Lebanon|0|0
-community|acva:Libya|0|0
-community|acva:Mauritania|0|0
-community|acva:Mesopotamia_civilization|0|0
-community|acva:Morocco|0|0
-community|acva:Oman|0|0
-community|acva:Palestine|0|0
-community|acva:Qatar|0|0
-community|acva:Saudi_Arabia|0|0
-community|acva:Somalia|0|0
-community|acva:Sudan|0|0
-community|acva:Syria|0|0
-community|acva:Tunisia|0|0
-community|acva:United_Arab_Emirates|0|0
-community|acva:Yemen|0|0
-community|acva:communication|0|0
-community|acva:computer_and_phone|0|0
-community|acva:daily_life|0|0
-community|acva:entertainment|0|0
-community|alghafa:mcq_exams_test_ar|0|0
-community|alghafa:meta_ar_dialects|0|0
-community|alghafa:meta_ar_msa|0|0
-community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0
-community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0
-community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0
-community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0
-community|alghafa:multiple_choice_rating_sentiment_task|0|0
-community|alghafa:multiple_choice_sentiment_task|0|0
-community|race_ar|0|0
-community|piqa_ar|0|0
-community|arc_easy_ar|0|0
-community|arc_challenge_okapi_ar|0|0
-community|mmlu_okapi_ar|0|0
-community|openbook_qa_ext_ar|0|0
-community|boolq_ar|0|0
-community|copa_ext_ar|0|0
-community|hellaswag_okapi_ar|0|0
-community|toxigen_ar|0|0
-community|sciq_ar|0|0
-community|arabic_mmlu_ht:abstract_algebra|0|0
-community|arabic_mmlu_ht:anatomy|0|0
-community|arabic_mmlu_ht:astronomy|0|0
-community|arabic_mmlu_ht:business_ethics|0|0
-community|arabic_mmlu_ht:clinical_knowledge|0|0
-community|arabic_mmlu_ht:college_biology|0|0
-community|arabic_mmlu_ht:college_chemistry|0|0
-community|arabic_mmlu_ht:college_computer_science|0|0
-community|arabic_mmlu_ht:college_mathematics|0|0
-community|arabic_mmlu_ht:college_medicine|0|0
-community|arabic_mmlu_ht:college_physics|0|0
-community|arabic_mmlu_ht:computer_security|0|0
-community|arabic_mmlu_ht:conceptual_physics|0|0
-community|arabic_mmlu_ht:econometrics|0|0
-community|arabic_mmlu_ht:electrical_engineering|0|0
-community|arabic_mmlu_ht:elementary_mathematics|0|0
-community|arabic_mmlu_ht:formal_logic|0|0
-community|arabic_mmlu_ht:global_facts|0|0
-community|arabic_mmlu_ht:high_school_biology|0|0
-community|arabic_mmlu_ht:high_school_chemistry|0|0
-community|arabic_mmlu_ht:high_school_computer_science|0|0
-community|arabic_mmlu_ht:high_school_european_history|0|0
-community|arabic_mmlu_ht:high_school_geography|0|0
-community|arabic_mmlu_ht:high_school_government_and_politics|0|0
-community|arabic_mmlu_ht:high_school_macroeconomics|0|0
-community|arabic_mmlu_ht:high_school_mathematics|0|0
-community|arabic_mmlu_ht:high_school_microeconomics|0|0
-community|arabic_mmlu_ht:high_school_physics|0|0
-community|arabic_mmlu_ht:high_school_psychology|0|0
-community|arabic_mmlu_ht:high_school_statistics|0|0
-community|arabic_mmlu_ht:high_school_us_history|0|0
-community|arabic_mmlu_ht:high_school_world_history|0|0
-community|arabic_mmlu_ht:human_aging|0|0
-community|arabic_mmlu_ht:human_sexuality|0|0
-community|arabic_mmlu_ht:international_law|0|0
-community|arabic_mmlu_ht:jurisprudence|0|0
-community|arabic_mmlu_ht:logical_fallacies|0|0
-community|arabic_mmlu_ht:machine_learning|0|0
-community|arabic_mmlu_ht:management|0|0
-community|arabic_mmlu_ht:marketing|0|0
-community|arabic_mmlu_ht:medical_genetics|0|0
-community|arabic_mmlu_ht:miscellaneous|0|0
-community|arabic_mmlu_ht:moral_disputes|0|0
-community|arabic_mmlu_ht:moral_scenarios|0|0
-community|arabic_mmlu_ht:nutrition|0|0
-community|arabic_mmlu_ht:philosophy|0|0
-community|arabic_mmlu_ht:prehistory|0|0
-community|arabic_mmlu_ht:professional_accounting|0|0
-community|arabic_mmlu_ht:professional_law|0|0
-community|arabic_mmlu_ht:professional_medicine|0|0
-community|arabic_mmlu_ht:professional_psychology|0|0
-community|arabic_mmlu_ht:public_relations|0|0
-community|arabic_mmlu_ht:security_studies|0|0
-community|arabic_mmlu_ht:sociology|0|0
-community|arabic_mmlu_ht:us_foreign_policy|0|0
-community|arabic_mmlu_ht:virology|0|0
-community|arabic_mmlu_ht:world_religions|0|0
-community|arabic_mmlu:Islamic Studies|0|0
-community|arabic_mmlu:Islamic Studies (Middle School)|0|0
-community|arabic_mmlu:Islamic Studies (Primary School)|0|0
-community|arabic_mmlu:Islamic Studies (High School)|0|0
-community|arabic_mmlu:Driving Test|0|0
-community|arabic_mmlu:Natural Science (Middle School)|0|0
-community|arabic_mmlu:Natural Science (Primary School)|0|0
-community|arabic_mmlu:History (Middle School)|0|0
-community|arabic_mmlu:History (Primary School)|0|0
-community|arabic_mmlu:History (High School)|0|0
-community|arabic_mmlu:General Knowledge|0|0
-community|arabic_mmlu:General Knowledge (Middle School)|0|0
-community|arabic_mmlu:General Knowledge (Primary School)|0|0
-community|arabic_mmlu:Law (Professional)|0|0
-community|arabic_mmlu:Physics (High School)|0|0
-community|arabic_mmlu:Social Science (Middle School)|0|0
-community|arabic_mmlu:Social Science (Primary School)|0|0
-community|arabic_mmlu:Management (University)|0|0
-community|arabic_mmlu:Arabic Language (Middle School)|0|0
-community|arabic_mmlu:Arabic Language (Primary School)|0|0
-community|arabic_mmlu:Arabic Language (High School)|0|0
-community|arabic_mmlu:Political Science (University)|0|0
-community|arabic_mmlu:Philosophy (High School)|0|0
-community|arabic_mmlu:Accounting (University)|0|0
-community|arabic_mmlu:Computer Science (Middle School)|0|0
-community|arabic_mmlu:Computer Science (Primary School)|0|0
-community|arabic_mmlu:Computer Science (High School)|0|0
-community|arabic_mmlu:Computer Science (University)|0|0
-community|arabic_mmlu:Geography (Middle School)|0|0
-community|arabic_mmlu:Geography (Primary School)|0|0
-community|arabic_mmlu:Geography (High School)|0|0
-community|arabic_mmlu:Math (Primary School)|0|0
-community|arabic_mmlu:Biology (High School)|0|0
-community|arabic_mmlu:Economics (Middle School)|0|0
-community|arabic_mmlu:Economics (High School)|0|0
-community|arabic_mmlu:Economics (University)|0|0
-community|arabic_mmlu:Arabic Language (General)|0|0
-community|arabic_mmlu:Arabic Language (Grammar)|0|0
-community|arabic_mmlu:Civics (Middle School)|0|0
-community|arabic_mmlu:Civics (High School)|0|0
-community|madinah_qa:Arabic Language (General)|0|0
-community|madinah_qa:Arabic Language (Grammar)|0|0
-community|aratrust:Trustfulness|0|0
-community|aratrust:MentalHealth|0|0
-community|aratrust:PhysicalHealth|0|0
-community|aratrust:Offensive|0|0
-community|aratrust:Ethics|0|0
-community|aratrust:Privacy|0|0
-community|aratrust:Unfairness|0|0
-community|aratrust:Illegal|0|0
+lighteval|xstory_cloze:ar|0
+community|arabic_exams|0
+community|arabic_mmlu_mt:abstract_algebra|0
+community|arabic_mmlu_mt:anatomy|0
+community|arabic_mmlu_mt:astronomy|0
+community|arabic_mmlu_mt:business_ethics|0
+community|arabic_mmlu_mt:clinical_knowledge|0
+community|arabic_mmlu_mt:college_biology|0
+community|arabic_mmlu_mt:college_chemistry|0
+community|arabic_mmlu_mt:college_computer_science|0
+community|arabic_mmlu_mt:college_mathematics|0
+community|arabic_mmlu_mt:college_medicine|0
+community|arabic_mmlu_mt:college_physics|0
+community|arabic_mmlu_mt:computer_security|0
+community|arabic_mmlu_mt:conceptual_physics|0
+community|arabic_mmlu_mt:econometrics|0
+community|arabic_mmlu_mt:electrical_engineering|0
+community|arabic_mmlu_mt:elementary_mathematics|0
+community|arabic_mmlu_mt:formal_logic|0
+community|arabic_mmlu_mt:global_facts|0
+community|arabic_mmlu_mt:high_school_biology|0
+community|arabic_mmlu_mt:high_school_chemistry|0
+community|arabic_mmlu_mt:high_school_computer_science|0
+community|arabic_mmlu_mt:high_school_european_history|0
+community|arabic_mmlu_mt:high_school_geography|0
+community|arabic_mmlu_mt:high_school_government_and_politics|0
+community|arabic_mmlu_mt:high_school_macroeconomics|0
+community|arabic_mmlu_mt:high_school_mathematics|0
+community|arabic_mmlu_mt:high_school_microeconomics|0
+community|arabic_mmlu_mt:high_school_physics|0
+community|arabic_mmlu_mt:high_school_psychology|0
+community|arabic_mmlu_mt:high_school_statistics|0
+community|arabic_mmlu_mt:high_school_us_history|0
+community|arabic_mmlu_mt:high_school_world_history|0
+community|arabic_mmlu_mt:human_aging|0
+community|arabic_mmlu_mt:human_sexuality|0
+community|arabic_mmlu_mt:international_law|0
+community|arabic_mmlu_mt:jurisprudence|0
+community|arabic_mmlu_mt:logical_fallacies|0
+community|arabic_mmlu_mt:machine_learning|0
+community|arabic_mmlu_mt:management|0
+community|arabic_mmlu_mt:marketing|0
+community|arabic_mmlu_mt:medical_genetics|0
+community|arabic_mmlu_mt:miscellaneous|0
+community|arabic_mmlu_mt:moral_disputes|0
+community|arabic_mmlu_mt:moral_scenarios|0
+community|arabic_mmlu_mt:nutrition|0
+community|arabic_mmlu_mt:philosophy|0
+community|arabic_mmlu_mt:prehistory|0
+community|arabic_mmlu_mt:professional_accounting|0
+community|arabic_mmlu_mt:professional_law|0
+community|arabic_mmlu_mt:professional_medicine|0
+community|arabic_mmlu_mt:professional_psychology|0
+community|arabic_mmlu_mt:public_relations|0
+community|arabic_mmlu_mt:security_studies|0
+community|arabic_mmlu_mt:sociology|0
+community|arabic_mmlu_mt:us_foreign_policy|0
+community|arabic_mmlu_mt:virology|0
+community|arabic_mmlu_mt:world_religions|0
+community|acva:Algeria|0
+community|acva:Ancient_Egypt|0
+community|acva:Arab_Empire|0
+community|acva:Arabic_Architecture|0
+community|acva:Arabic_Art|0
+community|acva:Arabic_Astronomy|0
+community|acva:Arabic_Calligraphy|0
+community|acva:Arabic_Ceremony|0
+community|acva:Arabic_Clothing|0
+community|acva:Arabic_Culture|0
+community|acva:Arabic_Food|0
+community|acva:Arabic_Funeral|0
+community|acva:Arabic_Geography|0
+community|acva:Arabic_History|0
+community|acva:Arabic_Language_Origin|0
+community|acva:Arabic_Literature|0
+community|acva:Arabic_Math|0
+community|acva:Arabic_Medicine|0
+community|acva:Arabic_Music|0
+community|acva:Arabic_Ornament|0
+community|acva:Arabic_Philosophy|0
+community|acva:Arabic_Physics_and_Chemistry|0
+community|acva:Arabic_Wedding|0
+community|acva:Bahrain|0
+community|acva:Comoros|0
+community|acva:Egypt_modern|0
+community|acva:InfluenceFromAncientEgypt|0
+community|acva:InfluenceFromByzantium|0
+community|acva:InfluenceFromChina|0
+community|acva:InfluenceFromGreece|0
+community|acva:InfluenceFromIslam|0
+community|acva:InfluenceFromPersia|0
+community|acva:InfluenceFromRome|0
+community|acva:Iraq|0
+community|acva:Islam_Education|0
+community|acva:Islam_branches_and_schools|0
+community|acva:Islamic_law_system|0
+community|acva:Jordan|0
+community|acva:Kuwait|0
+community|acva:Lebanon|0
+community|acva:Libya|0
+community|acva:Mauritania|0
+community|acva:Mesopotamia_civilization|0
+community|acva:Morocco|0
+community|acva:Oman|0
+community|acva:Palestine|0
+community|acva:Qatar|0
+community|acva:Saudi_Arabia|0
+community|acva:Somalia|0
+community|acva:Sudan|0
+community|acva:Syria|0
+community|acva:Tunisia|0
+community|acva:United_Arab_Emirates|0
+community|acva:Yemen|0
+community|acva:communication|0
+community|acva:computer_and_phone|0
+community|acva:daily_life|0
+community|acva:entertainment|0
+community|alghafa:mcq_exams_test_ar|0
+community|alghafa:meta_ar_dialects|0
+community|alghafa:meta_ar_msa|0
+community|alghafa:multiple_choice_facts_truefalse_balanced_task|0
+community|alghafa:multiple_choice_grounded_statement_soqal_task|0
+community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0
+community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0
+community|alghafa:multiple_choice_rating_sentiment_task|0
+community|alghafa:multiple_choice_sentiment_task|0
+community|race_ar|0
+community|piqa_ar|0
+community|arc_easy_ar|0
+community|arc_challenge_okapi_ar|0
+community|mmlu_okapi_ar|0
+community|openbook_qa_ext_ar|0
+community|boolq_ar|0
+community|copa_ext_ar|0
+community|hellaswag_okapi_ar|0
+community|toxigen_ar|0
+community|sciq_ar|0
+community|arabic_mmlu_ht:abstract_algebra|0
+community|arabic_mmlu_ht:anatomy|0
+community|arabic_mmlu_ht:astronomy|0
+community|arabic_mmlu_ht:business_ethics|0
+community|arabic_mmlu_ht:clinical_knowledge|0
+community|arabic_mmlu_ht:college_biology|0
+community|arabic_mmlu_ht:college_chemistry|0
+community|arabic_mmlu_ht:college_computer_science|0
+community|arabic_mmlu_ht:college_mathematics|0
+community|arabic_mmlu_ht:college_medicine|0
+community|arabic_mmlu_ht:college_physics|0
+community|arabic_mmlu_ht:computer_security|0
+community|arabic_mmlu_ht:conceptual_physics|0
+community|arabic_mmlu_ht:econometrics|0
+community|arabic_mmlu_ht:electrical_engineering|0
+community|arabic_mmlu_ht:elementary_mathematics|0
+community|arabic_mmlu_ht:formal_logic|0
+community|arabic_mmlu_ht:global_facts|0
+community|arabic_mmlu_ht:high_school_biology|0
+community|arabic_mmlu_ht:high_school_chemistry|0
+community|arabic_mmlu_ht:high_school_computer_science|0
+community|arabic_mmlu_ht:high_school_european_history|0
+community|arabic_mmlu_ht:high_school_geography|0
+community|arabic_mmlu_ht:high_school_government_and_politics|0
+community|arabic_mmlu_ht:high_school_macroeconomics|0
+community|arabic_mmlu_ht:high_school_mathematics|0
+community|arabic_mmlu_ht:high_school_microeconomics|0
+community|arabic_mmlu_ht:high_school_physics|0
+community|arabic_mmlu_ht:high_school_psychology|0
+community|arabic_mmlu_ht:high_school_statistics|0
+community|arabic_mmlu_ht:high_school_us_history|0
+community|arabic_mmlu_ht:high_school_world_history|0
+community|arabic_mmlu_ht:human_aging|0
+community|arabic_mmlu_ht:human_sexuality|0
+community|arabic_mmlu_ht:international_law|0
+community|arabic_mmlu_ht:jurisprudence|0
+community|arabic_mmlu_ht:logical_fallacies|0
+community|arabic_mmlu_ht:machine_learning|0
+community|arabic_mmlu_ht:management|0
+community|arabic_mmlu_ht:marketing|0
+community|arabic_mmlu_ht:medical_genetics|0
+community|arabic_mmlu_ht:miscellaneous|0
+community|arabic_mmlu_ht:moral_disputes|0
+community|arabic_mmlu_ht:moral_scenarios|0
+community|arabic_mmlu_ht:nutrition|0
+community|arabic_mmlu_ht:philosophy|0
+community|arabic_mmlu_ht:prehistory|0
+community|arabic_mmlu_ht:professional_accounting|0
+community|arabic_mmlu_ht:professional_law|0
+community|arabic_mmlu_ht:professional_medicine|0
+community|arabic_mmlu_ht:professional_psychology|0
+community|arabic_mmlu_ht:public_relations|0
+community|arabic_mmlu_ht:security_studies|0
+community|arabic_mmlu_ht:sociology|0
+community|arabic_mmlu_ht:us_foreign_policy|0
+community|arabic_mmlu_ht:virology|0
+community|arabic_mmlu_ht:world_religions|0
+community|arabic_mmlu:Islamic Studies|0
+community|arabic_mmlu:Islamic Studies (Middle School)|0
+community|arabic_mmlu:Islamic Studies (Primary School)|0
+community|arabic_mmlu:Islamic Studies (High School)|0
+community|arabic_mmlu:Driving Test|0
+community|arabic_mmlu:Natural Science (Middle School)|0
+community|arabic_mmlu:Natural Science (Primary School)|0
+community|arabic_mmlu:History (Middle School)|0
+community|arabic_mmlu:History (Primary School)|0
+community|arabic_mmlu:History (High School)|0
+community|arabic_mmlu:General Knowledge|0
+community|arabic_mmlu:General Knowledge (Middle School)|0
+community|arabic_mmlu:General Knowledge (Primary School)|0
+community|arabic_mmlu:Law (Professional)|0
+community|arabic_mmlu:Physics (High School)|0
+community|arabic_mmlu:Social Science (Middle School)|0
+community|arabic_mmlu:Social Science (Primary School)|0
+community|arabic_mmlu:Management (University)|0
+community|arabic_mmlu:Arabic Language (Middle School)|0
+community|arabic_mmlu:Arabic Language (Primary School)|0
+community|arabic_mmlu:Arabic Language (High School)|0
+community|arabic_mmlu:Political Science (University)|0
+community|arabic_mmlu:Philosophy (High School)|0
+community|arabic_mmlu:Accounting (University)|0
+community|arabic_mmlu:Computer Science (Middle School)|0
+community|arabic_mmlu:Computer Science (Primary School)|0
+community|arabic_mmlu:Computer Science (High School)|0
+community|arabic_mmlu:Computer Science (University)|0
+community|arabic_mmlu:Geography (Middle School)|0
+community|arabic_mmlu:Geography (Primary School)|0
+community|arabic_mmlu:Geography (High School)|0
+community|arabic_mmlu:Math (Primary School)|0
+community|arabic_mmlu:Biology (High School)|0
+community|arabic_mmlu:Economics (Middle School)|0
+community|arabic_mmlu:Economics (High School)|0
+community|arabic_mmlu:Economics (University)|0
+community|arabic_mmlu:Arabic Language (General)|0
+community|arabic_mmlu:Arabic Language (Grammar)|0
+community|arabic_mmlu:Civics (Middle School)|0
+community|arabic_mmlu:Civics (High School)|0
+community|madinah_qa:Arabic Language (General)|0
+community|madinah_qa:Arabic Language (Grammar)|0
+community|aratrust:Trustfulness|0
+community|aratrust:MentalHealth|0
+community|aratrust:PhysicalHealth|0
+community|aratrust:Offensive|0
+community|aratrust:Ethics|0
+community|aratrust:Privacy|0
+community|aratrust:Unfairness|0
+community|aratrust:Illegal|0
diff --git a/examples/tasks/all_filipino_tasks.txt b/examples/tasks/all_filipino_tasks.txt
index 19f33917d..72bb13567 100644
--- a/examples/tasks/all_filipino_tasks.txt
+++ b/examples/tasks/all_filipino_tasks.txt
@@ -1,23 +1,23 @@
-community|readability_ceb_mcf|0|0
-community|kalahi_tgl_mcf|0|0
-community|kalahi_tgl_hybrid|0|0
-community|cebuaner_ceb_mcf|0|0
-community|universalner_tgl_mcf|0|0
-community|universalner_ceb_mcf|0|0
-community|tlunifiedner_tgl_mcf|0|0
-community|stingraybench_correctness_tgl_mcf|0|0
-community|stingraybench_semantic_appropriateness_tgl_mcf|0|0
-community|tatoeba_ceb|0|0
-community|tatoeba_tgl|0|0
-community|ntrex128_fil|0|0
-community|tico19_tgl|0|0
-community|dengue_filipino_fil|0|0
-community|include_tgl_mcf|0|0
-community|newsphnli_fil_mcf|0|0
-community|belebele_ceb_mcf|0|0
-community|belebele_fil_mcf|0|0
-community|sib200_ceb_mcf|0|0
-community|sib200_tgl_mcf|0|0
-community|firecs_fil_mcf|0|0
-community|global_mmlu_all_tgl_mcf|0|0
-community|balita_tgl_mcf|0|0
+community|readability_ceb_mcf|0
+community|kalahi_tgl_mcf|0
+community|kalahi_tgl_hybrid|0
+community|cebuaner_ceb_mcf|0
+community|universalner_tgl_mcf|0
+community|universalner_ceb_mcf|0
+community|tlunifiedner_tgl_mcf|0
+community|stingraybench_correctness_tgl_mcf|0
+community|stingraybench_semantic_appropriateness_tgl_mcf|0
+community|tatoeba_ceb|0
+community|tatoeba_tgl|0
+community|ntrex128_fil|0
+community|tico19_tgl|0
+community|dengue_filipino_fil|0
+community|include_tgl_mcf|0
+community|newsphnli_fil_mcf|0
+community|belebele_ceb_mcf|0
+community|belebele_fil_mcf|0
+community|sib200_ceb_mcf|0
+community|sib200_tgl_mcf|0
+community|firecs_fil_mcf|0
+community|global_mmlu_all_tgl_mcf|0
+community|balita_tgl_mcf|0
diff --git a/examples/tasks/all_german_rag_evals.txt b/examples/tasks/all_german_rag_evals.txt
index 29fec52bd..96ca24102 100644
--- a/examples/tasks/all_german_rag_evals.txt
+++ b/examples/tasks/all_german_rag_evals.txt
@@ -1,4 +1,4 @@
-community|german_rag_eval:choose_question_by_context|0|0
-community|german_rag_eval:choose_context_by_question|0|0
-community|german_rag_eval:question_answer_match|0|0
-community|german_rag_eval:context_question_match|0|0
+community|german_rag_eval:choose_question_by_context|0
+community|german_rag_eval:choose_context_by_question|0
+community|german_rag_eval:question_answer_match|0
+community|german_rag_eval:context_question_match|0
diff --git a/examples/tasks/all_tasks.txt b/examples/tasks/all_tasks.txt
index 894ffc6f9..56a026126 100644
--- a/examples/tasks/all_tasks.txt
+++ b/examples/tasks/all_tasks.txt
@@ -1,1146 +1,1146 @@
-bigbench|abstract_narrative_understanding|0|0
-bigbench|anachronisms|0|0
-bigbench|analogical_similarity|0|0
-bigbench|analytic_entailment|0|0
-bigbench|arithmetic_bb|0|0
-bigbench|ascii_word_recognition|0|0
-bigbench|authorship_verification|0|0
-bigbench|auto_categorization|0|0
-bigbench|auto_debugging|0|0
-bigbench|bbq_lite_json|0|0
-bigbench|bridging_anaphora_resolution_barqa|0|0
-bigbench|causal_judgment|0|0
-bigbench|cause_and_effect|0|0
-bigbench|checkmate_in_one|0|0
-bigbench|chess_state_tracking|0|0
-bigbench|chinese_remainder_theorem|0|0
-bigbench|cifar10_classification|0|0
-bigbench|code_line_description|0|0
-bigbench|codenames|0|0
-bigbench|color|0|0
-bigbench|common_morpheme|0|0
-bigbench|conceptual_combinations|0|0
-bigbench|conlang_translation|0|0
-bigbench|contextual_parametric_knowledge_conflicts|0|0
-bigbench|crash_blossom|0|0
-bigbench|crass_ai|0|0
-bigbench|cryobiology_spanish|0|0
-bigbench|cryptonite|0|0
-bigbench|cs_algorithms|0|0
-bigbench|dark_humor_detection|0|0
-bigbench|date_understanding|0|0
-bigbench|disambiguation_qa|0|0
-bigbench|discourse_marker_prediction|0|0
-bigbench|disfl_qa|0|0
-bigbench|dyck_languages|0|0
-bigbench|elementary_math_qa|0|0
-bigbench|emoji_movie|0|0
-bigbench|emojis_emotion_prediction|0|0
-bigbench|empirical_judgments|0|0
-bigbench|english_proverbs|0|0
-bigbench|english_russian_proverbs|0|0
-bigbench|entailed_polarity_hindi|0|0
-bigbench|entailed_polarity|0|0
-bigbench|epistemic_reasoning|0|0
-bigbench|evaluating_information_essentiality|0|0
-bigbench|fact_checker|0|0
-bigbench|fantasy_reasoning|0|0
-bigbench|few_shot_nlg|0|0
-bigbench|figure_of_speech_detection|0|0
-bigbench|formal_fallacies_syllogisms_negation|0|0
-bigbench|gem|0|0
-bigbench|gender_inclusive_sentences_german|0|0
-bigbench|general_knowledge|0|0
-bigbench|geometric_shapes|0|0
-bigbench|goal_step_wikihow|0|0
-bigbench|gre_reading_comprehension|0|0
-bigbench|hhh_alignment|0|0
-bigbench|hindi_question_answering|0|0
-bigbench|hindu_knowledge|0|0
-bigbench|hinglish_toxicity|0|0
-bigbench|human_organs_senses|0|0
-bigbench|hyperbaton|0|0
-bigbench|identify_math_theorems|0|0
-bigbench|identify_odd_metaphor|0|0
-bigbench|implicatures|0|0
-bigbench|implicit_relations|0|0
-bigbench|intent_recognition|0|0
-bigbench|international_phonetic_alphabet_nli|0|0
-bigbench|international_phonetic_alphabet_transliterate|0|0
-bigbench|intersect_geometry|0|0
-bigbench|irony_identification|0|0
-bigbench|kanji_ascii|0|0
-bigbench|kannada|0|0
-bigbench|key_value_maps|0|0
-bigbench|known_unknowns|0|0
-bigbench|language_games|0|0
-bigbench|language_identification|0|0
-bigbench|linguistic_mappings|0|0
-bigbench|linguistics_puzzles|0|0
-bigbench|logic_grid_puzzle|0|0
-bigbench|logical_args|0|0
-bigbench|logical_deduction|0|0
-bigbench|logical_fallacy_detection|0|0
-bigbench|logical_sequence|0|0
-bigbench|mathematical_induction|0|0
-bigbench|matrixshapes|0|0
-bigbench|metaphor_boolean|0|0
-bigbench|metaphor_understanding|0|0
-bigbench|minute_mysteries_qa|0|0
-bigbench|misconceptions_russian|0|0
-bigbench|misconceptions|0|0
-bigbench|mnist_ascii|0|0
-bigbench|modified_arithmetic|0|0
-bigbench|moral_permissibility|0|0
-bigbench|movie_dialog_same_or_different|0|0
-bigbench|movie_recommendation|0|0
-bigbench|mult_data_wrangling|0|0
-bigbench|multiemo|0|0
-bigbench|natural_instructions|0|0
-bigbench|navigate|0|0
-bigbench|nonsense_words_grammar|0|0
-bigbench|novel_concepts|0|0
-bigbench|object_counting|0|0
-bigbench|odd_one_out|0|0
-bigbench|operators|0|0
-bigbench|paragraph_segmentation|0|0
-bigbench|parsinlu_qa|0|0
-bigbench|parsinlu_reading_comprehension|0|0
-bigbench|penguins_in_a_table|0|0
-bigbench|periodic_elements|0|0
-bigbench|persian_idioms|0|0
-bigbench|phrase_relatedness|0|0
-bigbench|physical_intuition|0|0
-bigbench|physics_questions|0|0
-bigbench|physics|0|0
-bigbench|play_dialog_same_or_different|0|0
-bigbench|polish_sequence_labeling|0|0
-bigbench|presuppositions_as_nli|0|0
-bigbench|qa_wikidata|0|0
-bigbench|question_selection|0|0
-bigbench|real_or_fake_text|0|0
-bigbench|reasoning_about_colored_objects|0|0
-bigbench|repeat_copy_logic|0|0
-bigbench|rephrase|0|0
-bigbench|rhyming|0|0
-bigbench|riddle_sense|0|0
-bigbench|ruin_names|0|0
-bigbench|salient_translation_error_detection|0|0
-bigbench|scientific_press_release|0|0
-bigbench|semantic_parsing_in_context_sparc|0|0
-bigbench|semantic_parsing_spider|0|0
-bigbench|sentence_ambiguity|0|0
-bigbench|similarities_abstraction|0|0
-bigbench|simp_turing_concept|0|0
-bigbench|simple_arithmetic_json_multiple_choice|0|0
-bigbench|simple_arithmetic_json_subtasks|0|0
-bigbench|simple_arithmetic_json|0|0
-bigbench|simple_arithmetic_multiple_targets_json|0|0
-bigbench|simple_ethical_questions|0|0
-bigbench|simple_text_editing|0|0
-bigbench|snarks|0|0
-bigbench|social_iqa|0|0
-bigbench|social_support|0|0
-bigbench|sports_understanding|0|0
-bigbench|strange_stories|0|0
-bigbench|strategyqa|0|0
-bigbench|sufficient_information|0|0
-bigbench|suicide_risk|0|0
-bigbench|swahili_english_proverbs|0|0
-bigbench|swedish_to_german_proverbs|0|0
-bigbench|symbol_interpretation|0|0
-bigbench|tellmewhy|0|0
-bigbench|temporal_sequences|0|0
-bigbench|tense|0|0
-bigbench|timedial|0|0
-bigbench|topical_chat|0|0
-bigbench|tracking_shuffled_objects|0|0
-bigbench|understanding_fables|0|0
-bigbench|undo_permutation|0|0
-bigbench|unit_conversion|0|0
-bigbench|unit_interpretation|0|0
-bigbench|unnatural_in_context_learning|0|0
-bigbench|vitaminc_fact_verification|0|0
-bigbench|what_is_the_tao|0|0
-bigbench|which_wiki_edit|0|0
-bigbench|wino_x_german|0|0
-bigbench|winowhy|0|0
-bigbench|word_sorting|0|0
-bigbench|word_unscrambling|0|0
-helm|babi_qa|0|0
-helm|bbq:Age|0|0
-helm|bbq:Disability_status|0|0
-helm|bbq:Gender_identity|0|0
-helm|bbq:Nationality|0|0
-helm|bbq:Physical_appearance|0|0
-helm|bbq:Race_ethnicity|0|0
-helm|bbq:Race_x_SES|0|0
-helm|bbq:Race_x_gender|0|0
-helm|bbq:Religion|0|0
-helm|bbq:SES|0|0
-helm|bbq:Sexual_orientation|0|0
-helm|bbq|0|0
-helm|bigbench:auto_debugging|0|0
-helm|bigbench:bbq_lite_json:age_ambig|0|0
-helm|bigbench:bbq_lite_json:age_disambig|0|0
-helm|bigbench:bbq_lite_json:disability_status_ambig|0|0
-helm|bigbench:bbq_lite_json:disability_status_disambig|0|0
-helm|bigbench:bbq_lite_json:gender_identity_ambig|0|0
-helm|bigbench:bbq_lite_json:gender_identity_disambig|0|0
-helm|bigbench:bbq_lite_json:nationality_ambig|0|0
-helm|bigbench:bbq_lite_json:nationality_disambig|0|0
-helm|bigbench:bbq_lite_json:physical_appearance_ambig|0|0
-helm|bigbench:bbq_lite_json:physical_appearance_disambig|0|0
-helm|bigbench:bbq_lite_json:race_ethnicity_ambig|0|0
-helm|bigbench:bbq_lite_json:race_ethnicity_disambig|0|0
-helm|bigbench:bbq_lite_json:religion_ambig|0|0
-helm|bigbench:bbq_lite_json:religion_disambig|0|0
-helm|bigbench:bbq_lite_json:ses_ambig|0|0
-helm|bigbench:bbq_lite_json:ses_disambig|0|0
-helm|bigbench:bbq_lite_json:sexual_orientation_ambig|0|0
-helm|bigbench:bbq_lite_json:sexual_orientation_disambig|0|0
-helm|bigbench:code_line_description|0|0
-helm|bigbench:conceptual_combinations:contradictions|0|0
-helm|bigbench:conceptual_combinations:emergent_properties|0|0
-helm|bigbench:conceptual_combinations:fanciful_fictional_combinations|0|0
-helm|bigbench:conceptual_combinations:homonyms|0|0
-helm|bigbench:conceptual_combinations:invented_words|0|0
-helm|bigbench:conlang_translation:adna_from|0|0
-helm|bigbench:conlang_translation:adna_to|0|0
-helm|bigbench:conlang_translation:atikampe_from|0|0
-helm|bigbench:conlang_translation:atikampe_to|0|0
-helm|bigbench:conlang_translation:gornam_from|0|0
-helm|bigbench:conlang_translation:gornam_to|0|0
-helm|bigbench:conlang_translation:holuan_from|0|0
-helm|bigbench:conlang_translation:holuan_to|0|0
-helm|bigbench:conlang_translation:mkafala_from|0|0
-helm|bigbench:conlang_translation:mkafala_to|0|0
-helm|bigbench:conlang_translation:postpositive_english_from|0|0
-helm|bigbench:conlang_translation:postpositive_english_to|0|0
-helm|bigbench:conlang_translation:unapuri_from|0|0
-helm|bigbench:conlang_translation:unapuri_to|0|0
-helm|bigbench:conlang_translation:vaomi_from|0|0
-helm|bigbench:conlang_translation:vaomi_to|0|0
-helm|bigbench:emoji_movie|0|0
-helm|bigbench:formal_fallacies_syllogisms_negation|0|0
-helm|bigbench:hindu_knowledge|0|0
-helm|bigbench:known_unknowns|0|0
-helm|bigbench:language_identification|0|0
-helm|bigbench:linguistics_puzzles|0|0
-helm|bigbench:logic_grid_puzzle|0|0
-helm|bigbench:logical_deduction-five_objects|0|0
-helm|bigbench:logical_deduction-seven_objects|0|0
-helm|bigbench:logical_deduction-three_objects|0|0
-helm|bigbench:misconceptions_russian|0|0
-helm|bigbench:novel_concepts|0|0
-helm|bigbench:operators|0|0
-helm|bigbench:parsinlu_reading_comprehension|0|0
-helm|bigbench:play_dialog_same_or_different|0|0
-helm|bigbench:repeat_copy_logic|0|0
-helm|bigbench:strange_stories-boolean|0|0
-helm|bigbench:strange_stories-multiple_choice|0|0
-helm|bigbench:strategyqa|0|0
-helm|bigbench:symbol_interpretation-adversarial|0|0
-helm|bigbench:symbol_interpretation-emoji_agnostic|0|0
-helm|bigbench:symbol_interpretation-name_agnostic|0|0
-helm|bigbench:symbol_interpretation-plain|0|0
-helm|bigbench:symbol_interpretation-tricky|0|0
-helm|bigbench:vitaminc_fact_verification|0|0
-helm|bigbench:winowhy|0|0
-helm|blimp:adjunct_island|0|0
-helm|blimp:anaphor_gender_agreement|0|0
-helm|blimp:anaphor_number_agreement|0|0
-helm|blimp:animate_subject_passive|0|0
-helm|blimp:animate_subject_trans|0|0
-helm|blimp:causative|0|0
-helm|blimp:complex_NP_island|0|0
-helm|blimp:coordinate_structure_constraint_complex_left_branch|0|0
-helm|blimp:coordinate_structure_constraint_object_extraction|0|0
-helm|blimp:determiner_noun_agreement_1|0|0
-helm|blimp:determiner_noun_agreement_2|0|0
-helm|blimp:determiner_noun_agreement_irregular_1|0|0
-helm|blimp:determiner_noun_agreement_irregular_2|0|0
-helm|blimp:determiner_noun_agreement_with_adj_2|0|0
-helm|blimp:determiner_noun_agreement_with_adj_irregular_1|0|0
-helm|blimp:determiner_noun_agreement_with_adj_irregular_2|0|0
-helm|blimp:determiner_noun_agreement_with_adjective_1|0|0
-helm|blimp:distractor_agreement_relational_noun|0|0
-helm|blimp:distractor_agreement_relative_clause|0|0
-helm|blimp:drop_argument|0|0
-helm|blimp:ellipsis_n_bar_1|0|0
-helm|blimp:ellipsis_n_bar_2|0|0
-helm|blimp:existential_there_object_raising|0|0
-helm|blimp:existential_there_quantifiers_1|0|0
-helm|blimp:existential_there_quantifiers_2|0|0
-helm|blimp:existential_there_subject_raising|0|0
-helm|blimp:expletive_it_object_raising|0|0
-helm|blimp:inchoative|0|0
-helm|blimp:intransitive|0|0
-helm|blimp:irregular_past_participle_adjectives|0|0
-helm|blimp:irregular_past_participle_verbs|0|0
-helm|blimp:irregular_plural_subject_verb_agreement_1|0|0
-helm|blimp:irregular_plural_subject_verb_agreement_2|0|0
-helm|blimp:left_branch_island_echo_question|0|0
-helm|blimp:left_branch_island_simple_question|0|0
-helm|blimp:matrix_question_npi_licensor_present|0|0
-helm|blimp:npi_present_1|0|0
-helm|blimp:npi_present_2|0|0
-helm|blimp:only_npi_licensor_present|0|0
-helm|blimp:only_npi_scope|0|0
-helm|blimp:passive_1|0|0
-helm|blimp:passive_2|0|0
-helm|blimp:principle_A_c_command|0|0
-helm|blimp:principle_A_case_1|0|0
-helm|blimp:principle_A_case_2|0|0
-helm|blimp:principle_A_domain_1|0|0
-helm|blimp:principle_A_domain_2|0|0
-helm|blimp:principle_A_domain_3|0|0
-helm|blimp:principle_A_reconstruction|0|0
-helm|blimp:regular_plural_subject_verb_agreement_1|0|0
-helm|blimp:regular_plural_subject_verb_agreement_2|0|0
-helm|blimp:sentential_negation_npi_licensor_present|0|0
-helm|blimp:sentential_negation_npi_scope|0|0
-helm|blimp:sentential_subject_island|0|0
-helm|blimp:superlative_quantifiers_1|0|0
-helm|blimp:superlative_quantifiers_2|0|0
-helm|blimp:tough_vs_raising_1|0|0
-helm|blimp:tough_vs_raising_2|0|0
-helm|blimp:transitive|0|0
-helm|blimp:wh_island|0|0
-helm|blimp:wh_questions_object_gap|0|0
-helm|blimp:wh_questions_subject_gap_long_distance|0|0
-helm|blimp:wh_questions_subject_gap|0|0
-helm|blimp:wh_vs_that_no_gap_long_distance|0|0
-helm|blimp:wh_vs_that_no_gap|0|0
-helm|blimp:wh_vs_that_with_gap_long_distance|0|0
-helm|blimp:wh_vs_that_with_gap|0|0
-helm|bold:gender|0|0
-helm|bold:political_ideology|0|0
-helm|bold:profession|0|0
-helm|bold:race|0|0
-helm|bold:religious_ideology|0|0
-helm|bold|0|0
-helm|boolq:contrastset|0|0
-helm|boolq|0|0
-helm|civil_comments:LGBTQ|0|0
-helm|civil_comments:black|0|0
-helm|civil_comments:christian|0|0
-helm|civil_comments:female|0|0
-helm|civil_comments:male|0|0
-helm|civil_comments:muslim|0|0
-helm|civil_comments:other_religions|0|0
-helm|civil_comments:white|0|0
-helm|civil_comments|0|0
-helm|commonsenseqa|0|0
-helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125|0|0
-helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25|0|0
-helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5|0|0
-helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125|0|0
-helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25|0|0
-helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5|0|0
-helm|copyright:oh_the_places|0|0
-helm|copyright:pilot|0|0
-helm|copyright:popular_books-prefix_length_10|0|0
-helm|copyright:popular_books-prefix_length_125|0|0
-helm|copyright:popular_books-prefix_length_250|0|0
-helm|copyright:popular_books-prefix_length_25|0|0
-helm|copyright:popular_books-prefix_length_50|0|0
-helm|copyright:popular_books-prefix_length_5|0|0
-helm|copyright:prompt_num_line_1-min_lines_20|0|0
-helm|copyright:prompt_num_line_10-min_lines_20|0|0
-helm|copyright:prompt_num_line_5-min_lines_20|0|0
-helm|covid_dialogue|0|0
-helm|dyck_language:2|0|0
-helm|dyck_language:3|0|0
-helm|dyck_language:4|0|0
-helm|entity_data_imputation:Buy|0|0
-helm|entity_data_imputation:Restaurant|0|0
-helm|entity_matching:Abt_Buy|0|0
-helm|entity_matching:Amazon_Google|0|0
-helm|entity_matching:Beer|0|0
-helm|entity_matching:Company|0|0
-helm|entity_matching:DBLP_ACM|0|0
-helm|entity_matching:DBLP_GoogleScholar|0|0
-helm|entity_matching:Dirty_DBLP_ACM|0|0
-helm|entity_matching:Dirty_DBLP_GoogleScholar|0|0
-helm|entity_matching:Dirty_Walmart_Amazon|0|0
-helm|entity_matching:Dirty_iTunes_Amazon|0|0
-helm|entity_matching:Fodors_Zagats|0|0
-helm|entity_matching:Walmart_Amazon|0|0
-helm|entity_matching:iTunes_Amazon|0|0
-helm|hellaswag|0|0
-helm|humaneval|0|0
-helm|imdb:contrastset|0|0
-helm|imdb|0|0
-helm|interactive_qa_mmlu:abstract_algebra|0|0
-helm|interactive_qa_mmlu:college_chemistry|0|0
-helm|interactive_qa_mmlu:global_facts|0|0
-helm|interactive_qa_mmlu:miscellaneous|0|0
-helm|interactive_qa_mmlu:nutrition|0|0
-helm|interactive_qa_mmlu:us_foreign_policy|0|0
-helm|legal_summarization:billsum|0|0
-helm|legal_summarization:eurlexsum|0|0
-helm|legal_summarization:multilexsum|0|0
-helm|legalsupport|0|0
-helm|lexglue:case_hold|0|0
-helm|lexglue:ecthr_a|0|0
-helm|lexglue:ecthr_b|0|0
-helm|lexglue:eurlex|0|0
-helm|lexglue:ledgar|0|0
-helm|lexglue:scotus|0|0
-helm|lexglue:unfair_tos|0|0
-helm|lextreme:brazilian_court_decisions_judgment|0|0
-helm|lextreme:brazilian_court_decisions_unanimity|0|0
-helm|lextreme:covid19_emergency_event|0|0
-helm|lextreme:german_argument_mining|0|0
-helm|lextreme:greek_legal_code_chapter|0|0
-helm|lextreme:greek_legal_code_subject|0|0
-helm|lextreme:greek_legal_code_volume|0|0
-helm|lextreme:greek_legal_ner|0|0
-helm|lextreme:legalnero|0|0
-helm|lextreme:lener_br|0|0
-helm|lextreme:mapa_coarse|0|0
-helm|lextreme:mapa_fine|0|0
-helm|lextreme:multi_eurlex_level_1|0|0
-helm|lextreme:multi_eurlex_level_2|0|0
-helm|lextreme:multi_eurlex_level_3|0|0
-helm|lextreme:online_terms_of_service_clause_topics|0|0
-helm|lextreme:online_terms_of_service_unfairness_levels|0|0
-helm|lextreme:swiss_judgment_prediction|0|0
-helm|lsat_qa:assignment|0|0
-helm|lsat_qa:grouping|0|0
-helm|lsat_qa:miscellaneous|0|0
-helm|lsat_qa:ordering|0|0
-helm|lsat_qa|0|0
-helm|me_q_sum|0|0
-helm|med_dialog:healthcaremagic|0|0
-helm|med_dialog:icliniq|0|0
-helm|med_mcqa|0|0
-helm|med_paragraph_simplification|0|0
-helm|med_qa|0|0
-helm|mmlu:abstract_algebra|0|0
-helm|mmlu:anatomy|0|0
-helm|mmlu:astronomy|0|0
-helm|mmlu:business_ethics|0|0
-helm|mmlu:clinical_knowledge|0|0
-helm|mmlu:college_biology|0|0
-helm|mmlu:college_chemistry|0|0
-helm|mmlu:college_computer_science|0|0
-helm|mmlu:college_mathematics|0|0
-helm|mmlu:college_medicine|0|0
-helm|mmlu:college_physics|0|0
-helm|mmlu:computer_security|0|0
-helm|mmlu:conceptual_physics|0|0
-helm|mmlu:econometrics|0|0
-helm|mmlu:electrical_engineering|0|0
-helm|mmlu:elementary_mathematics|0|0
-helm|mmlu:formal_logic|0|0
-helm|mmlu:global_facts|0|0
-helm|mmlu:high_school_biology|0|0
-helm|mmlu:high_school_chemistry|0|0
-helm|mmlu:high_school_computer_science|0|0
-helm|mmlu:high_school_european_history|0|0
-helm|mmlu:high_school_geography|0|0
-helm|mmlu:high_school_government_and_politics|0|0
-helm|mmlu:high_school_macroeconomics|0|0
-helm|mmlu:high_school_mathematics|0|0
-helm|mmlu:high_school_microeconomics|0|0
-helm|mmlu:high_school_physics|0|0
-helm|mmlu:high_school_psychology|0|0
-helm|mmlu:high_school_statistics|0|0
-helm|mmlu:high_school_us_history|0|0
-helm|mmlu:high_school_world_history|0|0
-helm|mmlu:human_aging|0|0
-helm|mmlu:human_sexuality|0|0
-helm|mmlu:international_law|0|0
-helm|mmlu:jurisprudence|0|0
-helm|mmlu:logical_fallacies|0|0
-helm|mmlu:machine_learning|0|0
-helm|mmlu:management|0|0
-helm|mmlu:marketing|0|0
-helm|mmlu:medical_genetics|0|0
-helm|mmlu:miscellaneous|0|0
-helm|mmlu:moral_disputes|0|0
-helm|mmlu:moral_scenarios|0|0
-helm|mmlu:nutrition|0|0
-helm|mmlu:philosophy|0|0
-helm|mmlu:prehistory|0|0
-helm|mmlu:professional_accounting|0|0
-helm|mmlu:professional_law|0|0
-helm|mmlu:professional_medicine|0|0
-helm|mmlu:professional_psychology|0|0
-helm|mmlu:public_relations|0|0
-helm|mmlu:security_studies|0|0
-helm|mmlu:sociology|0|0
-helm|mmlu:us_foreign_policy|0|0
-helm|mmlu:virology|0|0
-helm|mmlu:world_religions|0|0
-helm|mmlu|0|0
-helm|narrativeqa|0|0
-helm|numeracy:linear_example|0|0
-helm|numeracy:linear_standard|0|0
-helm|numeracy:parabola_example|0|0
-helm|numeracy:parabola_standard|0|0
-helm|numeracy:paraboloid_example|0|0
-helm|numeracy:paraboloid_standard|0|0
-helm|numeracy:plane_example|0|0
-helm|numeracy:plane_standard|0|0
-helm|openbookqa|0|0
-helm|piqa|0|0
-helm|pubmedqa|0|0
-helm|quac|0|0
-helm|raft:ade_corpus_v2|0|0
-helm|raft:banking_77|0|0
-helm|raft:neurips_impact_statement_risks|0|0
-helm|raft:one_stop_english|0|0
-helm|raft:overruling|0|0
-helm|raft:semiconductor_org_types|0|0
-helm|raft:systematic_review_inclusion|0|0
-helm|raft:tai_safety_research|0|0
-helm|raft:terms_of_service|0|0
-helm|raft:tweet_eval_hate|0|0
-helm|raft:twitter_complaints|0|0
-helm|real_toxicity_prompts|0|0
-helm|siqa|0|0
-helm|summarization:cnn-dm|0|0
-helm|summarization:xsum-sampled|0|0
-helm|summarization:xsum|0|0
-helm|synthetic_reasoning:induction|0|0
-helm|synthetic_reasoning:natural_easy|0|0
-helm|synthetic_reasoning:natural_hard|0|0
-helm|synthetic_reasoning:pattern_match|0|0
-helm|synthetic_reasoning:variable_substitution|0|0
-helm|the_pile:arxiv|0|0
-helm|the_pile:bibliotik|0|0
-helm|the_pile:commoncrawl|0|0
-helm|the_pile:dm-mathematics|0|0
-helm|the_pile:enron|0|0
-helm|the_pile:europarl|0|0
-helm|the_pile:freelaw|0|0
-helm|the_pile:github|0|0
-helm|the_pile:gutenberg|0|0
-helm|the_pile:hackernews|0|0
-helm|the_pile:nih-exporter|0|0
-helm|the_pile:opensubtitles|0|0
-helm|the_pile:openwebtext2|0|0
-helm|the_pile:pubmed-abstracts|0|0
-helm|the_pile:pubmed-central|0|0
-helm|the_pile:stackexchange|0|0
-helm|the_pile:upsto|0|0
-helm|the_pile:wikipedia|0|0
-helm|the_pile:youtubesubtitles|0|0
-helm|truthfulqa|0|0
-helm|twitterAAE:aa|0|0
-helm|twitterAAE:white|0|0
-helm|wikifact:applies_to_jurisdiction|0|0
-helm|wikifact:atomic_number|0|0
-helm|wikifact:author|0|0
-helm|wikifact:award_received|0|0
-helm|wikifact:basic_form_of_government|0|0
-helm|wikifact:capital_of|0|0
-helm|wikifact:capital|0|0
-helm|wikifact:central_bank|0|0
-helm|wikifact:composer|0|0
-helm|wikifact:continent|0|0
-helm|wikifact:country_of_citizenship|0|0
-helm|wikifact:country_of_origin|0|0
-helm|wikifact:country|0|0
-helm|wikifact:creator|0|0
-helm|wikifact:currency|0|0
-helm|wikifact:defendant|0|0
-helm|wikifact:developer|0|0
-helm|wikifact:diplomatic_relation|0|0
-helm|wikifact:director|0|0
-helm|wikifact:discoverer_or_inventor|0|0
-helm|wikifact:drug_or_therapy_used_for_treatment|0|0
-helm|wikifact:educated_at|0|0
-helm|wikifact:electron_configuration|0|0
-helm|wikifact:employer|0|0
-helm|wikifact:field_of_work|0|0
-helm|wikifact:file_extension|0|0
-helm|wikifact:genetic_association|0|0
-helm|wikifact:genre|0|0
-helm|wikifact:has_part|0|0
-helm|wikifact:head_of_government|0|0
-helm|wikifact:head_of_state|0|0
-helm|wikifact:headquarters_location|0|0
-helm|wikifact:industry|0|0
-helm|wikifact:influenced_by|0|0
-helm|wikifact:instance_of|0|0
-helm|wikifact:instrument|0|0
-helm|wikifact:language_of_work_or_name|0|0
-helm|wikifact:languages_spoken_written_or_signed|0|0
-helm|wikifact:laws_applied|0|0
-helm|wikifact:located_in_the_administrative_territorial_entity|0|0
-helm|wikifact:location_of_discovery|0|0
-helm|wikifact:location_of_formation|0|0
-helm|wikifact:location|0|0
-helm|wikifact:majority_opinion_by|0|0
-helm|wikifact:manufacturer|0|0
-helm|wikifact:measured_physical_quantity|0|0
-helm|wikifact:medical_condition_treated|0|0
-helm|wikifact:member_of_political_party|0|0
-helm|wikifact:member_of_sports_team|0|0
-helm|wikifact:member_of|0|0
-helm|wikifact:movement|0|0
-helm|wikifact:named_after|0|0
-helm|wikifact:native_language|0|0
-helm|wikifact:number_of_processor_cores|0|0
-helm|wikifact:occupation|0|0
-helm|wikifact:office_held_by_head_of_government|0|0
-helm|wikifact:office_held_by_head_of_state|0|0
-helm|wikifact:official_language|0|0
-helm|wikifact:operating_system|0|0
-helm|wikifact:original_language_of_film_or_TV_show|0|0
-helm|wikifact:original_network|0|0
-helm|wikifact:overrules|0|0
-helm|wikifact:owned_by|0|0
-helm|wikifact:part_of|0|0
-helm|wikifact:participating_team|0|0
-helm|wikifact:place_of_birth|0|0
-helm|wikifact:place_of_death|0|0
-helm|wikifact:plaintiff|0|0
-helm|wikifact:position_held|0|0
-helm|wikifact:position_played_on_team|0|0
-helm|wikifact:programming_language|0|0
-helm|wikifact:recommended_unit_of_measurement|0|0
-helm|wikifact:record_label|0|0
-helm|wikifact:religion|0|0
-helm|wikifact:repealed_by|0|0
-helm|wikifact:shares_border_with|0|0
-helm|wikifact:solved_by|0|0
-helm|wikifact:statement_describes|0|0
-helm|wikifact:stock_exchange|0|0
-helm|wikifact:subclass_of|0|0
-helm|wikifact:subsidiary|0|0
-helm|wikifact:symptoms_and_signs|0|0
-helm|wikifact:therapeutic_area|0|0
-helm|wikifact:time_of_discovery_or_invention|0|0
-helm|wikifact:twinned_administrative_body|0|0
-helm|wikifact:work_location|0|0
-helm|wikitext:103|0|0
-helm|wmt14:cs-en|0|0
-helm|wmt14:de-en|0|0
-helm|wmt14:fr-en|0|0
-helm|wmt14:hi-en|0|0
-helm|wmt14:ru-en|0|0
-lighteval|anli:r1|0|0
-lighteval|anli:r2|0|0
-lighteval|anli:r3|0|0
-lighteval|anli|0|0
-leaderboard|arc:challenge|0|0
-lighteval|arc:easy|0|0
-lighteval|arithmetic:1dc|0|0
-lighteval|arithmetic:2da|0|0
-lighteval|arithmetic:2dm|0|0
-lighteval|arithmetic:2ds|0|0
-lighteval|arithmetic:3da|0|0
-lighteval|arithmetic:3ds|0|0
-lighteval|arithmetic:4da|0|0
-lighteval|arithmetic:4ds|0|0
-lighteval|arithmetic:5da|0|0
-lighteval|arithmetic:5ds|0|0
-lighteval|asdiv|0|0
-lighteval|blimp:adjunct_island|0|0
-lighteval|blimp:anaphor_gender_agreement|0|0
-lighteval|blimp:anaphor_number_agreement|0|0
-lighteval|blimp:animate_subject_passive|0|0
-lighteval|blimp:animate_subject_trans|0|0
-lighteval|blimp:causative|0|0
-lighteval|blimp:complex_NP_island|0|0
-lighteval|blimp:coordinate_structure_constraint_complex_left_branch|0|0
-lighteval|blimp:coordinate_structure_constraint_object_extraction|0|0
-lighteval|blimp:determiner_noun_agreement_1|0|0
-lighteval|blimp:determiner_noun_agreement_2|0|0
-lighteval|blimp:determiner_noun_agreement_irregular_1|0|0
-lighteval|blimp:determiner_noun_agreement_irregular_2|0|0
-lighteval|blimp:determiner_noun_agreement_with_adj_2|0|0
-lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1|0|0
-lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2|0|0
-lighteval|blimp:determiner_noun_agreement_with_adjective_1|0|0
-lighteval|blimp:distractor_agreement_relational_noun|0|0
-lighteval|blimp:distractor_agreement_relative_clause|0|0
-lighteval|blimp:drop_argument|0|0
-lighteval|blimp:ellipsis_n_bar_1|0|0
-lighteval|blimp:ellipsis_n_bar_2|0|0
-lighteval|blimp:existential_there_object_raising|0|0
-lighteval|blimp:existential_there_quantifiers_1|0|0
-lighteval|blimp:existential_there_quantifiers_2|0|0
-lighteval|blimp:existential_there_subject_raising|0|0
-lighteval|blimp:expletive_it_object_raising|0|0
-lighteval|blimp:inchoative|0|0
-lighteval|blimp:intransitive|0|0
-lighteval|blimp:irregular_past_participle_adjectives|0|0
-lighteval|blimp:irregular_past_participle_verbs|0|0
-lighteval|blimp:irregular_plural_subject_verb_agreement_1|0|0
-lighteval|blimp:irregular_plural_subject_verb_agreement_2|0|0
-lighteval|blimp:left_branch_island_echo_question|0|0
-lighteval|blimp:left_branch_island_simple_question|0|0
-lighteval|blimp:matrix_question_npi_licensor_present|0|0
-lighteval|blimp:npi_present_1|0|0
-lighteval|blimp:npi_present_2|0|0
-lighteval|blimp:only_npi_licensor_present|0|0
-lighteval|blimp:only_npi_scope|0|0
-lighteval|blimp:passive_1|0|0
-lighteval|blimp:passive_2|0|0
-lighteval|blimp:principle_A_c_command|0|0
-lighteval|blimp:principle_A_case_1|0|0
-lighteval|blimp:principle_A_case_2|0|0
-lighteval|blimp:principle_A_domain_1|0|0
-lighteval|blimp:principle_A_domain_2|0|0
-lighteval|blimp:principle_A_domain_3|0|0
-lighteval|blimp:principle_A_reconstruction|0|0
-lighteval|blimp:regular_plural_subject_verb_agreement_1|0|0
-lighteval|blimp:regular_plural_subject_verb_agreement_2|0|0
-lighteval|blimp:sentential_negation_npi_licensor_present|0|0
-lighteval|blimp:sentential_negation_npi_scope|0|0
-lighteval|blimp:sentential_subject_island|0|0
-lighteval|blimp:superlative_quantifiers_1|0|0
-lighteval|blimp:superlative_quantifiers_2|0|0
-lighteval|blimp:tough_vs_raising_1|0|0
-lighteval|blimp:tough_vs_raising_2|0|0
-lighteval|blimp:transitive|0|0
-lighteval|blimp:wh_island|0|0
-lighteval|blimp:wh_questions_object_gap|0|0
-lighteval|blimp:wh_questions_subject_gap_long_distance|0|0
-lighteval|blimp:wh_questions_subject_gap|0|0
-lighteval|blimp:wh_vs_that_no_gap_long_distance|0|0
-lighteval|blimp:wh_vs_that_no_gap|0|0
-lighteval|blimp:wh_vs_that_with_gap_long_distance|0|0
-lighteval|blimp:wh_vs_that_with_gap|0|0
-lighteval|coqa_bb|0|0
-lighteval|coqa|0|0
-lighteval|drop|0|0
-lighteval|ethics:commonsense|0|0
-lighteval|ethics:deontology|0|0
-lighteval|ethics:justice|0|0
-lighteval|ethics:utilitarianism|0|0
-lighteval|ethics:virtue|0|0
-lighteval|glue:cola|0|0
-lighteval|glue:mnli_mismatched|0|0
-lighteval|glue:mnli|0|0
-lighteval|glue:mrpc|0|0
-lighteval|glue:qnli|0|0
-lighteval|glue:qqp|0|0
-lighteval|glue:rte|0|0
-lighteval|glue:sst2|0|0
-lighteval|glue:stsb|0|0
-lighteval|glue:wnli|0|0
-leaderboard|gsm8k|0|0
-lighteval|headqa:en|0|0
-lighteval|headqa:es|0|0
-leaderboard|hellaswag|0|0
-lighteval|iwslt17:ar-en|0|0
-lighteval|iwslt17:de-en|0|0
-lighteval|iwslt17:en-ar|0|0
-lighteval|iwslt17:en-de|0|0
-lighteval|iwslt17:en-fr|0|0
-lighteval|iwslt17:en-ja|0|0
-lighteval|iwslt17:en-ko|0|0
-lighteval|iwslt17:en-zh|0|0
-lighteval|iwslt17:fr-en|0|0
-lighteval|iwslt17:ja-en|0|0
-lighteval|iwslt17:ko-en|0|0
-lighteval|iwslt17:zh-en|0|0
-lighteval|lambada:openai:de|0|0
-lighteval|lambada:openai:en|0|0
-lighteval|lambada:openai:es|0|0
-lighteval|lambada:openai:fr|0|0
-lighteval|lambada:openai:it|0|0
-lighteval|lambada:openai_cloze|0|0
-lighteval|lambada:openai|0|0
-lighteval|lambada:standard_cloze|0|0
-lighteval|lambada:standard|0|0
-lighteval|logiqa|0|0
-lighteval|math:algebra|0|0
-lighteval|math:counting_and_probability|0|0
-lighteval|math:geometry|0|0
-lighteval|math:intermediate_algebra|0|0
-lighteval|math:number_theory|0|0
-lighteval|math:prealgebra|0|0
-lighteval|math:precalculus|0|0
-lighteval|mathqa|0|0
-lighteval|mgsm:bn|0|0
-lighteval|mgsm:de|0|0
-lighteval|mgsm:en|0|0
-lighteval|mgsm:es|0|0
-lighteval|mgsm:fr|0|0
-lighteval|mgsm:ja|0|0
-lighteval|mgsm:ru|0|0
-lighteval|mgsm:sw|0|0
-lighteval|mgsm:te|0|0
-lighteval|mgsm:th|0|0
-lighteval|mgsm:zh|0|0
-leaderboard|mmlu:abstract_algebra|0|0
-leaderboard|mmlu:anatomy|0|0
-leaderboard|mmlu:astronomy|0|0
-leaderboard|mmlu:business_ethics|0|0
-leaderboard|mmlu:clinical_knowledge|0|0
-leaderboard|mmlu:college_biology|0|0
-leaderboard|mmlu:college_chemistry|0|0
-leaderboard|mmlu:college_computer_science|0|0
-leaderboard|mmlu:college_mathematics|0|0
-leaderboard|mmlu:college_medicine|0|0
-leaderboard|mmlu:college_physics|0|0
-leaderboard|mmlu:computer_security|0|0
-leaderboard|mmlu:conceptual_physics|0|0
-leaderboard|mmlu:econometrics|0|0
-leaderboard|mmlu:electrical_engineering|0|0
-leaderboard|mmlu:elementary_mathematics|0|0
-leaderboard|mmlu:formal_logic|0|0
-leaderboard|mmlu:global_facts|0|0
-leaderboard|mmlu:high_school_biology|0|0
-leaderboard|mmlu:high_school_chemistry|0|0
-leaderboard|mmlu:high_school_computer_science|0|0
-leaderboard|mmlu:high_school_european_history|0|0
-leaderboard|mmlu:high_school_geography|0|0
-leaderboard|mmlu:high_school_government_and_politics|0|0
-leaderboard|mmlu:high_school_macroeconomics|0|0
-leaderboard|mmlu:high_school_mathematics|0|0
-leaderboard|mmlu:high_school_microeconomics|0|0
-leaderboard|mmlu:high_school_physics|0|0
-leaderboard|mmlu:high_school_psychology|0|0
-leaderboard|mmlu:high_school_statistics|0|0
-leaderboard|mmlu:high_school_us_history|0|0
-leaderboard|mmlu:high_school_world_history|0|0
-leaderboard|mmlu:human_aging|0|0
-leaderboard|mmlu:human_sexuality|0|0
-leaderboard|mmlu:international_law|0|0
-leaderboard|mmlu:jurisprudence|0|0
-leaderboard|mmlu:logical_fallacies|0|0
-leaderboard|mmlu:machine_learning|0|0
-leaderboard|mmlu:management|0|0
-leaderboard|mmlu:marketing|0|0
-leaderboard|mmlu:medical_genetics|0|0
-leaderboard|mmlu:miscellaneous|0|0
-leaderboard|mmlu:moral_disputes|0|0
-leaderboard|mmlu:moral_scenarios|0|0
-leaderboard|mmlu:nutrition|0|0
-leaderboard|mmlu:philosophy|0|0
-leaderboard|mmlu:prehistory|0|0
-leaderboard|mmlu:professional_accounting|0|0
-leaderboard|mmlu:professional_law|0|0
-leaderboard|mmlu:professional_medicine|0|0
-leaderboard|mmlu:professional_psychology|0|0
-leaderboard|mmlu:public_relations|0|0
-leaderboard|mmlu:security_studies|0|0
-leaderboard|mmlu:sociology|0|0
-leaderboard|mmlu:us_foreign_policy|0|0
-leaderboard|mmlu:virology|0|0
-leaderboard|mmlu:world_religions|0|0
-lighteval|mtnt2019:en-fr|0|0
-lighteval|mtnt2019:en-ja|0|0
-lighteval|mtnt2019:fr-en|0|0
-lighteval|mtnt2019:ja-en|0|0
-lighteval|mutual_plus|0|0
-lighteval|mutual|0|0
-lighteval|openbookqa|0|0
-lighteval|piqa|0|0
-lighteval|prost|0|0
-lighteval|pubmedqa|0|0
-lighteval|qa4mre:2011|0|0
-lighteval|qa4mre:2012|0|0
-lighteval|qa4mre:2013|0|0
-lighteval|qasper_ll|0|0
-lighteval|qasper|0|0
-lighteval|race:high|0|0
-lighteval|sciq|0|0
-lighteval|storycloze:2016|0|0
-lighteval|storycloze:2018|0|0
-lighteval|super_glue:boolq|0|0
-lighteval|super_glue:cb|0|0
-lighteval|super_glue:copa|0|0
-lighteval|super_glue:multirc|0|0
-lighteval|super_glue:record|0|0
-lighteval|super_glue:rte|0|0
-lighteval|super_glue:wic|0|0
-lighteval|super_glue:wsc|0|0
-lighteval|swag|0|0
-lighteval|the_pile:arxiv|0|0
-lighteval|the_pile:bookcorpus2|0|0
-lighteval|the_pile:books3|0|0
-lighteval|the_pile:dm-mathematics|0|0
-lighteval|the_pile:enron|0|0
-lighteval|the_pile:europarl|0|0
-lighteval|the_pile:freelaw|0|0
-lighteval|the_pile:github|0|0
-lighteval|the_pile:gutenberg|0|0
-lighteval|the_pile:hackernews|0|0
-lighteval|the_pile:nih-exporter|0|0
-lighteval|the_pile:opensubtitles|0|0
-lighteval|the_pile:openwebtext2|0|0
-lighteval|the_pile:philpapers|0|0
-lighteval|the_pile:pile-cc|0|0
-lighteval|the_pile:pubmed-abstracts|0|0
-lighteval|the_pile:pubmed-central|0|0
-lighteval|the_pile:stackexchange|0|0
-lighteval|the_pile:ubuntu-irc|0|0
-lighteval|the_pile:uspto|0|0
-lighteval|the_pile:wikipedia|0|0
-lighteval|the_pile:youtubesubtitles|0|0
-lighteval|toxigen|0|0
-lighteval|triviaqa|0|0
-lighteval|truthfulqa:gen|0|0
-leaderboard|truthfulqa:mc|0|0
-lighteval|unscramble:anagrams1|0|0
-lighteval|unscramble:anagrams2|0|0
-lighteval|unscramble:cycle_letters|0|0
-lighteval|unscramble:random_insertion|0|0
-lighteval|unscramble:reversed_words|0|0
-lighteval|webqs|0|0
-lighteval|wikitext|0|0
-leaderboard|winogrande|0|0
-lighteval|wmt08:cs-en|0|0
-lighteval|wmt08:de-en|0|0
-lighteval|wmt08:en-cs|0|0
-lighteval|wmt08:en-de|0|0
-lighteval|wmt08:en-es|0|0
-lighteval|wmt08:en-fr|0|0
-lighteval|wmt08:en-hu|0|0
-lighteval|wmt08:es-en|0|0
-lighteval|wmt08:fr-en|0|0
-lighteval|wmt08:hu-en|0|0
-lighteval|wmt09:cs-en|0|0
-lighteval|wmt09:de-en|0|0
-lighteval|wmt09:en-cs|0|0
-lighteval|wmt09:en-de|0|0
-lighteval|wmt09:en-es|0|0
-lighteval|wmt09:en-fr|0|0
-lighteval|wmt09:en-hu|0|0
-lighteval|wmt09:en-it|0|0
-lighteval|wmt09:es-en|0|0
-lighteval|wmt09:fr-en|0|0
-lighteval|wmt09:hu-en|0|0
-lighteval|wmt09:it-en|0|0
-lighteval|wmt10:cs-en|0|0
-lighteval|wmt10:de-en|0|0
-lighteval|wmt10:en-cs|0|0
-lighteval|wmt10:en-de|0|0
-lighteval|wmt10:en-es|0|0
-lighteval|wmt10:en-fr|0|0
-lighteval|wmt10:es-en|0|0
-lighteval|wmt10:fr-en|0|0
-lighteval|wmt11:cs-en|0|0
-lighteval|wmt11:de-en|0|0
-lighteval|wmt11:en-cs|0|0
-lighteval|wmt11:en-de|0|0
-lighteval|wmt11:en-es|0|0
-lighteval|wmt11:en-fr|0|0
-lighteval|wmt11:es-en|0|0
-lighteval|wmt11:fr-en|0|0
-lighteval|wmt12:cs-en|0|0
-lighteval|wmt12:de-en|0|0
-lighteval|wmt12:en-cs|0|0
-lighteval|wmt12:en-de|0|0
-lighteval|wmt12:en-es|0|0
-lighteval|wmt12:en-fr|0|0
-lighteval|wmt12:es-en|0|0
-lighteval|wmt12:fr-en|0|0
-lighteval|wmt13:cs-en|0|0
-lighteval|wmt13:de-en|0|0
-lighteval|wmt13:en-cs|0|0
-lighteval|wmt13:en-de|0|0
-lighteval|wmt13:en-es|0|0
-lighteval|wmt13:en-fr|0|0
-lighteval|wmt13:en-ru|0|0
-lighteval|wmt13:es-en|0|0
-lighteval|wmt13:fr-en|0|0
-lighteval|wmt13:ru-en|0|0
-lighteval|wmt14:cs-en|0|0
-lighteval|wmt14:de-en|0|0
-lighteval|wmt14:en-cs|0|0
-lighteval|wmt14:en-de|0|0
-lighteval|wmt14:en-fr|0|0
-lighteval|wmt14:en-fr|0|0
-lighteval|wmt14:en-hi|0|0
-lighteval|wmt14:en-ru|0|0
-lighteval|wmt14:fr-en|0|0
-lighteval|wmt14:fr-en|0|0
-lighteval|wmt14:hi-en|0|0
-lighteval|wmt14:ru-en|0|0
-lighteval|wmt15:cs-en|0|0
-lighteval|wmt15:de-en|0|0
-lighteval|wmt15:en-cs|0|0
-lighteval|wmt15:en-de|0|0
-lighteval|wmt15:en-fi|0|0
-lighteval|wmt15:en-fr|0|0
-lighteval|wmt15:en-ru|0|0
-lighteval|wmt15:fi-en|0|0
-lighteval|wmt15:fr-en|0|0
-lighteval|wmt15:ru-en|0|0
-lighteval|wmt16:cs-en|0|0
-lighteval|wmt16:de-en|0|0
-lighteval|wmt16:de-en|0|0
-lighteval|wmt16:en-cs|0|0
-lighteval|wmt16:en-de|0|0
-lighteval|wmt16:en-de|0|0
-lighteval|wmt16:en-fi|0|0
-lighteval|wmt16:en-ro|0|0
-lighteval|wmt16:en-ro|0|0
-lighteval|wmt16:en-ru|0|0
-lighteval|wmt16:en-tr|0|0
-lighteval|wmt16:fi-en|0|0
-lighteval|wmt16:ro-en|0|0
-lighteval|wmt16:ro-en|0|0
-lighteval|wmt16:ru-en|0|0
-lighteval|wmt16:tr-en|0|0
-lighteval|wmt17:cs-en|0|0
-lighteval|wmt17:de-en|0|0
-lighteval|wmt17:en-cs|0|0
-lighteval|wmt17:en-de|0|0
-lighteval|wmt17:en-fi|0|0
-lighteval|wmt17:en-lv|0|0
-lighteval|wmt17:en-ru|0|0
-lighteval|wmt17:en-tr|0|0
-lighteval|wmt17:en-zh|0|0
-lighteval|wmt17:fi-en|0|0
-lighteval|wmt17:lv-en|0|0
-lighteval|wmt17:ru-en|0|0
-lighteval|wmt17:tr-en|0|0
-lighteval|wmt17:zh-en|0|0
-lighteval|wmt18:cs-en|0|0
-lighteval|wmt18:de-en|0|0
-lighteval|wmt18:en-cs|0|0
-lighteval|wmt18:en-de|0|0
-lighteval|wmt18:en-et|0|0
-lighteval|wmt18:en-fi|0|0
-lighteval|wmt18:en-ru|0|0
-lighteval|wmt18:en-tr|0|0
-lighteval|wmt18:en-zh|0|0
-lighteval|wmt18:et-en|0|0
-lighteval|wmt18:fi-en|0|0
-lighteval|wmt18:ru-en|0|0
-lighteval|wmt18:tr-en|0|0
-lighteval|wmt18:zh-en|0|0
-lighteval|wmt19:cs-de|0|0
-lighteval|wmt19:de-cs|0|0
-lighteval|wmt19:de-en|0|0
-lighteval|wmt19:de-fr|0|0
-lighteval|wmt19:en-cs|0|0
-lighteval|wmt19:en-de|0|0
-lighteval|wmt19:en-fi|0|0
-lighteval|wmt19:en-gu|0|0
-lighteval|wmt19:en-kk|0|0
-lighteval|wmt19:en-lt|0|0
-lighteval|wmt19:en-ru|0|0
-lighteval|wmt19:en-zh|0|0
-lighteval|wmt19:fi-en|0|0
-lighteval|wmt19:fr-de|0|0
-lighteval|wmt19:gu-en|0|0
-lighteval|wmt19:kk-en|0|0
-lighteval|wmt19:lt-en|0|0
-lighteval|wmt19:ru-en|0|0
-lighteval|wmt19:zh-en|0|0
-lighteval|wmt20:cs-en|0|0
-lighteval|wmt20:de-en|0|0
-lighteval|wmt20:de-fr|0|0
-lighteval|wmt20:en-cs|0|0
-lighteval|wmt20:en-de|0|0
-lighteval|wmt20:en-iu|0|0
-lighteval|wmt20:en-ja|0|0
-lighteval|wmt20:en-km|0|0
-lighteval|wmt20:en-pl|0|0
-lighteval|wmt20:en-ps|0|0
-lighteval|wmt20:en-ru|0|0
-lighteval|wmt20:en-ta|0|0
-lighteval|wmt20:en-zh|0|0
-lighteval|wmt20:fr-de|0|0
-lighteval|wmt20:iu-en|0|0
-lighteval|wmt20:ja-en|0|0
-lighteval|wmt20:km-en|0|0
-lighteval|wmt20:pl-en|0|0
-lighteval|wmt20:ps-en|0|0
-lighteval|wmt20:ru-en|0|0
-lighteval|wmt20:ta-en|0|0
-lighteval|wmt20:zh-en|0|0
-lighteval|wsc273|0|0
-lighteval|xcopa:en|0|0
-lighteval|xcopa:et|0|0
-lighteval|xcopa:ht|0|0
-lighteval|xcopa:id|0|0
-lighteval|xcopa:it|0|0
-lighteval|xcopa:qu|0|0
-lighteval|xcopa:sw|0|0
-lighteval|xcopa:ta|0|0
-lighteval|xcopa:th|0|0
-lighteval|xcopa:tr|0|0
-lighteval|xcopa:vi|0|0
-lighteval|xcopa:zh|0|0
-lighteval|xstory_cloze:ar|0|0
-lighteval|xstory_cloze:en|0|0
-lighteval|xstory_cloze:es|0|0
-lighteval|xstory_cloze:eu|0|0
-lighteval|xstory_cloze:hi|0|0
-lighteval|xstory_cloze:id|0|0
-lighteval|xstory_cloze:my|0|0
-lighteval|xstory_cloze:ru|0|0
-lighteval|xstory_cloze:sw|0|0
-lighteval|xstory_cloze:te|0|0
-lighteval|xstory_cloze:zh|0|0
-lighteval|xwinograd:en|0|0
-lighteval|xwinograd:fr|0|0
-lighteval|xwinograd:jp|0|0
-lighteval|xwinograd:pt|0|0
-lighteval|xwinograd:ru|0|0
-lighteval|xwinograd:zh|0|0
-original|arc:c:letters|0|0
-original|arc:c:options|0|0
-original|arc:c:simple|0|0
-original|mmlu:abstract_algebra|0|0
-original|mmlu:anatomy|0|0
-original|mmlu:astronomy|0|0
-original|mmlu:business_ethics|0|0
-original|mmlu:clinical_knowledge|0|0
-original|mmlu:college_biology|0|0
-original|mmlu:college_chemistry|0|0
-original|mmlu:college_computer_science|0|0
-original|mmlu:college_mathematics|0|0
-original|mmlu:college_medicine|0|0
-original|mmlu:college_physics|0|0
-original|mmlu:computer_security|0|0
-original|mmlu:conceptual_physics|0|0
-original|mmlu:econometrics|0|0
-original|mmlu:electrical_engineering|0|0
-original|mmlu:elementary_mathematics|0|0
-original|mmlu:formal_logic|0|0
-original|mmlu:global_facts|0|0
-original|mmlu:high_school_biology|0|0
-original|mmlu:high_school_chemistry|0|0
-original|mmlu:high_school_computer_science|0|0
-original|mmlu:high_school_european_history|0|0
-original|mmlu:high_school_geography|0|0
-original|mmlu:high_school_government_and_politics|0|0
-original|mmlu:high_school_macroeconomics|0|0
-original|mmlu:high_school_mathematics|0|0
-original|mmlu:high_school_microeconomics|0|0
-original|mmlu:high_school_physics|0|0
-original|mmlu:high_school_psychology|0|0
-original|mmlu:high_school_statistics|0|0
-original|mmlu:high_school_us_history|0|0
-original|mmlu:high_school_world_history|0|0
-original|mmlu:human_aging|0|0
-original|mmlu:human_sexuality|0|0
-original|mmlu:international_law|0|0
-original|mmlu:jurisprudence|0|0
-original|mmlu:logical_fallacies|0|0
-original|mmlu:machine_learning|0|0
-original|mmlu:management|0|0
-original|mmlu:marketing|0|0
-original|mmlu:medical_genetics|0|0
-original|mmlu:miscellaneous|0|0
-original|mmlu:moral_disputes|0|0
-original|mmlu:moral_scenarios|0|0
-original|mmlu:nutrition|0|0
-original|mmlu:philosophy|0|0
-original|mmlu:prehistory|0|0
-original|mmlu:professional_accounting|0|0
-original|mmlu:professional_law|0|0
-original|mmlu:professional_medicine|0|0
-original|mmlu:professional_psychology|0|0
-original|mmlu:public_relations|0|0
-original|mmlu:security_studies|0|0
-original|mmlu:sociology|0|0
-original|mmlu:us_foreign_policy|0|0
-original|mmlu:virology|0|0
-original|mmlu:world_religions|0|0
-original|mmlu|0|0
+bigbench|abstract_narrative_understanding|0
+bigbench|anachronisms|0
+bigbench|analogical_similarity|0
+bigbench|analytic_entailment|0
+bigbench|arithmetic_bb|0
+bigbench|ascii_word_recognition|0
+bigbench|authorship_verification|0
+bigbench|auto_categorization|0
+bigbench|auto_debugging|0
+bigbench|bbq_lite_json|0
+bigbench|bridging_anaphora_resolution_barqa|0
+bigbench|causal_judgment|0
+bigbench|cause_and_effect|0
+bigbench|checkmate_in_one|0
+bigbench|chess_state_tracking|0
+bigbench|chinese_remainder_theorem|0
+bigbench|cifar10_classification|0
+bigbench|code_line_description|0
+bigbench|codenames|0
+bigbench|color|0
+bigbench|common_morpheme|0
+bigbench|conceptual_combinations|0
+bigbench|conlang_translation|0
+bigbench|contextual_parametric_knowledge_conflicts|0
+bigbench|crash_blossom|0
+bigbench|crass_ai|0
+bigbench|cryobiology_spanish|0
+bigbench|cryptonite|0
+bigbench|cs_algorithms|0
+bigbench|dark_humor_detection|0
+bigbench|date_understanding|0
+bigbench|disambiguation_qa|0
+bigbench|discourse_marker_prediction|0
+bigbench|disfl_qa|0
+bigbench|dyck_languages|0
+bigbench|elementary_math_qa|0
+bigbench|emoji_movie|0
+bigbench|emojis_emotion_prediction|0
+bigbench|empirical_judgments|0
+bigbench|english_proverbs|0
+bigbench|english_russian_proverbs|0
+bigbench|entailed_polarity_hindi|0
+bigbench|entailed_polarity|0
+bigbench|epistemic_reasoning|0
+bigbench|evaluating_information_essentiality|0
+bigbench|fact_checker|0
+bigbench|fantasy_reasoning|0
+bigbench|few_shot_nlg|0
+bigbench|figure_of_speech_detection|0
+bigbench|formal_fallacies_syllogisms_negation|0
+bigbench|gem|0
+bigbench|gender_inclusive_sentences_german|0
+bigbench|general_knowledge|0
+bigbench|geometric_shapes|0
+bigbench|goal_step_wikihow|0
+bigbench|gre_reading_comprehension|0
+bigbench|hhh_alignment|0
+bigbench|hindi_question_answering|0
+bigbench|hindu_knowledge|0
+bigbench|hinglish_toxicity|0
+bigbench|human_organs_senses|0
+bigbench|hyperbaton|0
+bigbench|identify_math_theorems|0
+bigbench|identify_odd_metaphor|0
+bigbench|implicatures|0
+bigbench|implicit_relations|0
+bigbench|intent_recognition|0
+bigbench|international_phonetic_alphabet_nli|0
+bigbench|international_phonetic_alphabet_transliterate|0
+bigbench|intersect_geometry|0
+bigbench|irony_identification|0
+bigbench|kanji_ascii|0
+bigbench|kannada|0
+bigbench|key_value_maps|0
+bigbench|known_unknowns|0
+bigbench|language_games|0
+bigbench|language_identification|0
+bigbench|linguistic_mappings|0
+bigbench|linguistics_puzzles|0
+bigbench|logic_grid_puzzle|0
+bigbench|logical_args|0
+bigbench|logical_deduction|0
+bigbench|logical_fallacy_detection|0
+bigbench|logical_sequence|0
+bigbench|mathematical_induction|0
+bigbench|matrixshapes|0
+bigbench|metaphor_boolean|0
+bigbench|metaphor_understanding|0
+bigbench|minute_mysteries_qa|0
+bigbench|misconceptions_russian|0
+bigbench|misconceptions|0
+bigbench|mnist_ascii|0
+bigbench|modified_arithmetic|0
+bigbench|moral_permissibility|0
+bigbench|movie_dialog_same_or_different|0
+bigbench|movie_recommendation|0
+bigbench|mult_data_wrangling|0
+bigbench|multiemo|0
+bigbench|natural_instructions|0
+bigbench|navigate|0
+bigbench|nonsense_words_grammar|0
+bigbench|novel_concepts|0
+bigbench|object_counting|0
+bigbench|odd_one_out|0
+bigbench|operators|0
+bigbench|paragraph_segmentation|0
+bigbench|parsinlu_qa|0
+bigbench|parsinlu_reading_comprehension|0
+bigbench|penguins_in_a_table|0
+bigbench|periodic_elements|0
+bigbench|persian_idioms|0
+bigbench|phrase_relatedness|0
+bigbench|physical_intuition|0
+bigbench|physics_questions|0
+bigbench|physics|0
+bigbench|play_dialog_same_or_different|0
+bigbench|polish_sequence_labeling|0
+bigbench|presuppositions_as_nli|0
+bigbench|qa_wikidata|0
+bigbench|question_selection|0
+bigbench|real_or_fake_text|0
+bigbench|reasoning_about_colored_objects|0
+bigbench|repeat_copy_logic|0
+bigbench|rephrase|0
+bigbench|rhyming|0
+bigbench|riddle_sense|0
+bigbench|ruin_names|0
+bigbench|salient_translation_error_detection|0
+bigbench|scientific_press_release|0
+bigbench|semantic_parsing_in_context_sparc|0
+bigbench|semantic_parsing_spider|0
+bigbench|sentence_ambiguity|0
+bigbench|similarities_abstraction|0
+bigbench|simp_turing_concept|0
+bigbench|simple_arithmetic_json_multiple_choice|0
+bigbench|simple_arithmetic_json_subtasks|0
+bigbench|simple_arithmetic_json|0
+bigbench|simple_arithmetic_multiple_targets_json|0
+bigbench|simple_ethical_questions|0
+bigbench|simple_text_editing|0
+bigbench|snarks|0
+bigbench|social_iqa|0
+bigbench|social_support|0
+bigbench|sports_understanding|0
+bigbench|strange_stories|0
+bigbench|strategyqa|0
+bigbench|sufficient_information|0
+bigbench|suicide_risk|0
+bigbench|swahili_english_proverbs|0
+bigbench|swedish_to_german_proverbs|0
+bigbench|symbol_interpretation|0
+bigbench|tellmewhy|0
+bigbench|temporal_sequences|0
+bigbench|tense|0
+bigbench|timedial|0
+bigbench|topical_chat|0
+bigbench|tracking_shuffled_objects|0
+bigbench|understanding_fables|0
+bigbench|undo_permutation|0
+bigbench|unit_conversion|0
+bigbench|unit_interpretation|0
+bigbench|unnatural_in_context_learning|0
+bigbench|vitaminc_fact_verification|0
+bigbench|what_is_the_tao|0
+bigbench|which_wiki_edit|0
+bigbench|wino_x_german|0
+bigbench|winowhy|0
+bigbench|word_sorting|0
+bigbench|word_unscrambling|0
+helm|babi_qa|0
+helm|bbq:Age|0
+helm|bbq:Disability_status|0
+helm|bbq:Gender_identity|0
+helm|bbq:Nationality|0
+helm|bbq:Physical_appearance|0
+helm|bbq:Race_ethnicity|0
+helm|bbq:Race_x_SES|0
+helm|bbq:Race_x_gender|0
+helm|bbq:Religion|0
+helm|bbq:SES|0
+helm|bbq:Sexual_orientation|0
+helm|bbq|0
+helm|bigbench:auto_debugging|0
+helm|bigbench:bbq_lite_json:age_ambig|0
+helm|bigbench:bbq_lite_json:age_disambig|0
+helm|bigbench:bbq_lite_json:disability_status_ambig|0
+helm|bigbench:bbq_lite_json:disability_status_disambig|0
+helm|bigbench:bbq_lite_json:gender_identity_ambig|0
+helm|bigbench:bbq_lite_json:gender_identity_disambig|0
+helm|bigbench:bbq_lite_json:nationality_ambig|0
+helm|bigbench:bbq_lite_json:nationality_disambig|0
+helm|bigbench:bbq_lite_json:physical_appearance_ambig|0
+helm|bigbench:bbq_lite_json:physical_appearance_disambig|0
+helm|bigbench:bbq_lite_json:race_ethnicity_ambig|0
+helm|bigbench:bbq_lite_json:race_ethnicity_disambig|0
+helm|bigbench:bbq_lite_json:religion_ambig|0
+helm|bigbench:bbq_lite_json:religion_disambig|0
+helm|bigbench:bbq_lite_json:ses_ambig|0
+helm|bigbench:bbq_lite_json:ses_disambig|0
+helm|bigbench:bbq_lite_json:sexual_orientation_ambig|0
+helm|bigbench:bbq_lite_json:sexual_orientation_disambig|0
+helm|bigbench:code_line_description|0
+helm|bigbench:conceptual_combinations:contradictions|0
+helm|bigbench:conceptual_combinations:emergent_properties|0
+helm|bigbench:conceptual_combinations:fanciful_fictional_combinations|0
+helm|bigbench:conceptual_combinations:homonyms|0
+helm|bigbench:conceptual_combinations:invented_words|0
+helm|bigbench:conlang_translation:adna_from|0
+helm|bigbench:conlang_translation:adna_to|0
+helm|bigbench:conlang_translation:atikampe_from|0
+helm|bigbench:conlang_translation:atikampe_to|0
+helm|bigbench:conlang_translation:gornam_from|0
+helm|bigbench:conlang_translation:gornam_to|0
+helm|bigbench:conlang_translation:holuan_from|0
+helm|bigbench:conlang_translation:holuan_to|0
+helm|bigbench:conlang_translation:mkafala_from|0
+helm|bigbench:conlang_translation:mkafala_to|0
+helm|bigbench:conlang_translation:postpositive_english_from|0
+helm|bigbench:conlang_translation:postpositive_english_to|0
+helm|bigbench:conlang_translation:unapuri_from|0
+helm|bigbench:conlang_translation:unapuri_to|0
+helm|bigbench:conlang_translation:vaomi_from|0
+helm|bigbench:conlang_translation:vaomi_to|0
+helm|bigbench:emoji_movie|0
+helm|bigbench:formal_fallacies_syllogisms_negation|0
+helm|bigbench:hindu_knowledge|0
+helm|bigbench:known_unknowns|0
+helm|bigbench:language_identification|0
+helm|bigbench:linguistics_puzzles|0
+helm|bigbench:logic_grid_puzzle|0
+helm|bigbench:logical_deduction-five_objects|0
+helm|bigbench:logical_deduction-seven_objects|0
+helm|bigbench:logical_deduction-three_objects|0
+helm|bigbench:misconceptions_russian|0
+helm|bigbench:novel_concepts|0
+helm|bigbench:operators|0
+helm|bigbench:parsinlu_reading_comprehension|0
+helm|bigbench:play_dialog_same_or_different|0
+helm|bigbench:repeat_copy_logic|0
+helm|bigbench:strange_stories-boolean|0
+helm|bigbench:strange_stories-multiple_choice|0
+helm|bigbench:strategyqa|0
+helm|bigbench:symbol_interpretation-adversarial|0
+helm|bigbench:symbol_interpretation-emoji_agnostic|0
+helm|bigbench:symbol_interpretation-name_agnostic|0
+helm|bigbench:symbol_interpretation-plain|0
+helm|bigbench:symbol_interpretation-tricky|0
+helm|bigbench:vitaminc_fact_verification|0
+helm|bigbench:winowhy|0
+helm|blimp:adjunct_island|0
+helm|blimp:anaphor_gender_agreement|0
+helm|blimp:anaphor_number_agreement|0
+helm|blimp:animate_subject_passive|0
+helm|blimp:animate_subject_trans|0
+helm|blimp:causative|0
+helm|blimp:complex_NP_island|0
+helm|blimp:coordinate_structure_constraint_complex_left_branch|0
+helm|blimp:coordinate_structure_constraint_object_extraction|0
+helm|blimp:determiner_noun_agreement_1|0
+helm|blimp:determiner_noun_agreement_2|0
+helm|blimp:determiner_noun_agreement_irregular_1|0
+helm|blimp:determiner_noun_agreement_irregular_2|0
+helm|blimp:determiner_noun_agreement_with_adj_2|0
+helm|blimp:determiner_noun_agreement_with_adj_irregular_1|0
+helm|blimp:determiner_noun_agreement_with_adj_irregular_2|0
+helm|blimp:determiner_noun_agreement_with_adjective_1|0
+helm|blimp:distractor_agreement_relational_noun|0
+helm|blimp:distractor_agreement_relative_clause|0
+helm|blimp:drop_argument|0
+helm|blimp:ellipsis_n_bar_1|0
+helm|blimp:ellipsis_n_bar_2|0
+helm|blimp:existential_there_object_raising|0
+helm|blimp:existential_there_quantifiers_1|0
+helm|blimp:existential_there_quantifiers_2|0
+helm|blimp:existential_there_subject_raising|0
+helm|blimp:expletive_it_object_raising|0
+helm|blimp:inchoative|0
+helm|blimp:intransitive|0
+helm|blimp:irregular_past_participle_adjectives|0
+helm|blimp:irregular_past_participle_verbs|0
+helm|blimp:irregular_plural_subject_verb_agreement_1|0
+helm|blimp:irregular_plural_subject_verb_agreement_2|0
+helm|blimp:left_branch_island_echo_question|0
+helm|blimp:left_branch_island_simple_question|0
+helm|blimp:matrix_question_npi_licensor_present|0
+helm|blimp:npi_present_1|0
+helm|blimp:npi_present_2|0
+helm|blimp:only_npi_licensor_present|0
+helm|blimp:only_npi_scope|0
+helm|blimp:passive_1|0
+helm|blimp:passive_2|0
+helm|blimp:principle_A_c_command|0
+helm|blimp:principle_A_case_1|0
+helm|blimp:principle_A_case_2|0
+helm|blimp:principle_A_domain_1|0
+helm|blimp:principle_A_domain_2|0
+helm|blimp:principle_A_domain_3|0
+helm|blimp:principle_A_reconstruction|0
+helm|blimp:regular_plural_subject_verb_agreement_1|0
+helm|blimp:regular_plural_subject_verb_agreement_2|0
+helm|blimp:sentential_negation_npi_licensor_present|0
+helm|blimp:sentential_negation_npi_scope|0
+helm|blimp:sentential_subject_island|0
+helm|blimp:superlative_quantifiers_1|0
+helm|blimp:superlative_quantifiers_2|0
+helm|blimp:tough_vs_raising_1|0
+helm|blimp:tough_vs_raising_2|0
+helm|blimp:transitive|0
+helm|blimp:wh_island|0
+helm|blimp:wh_questions_object_gap|0
+helm|blimp:wh_questions_subject_gap_long_distance|0
+helm|blimp:wh_questions_subject_gap|0
+helm|blimp:wh_vs_that_no_gap_long_distance|0
+helm|blimp:wh_vs_that_no_gap|0
+helm|blimp:wh_vs_that_with_gap_long_distance|0
+helm|blimp:wh_vs_that_with_gap|0
+helm|bold:gender|0
+helm|bold:political_ideology|0
+helm|bold:profession|0
+helm|bold:race|0
+helm|bold:religious_ideology|0
+helm|bold|0
+helm|boolq:contrastset|0
+helm|boolq|0
+helm|civil_comments:LGBTQ|0
+helm|civil_comments:black|0
+helm|civil_comments:christian|0
+helm|civil_comments:female|0
+helm|civil_comments:male|0
+helm|civil_comments:muslim|0
+helm|civil_comments:other_religions|0
+helm|civil_comments:white|0
+helm|civil_comments|0
+helm|commonsenseqa|0
+helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125|0
+helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25|0
+helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5|0
+helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125|0
+helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25|0
+helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5|0
+helm|copyright:oh_the_places|0
+helm|copyright:pilot|0
+helm|copyright:popular_books-prefix_length_10|0
+helm|copyright:popular_books-prefix_length_125|0
+helm|copyright:popular_books-prefix_length_250|0
+helm|copyright:popular_books-prefix_length_25|0
+helm|copyright:popular_books-prefix_length_50|0
+helm|copyright:popular_books-prefix_length_5|0
+helm|copyright:prompt_num_line_1-min_lines_20|0
+helm|copyright:prompt_num_line_10-min_lines_20|0
+helm|copyright:prompt_num_line_5-min_lines_20|0
+helm|covid_dialogue|0
+helm|dyck_language:2|0
+helm|dyck_language:3|0
+helm|dyck_language:4|0
+helm|entity_data_imputation:Buy|0
+helm|entity_data_imputation:Restaurant|0
+helm|entity_matching:Abt_Buy|0
+helm|entity_matching:Amazon_Google|0
+helm|entity_matching:Beer|0
+helm|entity_matching:Company|0
+helm|entity_matching:DBLP_ACM|0
+helm|entity_matching:DBLP_GoogleScholar|0
+helm|entity_matching:Dirty_DBLP_ACM|0
+helm|entity_matching:Dirty_DBLP_GoogleScholar|0
+helm|entity_matching:Dirty_Walmart_Amazon|0
+helm|entity_matching:Dirty_iTunes_Amazon|0
+helm|entity_matching:Fodors_Zagats|0
+helm|entity_matching:Walmart_Amazon|0
+helm|entity_matching:iTunes_Amazon|0
+helm|hellaswag|0
+helm|humaneval|0
+helm|imdb:contrastset|0
+helm|imdb|0
+helm|interactive_qa_mmlu:abstract_algebra|0
+helm|interactive_qa_mmlu:college_chemistry|0
+helm|interactive_qa_mmlu:global_facts|0
+helm|interactive_qa_mmlu:miscellaneous|0
+helm|interactive_qa_mmlu:nutrition|0
+helm|interactive_qa_mmlu:us_foreign_policy|0
+helm|legal_summarization:billsum|0
+helm|legal_summarization:eurlexsum|0
+helm|legal_summarization:multilexsum|0
+helm|legalsupport|0
+helm|lexglue:case_hold|0
+helm|lexglue:ecthr_a|0
+helm|lexglue:ecthr_b|0
+helm|lexglue:eurlex|0
+helm|lexglue:ledgar|0
+helm|lexglue:scotus|0
+helm|lexglue:unfair_tos|0
+helm|lextreme:brazilian_court_decisions_judgment|0
+helm|lextreme:brazilian_court_decisions_unanimity|0
+helm|lextreme:covid19_emergency_event|0
+helm|lextreme:german_argument_mining|0
+helm|lextreme:greek_legal_code_chapter|0
+helm|lextreme:greek_legal_code_subject|0
+helm|lextreme:greek_legal_code_volume|0
+helm|lextreme:greek_legal_ner|0
+helm|lextreme:legalnero|0
+helm|lextreme:lener_br|0
+helm|lextreme:mapa_coarse|0
+helm|lextreme:mapa_fine|0
+helm|lextreme:multi_eurlex_level_1|0
+helm|lextreme:multi_eurlex_level_2|0
+helm|lextreme:multi_eurlex_level_3|0
+helm|lextreme:online_terms_of_service_clause_topics|0
+helm|lextreme:online_terms_of_service_unfairness_levels|0
+helm|lextreme:swiss_judgment_prediction|0
+helm|lsat_qa:assignment|0
+helm|lsat_qa:grouping|0
+helm|lsat_qa:miscellaneous|0
+helm|lsat_qa:ordering|0
+helm|lsat_qa|0
+helm|me_q_sum|0
+helm|med_dialog:healthcaremagic|0
+helm|med_dialog:icliniq|0
+helm|med_mcqa|0
+helm|med_paragraph_simplification|0
+helm|med_qa|0
+helm|mmlu:abstract_algebra|0
+helm|mmlu:anatomy|0
+helm|mmlu:astronomy|0
+helm|mmlu:business_ethics|0
+helm|mmlu:clinical_knowledge|0
+helm|mmlu:college_biology|0
+helm|mmlu:college_chemistry|0
+helm|mmlu:college_computer_science|0
+helm|mmlu:college_mathematics|0
+helm|mmlu:college_medicine|0
+helm|mmlu:college_physics|0
+helm|mmlu:computer_security|0
+helm|mmlu:conceptual_physics|0
+helm|mmlu:econometrics|0
+helm|mmlu:electrical_engineering|0
+helm|mmlu:elementary_mathematics|0
+helm|mmlu:formal_logic|0
+helm|mmlu:global_facts|0
+helm|mmlu:high_school_biology|0
+helm|mmlu:high_school_chemistry|0
+helm|mmlu:high_school_computer_science|0
+helm|mmlu:high_school_european_history|0
+helm|mmlu:high_school_geography|0
+helm|mmlu:high_school_government_and_politics|0
+helm|mmlu:high_school_macroeconomics|0
+helm|mmlu:high_school_mathematics|0
+helm|mmlu:high_school_microeconomics|0
+helm|mmlu:high_school_physics|0
+helm|mmlu:high_school_psychology|0
+helm|mmlu:high_school_statistics|0
+helm|mmlu:high_school_us_history|0
+helm|mmlu:high_school_world_history|0
+helm|mmlu:human_aging|0
+helm|mmlu:human_sexuality|0
+helm|mmlu:international_law|0
+helm|mmlu:jurisprudence|0
+helm|mmlu:logical_fallacies|0
+helm|mmlu:machine_learning|0
+helm|mmlu:management|0
+helm|mmlu:marketing|0
+helm|mmlu:medical_genetics|0
+helm|mmlu:miscellaneous|0
+helm|mmlu:moral_disputes|0
+helm|mmlu:moral_scenarios|0
+helm|mmlu:nutrition|0
+helm|mmlu:philosophy|0
+helm|mmlu:prehistory|0
+helm|mmlu:professional_accounting|0
+helm|mmlu:professional_law|0
+helm|mmlu:professional_medicine|0
+helm|mmlu:professional_psychology|0
+helm|mmlu:public_relations|0
+helm|mmlu:security_studies|0
+helm|mmlu:sociology|0
+helm|mmlu:us_foreign_policy|0
+helm|mmlu:virology|0
+helm|mmlu:world_religions|0
+helm|mmlu|0
+helm|narrativeqa|0
+helm|numeracy:linear_example|0
+helm|numeracy:linear_standard|0
+helm|numeracy:parabola_example|0
+helm|numeracy:parabola_standard|0
+helm|numeracy:paraboloid_example|0
+helm|numeracy:paraboloid_standard|0
+helm|numeracy:plane_example|0
+helm|numeracy:plane_standard|0
+helm|openbookqa|0
+helm|piqa|0
+helm|pubmedqa|0
+helm|quac|0
+helm|raft:ade_corpus_v2|0
+helm|raft:banking_77|0
+helm|raft:neurips_impact_statement_risks|0
+helm|raft:one_stop_english|0
+helm|raft:overruling|0
+helm|raft:semiconductor_org_types|0
+helm|raft:systematic_review_inclusion|0
+helm|raft:tai_safety_research|0
+helm|raft:terms_of_service|0
+helm|raft:tweet_eval_hate|0
+helm|raft:twitter_complaints|0
+helm|real_toxicity_prompts|0
+helm|siqa|0
+helm|summarization:cnn-dm|0
+helm|summarization:xsum-sampled|0
+helm|summarization:xsum|0
+helm|synthetic_reasoning:induction|0
+helm|synthetic_reasoning:natural_easy|0
+helm|synthetic_reasoning:natural_hard|0
+helm|synthetic_reasoning:pattern_match|0
+helm|synthetic_reasoning:variable_substitution|0
+helm|the_pile:arxiv|0
+helm|the_pile:bibliotik|0
+helm|the_pile:commoncrawl|0
+helm|the_pile:dm-mathematics|0
+helm|the_pile:enron|0
+helm|the_pile:europarl|0
+helm|the_pile:freelaw|0
+helm|the_pile:github|0
+helm|the_pile:gutenberg|0
+helm|the_pile:hackernews|0
+helm|the_pile:nih-exporter|0
+helm|the_pile:opensubtitles|0
+helm|the_pile:openwebtext2|0
+helm|the_pile:pubmed-abstracts|0
+helm|the_pile:pubmed-central|0
+helm|the_pile:stackexchange|0
+helm|the_pile:upsto|0
+helm|the_pile:wikipedia|0
+helm|the_pile:youtubesubtitles|0
+helm|truthfulqa|0
+helm|twitterAAE:aa|0
+helm|twitterAAE:white|0
+helm|wikifact:applies_to_jurisdiction|0
+helm|wikifact:atomic_number|0
+helm|wikifact:author|0
+helm|wikifact:award_received|0
+helm|wikifact:basic_form_of_government|0
+helm|wikifact:capital_of|0
+helm|wikifact:capital|0
+helm|wikifact:central_bank|0
+helm|wikifact:composer|0
+helm|wikifact:continent|0
+helm|wikifact:country_of_citizenship|0
+helm|wikifact:country_of_origin|0
+helm|wikifact:country|0
+helm|wikifact:creator|0
+helm|wikifact:currency|0
+helm|wikifact:defendant|0
+helm|wikifact:developer|0
+helm|wikifact:diplomatic_relation|0
+helm|wikifact:director|0
+helm|wikifact:discoverer_or_inventor|0
+helm|wikifact:drug_or_therapy_used_for_treatment|0
+helm|wikifact:educated_at|0
+helm|wikifact:electron_configuration|0
+helm|wikifact:employer|0
+helm|wikifact:field_of_work|0
+helm|wikifact:file_extension|0
+helm|wikifact:genetic_association|0
+helm|wikifact:genre|0
+helm|wikifact:has_part|0
+helm|wikifact:head_of_government|0
+helm|wikifact:head_of_state|0
+helm|wikifact:headquarters_location|0
+helm|wikifact:industry|0
+helm|wikifact:influenced_by|0
+helm|wikifact:instance_of|0
+helm|wikifact:instrument|0
+helm|wikifact:language_of_work_or_name|0
+helm|wikifact:languages_spoken_written_or_signed|0
+helm|wikifact:laws_applied|0
+helm|wikifact:located_in_the_administrative_territorial_entity|0
+helm|wikifact:location_of_discovery|0
+helm|wikifact:location_of_formation|0
+helm|wikifact:location|0
+helm|wikifact:majority_opinion_by|0
+helm|wikifact:manufacturer|0
+helm|wikifact:measured_physical_quantity|0
+helm|wikifact:medical_condition_treated|0
+helm|wikifact:member_of_political_party|0
+helm|wikifact:member_of_sports_team|0
+helm|wikifact:member_of|0
+helm|wikifact:movement|0
+helm|wikifact:named_after|0
+helm|wikifact:native_language|0
+helm|wikifact:number_of_processor_cores|0
+helm|wikifact:occupation|0
+helm|wikifact:office_held_by_head_of_government|0
+helm|wikifact:office_held_by_head_of_state|0
+helm|wikifact:official_language|0
+helm|wikifact:operating_system|0
+helm|wikifact:original_language_of_film_or_TV_show|0
+helm|wikifact:original_network|0
+helm|wikifact:overrules|0
+helm|wikifact:owned_by|0
+helm|wikifact:part_of|0
+helm|wikifact:participating_team|0
+helm|wikifact:place_of_birth|0
+helm|wikifact:place_of_death|0
+helm|wikifact:plaintiff|0
+helm|wikifact:position_held|0
+helm|wikifact:position_played_on_team|0
+helm|wikifact:programming_language|0
+helm|wikifact:recommended_unit_of_measurement|0
+helm|wikifact:record_label|0
+helm|wikifact:religion|0
+helm|wikifact:repealed_by|0
+helm|wikifact:shares_border_with|0
+helm|wikifact:solved_by|0
+helm|wikifact:statement_describes|0
+helm|wikifact:stock_exchange|0
+helm|wikifact:subclass_of|0
+helm|wikifact:subsidiary|0
+helm|wikifact:symptoms_and_signs|0
+helm|wikifact:therapeutic_area|0
+helm|wikifact:time_of_discovery_or_invention|0
+helm|wikifact:twinned_administrative_body|0
+helm|wikifact:work_location|0
+helm|wikitext:103|0
+helm|wmt14:cs-en|0
+helm|wmt14:de-en|0
+helm|wmt14:fr-en|0
+helm|wmt14:hi-en|0
+helm|wmt14:ru-en|0
+lighteval|anli:r1|0
+lighteval|anli:r2|0
+lighteval|anli:r3|0
+lighteval|anli|0
+leaderboard|arc:challenge|0
+lighteval|arc:easy|0
+lighteval|arithmetic:1dc|0
+lighteval|arithmetic:2da|0
+lighteval|arithmetic:2dm|0
+lighteval|arithmetic:2ds|0
+lighteval|arithmetic:3da|0
+lighteval|arithmetic:3ds|0
+lighteval|arithmetic:4da|0
+lighteval|arithmetic:4ds|0
+lighteval|arithmetic:5da|0
+lighteval|arithmetic:5ds|0
+lighteval|asdiv|0
+lighteval|blimp:adjunct_island|0
+lighteval|blimp:anaphor_gender_agreement|0
+lighteval|blimp:anaphor_number_agreement|0
+lighteval|blimp:animate_subject_passive|0
+lighteval|blimp:animate_subject_trans|0
+lighteval|blimp:causative|0
+lighteval|blimp:complex_NP_island|0
+lighteval|blimp:coordinate_structure_constraint_complex_left_branch|0
+lighteval|blimp:coordinate_structure_constraint_object_extraction|0
+lighteval|blimp:determiner_noun_agreement_1|0
+lighteval|blimp:determiner_noun_agreement_2|0
+lighteval|blimp:determiner_noun_agreement_irregular_1|0
+lighteval|blimp:determiner_noun_agreement_irregular_2|0
+lighteval|blimp:determiner_noun_agreement_with_adj_2|0
+lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1|0
+lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2|0
+lighteval|blimp:determiner_noun_agreement_with_adjective_1|0
+lighteval|blimp:distractor_agreement_relational_noun|0
+lighteval|blimp:distractor_agreement_relative_clause|0
+lighteval|blimp:drop_argument|0
+lighteval|blimp:ellipsis_n_bar_1|0
+lighteval|blimp:ellipsis_n_bar_2|0
+lighteval|blimp:existential_there_object_raising|0
+lighteval|blimp:existential_there_quantifiers_1|0
+lighteval|blimp:existential_there_quantifiers_2|0
+lighteval|blimp:existential_there_subject_raising|0
+lighteval|blimp:expletive_it_object_raising|0
+lighteval|blimp:inchoative|0
+lighteval|blimp:intransitive|0
+lighteval|blimp:irregular_past_participle_adjectives|0
+lighteval|blimp:irregular_past_participle_verbs|0
+lighteval|blimp:irregular_plural_subject_verb_agreement_1|0
+lighteval|blimp:irregular_plural_subject_verb_agreement_2|0
+lighteval|blimp:left_branch_island_echo_question|0
+lighteval|blimp:left_branch_island_simple_question|0
+lighteval|blimp:matrix_question_npi_licensor_present|0
+lighteval|blimp:npi_present_1|0
+lighteval|blimp:npi_present_2|0
+lighteval|blimp:only_npi_licensor_present|0
+lighteval|blimp:only_npi_scope|0
+lighteval|blimp:passive_1|0
+lighteval|blimp:passive_2|0
+lighteval|blimp:principle_A_c_command|0
+lighteval|blimp:principle_A_case_1|0
+lighteval|blimp:principle_A_case_2|0
+lighteval|blimp:principle_A_domain_1|0
+lighteval|blimp:principle_A_domain_2|0
+lighteval|blimp:principle_A_domain_3|0
+lighteval|blimp:principle_A_reconstruction|0
+lighteval|blimp:regular_plural_subject_verb_agreement_1|0
+lighteval|blimp:regular_plural_subject_verb_agreement_2|0
+lighteval|blimp:sentential_negation_npi_licensor_present|0
+lighteval|blimp:sentential_negation_npi_scope|0
+lighteval|blimp:sentential_subject_island|0
+lighteval|blimp:superlative_quantifiers_1|0
+lighteval|blimp:superlative_quantifiers_2|0
+lighteval|blimp:tough_vs_raising_1|0
+lighteval|blimp:tough_vs_raising_2|0
+lighteval|blimp:transitive|0
+lighteval|blimp:wh_island|0
+lighteval|blimp:wh_questions_object_gap|0
+lighteval|blimp:wh_questions_subject_gap_long_distance|0
+lighteval|blimp:wh_questions_subject_gap|0
+lighteval|blimp:wh_vs_that_no_gap_long_distance|0
+lighteval|blimp:wh_vs_that_no_gap|0
+lighteval|blimp:wh_vs_that_with_gap_long_distance|0
+lighteval|blimp:wh_vs_that_with_gap|0
+lighteval|coqa_bb|0
+lighteval|coqa|0
+lighteval|drop|0
+lighteval|ethics:commonsense|0
+lighteval|ethics:deontology|0
+lighteval|ethics:justice|0
+lighteval|ethics:utilitarianism|0
+lighteval|ethics:virtue|0
+lighteval|glue:cola|0
+lighteval|glue:mnli_mismatched|0
+lighteval|glue:mnli|0
+lighteval|glue:mrpc|0
+lighteval|glue:qnli|0
+lighteval|glue:qqp|0
+lighteval|glue:rte|0
+lighteval|glue:sst2|0
+lighteval|glue:stsb|0
+lighteval|glue:wnli|0
+leaderboard|gsm8k|0
+lighteval|headqa:en|0
+lighteval|headqa:es|0
+leaderboard|hellaswag|0
+lighteval|iwslt17:ar-en|0
+lighteval|iwslt17:de-en|0
+lighteval|iwslt17:en-ar|0
+lighteval|iwslt17:en-de|0
+lighteval|iwslt17:en-fr|0
+lighteval|iwslt17:en-ja|0
+lighteval|iwslt17:en-ko|0
+lighteval|iwslt17:en-zh|0
+lighteval|iwslt17:fr-en|0
+lighteval|iwslt17:ja-en|0
+lighteval|iwslt17:ko-en|0
+lighteval|iwslt17:zh-en|0
+lighteval|lambada:openai:de|0
+lighteval|lambada:openai:en|0
+lighteval|lambada:openai:es|0
+lighteval|lambada:openai:fr|0
+lighteval|lambada:openai:it|0
+lighteval|lambada:openai_cloze|0
+lighteval|lambada:openai|0
+lighteval|lambada:standard_cloze|0
+lighteval|lambada:standard|0
+lighteval|logiqa|0
+lighteval|math:algebra|0
+lighteval|math:counting_and_probability|0
+lighteval|math:geometry|0
+lighteval|math:intermediate_algebra|0
+lighteval|math:number_theory|0
+lighteval|math:prealgebra|0
+lighteval|math:precalculus|0
+lighteval|mathqa|0
+lighteval|mgsm:bn|0
+lighteval|mgsm:de|0
+lighteval|mgsm:en|0
+lighteval|mgsm:es|0
+lighteval|mgsm:fr|0
+lighteval|mgsm:ja|0
+lighteval|mgsm:ru|0
+lighteval|mgsm:sw|0
+lighteval|mgsm:te|0
+lighteval|mgsm:th|0
+lighteval|mgsm:zh|0
+leaderboard|mmlu:abstract_algebra|0
+leaderboard|mmlu:anatomy|0
+leaderboard|mmlu:astronomy|0
+leaderboard|mmlu:business_ethics|0
+leaderboard|mmlu:clinical_knowledge|0
+leaderboard|mmlu:college_biology|0
+leaderboard|mmlu:college_chemistry|0
+leaderboard|mmlu:college_computer_science|0
+leaderboard|mmlu:college_mathematics|0
+leaderboard|mmlu:college_medicine|0
+leaderboard|mmlu:college_physics|0
+leaderboard|mmlu:computer_security|0
+leaderboard|mmlu:conceptual_physics|0
+leaderboard|mmlu:econometrics|0
+leaderboard|mmlu:electrical_engineering|0
+leaderboard|mmlu:elementary_mathematics|0
+leaderboard|mmlu:formal_logic|0
+leaderboard|mmlu:global_facts|0
+leaderboard|mmlu:high_school_biology|0
+leaderboard|mmlu:high_school_chemistry|0
+leaderboard|mmlu:high_school_computer_science|0
+leaderboard|mmlu:high_school_european_history|0
+leaderboard|mmlu:high_school_geography|0
+leaderboard|mmlu:high_school_government_and_politics|0
+leaderboard|mmlu:high_school_macroeconomics|0
+leaderboard|mmlu:high_school_mathematics|0
+leaderboard|mmlu:high_school_microeconomics|0
+leaderboard|mmlu:high_school_physics|0
+leaderboard|mmlu:high_school_psychology|0
+leaderboard|mmlu:high_school_statistics|0
+leaderboard|mmlu:high_school_us_history|0
+leaderboard|mmlu:high_school_world_history|0
+leaderboard|mmlu:human_aging|0
+leaderboard|mmlu:human_sexuality|0
+leaderboard|mmlu:international_law|0
+leaderboard|mmlu:jurisprudence|0
+leaderboard|mmlu:logical_fallacies|0
+leaderboard|mmlu:machine_learning|0
+leaderboard|mmlu:management|0
+leaderboard|mmlu:marketing|0
+leaderboard|mmlu:medical_genetics|0
+leaderboard|mmlu:miscellaneous|0
+leaderboard|mmlu:moral_disputes|0
+leaderboard|mmlu:moral_scenarios|0
+leaderboard|mmlu:nutrition|0
+leaderboard|mmlu:philosophy|0
+leaderboard|mmlu:prehistory|0
+leaderboard|mmlu:professional_accounting|0
+leaderboard|mmlu:professional_law|0
+leaderboard|mmlu:professional_medicine|0
+leaderboard|mmlu:professional_psychology|0
+leaderboard|mmlu:public_relations|0
+leaderboard|mmlu:security_studies|0
+leaderboard|mmlu:sociology|0
+leaderboard|mmlu:us_foreign_policy|0
+leaderboard|mmlu:virology|0
+leaderboard|mmlu:world_religions|0
+lighteval|mtnt2019:en-fr|0
+lighteval|mtnt2019:en-ja|0
+lighteval|mtnt2019:fr-en|0
+lighteval|mtnt2019:ja-en|0
+lighteval|mutual_plus|0
+lighteval|mutual|0
+lighteval|openbookqa|0
+lighteval|piqa|0
+lighteval|prost|0
+lighteval|pubmedqa|0
+lighteval|qa4mre:2011|0
+lighteval|qa4mre:2012|0
+lighteval|qa4mre:2013|0
+lighteval|qasper_ll|0
+lighteval|qasper|0
+lighteval|race:high|0
+lighteval|sciq|0
+lighteval|storycloze:2016|0
+lighteval|storycloze:2018|0
+lighteval|super_glue:boolq|0
+lighteval|super_glue:cb|0
+lighteval|super_glue:copa|0
+lighteval|super_glue:multirc|0
+lighteval|super_glue:record|0
+lighteval|super_glue:rte|0
+lighteval|super_glue:wic|0
+lighteval|super_glue:wsc|0
+lighteval|swag|0
+lighteval|the_pile:arxiv|0
+lighteval|the_pile:bookcorpus2|0
+lighteval|the_pile:books3|0
+lighteval|the_pile:dm-mathematics|0
+lighteval|the_pile:enron|0
+lighteval|the_pile:europarl|0
+lighteval|the_pile:freelaw|0
+lighteval|the_pile:github|0
+lighteval|the_pile:gutenberg|0
+lighteval|the_pile:hackernews|0
+lighteval|the_pile:nih-exporter|0
+lighteval|the_pile:opensubtitles|0
+lighteval|the_pile:openwebtext2|0
+lighteval|the_pile:philpapers|0
+lighteval|the_pile:pile-cc|0
+lighteval|the_pile:pubmed-abstracts|0
+lighteval|the_pile:pubmed-central|0
+lighteval|the_pile:stackexchange|0
+lighteval|the_pile:ubuntu-irc|0
+lighteval|the_pile:uspto|0
+lighteval|the_pile:wikipedia|0
+lighteval|the_pile:youtubesubtitles|0
+lighteval|toxigen|0
+lighteval|triviaqa|0
+lighteval|truthfulqa:gen|0
+leaderboard|truthfulqa:mc|0
+lighteval|unscramble:anagrams1|0
+lighteval|unscramble:anagrams2|0
+lighteval|unscramble:cycle_letters|0
+lighteval|unscramble:random_insertion|0
+lighteval|unscramble:reversed_words|0
+lighteval|webqs|0
+lighteval|wikitext|0
+leaderboard|winogrande|0
+lighteval|wmt08:cs-en|0
+lighteval|wmt08:de-en|0
+lighteval|wmt08:en-cs|0
+lighteval|wmt08:en-de|0
+lighteval|wmt08:en-es|0
+lighteval|wmt08:en-fr|0
+lighteval|wmt08:en-hu|0
+lighteval|wmt08:es-en|0
+lighteval|wmt08:fr-en|0
+lighteval|wmt08:hu-en|0
+lighteval|wmt09:cs-en|0
+lighteval|wmt09:de-en|0
+lighteval|wmt09:en-cs|0
+lighteval|wmt09:en-de|0
+lighteval|wmt09:en-es|0
+lighteval|wmt09:en-fr|0
+lighteval|wmt09:en-hu|0
+lighteval|wmt09:en-it|0
+lighteval|wmt09:es-en|0
+lighteval|wmt09:fr-en|0
+lighteval|wmt09:hu-en|0
+lighteval|wmt09:it-en|0
+lighteval|wmt10:cs-en|0
+lighteval|wmt10:de-en|0
+lighteval|wmt10:en-cs|0
+lighteval|wmt10:en-de|0
+lighteval|wmt10:en-es|0
+lighteval|wmt10:en-fr|0
+lighteval|wmt10:es-en|0
+lighteval|wmt10:fr-en|0
+lighteval|wmt11:cs-en|0
+lighteval|wmt11:de-en|0
+lighteval|wmt11:en-cs|0
+lighteval|wmt11:en-de|0
+lighteval|wmt11:en-es|0
+lighteval|wmt11:en-fr|0
+lighteval|wmt11:es-en|0
+lighteval|wmt11:fr-en|0
+lighteval|wmt12:cs-en|0
+lighteval|wmt12:de-en|0
+lighteval|wmt12:en-cs|0
+lighteval|wmt12:en-de|0
+lighteval|wmt12:en-es|0
+lighteval|wmt12:en-fr|0
+lighteval|wmt12:es-en|0
+lighteval|wmt12:fr-en|0
+lighteval|wmt13:cs-en|0
+lighteval|wmt13:de-en|0
+lighteval|wmt13:en-cs|0
+lighteval|wmt13:en-de|0
+lighteval|wmt13:en-es|0
+lighteval|wmt13:en-fr|0
+lighteval|wmt13:en-ru|0
+lighteval|wmt13:es-en|0
+lighteval|wmt13:fr-en|0
+lighteval|wmt13:ru-en|0
+lighteval|wmt14:cs-en|0
+lighteval|wmt14:de-en|0
+lighteval|wmt14:en-cs|0
+lighteval|wmt14:en-de|0
+lighteval|wmt14:en-fr|0
+lighteval|wmt14:en-fr|0
+lighteval|wmt14:en-hi|0
+lighteval|wmt14:en-ru|0
+lighteval|wmt14:fr-en|0
+lighteval|wmt14:fr-en|0
+lighteval|wmt14:hi-en|0
+lighteval|wmt14:ru-en|0
+lighteval|wmt15:cs-en|0
+lighteval|wmt15:de-en|0
+lighteval|wmt15:en-cs|0
+lighteval|wmt15:en-de|0
+lighteval|wmt15:en-fi|0
+lighteval|wmt15:en-fr|0
+lighteval|wmt15:en-ru|0
+lighteval|wmt15:fi-en|0
+lighteval|wmt15:fr-en|0
+lighteval|wmt15:ru-en|0
+lighteval|wmt16:cs-en|0
+lighteval|wmt16:de-en|0
+lighteval|wmt16:de-en|0
+lighteval|wmt16:en-cs|0
+lighteval|wmt16:en-de|0
+lighteval|wmt16:en-de|0
+lighteval|wmt16:en-fi|0
+lighteval|wmt16:en-ro|0
+lighteval|wmt16:en-ro|0
+lighteval|wmt16:en-ru|0
+lighteval|wmt16:en-tr|0
+lighteval|wmt16:fi-en|0
+lighteval|wmt16:ro-en|0
+lighteval|wmt16:ro-en|0
+lighteval|wmt16:ru-en|0
+lighteval|wmt16:tr-en|0
+lighteval|wmt17:cs-en|0
+lighteval|wmt17:de-en|0
+lighteval|wmt17:en-cs|0
+lighteval|wmt17:en-de|0
+lighteval|wmt17:en-fi|0
+lighteval|wmt17:en-lv|0
+lighteval|wmt17:en-ru|0
+lighteval|wmt17:en-tr|0
+lighteval|wmt17:en-zh|0
+lighteval|wmt17:fi-en|0
+lighteval|wmt17:lv-en|0
+lighteval|wmt17:ru-en|0
+lighteval|wmt17:tr-en|0
+lighteval|wmt17:zh-en|0
+lighteval|wmt18:cs-en|0
+lighteval|wmt18:de-en|0
+lighteval|wmt18:en-cs|0
+lighteval|wmt18:en-de|0
+lighteval|wmt18:en-et|0
+lighteval|wmt18:en-fi|0
+lighteval|wmt18:en-ru|0
+lighteval|wmt18:en-tr|0
+lighteval|wmt18:en-zh|0
+lighteval|wmt18:et-en|0
+lighteval|wmt18:fi-en|0
+lighteval|wmt18:ru-en|0
+lighteval|wmt18:tr-en|0
+lighteval|wmt18:zh-en|0
+lighteval|wmt19:cs-de|0
+lighteval|wmt19:de-cs|0
+lighteval|wmt19:de-en|0
+lighteval|wmt19:de-fr|0
+lighteval|wmt19:en-cs|0
+lighteval|wmt19:en-de|0
+lighteval|wmt19:en-fi|0
+lighteval|wmt19:en-gu|0
+lighteval|wmt19:en-kk|0
+lighteval|wmt19:en-lt|0
+lighteval|wmt19:en-ru|0
+lighteval|wmt19:en-zh|0
+lighteval|wmt19:fi-en|0
+lighteval|wmt19:fr-de|0
+lighteval|wmt19:gu-en|0
+lighteval|wmt19:kk-en|0
+lighteval|wmt19:lt-en|0
+lighteval|wmt19:ru-en|0
+lighteval|wmt19:zh-en|0
+lighteval|wmt20:cs-en|0
+lighteval|wmt20:de-en|0
+lighteval|wmt20:de-fr|0
+lighteval|wmt20:en-cs|0
+lighteval|wmt20:en-de|0
+lighteval|wmt20:en-iu|0
+lighteval|wmt20:en-ja|0
+lighteval|wmt20:en-km|0
+lighteval|wmt20:en-pl|0
+lighteval|wmt20:en-ps|0
+lighteval|wmt20:en-ru|0
+lighteval|wmt20:en-ta|0
+lighteval|wmt20:en-zh|0
+lighteval|wmt20:fr-de|0
+lighteval|wmt20:iu-en|0
+lighteval|wmt20:ja-en|0
+lighteval|wmt20:km-en|0
+lighteval|wmt20:pl-en|0
+lighteval|wmt20:ps-en|0
+lighteval|wmt20:ru-en|0
+lighteval|wmt20:ta-en|0
+lighteval|wmt20:zh-en|0
+lighteval|wsc273|0
+lighteval|xcopa:en|0
+lighteval|xcopa:et|0
+lighteval|xcopa:ht|0
+lighteval|xcopa:id|0
+lighteval|xcopa:it|0
+lighteval|xcopa:qu|0
+lighteval|xcopa:sw|0
+lighteval|xcopa:ta|0
+lighteval|xcopa:th|0
+lighteval|xcopa:tr|0
+lighteval|xcopa:vi|0
+lighteval|xcopa:zh|0
+lighteval|xstory_cloze:ar|0
+lighteval|xstory_cloze:en|0
+lighteval|xstory_cloze:es|0
+lighteval|xstory_cloze:eu|0
+lighteval|xstory_cloze:hi|0
+lighteval|xstory_cloze:id|0
+lighteval|xstory_cloze:my|0
+lighteval|xstory_cloze:ru|0
+lighteval|xstory_cloze:sw|0
+lighteval|xstory_cloze:te|0
+lighteval|xstory_cloze:zh|0
+lighteval|xwinograd:en|0
+lighteval|xwinograd:fr|0
+lighteval|xwinograd:jp|0
+lighteval|xwinograd:pt|0
+lighteval|xwinograd:ru|0
+lighteval|xwinograd:zh|0
+original|arc:c:letters|0
+original|arc:c:options|0
+original|arc:c:simple|0
+original|mmlu:abstract_algebra|0
+original|mmlu:anatomy|0
+original|mmlu:astronomy|0
+original|mmlu:business_ethics|0
+original|mmlu:clinical_knowledge|0
+original|mmlu:college_biology|0
+original|mmlu:college_chemistry|0
+original|mmlu:college_computer_science|0
+original|mmlu:college_mathematics|0
+original|mmlu:college_medicine|0
+original|mmlu:college_physics|0
+original|mmlu:computer_security|0
+original|mmlu:conceptual_physics|0
+original|mmlu:econometrics|0
+original|mmlu:electrical_engineering|0
+original|mmlu:elementary_mathematics|0
+original|mmlu:formal_logic|0
+original|mmlu:global_facts|0
+original|mmlu:high_school_biology|0
+original|mmlu:high_school_chemistry|0
+original|mmlu:high_school_computer_science|0
+original|mmlu:high_school_european_history|0
+original|mmlu:high_school_geography|0
+original|mmlu:high_school_government_and_politics|0
+original|mmlu:high_school_macroeconomics|0
+original|mmlu:high_school_mathematics|0
+original|mmlu:high_school_microeconomics|0
+original|mmlu:high_school_physics|0
+original|mmlu:high_school_psychology|0
+original|mmlu:high_school_statistics|0
+original|mmlu:high_school_us_history|0
+original|mmlu:high_school_world_history|0
+original|mmlu:human_aging|0
+original|mmlu:human_sexuality|0
+original|mmlu:international_law|0
+original|mmlu:jurisprudence|0
+original|mmlu:logical_fallacies|0
+original|mmlu:machine_learning|0
+original|mmlu:management|0
+original|mmlu:marketing|0
+original|mmlu:medical_genetics|0
+original|mmlu:miscellaneous|0
+original|mmlu:moral_disputes|0
+original|mmlu:moral_scenarios|0
+original|mmlu:nutrition|0
+original|mmlu:philosophy|0
+original|mmlu:prehistory|0
+original|mmlu:professional_accounting|0
+original|mmlu:professional_law|0
+original|mmlu:professional_medicine|0
+original|mmlu:professional_psychology|0
+original|mmlu:public_relations|0
+original|mmlu:security_studies|0
+original|mmlu:sociology|0
+original|mmlu:us_foreign_policy|0
+original|mmlu:virology|0
+original|mmlu:world_religions|0
+original|mmlu|0
diff --git a/examples/tasks/bbh.txt b/examples/tasks/bbh.txt
index 6b90fa3ae..c12ff66c4 100644
--- a/examples/tasks/bbh.txt
+++ b/examples/tasks/bbh.txt
@@ -1,36 +1,36 @@
-lighteval|bigbench:causal_judgment|3|0
-lighteval|bigbench:date_understanding|3|0
-lighteval|bigbench:disambiguation_qa|3|0
-lighteval|bigbench:geometric_shapes|3|0
-lighteval|bigbench:logical_deduction_five_objects|3|0
-lighteval|bigbench:logical_deduction_seven_objects|3|0
-lighteval|bigbench:logical_deduction_three_objects|3|0
-lighteval|bigbench:movie_recommendation|3|0
-lighteval|bigbench:navigate|3|0
-lighteval|bigbench:reasoning_about_colored_objects|3|0
-lighteval|bigbench:ruin_names|3|0
-lighteval|bigbench:salient_translation_error_detection|3|0
-lighteval|bigbench:snarks|3|0
-lighteval|bigbench:sports_understanding|3|0
-lighteval|bigbench:temporal_sequences|3|0
-lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0
-lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0
-lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0
-harness|bigbench:causal_judgment|3|0
-harness|bigbench:date_understanding|3|0
-harness|bigbench:disambiguation_qa|3|0
-harness|bigbench:geometric_shapes|3|0
-harness|bigbench:logical_deduction_five_objects|3|0
-harness|bigbench:logical_deduction_seven_objects|3|0
-harness|bigbench:logical_deduction_three_objects|3|0
-harness|bigbench:movie_recommendation|3|0
-harness|bigbench:navigate|3|0
-harness|bigbench:reasoning_about_colored_objects|3|0
-harness|bigbench:ruin_names|3|0
-harness|bigbench:salient_translation_error_detection|3|0
-harness|bigbench:snarks|3|0
-harness|bigbench:sports_understanding|3|0
-harness|bigbench:temporal_sequences|3|0
-harness|bigbench:tracking_shuffled_objects_five_objects|3|0
-harness|bigbench:tracking_shuffled_objects_seven_objects|3|0
-harness|bigbench:tracking_shuffled_objects_three_objects|3|0
+lighteval|bigbench:causal_judgment|3
+lighteval|bigbench:date_understanding|3
+lighteval|bigbench:disambiguation_qa|3
+lighteval|bigbench:geometric_shapes|3
+lighteval|bigbench:logical_deduction_five_objects|3
+lighteval|bigbench:logical_deduction_seven_objects|3
+lighteval|bigbench:logical_deduction_three_objects|3
+lighteval|bigbench:movie_recommendation|3
+lighteval|bigbench:navigate|3
+lighteval|bigbench:reasoning_about_colored_objects|3
+lighteval|bigbench:ruin_names|3
+lighteval|bigbench:salient_translation_error_detection|3
+lighteval|bigbench:snarks|3
+lighteval|bigbench:sports_understanding|3
+lighteval|bigbench:temporal_sequences|3
+lighteval|bigbench:tracking_shuffled_objects_five_objects|3
+lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
+lighteval|bigbench:tracking_shuffled_objects_three_objects|3
+harness|bigbench:causal_judgment|3
+harness|bigbench:date_understanding|3
+harness|bigbench:disambiguation_qa|3
+harness|bigbench:geometric_shapes|3
+harness|bigbench:logical_deduction_five_objects|3
+harness|bigbench:logical_deduction_seven_objects|3
+harness|bigbench:logical_deduction_three_objects|3
+harness|bigbench:movie_recommendation|3
+harness|bigbench:navigate|3
+harness|bigbench:reasoning_about_colored_objects|3
+harness|bigbench:ruin_names|3
+harness|bigbench:salient_translation_error_detection|3
+harness|bigbench:snarks|3
+harness|bigbench:sports_understanding|3
+harness|bigbench:temporal_sequences|3
+harness|bigbench:tracking_shuffled_objects_five_objects|3
+harness|bigbench:tracking_shuffled_objects_seven_objects|3
+harness|bigbench:tracking_shuffled_objects_three_objects|3
diff --git a/examples/tasks/fine_tasks/cf/ar.txt b/examples/tasks/fine_tasks/cf/ar.txt
index 8e7bbe0b7..e9c025ecc 100644
--- a/examples/tasks/fine_tasks/cf/ar.txt
+++ b/examples/tasks/fine_tasks/cf/ar.txt
@@ -1,23 +1,23 @@
 # General Knowledge (GK)
-lighteval|exams_ara_cf|0|1
-lighteval|mmlu_ara_cf|0|1
-lighteval|alghafa_arc_ara_cf:easy|0|1
-lighteval|alghafa_sciqa_ara_cf|0|1
+lighteval|exams_ara_cf|0
+lighteval|mmlu_ara_cf|0
+lighteval|alghafa_arc_ara_cf:easy|0
+lighteval|alghafa_sciqa_ara_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_arb_Arab_cf|0|1
-lighteval|soqal_ara_cf|0|1
-lighteval|mlqa_ara|0|1
-lighteval|tydiqa_ara|0|1
-lighteval|alghafa_race_ara_cf|0|1
-lighteval|arcd_ara|0|1
+lighteval|belebele_arb_Arab_cf|0
+lighteval|soqal_ara_cf|0
+lighteval|mlqa_ara|0
+lighteval|tydiqa_ara|0
+lighteval|alghafa_race_ara_cf|0
+lighteval|arcd_ara|0
 
 # Reasoning (RES)
-lighteval|xcodah_ara_cf|0|1
-lighteval|alghafa_piqa_ara_cf|0|1
-lighteval|xcsqa_ara_cf|0|1
+lighteval|xcodah_ara_cf|0
+lighteval|alghafa_piqa_ara_cf|0
+lighteval|xcsqa_ara_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|xnli2.0_ara_cf|0|1
-lighteval|mlmm_hellaswag_ara_cf|0|1
-lighteval|xstory_cloze_ara_cf|0|1
+lighteval|xnli2.0_ara_cf|0
+lighteval|mlmm_hellaswag_ara_cf|0
+lighteval|xstory_cloze_ara_cf|0
diff --git a/examples/tasks/fine_tasks/cf/fr.txt b/examples/tasks/fine_tasks/cf/fr.txt
index e20a4808f..9e822cd2e 100644
--- a/examples/tasks/fine_tasks/cf/fr.txt
+++ b/examples/tasks/fine_tasks/cf/fr.txt
@@ -1,16 +1,16 @@
 # General Knowledge (GK)
-lighteval|meta_mmlu_fra_cf|0|1
-lighteval|mlmm_arc_fra_cf:challenge|0|1
-lighteval|mintaka_fra|0|1
+lighteval|meta_mmlu_fra_cf|0
+lighteval|mlmm_arc_fra_cf:challenge|0
+lighteval|mintaka_fra|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_fra_Latn_cf|0|1
-lighteval|fquadv2_fra|0|1
+lighteval|belebele_fra_Latn_cf|0
+lighteval|fquadv2_fra|0
 
 # Reasoning (RES)
-lighteval|xcodah_fra_cf|0|1
-lighteval|xcsqa_fra_cf|0|1
+lighteval|xcodah_fra_cf|0
+lighteval|xcsqa_fra_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_fra_cf|0|1
-lighteval|xnli2.0_fra_cf|0|1
+lighteval|mlmm_hellaswag_fra_cf|0
+lighteval|xnli2.0_fra_cf|0
diff --git a/examples/tasks/fine_tasks/cf/hi.txt b/examples/tasks/fine_tasks/cf/hi.txt
index 41aa6477e..7c7b565d4 100644
--- a/examples/tasks/fine_tasks/cf/hi.txt
+++ b/examples/tasks/fine_tasks/cf/hi.txt
@@ -1,17 +1,17 @@
 # General Knowledge (GK)
-lighteval|meta_mmlu_hin_cf|0|1
-lighteval|community_arc_hin_cf:easy|0|1
+lighteval|meta_mmlu_hin_cf|0
+lighteval|community_arc_hin_cf:easy|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_hin_Deva_cf|0|1
-lighteval|indicqa_hin|0|1
+lighteval|belebele_hin_Deva_cf|0
+lighteval|indicqa_hin|0
 
 # Reasoning (RES)
-lighteval|xcodah_hin_cf|0|1
-lighteval|indicxcopa_hin_cf|0|1
-lighteval|xcsqa_hin_cf|0|1
+lighteval|xcodah_hin_cf|0
+lighteval|indicxcopa_hin_cf|0
+lighteval|xcsqa_hin_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_hin_cf|0|1
-lighteval|indicnxnli_hin_cf|0|1
-lighteval|xstory_cloze_hin_cf|0|1
+lighteval|mlmm_hellaswag_hin_cf|0
+lighteval|indicnxnli_hin_cf|0
+lighteval|xstory_cloze_hin_cf|0
diff --git a/examples/tasks/fine_tasks/cf/ru.txt b/examples/tasks/fine_tasks/cf/ru.txt
index d16e07d72..3e37ec3c0 100644
--- a/examples/tasks/fine_tasks/cf/ru.txt
+++ b/examples/tasks/fine_tasks/cf/ru.txt
@@ -1,20 +1,20 @@
 # General Knowledge (GK)
-lighteval|mlmm_arc_rus_cf:challenge|0|1
-lighteval|rummlu_rus_cf|0|1
-lighteval|mera_openbookqa_rus_cf|0|1
+lighteval|mlmm_arc_rus_cf:challenge|0
+lighteval|rummlu_rus_cf|0
+lighteval|mera_openbookqa_rus_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_rus_Cyrl_cf|0|1
-lighteval|tydiqa_rus|0|1
-lighteval|sber_squad_rus|0|1
-lighteval|xquad_rus|0|1
+lighteval|belebele_rus_Cyrl_cf|0
+lighteval|tydiqa_rus|0
+lighteval|sber_squad_rus|0
+lighteval|xquad_rus|0
 
 # Reasoning (RES)
-lighteval|parus_rus_cf|0|1
-lighteval|xcodah_rus_cf|0|1
-lighteval|xcsqa_rus_cf|0|1
+lighteval|parus_rus_cf|0
+lighteval|xcodah_rus_cf|0
+lighteval|xcsqa_rus_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_rus_cf|0|1
-lighteval|xnli2.0_rus_cf|0|1
-lighteval|xstory_cloze_rus_cf|0|1
+lighteval|mlmm_hellaswag_rus_cf|0
+lighteval|xnli2.0_rus_cf|0
+lighteval|xstory_cloze_rus_cf|0
diff --git a/examples/tasks/fine_tasks/cf/sw.txt b/examples/tasks/fine_tasks/cf/sw.txt
index 67406e2a1..e01ebde38 100644
--- a/examples/tasks/fine_tasks/cf/sw.txt
+++ b/examples/tasks/fine_tasks/cf/sw.txt
@@ -1,17 +1,17 @@
 # General Knowledge (GK)
-lighteval|community_arc_swa_cf:easy|0|1
-lighteval|m3exams_swa_cf|0|1
-lighteval|openai_mmlu_swa_cf|0|1
+lighteval|community_arc_swa_cf:easy|0
+lighteval|m3exams_swa_cf|0
+lighteval|openai_mmlu_swa_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_swh_Latn_cf|0|1
-lighteval|kenswquad_swa|0|1
-lighteval|tydiqa_swa|0|1
+lighteval|belebele_swh_Latn_cf|0
+lighteval|kenswquad_swa|0
+lighteval|tydiqa_swa|0
 
 # Reasoning (RES)
-lighteval|xcsqa_swa_cf|0|1
-lighteval|xcopa_swa_cf|0|1
+lighteval|xcsqa_swa_cf|0
+lighteval|xcopa_swa_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|xnli2.0_swa_cf|0|1
-lighteval|xstory_cloze_swa_cf|0|1
+lighteval|xnli2.0_swa_cf|0
+lighteval|xstory_cloze_swa_cf|0
diff --git a/examples/tasks/fine_tasks/cf/te.txt b/examples/tasks/fine_tasks/cf/te.txt
index 7b844868c..2c64cbc2b 100644
--- a/examples/tasks/fine_tasks/cf/te.txt
+++ b/examples/tasks/fine_tasks/cf/te.txt
@@ -1,14 +1,14 @@
 # General Knowledge (GK)
-lighteval|mlmm_mmlu_tel_cf|0|1
+lighteval|mlmm_mmlu_tel_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_tel_Telu_cf|0|1
-lighteval|indicqa_tel|0|1
+lighteval|belebele_tel_Telu_cf|0
+lighteval|indicqa_tel|0
 
 # Reasoning (RES)
-lighteval|indicxcopa_tel_cf|0|1
+lighteval|indicxcopa_tel_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|community_hellaswag_tel_cf|0|1
-lighteval|indicnxnli_tel_cf|0|1
-lighteval|xstory_cloze_tel_cf|0|1
+lighteval|community_hellaswag_tel_cf|0
+lighteval|indicnxnli_tel_cf|0
+lighteval|xstory_cloze_tel_cf|0
diff --git a/examples/tasks/fine_tasks/cf/th.txt b/examples/tasks/fine_tasks/cf/th.txt
index 16743e9af..89a895063 100644
--- a/examples/tasks/fine_tasks/cf/th.txt
+++ b/examples/tasks/fine_tasks/cf/th.txt
@@ -1,12 +1,12 @@
 # General Knowledge (GK)
-lighteval|meta_mmlu_tha_cf|0|1
-lighteval|m3exams_tha_cf|0|1
+lighteval|meta_mmlu_tha_cf|0
+lighteval|m3exams_tha_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_tha_Thai_cf|0|1
-lighteval|thaiqa_tha|0|1
-lighteval|xquad_tha|0|1
+lighteval|belebele_tha_Thai_cf|0
+lighteval|thaiqa_tha|0
+lighteval|xquad_tha|0
 
 # Natural Language Understanding (NLU)
-lighteval|community_hellaswag_tha_cf|0|1
-lighteval|xnli2.0_tha_cf|0|1
+lighteval|community_hellaswag_tha_cf|0
+lighteval|xnli2.0_tha_cf|0
diff --git a/examples/tasks/fine_tasks/cf/tr.txt b/examples/tasks/fine_tasks/cf/tr.txt
index d999be71c..5c31d63d5 100644
--- a/examples/tasks/fine_tasks/cf/tr.txt
+++ b/examples/tasks/fine_tasks/cf/tr.txt
@@ -1,16 +1,16 @@
 # General Knowledge (GK)
-lighteval|community_arc_tur_cf:easy|0|1
-lighteval|exams_tur_cf|0|1
-lighteval|community_mmlu_tur_cf|0|1
+lighteval|community_arc_tur_cf:easy|0
+lighteval|exams_tur_cf|0
+lighteval|community_mmlu_tur_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_tur_Latn_cf|0|1
-lighteval|tquadv2_tur|0|1
-lighteval|xquad_tur|0|1
+lighteval|belebele_tur_Latn_cf|0
+lighteval|tquadv2_tur|0
+lighteval|xquad_tur|0
 
 # Reasoning (RES)
-lighteval|xcopa_tur_cf|0|1
+lighteval|xcopa_tur_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|community_hellaswag_tur_cf|0|1
-lighteval|xnli2.0_tur_cf|0|1
+lighteval|community_hellaswag_tur_cf|0
+lighteval|xnli2.0_tur_cf|0
diff --git a/examples/tasks/fine_tasks/cf/zh.txt b/examples/tasks/fine_tasks/cf/zh.txt
index 76e7e28db..76d0ef068 100644
--- a/examples/tasks/fine_tasks/cf/zh.txt
+++ b/examples/tasks/fine_tasks/cf/zh.txt
@@ -1,22 +1,22 @@
 # General Knowledge (GK)
-lighteval|agieval_zho_cf|0|1
-lighteval|ceval_zho_cf|0|1
-lighteval|cmmlu_zho_cf|0|1
-lighteval|m3exams_zho_cf|0|1
+lighteval|agieval_zho_cf|0
+lighteval|ceval_zho_cf|0
+lighteval|cmmlu_zho_cf|0
+lighteval|m3exams_zho_cf|0
 
 # Reading Comprehension (RC)
-lighteval|belebele_zho_Hans_cf|0|1
-lighteval|c3_zho_cf|0|1
-lighteval|cmrc2018_zho|0|1
-lighteval|chinese_squad_zho|0|1
+lighteval|belebele_zho_Hans_cf|0
+lighteval|c3_zho_cf|0
+lighteval|cmrc2018_zho|0
+lighteval|chinese_squad_zho|0
 
 # Reasoning (RES)
-lighteval|xcodah_zho_cf|0|1
-lighteval|xcopa_zho_cf|0|1
-lighteval|xcsqa_zho_cf|0|1
+lighteval|xcodah_zho_cf|0
+lighteval|xcopa_zho_cf|0
+lighteval|xcsqa_zho_cf|0
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_zho_cf|0|1
-lighteval|ocnli_zho_cf|0|1
-lighteval|xwinograd_zho_cf|0|1
-lighteval|xstory_cloze_zho_cf|0|1
+lighteval|mlmm_hellaswag_zho_cf|0
+lighteval|ocnli_zho_cf|0
+lighteval|xwinograd_zho_cf|0
+lighteval|xstory_cloze_zho_cf|0
diff --git a/examples/tasks/fine_tasks/mcf/ar.txt b/examples/tasks/fine_tasks/mcf/ar.txt
index d772e653d..56e94cf00 100644
--- a/examples/tasks/fine_tasks/mcf/ar.txt
+++ b/examples/tasks/fine_tasks/mcf/ar.txt
@@ -1,23 +1,23 @@
 # General Knowledge (GK)
-lighteval|exams_ara_mcf|5|1
-lighteval|mmlu_ara_mcf|5|1
-lighteval|alghafa_arc_ara_mcf:easy|5|1
-lighteval|alghafa_sciqa_ara_mcf|5|1
+lighteval|exams_ara_mcf|5
+lighteval|mmlu_ara_mcf|5
+lighteval|alghafa_arc_ara_mcf:easy|5
+lighteval|alghafa_sciqa_ara_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_arb_Arab_mcf|5|1
-lighteval|soqal_ara_mcf|5|1
-lighteval|mlqa_ara|5|1
-lighteval|tydiqa_ara|5|1
-lighteval|alghafa_race_ara_mcf|5|1
-lighteval|arcd_ara|5|1
+lighteval|belebele_arb_Arab_mcf|5
+lighteval|soqal_ara_mcf|5
+lighteval|mlqa_ara|5
+lighteval|tydiqa_ara|5
+lighteval|alghafa_race_ara_mcf|5
+lighteval|arcd_ara|5
 
 # Reasoning (RES)
-lighteval|xcodah_ara_mcf|5|1
-lighteval|alghafa_piqa_ara_mcf|5|1
-lighteval|xcsqa_ara_mcf|5|1
+lighteval|xcodah_ara_mcf|5
+lighteval|alghafa_piqa_ara_mcf|5
+lighteval|xcsqa_ara_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|xnli2.0_ara_mcf|5|1
-lighteval|mlmm_hellaswag_ara_mcf|5|1
-lighteval|xstory_cloze_ara_mcf|5|1
+lighteval|xnli2.0_ara_mcf|5
+lighteval|mlmm_hellaswag_ara_mcf|5
+lighteval|xstory_cloze_ara_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/fr.txt b/examples/tasks/fine_tasks/mcf/fr.txt
index 4a7d04eac..e96e5bf49 100644
--- a/examples/tasks/fine_tasks/mcf/fr.txt
+++ b/examples/tasks/fine_tasks/mcf/fr.txt
@@ -1,16 +1,16 @@
 # General Knowledge (GK)
-lighteval|meta_mmlu_fra_mcf|5|1
-lighteval|mlmm_arc_fra_mcf:challenge|5|1
-lighteval|mintaka_fra|5|1
+lighteval|meta_mmlu_fra_mcf|5
+lighteval|mlmm_arc_fra_mcf:challenge|5
+lighteval|mintaka_fra|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_fra_Latn_mcf|5|1
-lighteval|fquadv2_fra|5|1
+lighteval|belebele_fra_Latn_mcf|5
+lighteval|fquadv2_fra|5
 
 # Reasoning (RES)
-lighteval|xcodah_fra_mcf|5|1
-lighteval|xcsqa_fra_mcf|5|1
+lighteval|xcodah_fra_mcf|5
+lighteval|xcsqa_fra_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_fra_mcf|5|1
-lighteval|xnli2.0_fra_mcf|5|1
+lighteval|mlmm_hellaswag_fra_mcf|5
+lighteval|xnli2.0_fra_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/hi.txt b/examples/tasks/fine_tasks/mcf/hi.txt
index e7298ae70..5140ebc74 100644
--- a/examples/tasks/fine_tasks/mcf/hi.txt
+++ b/examples/tasks/fine_tasks/mcf/hi.txt
@@ -1,17 +1,17 @@
 # General Knowledge (GK)
-lighteval|meta_mmlu_hin_mcf|5|1
-lighteval|community_arc_hin_mcf:easy|5|1
+lighteval|meta_mmlu_hin_mcf|5
+lighteval|community_arc_hin_mcf:easy|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_hin_Deva_mcf|5|1
-lighteval|indicqa_hin|5|1
+lighteval|belebele_hin_Deva_mcf|5
+lighteval|indicqa_hin|5
 
 # Reasoning (RES)
-lighteval|xcodah_hin_mcf|5|1
-lighteval|indicxcopa_hin_mcf|5|1
-lighteval|xcsqa_hin_mcf|5|1
+lighteval|xcodah_hin_mcf|5
+lighteval|indicxcopa_hin_mcf|5
+lighteval|xcsqa_hin_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_hin_mcf|5|1
-lighteval|indicnxnli_hin_mcf|5|1
-lighteval|xstory_cloze_hin_mcf|5|1
+lighteval|mlmm_hellaswag_hin_mcf|5
+lighteval|indicnxnli_hin_mcf|5
+lighteval|xstory_cloze_hin_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/ru.txt b/examples/tasks/fine_tasks/mcf/ru.txt
index 5598cee78..f6c14a842 100644
--- a/examples/tasks/fine_tasks/mcf/ru.txt
+++ b/examples/tasks/fine_tasks/mcf/ru.txt
@@ -1,20 +1,20 @@
 # General Knowledge (GK)
-lighteval|mlmm_arc_rus_mcf:challenge|5|1
-lighteval|rummlu_rus_mcf|5|1
-lighteval|mera_openbookqa_rus_mcf|5|1
+lighteval|mlmm_arc_rus_mcf:challenge|5
+lighteval|rummlu_rus_mcf|5
+lighteval|mera_openbookqa_rus_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_rus_Cyrl_mcf|5|1
-lighteval|tydiqa_rus|5|1
-lighteval|sber_squad_rus|5|1
-lighteval|xquad_rus|5|1
+lighteval|belebele_rus_Cyrl_mcf|5
+lighteval|tydiqa_rus|5
+lighteval|sber_squad_rus|5
+lighteval|xquad_rus|5
 
 # Reasoning (RES)
-lighteval|parus_rus_mcf|0|1
-lighteval|xcodah_rus_mcf|5|1
-lighteval|xcsqa_rus_mcf|5|1
+lighteval|parus_rus_mcf|0
+lighteval|xcodah_rus_mcf|5
+lighteval|xcsqa_rus_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_rus_mcf|0|1
-lighteval|xnli2.0_rus_mcf|5|1
-lighteval|xstory_cloze_rus_mcf|5|1
+lighteval|mlmm_hellaswag_rus_mcf|0
+lighteval|xnli2.0_rus_mcf|5
+lighteval|xstory_cloze_rus_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/sw.txt b/examples/tasks/fine_tasks/mcf/sw.txt
index acb53d364..a1f726954 100644
--- a/examples/tasks/fine_tasks/mcf/sw.txt
+++ b/examples/tasks/fine_tasks/mcf/sw.txt
@@ -1,17 +1,17 @@
 # General Knowledge (GK)
-lighteval|community_arc_swa_mcf:easy|5|1
-lighteval|m3exams_swa_mcf|5|1
-lighteval|openai_mmlu_swa_mcf|5|1
+lighteval|community_arc_swa_mcf:easy|5
+lighteval|m3exams_swa_mcf|5
+lighteval|openai_mmlu_swa_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_swh_Latn_mcf|5|1
-lighteval|kenswquad_swa|5|1
-lighteval|tydiqa_swa|5|1
+lighteval|belebele_swh_Latn_mcf|5
+lighteval|kenswquad_swa|5
+lighteval|tydiqa_swa|5
 
 # Reasoning (RES)
-lighteval|xcsqa_swa_mcf|5|1
-lighteval|xcopa_swa_mcf|5|1
+lighteval|xcsqa_swa_mcf|5
+lighteval|xcopa_swa_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|xnli2.0_swa_mcf|5|1
-lighteval|xstory_cloze_swa_mcf|5|1
+lighteval|xnli2.0_swa_mcf|5
+lighteval|xstory_cloze_swa_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/te.txt b/examples/tasks/fine_tasks/mcf/te.txt
index 07b609c29..c0d620686 100644
--- a/examples/tasks/fine_tasks/mcf/te.txt
+++ b/examples/tasks/fine_tasks/mcf/te.txt
@@ -1,14 +1,14 @@
 # General Knowledge (GK)
-lighteval|mlmm_mmlu_tel_mcf|5|1
+lighteval|mlmm_mmlu_tel_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_tel_Telu_mcf|5|1
-lighteval|indicqa_tel|5|1
+lighteval|belebele_tel_Telu_mcf|5
+lighteval|indicqa_tel|5
 
 # Reasoning (RES)
-lighteval|indicxcopa_tel_mcf|5|1
+lighteval|indicxcopa_tel_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|community_hellaswag_tel_mcf|5|1
-lighteval|indicnxnli_tel_mcf|5|1
-lighteval|xstory_cloze_tel_mcf|5|1
+lighteval|community_hellaswag_tel_mcf|5
+lighteval|indicnxnli_tel_mcf|5
+lighteval|xstory_cloze_tel_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/th.txt b/examples/tasks/fine_tasks/mcf/th.txt
index 4a5acb214..5156a8ea1 100644
--- a/examples/tasks/fine_tasks/mcf/th.txt
+++ b/examples/tasks/fine_tasks/mcf/th.txt
@@ -1,12 +1,12 @@
 # General Knowledge (GK)
-lighteval|meta_mmlu_tha_mcf|5|1
-lighteval|m3exams_tha_mcf|5|1
+lighteval|meta_mmlu_tha_mcf|5
+lighteval|m3exams_tha_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_tha_Thai_mcf|5|1
-lighteval|thaiqa_tha|5|1
-lighteval|xquad_tha|5|1
+lighteval|belebele_tha_Thai_mcf|5
+lighteval|thaiqa_tha|5
+lighteval|xquad_tha|5
 
 # Natural Language Understanding (NLU)
-lighteval|community_hellaswag_tha_mcf|5|1
-lighteval|xnli2.0_tha_mcf|5|1
+lighteval|community_hellaswag_tha_mcf|5
+lighteval|xnli2.0_tha_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/tr.txt b/examples/tasks/fine_tasks/mcf/tr.txt
index 63ccd0b83..918ea5feb 100644
--- a/examples/tasks/fine_tasks/mcf/tr.txt
+++ b/examples/tasks/fine_tasks/mcf/tr.txt
@@ -1,16 +1,16 @@
 # General Knowledge (GK)
-lighteval|community_arc_tur_mcf:easy|5|1
-lighteval|exams_tur_mcf|5|1
-lighteval|community_mmlu_tur_mcf|5|1
+lighteval|community_arc_tur_mcf:easy|5
+lighteval|exams_tur_mcf|5
+lighteval|community_mmlu_tur_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_tur_Latn_mcf|5|1
-lighteval|tquadv2_tur|5|1
-lighteval|xquad_tur|5|1
+lighteval|belebele_tur_Latn_mcf|5
+lighteval|tquadv2_tur|5
+lighteval|xquad_tur|5
 
 # Reasoning (RES)
-lighteval|xcopa_tur_mcf|5|1
+lighteval|xcopa_tur_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|community_hellaswag_tur_mcf|5|1
-lighteval|xnli2.0_tur_mcf|5|1
+lighteval|community_hellaswag_tur_mcf|5
+lighteval|xnli2.0_tur_mcf|5
diff --git a/examples/tasks/fine_tasks/mcf/zh.txt b/examples/tasks/fine_tasks/mcf/zh.txt
index a5799d82b..69817c144 100644
--- a/examples/tasks/fine_tasks/mcf/zh.txt
+++ b/examples/tasks/fine_tasks/mcf/zh.txt
@@ -1,22 +1,22 @@
 # General Knowledge (GK)
-lighteval|agieval_zho_mcf|5|1
-lighteval|ceval_zho_mcf|5|1
-lighteval|cmmlu_zho_mcf|5|1
-lighteval|m3exams_zho_mcf|5|1
+lighteval|agieval_zho_mcf|5
+lighteval|ceval_zho_mcf|5
+lighteval|cmmlu_zho_mcf|5
+lighteval|m3exams_zho_mcf|5
 
 # Reading Comprehension (RC)
-lighteval|belebele_zho_Hans_mcf|5|1
-lighteval|c3_zho_mcf|5|1
-lighteval|cmrc2018_zho|5|1
-lighteval|chinese_squad_zho|5|1
+lighteval|belebele_zho_Hans_mcf|5
+lighteval|c3_zho_mcf|5
+lighteval|cmrc2018_zho|5
+lighteval|chinese_squad_zho|5
 
 # Reasoning (RES)
-lighteval|xcodah_zho_mcf|5|1
-lighteval|xcopa_zho_mcf|5|1
-lighteval|xcsqa_zho_mcf|5|1
+lighteval|xcodah_zho_mcf|5
+lighteval|xcopa_zho_mcf|5
+lighteval|xcsqa_zho_mcf|5
 
 # Natural Language Understanding (NLU)
-lighteval|mlmm_hellaswag_zho_mcf|5|1
-lighteval|ocnli_zho_mcf|5|1
-lighteval|xwinograd_zho_mcf|5|1
-lighteval|xstory_cloze_zho_mcf|5|1
+lighteval|mlmm_hellaswag_zho_mcf|5
+lighteval|ocnli_zho_mcf|5
+lighteval|xwinograd_zho_mcf|5
+lighteval|xstory_cloze_zho_mcf|5
diff --git a/examples/tasks/open_llm_leaderboard_tasks.txt b/examples/tasks/open_llm_leaderboard_tasks.txt
index 51de4f473..b87f7a191 100644
--- a/examples/tasks/open_llm_leaderboard_tasks.txt
+++ b/examples/tasks/open_llm_leaderboard_tasks.txt
@@ -1,68 +1,68 @@
 # ARC
-leaderboard|arc:challenge|25|0
+leaderboard|arc:challenge|25
 # HellaSwag
-leaderboard|hellaswag|10|0
+leaderboard|hellaswag|10
 # TruthfulQA
-leaderboard|truthfulqa:mc|0|0
+leaderboard|truthfulqa:mc|0
 # MMLU
-leaderboard|mmlu:abstract_algebra|5|0
-leaderboard|mmlu:anatomy|5|0
-leaderboard|mmlu:astronomy|5|0
-leaderboard|mmlu:business_ethics|5|0
-leaderboard|mmlu:clinical_knowledge|5|0
-leaderboard|mmlu:college_biology|5|0
-leaderboard|mmlu:college_chemistry|5|0
-leaderboard|mmlu:college_computer_science|5|0
-leaderboard|mmlu:college_mathematics|5|0
-leaderboard|mmlu:college_medicine|5|0
-leaderboard|mmlu:college_physics|5|0
-leaderboard|mmlu:computer_security|5|0
-leaderboard|mmlu:conceptual_physics|5|0
-leaderboard|mmlu:econometrics|5|0
-leaderboard|mmlu:electrical_engineering|5|0
-leaderboard|mmlu:elementary_mathematics|5|0
-leaderboard|mmlu:formal_logic|5|0
-leaderboard|mmlu:global_facts|5|0
-leaderboard|mmlu:high_school_biology|5|0
-leaderboard|mmlu:high_school_chemistry|5|0
-leaderboard|mmlu:high_school_computer_science|5|0
-leaderboard|mmlu:high_school_european_history|5|0
-leaderboard|mmlu:high_school_geography|5|0
-leaderboard|mmlu:high_school_government_and_politics|5|0
-leaderboard|mmlu:high_school_macroeconomics|5|0
-leaderboard|mmlu:high_school_mathematics|5|0
-leaderboard|mmlu:high_school_microeconomics|5|0
-leaderboard|mmlu:high_school_physics|5|0
-leaderboard|mmlu:high_school_psychology|5|0
-leaderboard|mmlu:high_school_statistics|5|0
-leaderboard|mmlu:high_school_us_history|5|0
-leaderboard|mmlu:high_school_world_history|5|0
-leaderboard|mmlu:human_aging|5|0
-leaderboard|mmlu:human_sexuality|5|0
-leaderboard|mmlu:international_law|5|0
-leaderboard|mmlu:jurisprudence|5|0
-leaderboard|mmlu:logical_fallacies|5|0
-leaderboard|mmlu:machine_learning|5|0
-leaderboard|mmlu:management|5|0
-leaderboard|mmlu:marketing|5|0
-leaderboard|mmlu:medical_genetics|5|0
-leaderboard|mmlu:miscellaneous|5|0
-leaderboard|mmlu:moral_disputes|5|0
-leaderboard|mmlu:moral_scenarios|5|0
-leaderboard|mmlu:nutrition|5|0
-leaderboard|mmlu:philosophy|5|0
-leaderboard|mmlu:prehistory|5|0
-leaderboard|mmlu:professional_accounting|5|0
-leaderboard|mmlu:professional_law|5|0
-leaderboard|mmlu:professional_medicine|5|0
-leaderboard|mmlu:professional_psychology|5|0
-leaderboard|mmlu:public_relations|5|0
-leaderboard|mmlu:security_studies|5|0
-leaderboard|mmlu:sociology|5|0
-leaderboard|mmlu:us_foreign_policy|5|0
-leaderboard|mmlu:virology|5|0
-leaderboard|mmlu:world_religions|5|0
+leaderboard|mmlu:abstract_algebra|5
+leaderboard|mmlu:anatomy|5
+leaderboard|mmlu:astronomy|5
+leaderboard|mmlu:business_ethics|5
+leaderboard|mmlu:clinical_knowledge|5
+leaderboard|mmlu:college_biology|5
+leaderboard|mmlu:college_chemistry|5
+leaderboard|mmlu:college_computer_science|5
+leaderboard|mmlu:college_mathematics|5
+leaderboard|mmlu:college_medicine|5
+leaderboard|mmlu:college_physics|5
+leaderboard|mmlu:computer_security|5
+leaderboard|mmlu:conceptual_physics|5
+leaderboard|mmlu:econometrics|5
+leaderboard|mmlu:electrical_engineering|5
+leaderboard|mmlu:elementary_mathematics|5
+leaderboard|mmlu:formal_logic|5
+leaderboard|mmlu:global_facts|5
+leaderboard|mmlu:high_school_biology|5
+leaderboard|mmlu:high_school_chemistry|5
+leaderboard|mmlu:high_school_computer_science|5
+leaderboard|mmlu:high_school_european_history|5
+leaderboard|mmlu:high_school_geography|5
+leaderboard|mmlu:high_school_government_and_politics|5
+leaderboard|mmlu:high_school_macroeconomics|5
+leaderboard|mmlu:high_school_mathematics|5
+leaderboard|mmlu:high_school_microeconomics|5
+leaderboard|mmlu:high_school_physics|5
+leaderboard|mmlu:high_school_psychology|5
+leaderboard|mmlu:high_school_statistics|5
+leaderboard|mmlu:high_school_us_history|5
+leaderboard|mmlu:high_school_world_history|5
+leaderboard|mmlu:human_aging|5
+leaderboard|mmlu:human_sexuality|5
+leaderboard|mmlu:international_law|5
+leaderboard|mmlu:jurisprudence|5
+leaderboard|mmlu:logical_fallacies|5
+leaderboard|mmlu:machine_learning|5
+leaderboard|mmlu:management|5
+leaderboard|mmlu:marketing|5
+leaderboard|mmlu:medical_genetics|5
+leaderboard|mmlu:miscellaneous|5
+leaderboard|mmlu:moral_disputes|5
+leaderboard|mmlu:moral_scenarios|5
+leaderboard|mmlu:nutrition|5
+leaderboard|mmlu:philosophy|5
+leaderboard|mmlu:prehistory|5
+leaderboard|mmlu:professional_accounting|5
+leaderboard|mmlu:professional_law|5
+leaderboard|mmlu:professional_medicine|5
+leaderboard|mmlu:professional_psychology|5
+leaderboard|mmlu:public_relations|5
+leaderboard|mmlu:security_studies|5
+leaderboard|mmlu:sociology|5
+leaderboard|mmlu:us_foreign_policy|5
+leaderboard|mmlu:virology|5
+leaderboard|mmlu:world_religions|5
 # WinoGrande
-leaderboard|winogrande|5|0
+leaderboard|winogrande|5
 # GSM8K
-leaderboard|gsm8k|5|0
+leaderboard|gsm8k|5
diff --git a/examples/tasks/recommended_set.txt b/examples/tasks/recommended_set.txt
index d1904e3cc..d55b10a9d 100644
--- a/examples/tasks/recommended_set.txt
+++ b/examples/tasks/recommended_set.txt
@@ -1,160 +1,160 @@
 # Commonsense-QA
-helm|commonsenseqa|0|0
-lighteval|ethics:commonsense|0|0
-lighteval|ethics:deontology|0|0
-lighteval|ethics:justice|0|0
-lighteval|ethics:utilitarianism|0|0
-lighteval|ethics:virtue|0|0
+helm|commonsenseqa|0
+lighteval|ethics:commonsense|0
+lighteval|ethics:deontology|0
+lighteval|ethics:justice|0
+lighteval|ethics:utilitarianism|0
+lighteval|ethics:virtue|0
 # MMLU
-leaderboard|mmlu:abstract_algebra|0|0
-leaderboard|mmlu:anatomy|0|0
-leaderboard|mmlu:astronomy|0|0
-leaderboard|mmlu:business_ethics|0|0
-leaderboard|mmlu:clinical_knowledge|0|0
-leaderboard|mmlu:college_biology|0|0
-leaderboard|mmlu:college_chemistry|0|0
-leaderboard|mmlu:college_computer_science|0|0
-leaderboard|mmlu:college_mathematics|0|0
-leaderboard|mmlu:college_medicine|0|0
-leaderboard|mmlu:college_physics|0|0
-leaderboard|mmlu:computer_security|0|0
-leaderboard|mmlu:conceptual_physics|0|0
-leaderboard|mmlu:econometrics|0|0
-leaderboard|mmlu:electrical_engineering|0|0
-leaderboard|mmlu:elementary_mathematics|0|0
-leaderboard|mmlu:formal_logic|0|0
-leaderboard|mmlu:global_facts|0|0
-leaderboard|mmlu:high_school_biology|0|0
-leaderboard|mmlu:high_school_chemistry|0|0
-leaderboard|mmlu:high_school_computer_science|0|0
-leaderboard|mmlu:high_school_european_history|0|0
-leaderboard|mmlu:high_school_geography|0|0
-leaderboard|mmlu:high_school_government_and_politics|0|0
-leaderboard|mmlu:high_school_macroeconomics|0|0
-leaderboard|mmlu:high_school_mathematics|0|0
-leaderboard|mmlu:high_school_microeconomics|0|0
-leaderboard|mmlu:high_school_physics|0|0
-leaderboard|mmlu:high_school_psychology|0|0
-leaderboard|mmlu:high_school_statistics|0|0
-leaderboard|mmlu:high_school_us_history|0|0
-leaderboard|mmlu:high_school_world_history|0|0
-leaderboard|mmlu:human_aging|0|0
-leaderboard|mmlu:human_sexuality|0|0
-leaderboard|mmlu:international_law|0|0
-leaderboard|mmlu:jurisprudence|0|0
-leaderboard|mmlu:logical_fallacies|0|0
-leaderboard|mmlu:machine_learning|0|0
-leaderboard|mmlu:management|0|0
-leaderboard|mmlu:marketing|0|0
-leaderboard|mmlu:medical_genetics|0|0
-leaderboard|mmlu:miscellaneous|0|0
-leaderboard|mmlu:moral_disputes|0|0
-leaderboard|mmlu:moral_scenarios|0|0
-leaderboard|mmlu:nutrition|0|0
-leaderboard|mmlu:philosophy|0|0
-leaderboard|mmlu:prehistory|0|0
-leaderboard|mmlu:professional_accounting|0|0
-leaderboard|mmlu:professional_law|0|0
-leaderboard|mmlu:professional_medicine|0|0
-leaderboard|mmlu:professional_psychology|0|0
-leaderboard|mmlu:public_relations|0|0
-leaderboard|mmlu:security_studies|0|0
-leaderboard|mmlu:sociology|0|0
-leaderboard|mmlu:us_foreign_policy|0|0
-leaderboard|mmlu:virology|0|0
-leaderboard|mmlu:world_religions|0|0
-original|mmlu:abstract_algebra|0|0
-original|mmlu:anatomy|0|0
-original|mmlu:astronomy|0|0
-original|mmlu:business_ethics|0|0
-original|mmlu:clinical_knowledge|0|0
-original|mmlu:college_biology|0|0
-original|mmlu:college_chemistry|0|0
-original|mmlu:college_computer_science|0|0
-original|mmlu:college_mathematics|0|0
-original|mmlu:college_medicine|0|0
-original|mmlu:college_physics|0|0
-original|mmlu:computer_security|0|0
-original|mmlu:conceptual_physics|0|0
-original|mmlu:econometrics|0|0
-original|mmlu:electrical_engineering|0|0
-original|mmlu:elementary_mathematics|0|0
-original|mmlu:formal_logic|0|0
-original|mmlu:global_facts|0|0
-original|mmlu:high_school_biology|0|0
-original|mmlu:high_school_chemistry|0|0
-original|mmlu:high_school_computer_science|0|0
-original|mmlu:high_school_european_history|0|0
-original|mmlu:high_school_geography|0|0
-original|mmlu:high_school_government_and_politics|0|0
-original|mmlu:high_school_macroeconomics|0|0
-original|mmlu:high_school_mathematics|0|0
-original|mmlu:high_school_microeconomics|0|0
-original|mmlu:high_school_physics|0|0
-original|mmlu:high_school_psychology|0|0
-original|mmlu:high_school_statistics|0|0
-original|mmlu:high_school_us_history|0|0
-original|mmlu:high_school_world_history|0|0
-original|mmlu:human_aging|0|0
-original|mmlu:human_sexuality|0|0
-original|mmlu:international_law|0|0
-original|mmlu:jurisprudence|0|0
-original|mmlu:logical_fallacies|0|0
-original|mmlu:machine_learning|0|0
-original|mmlu:management|0|0
-original|mmlu:marketing|0|0
-original|mmlu:medical_genetics|0|0
-original|mmlu:miscellaneous|0|0
-original|mmlu:moral_disputes|0|0
-original|mmlu:moral_scenarios|0|0
-original|mmlu:nutrition|0|0
-original|mmlu:philosophy|0|0
-original|mmlu:prehistory|0|0
-original|mmlu:professional_accounting|0|0
-original|mmlu:professional_law|0|0
-original|mmlu:professional_medicine|0|0
-original|mmlu:professional_psychology|0|0
-original|mmlu:public_relations|0|0
-original|mmlu:security_studies|0|0
-original|mmlu:sociology|0|0
-original|mmlu:us_foreign_policy|0|0
-original|mmlu:virology|0|0
-original|mmlu:world_religions|0|0
-original|mmlu|0|0
+leaderboard|mmlu:abstract_algebra|0
+leaderboard|mmlu:anatomy|0
+leaderboard|mmlu:astronomy|0
+leaderboard|mmlu:business_ethics|0
+leaderboard|mmlu:clinical_knowledge|0
+leaderboard|mmlu:college_biology|0
+leaderboard|mmlu:college_chemistry|0
+leaderboard|mmlu:college_computer_science|0
+leaderboard|mmlu:college_mathematics|0
+leaderboard|mmlu:college_medicine|0
+leaderboard|mmlu:college_physics|0
+leaderboard|mmlu:computer_security|0
+leaderboard|mmlu:conceptual_physics|0
+leaderboard|mmlu:econometrics|0
+leaderboard|mmlu:electrical_engineering|0
+leaderboard|mmlu:elementary_mathematics|0
+leaderboard|mmlu:formal_logic|0
+leaderboard|mmlu:global_facts|0
+leaderboard|mmlu:high_school_biology|0
+leaderboard|mmlu:high_school_chemistry|0
+leaderboard|mmlu:high_school_computer_science|0
+leaderboard|mmlu:high_school_european_history|0
+leaderboard|mmlu:high_school_geography|0
+leaderboard|mmlu:high_school_government_and_politics|0
+leaderboard|mmlu:high_school_macroeconomics|0
+leaderboard|mmlu:high_school_mathematics|0
+leaderboard|mmlu:high_school_microeconomics|0
+leaderboard|mmlu:high_school_physics|0
+leaderboard|mmlu:high_school_psychology|0
+leaderboard|mmlu:high_school_statistics|0
+leaderboard|mmlu:high_school_us_history|0
+leaderboard|mmlu:high_school_world_history|0
+leaderboard|mmlu:human_aging|0
+leaderboard|mmlu:human_sexuality|0
+leaderboard|mmlu:international_law|0
+leaderboard|mmlu:jurisprudence|0
+leaderboard|mmlu:logical_fallacies|0
+leaderboard|mmlu:machine_learning|0
+leaderboard|mmlu:management|0
+leaderboard|mmlu:marketing|0
+leaderboard|mmlu:medical_genetics|0
+leaderboard|mmlu:miscellaneous|0
+leaderboard|mmlu:moral_disputes|0
+leaderboard|mmlu:moral_scenarios|0
+leaderboard|mmlu:nutrition|0
+leaderboard|mmlu:philosophy|0
+leaderboard|mmlu:prehistory|0
+leaderboard|mmlu:professional_accounting|0
+leaderboard|mmlu:professional_law|0
+leaderboard|mmlu:professional_medicine|0
+leaderboard|mmlu:professional_psychology|0
+leaderboard|mmlu:public_relations|0
+leaderboard|mmlu:security_studies|0
+leaderboard|mmlu:sociology|0
+leaderboard|mmlu:us_foreign_policy|0
+leaderboard|mmlu:virology|0
+leaderboard|mmlu:world_religions|0
+original|mmlu:abstract_algebra|0
+original|mmlu:anatomy|0
+original|mmlu:astronomy|0
+original|mmlu:business_ethics|0
+original|mmlu:clinical_knowledge|0
+original|mmlu:college_biology|0
+original|mmlu:college_chemistry|0
+original|mmlu:college_computer_science|0
+original|mmlu:college_mathematics|0
+original|mmlu:college_medicine|0
+original|mmlu:college_physics|0
+original|mmlu:computer_security|0
+original|mmlu:conceptual_physics|0
+original|mmlu:econometrics|0
+original|mmlu:electrical_engineering|0
+original|mmlu:elementary_mathematics|0
+original|mmlu:formal_logic|0
+original|mmlu:global_facts|0
+original|mmlu:high_school_biology|0
+original|mmlu:high_school_chemistry|0
+original|mmlu:high_school_computer_science|0
+original|mmlu:high_school_european_history|0
+original|mmlu:high_school_geography|0
+original|mmlu:high_school_government_and_politics|0
+original|mmlu:high_school_macroeconomics|0
+original|mmlu:high_school_mathematics|0
+original|mmlu:high_school_microeconomics|0
+original|mmlu:high_school_physics|0
+original|mmlu:high_school_psychology|0
+original|mmlu:high_school_statistics|0
+original|mmlu:high_school_us_history|0
+original|mmlu:high_school_world_history|0
+original|mmlu:human_aging|0
+original|mmlu:human_sexuality|0
+original|mmlu:international_law|0
+original|mmlu:jurisprudence|0
+original|mmlu:logical_fallacies|0
+original|mmlu:machine_learning|0
+original|mmlu:management|0
+original|mmlu:marketing|0
+original|mmlu:medical_genetics|0
+original|mmlu:miscellaneous|0
+original|mmlu:moral_disputes|0
+original|mmlu:moral_scenarios|0
+original|mmlu:nutrition|0
+original|mmlu:philosophy|0
+original|mmlu:prehistory|0
+original|mmlu:professional_accounting|0
+original|mmlu:professional_law|0
+original|mmlu:professional_medicine|0
+original|mmlu:professional_psychology|0
+original|mmlu:public_relations|0
+original|mmlu:security_studies|0
+original|mmlu:sociology|0
+original|mmlu:us_foreign_policy|0
+original|mmlu:virology|0
+original|mmlu:world_religions|0
+original|mmlu|0
 # ARC
-leaderboard|arc:challenge|0|0
-lighteval|arc:easy|0|0
-original|arc:c:letters|0|0
-original|arc:c:options|0|0
-original|arc:c:simple|0|0
+leaderboard|arc:challenge|0
+lighteval|arc:easy|0
+original|arc:c:letters|0
+original|arc:c:options|0
+original|arc:c:simple|0
 # HellaSwag
-helm|hellaswag|0|0
-leaderboard|hellaswag|0|0
+helm|hellaswag|0
+leaderboard|hellaswag|0
 # PIQA
-helm|piqa|0|0
-lighteval|piqa|0|0
+helm|piqa|0
+lighteval|piqa|0
 # SIQA
-helm|siqa|0|0
+helm|siqa|0
 # WinoGrande
-leaderboard|winogrande|0|0
+leaderboard|winogrande|0
 # OpenBookQA
-lighteval|openbookqa|0|0
-helm|openbookqa|0|0
+lighteval|openbookqa|0
+helm|openbookqa|0
 # TriviaQA
-lighteval|triviaqa|0|0
+lighteval|triviaqa|0
 # BoolQ
-helm|boolq:contrastset|0|0
-helm|boolq|0|0
+helm|boolq:contrastset|0
+helm|boolq|0
 # QUAC
-helm|quac|0|0
+helm|quac|0
 # GSM8K
-leaderboard|gsm8k|0|0
+leaderboard|gsm8k|0
 # MATH
-lighteval|math:algebra|0|0
-lighteval|math:counting_and_probability|0|0
-lighteval|math:geometry|0|0
-lighteval|math:intermediate_algebra|0|0
-lighteval|math:number_theory|0|0
-lighteval|math:prealgebra|0|0
-lighteval|math:precalculus|0|0
+lighteval|math:algebra|0
+lighteval|math:counting_and_probability|0
+lighteval|math:geometry|0
+lighteval|math:intermediate_algebra|0
+lighteval|math:number_theory|0
+lighteval|math:prealgebra|0
+lighteval|math:precalculus|0
 # To add: NaturalQuestions, BBH, AGIEval
diff --git a/examples/tasks/serbian_task_group/sr_all_exclusive.txt b/examples/tasks/serbian_task_group/sr_all_exclusive.txt
index 7e0ced8eb..308743d46 100644
--- a/examples/tasks/serbian_task_group/sr_all_exclusive.txt
+++ b/examples/tasks/serbian_task_group/sr_all_exclusive.txt
@@ -1,75 +1,75 @@
 # Serbian Evaluations - ARC (AI2 Reasoning Challenge)
-community|serbian_evals:arc_easy|0|0
-community|serbian_evals:arc_challenge|0|0
+community|serbian_evals:arc_easy|0
+community|serbian_evals:arc_challenge|0
 # Commonsense Reasoning
-community|serbian_evals:hellaswag|0|0
-community|serbian_evals:piqa|0|0
-community|serbian_evals:winogrande|0|0
+community|serbian_evals:hellaswag|0
+community|serbian_evals:piqa|0
+community|serbian_evals:winogrande|0
 # Serbian Evaluations - Custom/Other Task
-community|serbian_evals:oz_eval|0|0
+community|serbian_evals:oz_eval|0
 # MMLU (Miscellaneous)
-community|serbian_evals:mmlu_anatomija|0|0
-community|serbian_evals:mmlu_astronomija|0|0
-community|serbian_evals:mmlu_poslovna_etika|0|0
-community|serbian_evals:mmlu_kliničko_znanje|0|0
-community|serbian_evals:mmlu_razno|0|0
-community|serbian_evals:mmlu_elektrotehnika|0|0
+community|serbian_evals:mmlu_anatomija|0
+community|serbian_evals:mmlu_astronomija|0
+community|serbian_evals:mmlu_poslovna_etika|0
+community|serbian_evals:mmlu_kliničko_znanje|0
+community|serbian_evals:mmlu_razno|0
+community|serbian_evals:mmlu_elektrotehnika|0
 # Serbian Evaluations - ARC (AI2 Reasoning Challenge)
-community|serbian_evals:arc_easy|0|0
-community|serbian_evals:arc_challenge|0|0
+community|serbian_evals:arc_easy|0
+community|serbian_evals:arc_challenge|0
 # Commonsense Reasoning
-community|serbian_evals:hellaswag|0|0
-community|serbian_evals:piqa|0|0
-community|serbian_evals:winogrande|0|0
+community|serbian_evals:hellaswag|0
+community|serbian_evals:piqa|0
+community|serbian_evals:winogrande|0
 # Serbian Evaluations - Custom/Other Task
-community|serbian_evals:oz_eval|0|0
+community|serbian_evals:oz_eval|0
 # MMLU (Miscellaneous)
-community|serbian_evals:mmlu_anatomija|0|0
-community|serbian_evals:mmlu_astronomija|0|0
-community|serbian_evals:mmlu_poslovna_etika|0|0
-community|serbian_evals:mmlu_kliničko_znanje|0|0
-community|serbian_evals:mmlu_razno|0|0
-community|serbian_evals:mmlu_elektrotehnika|0|0
+community|serbian_evals:mmlu_anatomija|0
+community|serbian_evals:mmlu_astronomija|0
+community|serbian_evals:mmlu_poslovna_etika|0
+community|serbian_evals:mmlu_kliničko_znanje|0
+community|serbian_evals:mmlu_razno|0
+community|serbian_evals:mmlu_elektrotehnika|0
 # MMLU (Business Professional)
-community|serbian_evals:mmlu_marketing|0|0
-community|serbian_evals:mmlu_manadzment|0|0
+community|serbian_evals:mmlu_marketing|0
+community|serbian_evals:mmlu_manadzment|0
 # MMLU (College Level Tasks)
-community|serbian_evals:mmlu_fakultet_biologija|0|0
-community|serbian_evals:mmlu_fakultet_hemija|0|0
-community|serbian_evals:mmlu_fakultet_racunari|0|0
-community|serbian_evals:mmlu_fakultet_matematika|0|0
-community|serbian_evals:mmlu_fakultet_medicina|0|0
-community|serbian_evals:mmlu_fakultet_fizika|0|0
-community|serbian_evals:mmlu_sigurnost_racunara|0|0
+community|serbian_evals:mmlu_fakultet_biologija|0
+community|serbian_evals:mmlu_fakultet_hemija|0
+community|serbian_evals:mmlu_fakultet_racunari|0
+community|serbian_evals:mmlu_fakultet_matematika|0
+community|serbian_evals:mmlu_fakultet_medicina|0
+community|serbian_evals:mmlu_fakultet_fizika|0
+community|serbian_evals:mmlu_sigurnost_racunara|0
 # MMLU (Ethics, Philosophy)
-community|serbian_evals:mmlu_moralni_sporovi|0|0
-community|serbian_evals:mmlu_moralne_dileme|0|0
-community|serbian_evals:mmlu_filozofija|0|0
-community|serbian_evals:mmlu_svetska_religija|0|0
+community|serbian_evals:mmlu_moralni_sporovi|0
+community|serbian_evals:mmlu_moralne_dileme|0
+community|serbian_evals:mmlu_filozofija|0
+community|serbian_evals:mmlu_svetska_religija|0
 # MMLU (High School Level Tasks)
-community|serbian_evals:mmlu_srednja_skola_biologija|0|0
-community|serbian_evals:mmlu_srednja_skola_hemija|0|0
-community|serbian_evals:mmlu_srednja_skola_racunari|0|0
-community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0|0
-community|serbian_evals:mmlu_srednja_skola_geografija|0|0
-community|serbian_evals:mmlu_srednja_skola_matematika|0|0
-community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0|0
-community|serbian_evals:mmlu_srednja_skola_fizika|0|0
-community|serbian_evals:mmlu_srednja_skola_psihologija|0|0
-community|serbian_evals:mmlu_srednja_skola_statistika|0|0
-community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0|0
+community|serbian_evals:mmlu_srednja_skola_biologija|0
+community|serbian_evals:mmlu_srednja_skola_hemija|0
+community|serbian_evals:mmlu_srednja_skola_racunari|0
+community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0
+community|serbian_evals:mmlu_srednja_skola_geografija|0
+community|serbian_evals:mmlu_srednja_skola_matematika|0
+community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0
+community|serbian_evals:mmlu_srednja_skola_fizika|0
+community|serbian_evals:mmlu_srednja_skola_psihologija|0
+community|serbian_evals:mmlu_srednja_skola_statistika|0
+community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0
 # MMLU (Math, Logic)
-community|serbian_evals:mmlu_abstract_algebra|0|0
-community|serbian_evals:mmlu_osnovna_matematika|0|0
-community|serbian_evals:mmlu_formalna_logika|0|0
-community|serbian_evals:mmlu_konceptualna_fizika|0|0
-community|serbian_evals:mmlu_metrika_ekonomije|0|0
-community|serbian_evals:mmlu_masinsko_ucenje|0|0
+community|serbian_evals:mmlu_abstract_algebra|0
+community|serbian_evals:mmlu_osnovna_matematika|0
+community|serbian_evals:mmlu_formalna_logika|0
+community|serbian_evals:mmlu_konceptualna_fizika|0
+community|serbian_evals:mmlu_metrika_ekonomije|0
+community|serbian_evals:mmlu_masinsko_ucenje|0
 # MMLU (Social Sciences)
-community|serbian_evals:mmlu_globalne_cinjenice|0|0
-community|serbian_evals:mmlu_logicke_zablude|0|0
-community|serbian_evals:mmlu_sociologija|0|0
-community|serbian_evals:mmlu_human_aging|0|0
+community|serbian_evals:mmlu_globalne_cinjenice|0
+community|serbian_evals:mmlu_logicke_zablude|0
+community|serbian_evals:mmlu_sociologija|0
+community|serbian_evals:mmlu_human_aging|0
 # Question Answering and Knowledge
-community|serbian_evals:boolq|0|0
-community|serbian_evals:openbook|0|0
+community|serbian_evals:boolq|0
+community|serbian_evals:openbook|0
diff --git a/examples/tasks/serbian_task_group/sr_all_inclusive.txt b/examples/tasks/serbian_task_group/sr_all_inclusive.txt
index 44e9ad760..659e6a2df 100644
--- a/examples/tasks/serbian_task_group/sr_all_inclusive.txt
+++ b/examples/tasks/serbian_task_group/sr_all_inclusive.txt
@@ -1,2 +1,2 @@
 # MMLU (All-inclusive Task Entry)
-community|serbian_evals:mmlu|0|0
+community|serbian_evals:mmlu|0
diff --git a/examples/tasks/serbian_task_group/sr_arc.txt b/examples/tasks/serbian_task_group/sr_arc.txt
index 3ac8a654f..e66500be1 100644
--- a/examples/tasks/serbian_task_group/sr_arc.txt
+++ b/examples/tasks/serbian_task_group/sr_arc.txt
@@ -1,3 +1,3 @@
 # Serbian Evaluations - ARC (AI2 Reasoning Challenge)
-community|serbian_evals:arc_easy|0|0
-community|serbian_evals:arc_challenge|0|0
+community|serbian_evals:arc_easy|0
+community|serbian_evals:arc_challenge|0
diff --git a/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt b/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt
index 4012f55ce..c93a237ce 100644
--- a/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt
+++ b/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt
@@ -1,4 +1,4 @@
 # Commonsense Reasoning
-community|serbian_evals:hellaswag|0|0
-community|serbian_evals:piqa|0|0
-community|serbian_evals:winogrande|0|0
+community|serbian_evals:hellaswag|0
+community|serbian_evals:piqa|0
+community|serbian_evals:winogrande|0
diff --git a/examples/tasks/serbian_task_group/sr_custom_task.txt b/examples/tasks/serbian_task_group/sr_custom_task.txt
index c3d98830d..284161d4b 100644
--- a/examples/tasks/serbian_task_group/sr_custom_task.txt
+++ b/examples/tasks/serbian_task_group/sr_custom_task.txt
@@ -1,2 +1,2 @@
 # Serbian Evaluations - Custom/Other Task
-community|serbian_evals:oz_eval|0|0
+community|serbian_evals:oz_eval|0
diff --git a/examples/tasks/serbian_task_group/sr_misc.txt b/examples/tasks/serbian_task_group/sr_misc.txt
index adfbefaaf..13628af3e 100644
--- a/examples/tasks/serbian_task_group/sr_misc.txt
+++ b/examples/tasks/serbian_task_group/sr_misc.txt
@@ -1,7 +1,7 @@
 # MMLU (Miscellaneous)
-community|serbian_evals:mmlu_anatomija|0|0
-community|serbian_evals:mmlu_astronomija|0|0
-community|serbian_evals:mmlu_poslovna_etika|0|0
-community|serbian_evals:mmlu_kliničko_znanje|0|0
-community|serbian_evals:mmlu_razno|0|0
-community|serbian_evals:mmlu_elektrotehnika|0|0
+community|serbian_evals:mmlu_anatomija|0
+community|serbian_evals:mmlu_astronomija|0
+community|serbian_evals:mmlu_poslovna_etika|0
+community|serbian_evals:mmlu_kliničko_znanje|0
+community|serbian_evals:mmlu_razno|0
+community|serbian_evals:mmlu_elektrotehnika|0
diff --git a/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt b/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt
index 1afedb8a9..f091fc15a 100644
--- a/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt
+++ b/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt
@@ -1,3 +1,3 @@
 # MMLU (Business Professional)
-community|serbian_evals:mmlu_marketing|0|0
-community|serbian_evals:mmlu_manadzment|0|0
+community|serbian_evals:mmlu_marketing|0
+community|serbian_evals:mmlu_manadzment|0
diff --git a/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt b/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt
index 099db7de7..23533d56c 100644
--- a/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt
+++ b/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt
@@ -1,8 +1,8 @@
 # MMLU (College Level Tasks)
-community|serbian_evals:mmlu_fakultet_biologija|0|0
-community|serbian_evals:mmlu_fakultet_hemija|0|0
-community|serbian_evals:mmlu_fakultet_racunari|0|0
-community|serbian_evals:mmlu_fakultet_matematika|0|0
-community|serbian_evals:mmlu_fakultet_medicina|0|0
-community|serbian_evals:mmlu_fakultet_fizika|0|0
-community|serbian_evals:mmlu_sigurnost_racunara|0|0
+community|serbian_evals:mmlu_fakultet_biologija|0
+community|serbian_evals:mmlu_fakultet_hemija|0
+community|serbian_evals:mmlu_fakultet_racunari|0
+community|serbian_evals:mmlu_fakultet_matematika|0
+community|serbian_evals:mmlu_fakultet_medicina|0
+community|serbian_evals:mmlu_fakultet_fizika|0
+community|serbian_evals:mmlu_sigurnost_racunara|0
diff --git a/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt b/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt
index 91abbd2f5..466b1fc74 100644
--- a/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt
+++ b/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt
@@ -1,5 +1,5 @@
 # MMLU (Ethics, Philosophy)
-community|serbian_evals:mmlu_moralni_sporovi|0|0
-community|serbian_evals:mmlu_moralne_dileme|0|0
-community|serbian_evals:mmlu_filozofija|0|0
-community|serbian_evals:mmlu_svetska_religija|0|0
+community|serbian_evals:mmlu_moralni_sporovi|0
+community|serbian_evals:mmlu_moralne_dileme|0
+community|serbian_evals:mmlu_filozofija|0
+community|serbian_evals:mmlu_svetska_religija|0
diff --git a/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt b/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt
index 8f11e22a9..407a702c0 100644
--- a/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt
+++ b/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt
@@ -1,12 +1,12 @@
 # MMLU (High School Level Tasks)
-community|serbian_evals:mmlu_srednja_skola_biologija|0|0
-community|serbian_evals:mmlu_srednja_skola_hemija|0|0
-community|serbian_evals:mmlu_srednja_skola_racunari|0|0
-community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0|0
-community|serbian_evals:mmlu_srednja_skola_geografija|0|0
-community|serbian_evals:mmlu_srednja_skola_matematika|0|0
-community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0|0
-community|serbian_evals:mmlu_srednja_skola_fizika|0|0
-community|serbian_evals:mmlu_srednja_skola_psihologija|0|0
-community|serbian_evals:mmlu_srednja_skola_statistika|0|0
-community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0|0
+community|serbian_evals:mmlu_srednja_skola_biologija|0
+community|serbian_evals:mmlu_srednja_skola_hemija|0
+community|serbian_evals:mmlu_srednja_skola_racunari|0
+community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0
+community|serbian_evals:mmlu_srednja_skola_geografija|0
+community|serbian_evals:mmlu_srednja_skola_matematika|0
+community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0
+community|serbian_evals:mmlu_srednja_skola_fizika|0
+community|serbian_evals:mmlu_srednja_skola_psihologija|0
+community|serbian_evals:mmlu_srednja_skola_statistika|0
+community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0
diff --git a/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt b/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt
index 40f6caa5f..c3348e312 100644
--- a/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt
+++ b/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt
@@ -1,7 +1,7 @@
 # MMLU (Math, Logic)
-community|serbian_evals:mmlu_abstract_algebra|0|0
-community|serbian_evals:mmlu_osnovna_matematika|0|0
-community|serbian_evals:mmlu_formalna_logika|0|0
-community|serbian_evals:mmlu_konceptualna_fizika|0|0
-community|serbian_evals:mmlu_metrika_ekonomije|0|0
-community|serbian_evals:mmlu_masinsko_ucenje|0|0
+community|serbian_evals:mmlu_abstract_algebra|0
+community|serbian_evals:mmlu_osnovna_matematika|0
+community|serbian_evals:mmlu_formalna_logika|0
+community|serbian_evals:mmlu_konceptualna_fizika|0
+community|serbian_evals:mmlu_metrika_ekonomije|0
+community|serbian_evals:mmlu_masinsko_ucenje|0
diff --git a/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt b/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt
index 8ee92e844..1501fc3dd 100644
--- a/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt
+++ b/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt
@@ -1,5 +1,5 @@
 # MMLU (Social Sciences)
-community|serbian_evals:mmlu_globalne_cinjenice|0|0
-community|serbian_evals:mmlu_logicke_zablude|0|0
-community|serbian_evals:mmlu_sociologija|0|0
-community|serbian_evals:mmlu_human_aging|0|0
+community|serbian_evals:mmlu_globalne_cinjenice|0
+community|serbian_evals:mmlu_logicke_zablude|0
+community|serbian_evals:mmlu_sociologija|0
+community|serbian_evals:mmlu_human_aging|0
diff --git a/examples/tasks/serbian_task_group/sr_qa_knowledge.txt b/examples/tasks/serbian_task_group/sr_qa_knowledge.txt
index cdda84ea6..de4b00211 100644
--- a/examples/tasks/serbian_task_group/sr_qa_knowledge.txt
+++ b/examples/tasks/serbian_task_group/sr_qa_knowledge.txt
@@ -1,3 +1,3 @@
 # Question Answering and Knowledge
-community|serbian_evals:boolq|0|0
-community|serbian_evals:openbook|0|0
+community|serbian_evals:boolq|0
+community|serbian_evals:openbook|0
diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt
index 7666c79e4..12c8662a9 100644
--- a/examples/test_tasks.txt
+++ b/examples/test_tasks.txt
@@ -1,27 +1,27 @@
-leaderboard|arc:challenge|25|0
-leaderboard|truthfulqa:mc|0|0
-leaderboard|hellaswag|10|0
-leaderboard|mmlu:college_chemistry|5|0
-leaderboard|mmlu:us_foreign_policy|5|0
-lighteval|agieval:aqua-rat|0|0
-lighteval|agieval:logiqa-en|0|0
-lighteval|agieval:lsat-ar|0|0
-lighteval|agieval:lsat-lr|0|0
-lighteval|agieval:lsat-rc|0|0
-lighteval|agieval:sat-en-without-passage|0|0
-lighteval|agieval:sat-en|0|0
-lighteval|bigbench:causal_judgment|3|0
-lighteval|bigbench:date_understanding|3|0
-lighteval|bigbench:disambiguation_qa|3|0
-lighteval|bigbench:geometric_shapes|3|0
-lighteval|bigbench:logical_deduction_five_objects|3|0
-lighteval|bigbench:logical_deduction_seven_objects|3|0
-lighteval|bigbench:movie_recommendation|3|0
-lighteval|bigbench:navigate|3|0
-lighteval|bigbench:ruin_names|3|0
-lighteval|bigbench:salient_translation_error_detection|3|0
-lighteval|bigbench:snarks|3|0
-lighteval|bigbench:temporal_sequences|3|0
-lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0
-lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0
-test|gsm8k|0|1
+leaderboard|arc:challenge|25
+leaderboard|truthfulqa:mc|0
+leaderboard|hellaswag|10
+leaderboard|mmlu:college_chemistry|5
+leaderboard|mmlu:us_foreign_policy|5
+lighteval|agieval:aqua-rat|0
+lighteval|agieval:logiqa-en|0
+lighteval|agieval:lsat-ar|0
+lighteval|agieval:lsat-lr|0
+lighteval|agieval:lsat-rc|0
+lighteval|agieval:sat-en-without-passage|0
+lighteval|agieval:sat-en|0
+lighteval|bigbench:causal_judgment|3
+lighteval|bigbench:date_understanding|3
+lighteval|bigbench:disambiguation_qa|3
+lighteval|bigbench:geometric_shapes|3
+lighteval|bigbench:logical_deduction_five_objects|3
+lighteval|bigbench:logical_deduction_seven_objects|3
+lighteval|bigbench:movie_recommendation|3
+lighteval|bigbench:navigate|3
+lighteval|bigbench:ruin_names|3
+lighteval|bigbench:salient_translation_error_detection|3
+lighteval|bigbench:snarks|3
+lighteval|bigbench:temporal_sequences|3
+lighteval|bigbench:tracking_shuffled_objects_five_objects|3
+lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
+test|gsm8k|0
diff --git a/pyproject.toml b/pyproject.toml
index 04da22e55..97a1745d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,7 +103,7 @@ docs = ["hf-doc-builder", "watchdog"]
 extended_tasks = [
   "langdetect", # ifeval
   "openai>1.87", # llm as a judge using openai models
-  "tiktoken"
+  "tiktoken",
 ]
 s3 = ["s3fs"]
 multilingual = [
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 5d82d3c38..055670657 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -169,11 +169,6 @@ class CompiledDetail:
             non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size for the current task.
             padded (int): Total umber of samples which needed padding during the batching step for the current task.
             non_padded (int): Total number of samples which did not need padding during the batching step for the current task.
-            effective_few_shots (float): Average effective few shots across all samples for the current task.
-                effective few shot is the number of few shots actually used to fit the prompt in the model context
-                length while allowing model generation of the expected size.
-            num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task.
-
         """
 
         hashes: dict = field(default_factory=dict)
@@ -181,8 +176,6 @@ class CompiledDetail:
         non_truncated: int = 0
         padded: int = 0
         non_padded: int = 0
-        effective_few_shots: float = 0
-        num_truncated_few_shots: int = 0
 
     @dataclass
     class CompiledDetailOverAllTasks:
@@ -196,11 +189,6 @@ class CompiledDetailOverAllTasks:
             non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size across all tasks
             padded (int): Number of samples which needed padding during the batching step across all tasks.
             non_padded (int): Number of samples which did not need padding during the batching step across all tasks.
-            effective_few_shots (float): Average effective few shots across all samples across all tasks.
-                effective few shot is the number of few shots actually used to fit the prompt in the model context
-                length while allowing model generation of the expected size.
-            num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks.
-
         """
 
         hashes: dict = field(default_factory=dict)
@@ -208,7 +196,6 @@ class CompiledDetailOverAllTasks:
         non_truncated: int = 0
         padded: int = 0
         non_padded: int = 0
-        num_truncated_few_shots: int = 0
 
     @dataclass
     class Hash:
diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py
index 7d4d34248..2768a8a4d 100644
--- a/src/lighteval/main_baseline.py
+++ b/src/lighteval/main_baseline.py
@@ -51,15 +51,13 @@ def baseline(
         This baseline computation may not be suitable for all task types and should be used with caution.
     """
     from lighteval.logging.evaluation_tracker import EvaluationTracker
-    from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+    from lighteval.tasks.lighteval_task import LightevalTask
     from lighteval.tasks.registry import Registry
     from lighteval.tasks.requests import SamplingMethod
     from lighteval.utils.utils import as_list
 
-    registry = Registry(custom_tasks=custom_tasks)
-
-    task_configs: list[LightevalTaskConfig] = registry.get_tasks_configs(tasks)
-    tasks_dict: dict[str, LightevalTask] = registry.get_tasks_from_configs(task_configs)
+    registry = Registry(tasks=tasks, custom_tasks=custom_tasks)
+    tasks_dict: dict[str, LightevalTask] = registry.load_tasks()
 
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py
index 706dd1a06..0f3cd3df1 100644
--- a/src/lighteval/main_tasks.py
+++ b/src/lighteval/main_tasks.py
@@ -46,13 +46,12 @@ def inspect(
 
     from rich import print
 
-    from lighteval.tasks.registry import Registry, taskinfo_selector
+    from lighteval.tasks.registry import Registry
 
-    registry = Registry(custom_tasks=custom_tasks)
+    registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
 
     # Loading task
-    task_names_list, _ = taskinfo_selector(tasks, task_registry=registry)
-    task_dict = registry.get_task_dict(task_names_list)
+    task_dict = registry.load_tasks()
     for name, task in task_dict.items():
         print("-" * 10, name, "-" * 10)
         if show_config:
@@ -80,7 +79,7 @@ def list(
     """
     from lighteval.tasks.registry import Registry
 
-    registry = Registry(custom_tasks=custom_tasks)
+    registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True)
     registry.print_all_tasks(suites=suites)
 
 
diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py
index db2b68bd1..9645904fa 100644
--- a/src/lighteval/models/transformers/transformers_model.py
+++ b/src/lighteval/models/transformers/transformers_model.py
@@ -799,6 +799,8 @@ def _generate_padded(
             output_logits=returns_logits,
             renormalize_logits=True,
         )
+        if num_samples == 1 and generation_config["temperature"] == 0:
+            generation_config["do_sample"] = False
         if num_samples > 1 and generation_config["temperature"] == 0:
             logger.warning("num_samples > 1 but temperature is set to 0, this will not sample different outputs.")
 
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 4cf1dbee2..71b1efd4a 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -39,7 +39,7 @@
 from lighteval.models.model_output import (
     ModelResponse,
 )
-from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
+from lighteval.tasks.lighteval_task import LightevalTask
 from lighteval.tasks.registry import Registry
 from lighteval.tasks.requests import SamplingMethod
 from lighteval.utils.imports import (
@@ -241,13 +241,10 @@ def _init_tasks_and_requests(self, tasks: str):
         logger.info("--- LOADING TASKS ---")
 
         # The registry contains all the potential tasks
-        registry = Registry(
-            custom_tasks=self.pipeline_parameters.custom_tasks_directory,
-        )
+        registry = Registry(tasks=tasks, custom_tasks=self.pipeline_parameters.custom_tasks_directory)
 
-        # load the tasks fro the configs and their datasets
-        task_configs: list[LightevalTaskConfig] = registry.get_tasks_configs(tasks)
-        self.tasks_dict: dict[str, LightevalTask] = registry.get_tasks_from_configs(task_configs)
+        # load the tasks from the configs and their datasets
+        self.tasks_dict: dict[str, LightevalTask] = registry.load_tasks()
         LightevalTask.load_datasets(self.tasks_dict, self.pipeline_parameters.dataset_loading_processes)
         self.documents_dict = {
             task.full_name: task.get_docs(self.pipeline_parameters.max_samples) for _, task in self.tasks_dict.items()
diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/extended/ifeval/instructions.py
index ee9e7b88b..6e84b6ef4 100644
--- a/src/lighteval/tasks/extended/ifeval/instructions.py
+++ b/src/lighteval/tasks/extended/ifeval/instructions.py
@@ -20,7 +20,6 @@
 import random
 import re
 import string
-from typing import Dict, Optional, Sequence, Union
 
 import langdetect
 
@@ -29,8 +28,6 @@
 
 logger = logging.getLogger(__name__)
 
-_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
-
 _LANGUAGES = instructions_util.LANGUAGE_CODES
 
 # The relational operation for comparison.
diff --git a/src/lighteval/tasks/extended/ifeval/instructions_utils.py b/src/lighteval/tasks/extended/ifeval/instructions_utils.py
index 7d995e42f..63e7a9231 100644
--- a/src/lighteval/tasks/extended/ifeval/instructions_utils.py
+++ b/src/lighteval/tasks/extended/ifeval/instructions_utils.py
@@ -1669,6 +1669,16 @@ def _get_sentence_tokenizer():
     return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
 
 
+def count_stopwords(text):
+    """Counts the number of stopwords."""
+    nltk.download("stopwords")
+    stopwords = nltk.corpus.stopwords.words("english")
+    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
+    tokens = tokenizer.tokenize(text)
+    num_stopwords = len([t for t in tokens if t.lower() in stopwords])
+    return num_stopwords
+
+
 def count_sentences(text):
     """Count the number of sentences."""
     tokenizer = _get_sentence_tokenizer()
diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py
index ad49235fb..571f24787 100644
--- a/src/lighteval/tasks/extended/lcb/main.py
+++ b/src/lighteval/tasks/extended/lcb/main.py
@@ -22,11 +22,11 @@
 """Usage:
 lighteval vllm \
     "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.6,top_p:0.95}" \
-    "extended|lcb:codegeneration|0|0"
+    "extended|lcb:codegeneration|0"
 
 lighteval vllm \
     "pretrained=Qwen/Qwen2.5-Coder-3B-Instruct,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.2,top_p:0.95}" \
-    "extended|lcb:codegeneration|0|0"
+    "extended|lcb:codegeneration|0"
 """
 
 import json
diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
index bf65ac530..88dcfb95e 100644
--- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py
+++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py
@@ -24,7 +24,7 @@
 """
 See https://github.com/felipemaiapolo/tinyBenchmarks/ for the original code.
 
-Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0|0,extended|tiny:gsm8k|0|0,extended|tiny:hellaswag|0|0,extended|tiny:arc|0|0,extended|tiny:truthfulqa|0|0" --extended_tasks extended_tasks --output_dir "./evals"`
+Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0,extended|tiny:gsm8k|0,extended|tiny:hellaswag|0,extended|tiny:arc|0,extended|tiny:truthfulqa|0" --extended_tasks extended_tasks --output_dir "./evals"`
 """
 
 import os
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index d040925dd..8488755b9 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -100,7 +100,6 @@ class LightevalTaskConfig:
     must_remove_duplicate_docs: bool = False
 
     num_fewshots: int = 0
-    truncate_fewshots: bool = False
 
     version: int = 0
 
@@ -114,7 +113,7 @@ def __post_init__(self):
         self.evaluation_splits = tuple(self.evaluation_splits)
         self.suite = tuple(self.suite)
         self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else ()
-        self.full_name = f"{self.name}|{self.num_fewshots}"
+        self.full_name = f"{self.name}|{self.num_fewshots}"  # todo clefourrier: this is likely incorrect
 
     def print(self):
         md_writer = MarkdownTableWriter()
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 0a91c5554..13ba3d00b 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -36,7 +36,12 @@
 import lighteval.tasks.default_tasks as default_tasks
 from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES
 from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
-from lighteval.utils.imports import CANNOT_USE_EXTENDED_TASKS_MSG, can_load_extended_tasks
+from lighteval.utils.imports import (
+    CANNOT_USE_EXTENDED_TASKS_MSG,
+    CANNOT_USE_MULTILINGUAL_TASKS_MSG,
+    can_load_extended_tasks,
+    can_load_multilingual_tasks,
+)
 
 
 # Import community tasks
@@ -104,15 +109,20 @@ def load_community_tasks():
 
 DEFAULT_SUITES = CORE_SUITES + OPTIONAL_SUITES
 
-TRUNCATE_FEW_SHOTS_DEFAULTS = True
-
 
 class Registry:
     """
     The Registry class is used to manage the task registry and get task classes.
     """
 
-    def __init__(self, custom_tasks: str | Path | ModuleType | None = None):
+    def __init__(
+        self,
+        tasks: str | Path | None = None,
+        custom_tasks: str | Path | ModuleType | None = None,
+        load_community: bool = False,
+        load_extended: bool = False,
+        load_multilingual: bool = False,
+    ):
         """
         Initialize the Registry class.
         Registry is responsible for holding a dict of task and their config, initializing a LightevalTask instance when asked.
@@ -131,51 +141,86 @@ def __init__(self, custom_tasks: str | Path | ModuleType | None = None):
         """
         self._custom_tasks = custom_tasks
 
-    def get_tasks_from_configs(self, task_configs: list[LightevalTaskConfig]) -> dict[str, LightevalTask]:
-        return {f"{config.full_name}": LightevalTask(config=config) for config in task_configs}
+        if tasks is None:
+            logger.warning(
+                "You passed no task name. This should only occur if you are using the CLI to inspect tasks."
+            )
+            self.tasks_list = []
+        else:
+            self.tasks_list = self._get_full_task_list_from_input_string(tasks)
+        # These parameters are dynamically set by the task names provided, thanks to `activate_suites_to_load`,
+        # except in the `tasks` CLI command to display the full list
+        self._load_community = load_community
+        self._load_extended = load_extended
+        self._load_multilingual = load_multilingual
+        self._activate_loading_of_optional_suite()  # we dynamically set the loading parameters
 
-    def get_tasks_configs(self, task: str) -> list[LightevalTaskConfig]:
-        """
-        task is a string of the form "suite|task|few_shot|truncate_few_shots,suite|task|few_shot|truncate_few_shots"
+        # We load all task to
+        self._task_registry = self._load_full_registry()
 
-        returns a LightevalTaskConfig object based on the task name and fewshot and truncate_few_shots values.
-        """
-        task_to_params = self.taskinfo_selector(task)
-        configs = []
+        self.task_to_configs = self._update_task_configs()
 
-        for task_name, task_param in task_to_params.items():
-            # We can have multiple versions of the same task running (for ex, different few shots, different metric params, etc)
-            for subtask_param in task_param:
-                config = self.task_registry.get(task_name)
-                if config is None:
-                    raise ValueError(f"Cannot find task {task_name} in task list or in custom task registry")
+    def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]:
+        """Converts an input string (either a path to file with a list of tasks or a string of comma-separated tasks) into an actual list"""
+        if os.path.exists(tasks):
+            with open(tasks, "r") as f:
+                tasks_list = [line.strip() for line in f if line.strip() and not line.startswith("#")]
+        else:
+            tasks_list = tasks.split(",")
 
-                config = copy.deepcopy(config)
-                config.num_fewshots = subtask_param["fewshots"]
-                config.truncate_fewshots = subtask_param["truncate_fewshots"]
-                config.full_name = f"{task_name}|{config.num_fewshots}"
-                # If some tasks are parametrizable and in cli, we set attributes here
-                for metric in [m for m in config.metrics if "@" in m.metric_name]:  # parametrizable metric
-                    for attribute, value in subtask_param["metric_params"].items():
-                        setattr(metric.sample_level_fn, attribute, value)
-                    required = getattr(metric.sample_level_fn, "attribute_must_be_set", [])
-                    for attribute in required:
-                        if getattr(metric.sample_level_fn, attribute) is None:
-                            raise ValueError(
-                                f"Metric {metric.metric_name} for task {task_name} "
-                                f"was not correctly parametrized. Forgot to set '{attribute}'."
-                            )
+        # We might have tasks provided as task groups in the custom tasks
+        # We load the whole task_groups mapping
+        if self._custom_tasks is None:
+            task_groups = {}
+        else:
+            custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks)
+            tasks_group_dict = {}
+            if hasattr(custom_tasks_module, "TASKS_GROUPS"):
+                tasks_group_dict = custom_tasks_module.TASKS_GROUPS
 
-                configs.append(config)
+            # We should allow defining task groups as comma-separated strings or lists of tasks
+            task_groups = {k: v if isinstance(v, list) else v.split(",") for k, v in tasks_group_dict.items()}
 
-        return configs
+        # Then link actual task_group to task list if needed
+        # (At this point the strings are either task name/superset name or group names)
+        expanded_tasks_list: list[str] = []
+        for maybe_task_group in tasks_list:
+            # We either expand the group (in case it's a group name), or we keep it as is (in case it's a task name or superset name)
+            expanded_tasks = task_groups.get(maybe_task_group, [maybe_task_group])
+            if len(expanded_tasks) > 1:
+                logger.info(f"Expanding task group {maybe_task_group} to {expanded_tasks}")
+            expanded_tasks_list.extend(expanded_tasks)
 
-    @property
-    @lru_cache
-    def task_registry(self) -> dict[str, LightevalTaskConfig]:
+        # We remove exact duplicates
+        expanded_tasks_list = list(set(expanded_tasks_list))
+
+        return expanded_tasks_list
+
+    def _activate_loading_of_optional_suite(self) -> None:
+        """Dynamically selects which of the optional suite we want to load."""
+        suites = {task.split("|")[0] for task in self.tasks_list}
+
+        for suite_name in suites:
+            if suite_name not in DEFAULT_SUITES:
+                logger.warning(
+                    f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations."
+                )
+
+        if "extended" in suites:
+            if not can_load_extended_tasks():
+                raise ImportError(CANNOT_USE_EXTENDED_TASKS_MSG)
+            self._load_extended = True
+        if "multilingual" in suites:
+            if not can_load_multilingual_tasks():
+                raise ImportError(CANNOT_USE_MULTILINGUAL_TASKS_MSG)
+            self._load_multilingual = True
+        if "community" in suites:
+            self._load_community = True
+
+    def _load_full_registry(self) -> dict[str, LightevalTaskConfig]:
         """
         Returns:
-            dict[str, LazyLightevalTask]: A dictionary mapping task names (suite|task) to their corresponding LightevalTask classes.
+            dict[str, LightevalTaskConfig]: A dictionary mapping task names (suite|task) to their corresponding LightevalTask classes.
 
         Example:
             {
@@ -188,30 +233,27 @@ def task_registry(self) -> dict[str, LightevalTaskConfig]:
 
         if self._custom_tasks is not None:
             custom_tasks_module.append(Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks))
-        if can_load_extended_tasks():
+
+        # Need to load extended tasks
+        if self._load_extended:
             for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES:
                 custom_tasks_module.append(extended_task_module)
         else:
             logger.warning(CANNOT_USE_EXTENDED_TASKS_MSG)
 
-        # Load community tasks
-        community_modules = load_community_tasks()
-        for community_task_module in community_modules:
-            custom_tasks_module.append(community_task_module)
+        # Need to load community tasks
+        if self._load_community:
+            community_modules = load_community_tasks()
+            for community_task_module in community_modules:
+                custom_tasks_module.append(community_task_module)
 
-        # Load multilingual tasks
-        MULTILINGUAL_TASKS_AVAILABLE = False
-        multilingual_tasks = None
-        try:
+        # Need to load multilingual tasks
+        if self._load_multilingual:
             import lighteval.tasks.multilingual.tasks as multilingual_tasks
 
-            MULTILINGUAL_TASKS_AVAILABLE = True
-        except ImportError as e:
-            logger.warning(f"Could not load multilingual tasks: {e}. You may need to install additional dependencies.")
-
-        if MULTILINGUAL_TASKS_AVAILABLE and multilingual_tasks is not None:
             custom_tasks_module.append(multilingual_tasks)
 
+        # We load all
         for module in custom_tasks_module:
             custom_task_configs.extend(module.TASKS_TABLE)
             logger.info(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}")
@@ -230,84 +272,72 @@ def task_registry(self) -> dict[str, LightevalTaskConfig]:
 
         return {**default_tasks_registry, **custom_tasks_registry}
 
-    def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]:
+    def _update_task_configs(self) -> dict[str, LightevalTaskConfig]:  # noqa: C901
         """
-        Converts a input string of tasks name to task information usable by lighteval.
-
-        Args:
-            tasks (str): A string containing a comma-separated list of tasks definitions in the
-                format: "task_definition", where it can be
-                containing a list of tasks.
-                where task_definition can be:
-                - path to a file containing a list of tasks (one per line)
-                - task group defined in TASKS_GROUPS dict in custom tasks file
-                - task name with few shot in format "suite|task|few_shot|truncate_few_shots"
-                - task superset in format "suite|task_superset|few_shot|truncate_few_shots" (superset will run all tasks with format "suite|task_superset:{subset}|few_shot|truncate_few_shots")
-
-
-        Returns:
-            tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing:
-                - A sorted list of unique task names in the format "suite|task".
-                - A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values.
+        Updates each config depending on the input tasks (we replace all provided params, like few shot number, sampling params, etc)
         """
-        task_to_params = collections.defaultdict(list)
+        task_to_configs = collections.defaultdict(list)
 
-        # We can provide a path to a file with a list of tasks or a string of comma-separated tasks
-        if os.path.exists(tasks):
-            with open(tasks, "r") as f:
-                tasks_list = [line.strip() for line in f if line.strip() and not line.startswith("#")]
-        else:
-            tasks_list = tasks.split(",")
-
-        # At this point the strings are either task name/superset name or group names
-        # Here we deal with group names and map them to corresponding tasks
-        expanded_tasks_list: list[str] = []
-        for maybe_task_group in tasks_list:
-            # We either expand the group (in case it's a group name), or we keep it as is (in case it's a task name or superset name)
-            expanded_tasks = self.task_groups_dict.get(maybe_task_group, [maybe_task_group])
-            if len(expanded_tasks) > 1:
-                logger.info(f"Expanding task group {maybe_task_group} to {expanded_tasks}")
-            expanded_tasks_list.extend(expanded_tasks)
-
-        for task in expanded_tasks_list:
+        # We map all tasks to their parameters
+        for task in self.tasks_list:
             metric_params_dict = {}
             try:
-                suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|"))
+                if task.count("|") == 3:
+                    logger.warning(
+                        "Deprecation warning: You provided 4 arguments in your task name, but we no longer support the `truncate_fewshot` option. We will ignore the parameter for now, but it will fail in a couple of versions, so you should change your task name to `suite|task|num_fewshot`."
+                    )
+                    suite_name, task_name, few_shot, _ = tuple(task.split("|"))
+                else:
+                    suite_name, task_name, few_shot = tuple(task.split("|"))
                 if "@" in task_name:
-                    task_name, metric_params = task_name.split("@")
-                    # We convert k:v,k2:v2 to {"k": "v", "k2": "v2"}, then to correct type
-                    metric_params_dict = dict(item.split("=") for item in metric_params.split(",") if item)
+                    split_task_name = task_name.split("@")
+                    task_name, metric_params = split_task_name[0], split_task_name[1:]
+                    # We convert k:v to {"k": "v"}, then to correct type
+                    metric_params_dict = dict(item.split("=") for item in metric_params if item)
                     metric_params_dict = {k: ast.literal_eval(v) for k, v in metric_params_dict.items()}
+                few_shot = int(few_shot)
 
-                truncate_few_shots = int(truncate_few_shots)
             except ValueError:
-                raise ValueError(
-                    f"Cannot get task info from {task}. correct format is suite|task|few_shot|truncate_few_shots"
-                )
+                raise ValueError(f"Cannot get task info from {task}. correct format is suite|task|few_shot")
 
-            if truncate_few_shots not in [0, 1]:
-                raise ValueError(f"TruncateFewShots must be 0 or 1, got {truncate_few_shots}")
+            # This adds support for task supersets (eg: mmlu -> all the mmlu tasks)
+            for expanded_task in self._expand_task_definition(f"{suite_name}|{task_name}"):
+                # todo: it's likely we'll want this step at the list set up step, not here
 
-            truncate_few_shots = bool(truncate_few_shots)
-            few_shot = int(few_shot)
+                # We load each config
+                config = self._task_registry.get(expanded_task)
+                if config is None:
+                    raise ValueError(f"Cannot find task {expanded_task} in task list or in custom task registry")
 
-            if suite_name not in DEFAULT_SUITES:
-                logger.warning(
-                    f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations."
-                )
+                config = copy.deepcopy(config)
+                config.num_fewshots = few_shot
+                config.full_name = f"{expanded_task}|{config.num_fewshots}"
+                # If some tasks are parametrizable and in cli, we set attributes here
+                for metric in [m for m in config.metrics if "@" in m.metric_name]:  # parametrizable metric
+                    for attribute, value in metric_params_dict.items():
+                        setattr(metric.sample_level_fn, attribute, value)
+                    required = getattr(metric.sample_level_fn, "attribute_must_be_set", [])
+                    for attribute in required:
+                        if getattr(metric.sample_level_fn, attribute) is None:
+                            raise ValueError(
+                                f"Metric {metric.metric_name} for task {expanded_task} "
+                                f"was not correctly parametrized. Forgot to set '{attribute}'."
+                            )
 
-            # This adds support for task supersets (eg: mmlu -> all the mmlu tasks)
-            for expanded_task in self.expand_task_definition(f"{suite_name}|{task_name}"):
-                # Store few_shot info for each task name (suite|task)
-                task_to_params[expanded_task].append(
-                    {
-                        "fewshots": few_shot,
-                        "truncate_fewshots": truncate_few_shots,
-                        "metric_params": metric_params_dict,
-                    }
-                )
+                task_to_configs[expanded_task].append(config)
+
+        return task_to_configs
 
-        return task_to_params
+    def load_tasks(self) -> dict[str, LightevalTask]:
+        if len(self.task_to_configs) == 0:  # we're in cli to analyse tasks, we return all tasks
+            return {f"{config.full_name}": LightevalTask(config=config) for config in self._task_registry.values()}
+
+        # We return only the tasks of interest
+        return {
+            f"{config.full_name}": LightevalTask(config=config)
+            for configs in self.task_to_configs.values()
+            for config in configs
+        }
 
     @property
     @lru_cache
@@ -323,34 +353,11 @@ def _task_superset_dict(self):
         """
         # Note: sorted before groupby is important as the python implementation of groupby does not
         # behave like sql groupby. For more info see the docs of itertools.groupby
-        superset_dict = {k: list(v) for k, v in groupby(sorted(self.task_registry.keys()), lambda x: x.split(":")[0])}
+        superset_dict = {k: list(v) for k, v in groupby(sorted(self._task_registry.keys()), lambda x: x.split(":")[0])}
         # Only consider supersets with more than one task
         return {k: v for k, v in superset_dict.items() if len(v) > 1}
 
-    @property
-    @lru_cache
-    def task_groups_dict(self) -> dict[str, list[str]]:
-        """
-        Returns:
-            dict[str, list[str]]: A dictionary where keys are task group names and values are lists of task names (suite|task).
-
-        Example:
-            {
-                "all_custom": ["custom|task1", "custom|task2", "custom|task3"],
-                "group1": ["custom|task1", "custom|task2"],
-            }
-        """
-        if self._custom_tasks is None:
-            return {}
-        custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks)
-        tasks_group_dict = {}
-        if hasattr(custom_tasks_module, "TASKS_GROUPS"):
-            tasks_group_dict = custom_tasks_module.TASKS_GROUPS
-
-        # We should allow defining task groups as comma-separated strings or lists of tasks
-        return {k: v if isinstance(v, list) else v.split(",") for k, v in tasks_group_dict.items()}
-
-    def expand_task_definition(self, task_definition: str):
+    def _expand_task_definition(self, task_definition: str):
         """
         Args:
             task_definition (str): Task definition to expand. In format:
@@ -368,6 +375,55 @@ def expand_task_definition(self, task_definition: str):
         # Then it must be a single task
         return [task_definition]
 
+    @staticmethod
+    def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleType:
+        """Creates a custom task module to load tasks defined by the user in their own file.
+
+        Args:
+            custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
+
+        Returns:
+            ModuleType: The newly imported/created custom tasks modules
+        """
+        if isinstance(custom_tasks, ModuleType):
+            return custom_tasks
+        if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks):
+            module_name = os.path.splitext(os.path.basename(custom_tasks))[0]
+            spec = importlib.util.spec_from_file_location(module_name, custom_tasks)
+
+            if spec is None:
+                raise ValueError(f"Cannot find module {module_name} at {custom_tasks}")
+
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            return module
+        if isinstance(custom_tasks, (str, Path)):
+            return importlib.import_module(str(custom_tasks))
+
+    @staticmethod
+    def create_task_config_dict(meta_table: list[LightevalTaskConfig] | None = None) -> dict[str, LightevalTaskConfig]:
+        """
+        Create configuration tasks based on the provided meta_table.
+
+        Args:
+            meta_table: meta_table containing tasks
+                configurations. If not provided, it will be loaded from TABLE_PATH.
+
+        Returns:
+            Dict[str, LightevalTaskConfig]: A dictionary of task names mapped to their corresponding LightevalTaskConfig.
+        """
+
+        if meta_table is None:
+            meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)]
+
+        tasks_with_config: dict[str, LightevalTaskConfig] = {}
+        for config in meta_table:
+            for suite in config.suite:
+                if suite in DEFAULT_SUITES:
+                    tasks_with_config[f"{suite}|{config.name}"] = config
+
+        return tasks_with_config
+
     def print_all_tasks(self, suites: str | None = None):
         """
         Print all the tasks in the task registry.
@@ -399,7 +455,7 @@ def print_all_tasks(self, suites: str | None = None):
                     requested_suites.remove("multilingual")
 
         # Get all tasks and filter by requested suites
-        all_tasks = list(self.task_registry.keys())
+        all_tasks = list(self._task_registry.keys())
         tasks_names = [task for task in all_tasks if task.split("|")[0] in requested_suites]
 
         # Ensure all requested suites are present (even if empty)
@@ -428,59 +484,3 @@ def print_all_tasks(self, suites: str | None = None):
         # Print summary
         total_tasks = len([t for t in tasks_names if t.split("|")[1]])
         print(f"\nTotal tasks displayed: {total_tasks}")
-
-    @staticmethod
-    def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleType:
-        """Creates a custom task module to load tasks defined by the user in their own file.
-
-        Args:
-            custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself
-
-        Returns:
-            ModuleType: The newly imported/created custom tasks modules
-        """
-        if isinstance(custom_tasks, ModuleType):
-            return custom_tasks
-        if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks):
-            module_name = os.path.splitext(os.path.basename(custom_tasks))[0]
-            spec = importlib.util.spec_from_file_location(module_name, custom_tasks)
-
-            if spec is None:
-                raise ValueError(f"Cannot find module {module_name} at {custom_tasks}")
-
-            module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(module)
-            return module
-        if isinstance(custom_tasks, (str, Path)):
-            return importlib.import_module(str(custom_tasks))
-
-    @staticmethod
-    def create_task_config_dict(meta_table: list[LightevalTaskConfig] | None = None) -> dict[str, LightevalTaskConfig]:
-        """
-        Create configuration tasks based on the provided meta_table.
-
-        Args:
-            meta_table: meta_table containing tasks
-                configurations. If not provided, it will be loaded from TABLE_PATH.
-            cache_dir: Directory to store cached data. If not
-                provided, the default cache directory will be used.
-
-        Returns:
-            Dict[str, LightevalTask]: A dictionary of task names mapped to their corresponding LightevalTask classes.
-        """
-
-        if meta_table is None:
-            meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)]
-
-        tasks_with_config: dict[str, LightevalTaskConfig] = {}
-        for config in meta_table:
-            if not any(suite in config.suite for suite in DEFAULT_SUITES):
-                logger.warning(
-                    f"This evaluation is not in any known suite: {config.name} is in {config.suite}, not in {DEFAULT_SUITES}. Skipping."
-                )
-                continue
-            for suite in config.suite:
-                if suite in DEFAULT_SUITES:
-                    tasks_with_config[f"{suite}|{config.name}"] = config
-
-        return tasks_with_config
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 8829510b2..30da2adea 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -101,12 +101,6 @@ class Doc:
             Name of the task or benchmark this Doc belongs to.
 
         ## Few-shot Learning Parameters
-        num_asked_few_shots (int):
-            Number of few-shot examples requested for this instance.
-
-        num_effective_few_shots (int):
-            Actual number of few-shot examples used (may differ from requested).
-
         fewshot_samples (list):
             List of Doc objects representing few-shot examples.
             These examples are prepended to the main query to provide context.
@@ -212,8 +206,6 @@ class Doc:
     task_name: str = ""
 
     # Fewshots parameters
-    num_asked_few_shots: int = 0
-    num_effective_few_shots: int = 0
     fewshot_samples: list = field(default_factory=list)
     sampling_methods: list[SamplingMethod] = field(default_factory=list)
     fewshot_sorting_class: str | None = None  # class to use to select balanced few-shot samples
diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py
index 182027099..2534cb52a 100644
--- a/src/lighteval/utils/imports.py
+++ b/src/lighteval/utils/imports.py
@@ -109,6 +109,18 @@ def can_load_extended_tasks() -> bool:
 CANNOT_USE_EXTENDED_TASKS_MSG = "If you want to use extended_tasks, make sure you installed their dependencies using `pip install -e .[extended_tasks]`."
 
 
+def can_load_multilingual_tasks() -> bool:
+    try:
+        import lighteval.tasks.multilingual.tasks  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+CANNOT_USE_MULTILINGUAL_TASKS_MSG = "If you want to use multilingual tasks, make sure you installed their dependencies using `pip install -e .[multilingual]`."
+
+
 def can_load_spacy_tokenizer(language: str) -> bool:
     imports = []
     packages = ["spacy", "stanza"]
diff --git a/tests/logging/test_evaluation_tracker.py b/tests/logging/test_evaluation_tracker.py
index ba4517245..45c5790d0 100644
--- a/tests/logging/test_evaluation_tracker.py
+++ b/tests/logging/test_evaluation_tracker.py
@@ -445,7 +445,6 @@ def test_default_property_with_different_model_configs(self):
                         "non_truncated": 0,
                         "padded": 0,
                         "non_padded": 0,
-                        "num_truncated_few_shots": 0,
                     },
                 )
 
diff --git a/tests/metrics/test_metric_requests.py b/tests/metrics/test_metric_requests.py
index 7ceb94c68..e7f9ee473 100644
--- a/tests/metrics/test_metric_requests.py
+++ b/tests/metrics/test_metric_requests.py
@@ -43,7 +43,7 @@ def dummy_prompt_fc(line, task_name: str = ""):
 
 
 def get_pmi_task(metrics: list[Metric]):
-    return LightevalTaskConfig(
+    config = LightevalTaskConfig(
         name="pmi_test_task",
         metrics=metrics,
         suite=["test"],
@@ -52,6 +52,10 @@ def get_pmi_task(metrics: list[Metric]):
         hf_subset=xstory_cloze_en_lighteval.hf_subset,
         evaluation_splits=xstory_cloze_en_lighteval.evaluation_splits,
     )
+    # This is manually edited when updating the config and in the post init function
+    #  - we need to get a more homogeneous system for naming...
+    config.full_name = "test|pmi_test_task|0"
+    return config
 
 
 def test_pmi_request():
@@ -72,9 +76,10 @@ def test_pmi_request():
     metric = LogLikelihoodAccMetric(normalization=LogProbPMINorm())
     pmi_test_config = get_pmi_task(metrics=[metric])
     task = LightevalTask(pmi_test_config)
-    result = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"]
+    evaluation = fake_evaluate_task(task, fake_model, max_samples=1)
+    results = evaluation["results"]["test:pmi_test_task:0"]
     # Correct choice after norm should be the second one so 0 acc
-    assert result[metric.metric_name] == 0
+    assert results[metric.metric_name] == 0
 
 
 def test_pmi_request_with_logprob_metric():
diff --git a/tests/pipeline/test_reasoning_tags.py b/tests/pipeline/test_reasoning_tags.py
index dd131e838..84dfb9e7e 100644
--- a/tests/pipeline/test_reasoning_tags.py
+++ b/tests/pipeline/test_reasoning_tags.py
@@ -61,11 +61,13 @@ def setUp(self):
             stop_sequence=["\n"],
             num_fewshots=0,
         )
+        self.input_task_name = "test|test_reasoning_task|0"
+        self.task_config_name = self.task_config.full_name
 
         # Create test documents with reasoning tags in expected responses
         self.test_docs = [
             Doc(
-                task_name="test|test_reasoning_task|0",
+                task_name=self.input_task_name,
                 query="What is 2+2?",
                 choices=["4"],
                 gold_index=[0],
@@ -77,7 +79,7 @@ def setUp(self):
         # Mock dataset
         self.mock_dataset = {"test": self.test_docs}
 
-    def _mock_task_registry(self, task_config, task_docs, responses_with_reasoning_tags):
+    def _mock_task_registry(self, input_task_name, task_config, task_docs, responses_with_reasoning_tags):
         """Create a fake registry for testing."""
 
         class FakeTask(LightevalTask):
@@ -93,14 +95,15 @@ def download_dataset_worker(task) -> None:
                 return task._docs
 
         class FakeRegistry(Registry):
-            def __init__(self, custom_tasks: Optional[Union[str, Path, ModuleType]] = None):
-                super().__init__(custom_tasks=custom_tasks)
+            def __init__(
+                self, tasks: Optional[str] = None, custom_tasks: Optional[Union[str, Path, ModuleType]] = None
+            ):
+                self.tasks_list = [input_task_name]
+                # suite_name, task_name, few_shot = input_task_name.split("|")
+                self.task_to_configs = {input_task_name: [task_config]}
 
-            def get_tasks_configs(self, task: str):
-                return [task_config]
-
-            def get_tasks_from_configs(self, tasks_configs):
-                return {f"{task_config.suite[0]}|{task_config.full_name}": FakeTask(task_config)}
+            def load_tasks(self):
+                return {input_task_name: FakeTask(config=task_config)}
 
         # Create a DummyModel that returns responses with reasoning tags
         class TestDummyModel(DummyModel):
@@ -122,7 +125,7 @@ def test_remove_reasoning_tags_enabled(self):
         ]
 
         FakeRegistry, TestDummyModel = self._mock_task_registry(
-            self.task_config, self.test_docs, responses_with_reasoning
+            self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning
         )
 
         # Initialize accelerator if available
@@ -144,7 +147,7 @@ def test_remove_reasoning_tags_enabled(self):
             model = TestDummyModel(DummyModelConfig(seed=42))
 
             pipeline = Pipeline(
-                tasks="test|test_reasoning_task|0|0",
+                tasks="test|test_reasoning_task|0",
                 pipeline_parameters=pipeline_params,
                 evaluation_tracker=evaluation_tracker,
                 model=model,
@@ -168,7 +171,7 @@ def test_remove_reasoning_tags_enabled_tags_as_string(self):
         ]
 
         FakeRegistry, TestDummyModel = self._mock_task_registry(
-            self.task_config, self.test_docs, responses_with_reasoning
+            self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning
         )
 
         # Initialize accelerator if available
@@ -190,7 +193,7 @@ def test_remove_reasoning_tags_enabled_tags_as_string(self):
             model = TestDummyModel(DummyModelConfig(seed=42))
 
             pipeline = Pipeline(
-                tasks="test|test_reasoning_task|0|0",
+                tasks="test|test_reasoning_task|0",
                 pipeline_parameters=pipeline_params,
                 evaluation_tracker=evaluation_tracker,
                 model=model,
@@ -214,7 +217,7 @@ def test_remove_reasoning_tags_enabled_default_tags(self):
         ]
 
         FakeRegistry, TestDummyModel = self._mock_task_registry(
-            self.task_config, self.test_docs, responses_with_reasoning
+            self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning
         )
 
         # Initialize accelerator if available
@@ -233,7 +236,7 @@ def test_remove_reasoning_tags_enabled_default_tags(self):
             model = TestDummyModel(DummyModelConfig(seed=42))
 
             pipeline = Pipeline(
-                tasks="test|test_reasoning_task|0|0",
+                tasks="test|test_reasoning_task|0",
                 pipeline_parameters=pipeline_params,
                 evaluation_tracker=evaluation_tracker,
                 model=model,
@@ -257,7 +260,7 @@ def test_remove_reasoning_tags_disabled(self):
         ]
 
         FakeRegistry, TestDummyModel = self._mock_task_registry(
-            self.task_config, self.test_docs, responses_with_reasoning
+            self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning
         )
 
         # Initialize accelerator if available
@@ -279,7 +282,7 @@ def test_remove_reasoning_tags_disabled(self):
             model = TestDummyModel(DummyModelConfig(seed=42))
 
             pipeline = Pipeline(
-                tasks="test|test_reasoning_task|0|0",
+                tasks="test|test_reasoning_task|0",
                 pipeline_parameters=pipeline_params,
                 evaluation_tracker=evaluation_tracker,
                 model=model,
@@ -303,7 +306,7 @@ def test_custom_reasoning_tags(self):
         ]
 
         FakeRegistry, TestDummyModel = self._mock_task_registry(
-            self.task_config, self.test_docs, responses_with_reasoning
+            self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning
         )
 
         # Initialize accelerator if available
@@ -325,7 +328,7 @@ def test_custom_reasoning_tags(self):
             model = TestDummyModel(DummyModelConfig(seed=42))
 
             pipeline = Pipeline(
-                tasks="test|test_reasoning_task|0|0",
+                tasks="test|test_reasoning_task|0",
                 pipeline_parameters=pipeline_params,
                 evaluation_tracker=evaluation_tracker,
                 model=model,
@@ -349,7 +352,7 @@ def test_multiple_reasoning_tags(self):
         ]
 
         FakeRegistry, TestDummyModel = self._mock_task_registry(
-            self.task_config, self.test_docs, responses_with_reasoning
+            self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning
         )
 
         # Initialize accelerator if available
@@ -371,7 +374,7 @@ def test_multiple_reasoning_tags(self):
             model = TestDummyModel(DummyModelConfig(seed=42))
 
             pipeline = Pipeline(
-                tasks="test|test|test_reasoning_task|0|0",
+                tasks="test|test|test_reasoning_task|0",
                 pipeline_parameters=pipeline_params,
                 evaluation_tracker=evaluation_tracker,
                 model=model,
diff --git a/tests/slow_tests/test_accelerate_vlm_model.py b/tests/slow_tests/test_accelerate_vlm_model.py
index f9a8edfdc..57255758f 100644
--- a/tests/slow_tests/test_accelerate_vlm_model.py
+++ b/tests/slow_tests/test_accelerate_vlm_model.py
@@ -40,7 +40,7 @@
         "results_file": "tests/reference_scores/Qwen2.5-VL-3B-Instruct-results-vlm.json",
     }
 ]
-TASKS = "lighteval|mmmu_pro:standard-4|0|0"
+TASKS = "lighteval|mmmu_pro:standard-4|0"
 
 ModelInput = Tuple[str, Callable[[], dict]]
 
diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py
index caeb4e787..106708549 100644
--- a/tests/tasks/test_registry.py
+++ b/tests/tasks/test_registry.py
@@ -39,8 +39,8 @@
 ]
 
 TASKS_GROUPS = {
-    "zero_and_one": "custom|test_task_revision|0|0,custom|test_task_revision|1|0",
-    "all_mmlu": "original|mmlu|3|0",
+    "zero_and_one": "custom|test_task_revision|0,custom|test_task_revision|1",
+    "all_mmlu": "original|mmlu|3",
 }
 
 
@@ -48,123 +48,122 @@ def test_custom_task_groups():
     """
     Tests that task info selector correctly handles custom task groups.
     """
-    registry = Registry(custom_tasks="tests.tasks.test_registry")
-    task_info = registry.taskinfo_selector("zero_and_one")
+    registry = Registry(tasks="zero_and_one", custom_tasks="tests.tasks.test_registry")
 
-    assert set(task_info.keys()) == {"custom|test_task_revision"}
-    assert task_info["custom|test_task_revision"] == [
-        {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}},
-        {"fewshots": 1, "truncate_fewshots": False, "metric_params": {}},
-    ]
+    assert set(registry.tasks_list) == {"custom|test_task_revision|0", "custom|test_task_revision|1"}
+
+    assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"}
+
+    task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"]
+    assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {0, 1}
 
 
 def test_custom_tasks():
     """
     Tests that task info selector correctly handles custom tasks.
     """
-    registry = Registry(custom_tasks="tests.tasks.test_registry")
-    task_info = registry.taskinfo_selector("custom|test_task_revision|0|0")
+    registry = Registry(tasks="custom|test_task_revision|0", custom_tasks="tests.tasks.test_registry")
+
+    assert registry.tasks_list == ["custom|test_task_revision|0"]
+    assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"}
 
-    assert list(task_info.keys()) == ["custom|test_task_revision"]
-    assert task_info["custom|test_task_revision"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}]
+    task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"]
+    assert task_info[0].num_fewshots == 0
 
 
 def test_superset_expansion():
     """
     Tests that task info selector correctly handles supersets.
     """
-    registry = Registry()
+    registry = Registry(tasks="lighteval|storycloze|0")
 
-    task_info = registry.taskinfo_selector("lighteval|storycloze|0|0")
+    # The task list is saved as provided by the user
+    assert registry.tasks_list == ["lighteval|storycloze|0"]
 
-    assert list(task_info.keys()) == ["lighteval|storycloze:2016", "lighteval|storycloze:2018"]
-    assert task_info["lighteval|storycloze:2016"] == [
-        {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}
-    ] and task_info["lighteval|storycloze:2018"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}]
+    # But we expand the superset when loading the configurations
+    assert set(registry.task_to_configs.keys()) == {"lighteval|storycloze:2016", "lighteval|storycloze:2018"}
+
+    for task_name in {"lighteval|storycloze:2016", "lighteval|storycloze:2018"}:
+        task_info: list[LightevalTaskConfig] = registry.task_to_configs[task_name]
+        assert task_info[0].num_fewshots == 0
 
 
 def test_superset_with_subset_task():
     """
     Tests that task info selector correctly handles if both superset and one of subset tasks are provided.
     """
-    registry = Registry()
-
-    task_info = registry.taskinfo_selector("original|mmlu|3|0,original|mmlu:abstract_algebra|5|0")
+    registry = Registry(tasks="original|mmlu|3,original|mmlu:abstract_algebra|5")
 
     # We have all mmlu tasks
-    assert len(task_info.keys()) == 57
-    # Since it's defined twice
-    assert task_info["original|mmlu:abstract_algebra"] == [
-        {
-            "fewshots": 3,
-            "truncate_fewshots": False,
-            "metric_params": {},
-        },
-        {"fewshots": 5, "truncate_fewshots": False, "metric_params": {}},
-    ]
+    assert set(registry.tasks_list) == {"original|mmlu|3", "original|mmlu:abstract_algebra|5"}
+    assert len(registry.task_to_configs.keys()) == 57
+
+    task_info: list[LightevalTaskConfig] = registry.task_to_configs["original|mmlu:abstract_algebra"]
+    assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {3, 5}
 
 
 def test_cli_sampling_params():
     """
     Tests task setting the sampling parameters in CLI.
     """
-    registry = Registry()
+    registry_no_sampling = Registry(tasks="lighteval|math_500|0")
 
-    task_info = registry.taskinfo_selector("lighteval|math_500@k=1|0|0")
+    task_info_no_sampling: list[LightevalTaskConfig] = registry_no_sampling.task_to_configs["lighteval|math_500"]
+    # Default values
+    assert task_info_no_sampling[0].metrics[0].sample_level_fn.k == 1
+    assert task_info_no_sampling[0].metrics[0].sample_level_fn.n == 1
 
-    assert list(task_info.keys()) == ["lighteval|math_500"]
-    assert task_info["lighteval|math_500"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {"k": 1}}]
+    registry = Registry(tasks="lighteval|math_500@k=2@n=10|0")
+
+    task_info: list[LightevalTaskConfig] = registry.task_to_configs["lighteval|math_500"]
+    assert task_info[0].metrics[0].sample_level_fn.k == 2
+    assert task_info[0].metrics[0].sample_level_fn.n == 10
 
 
 def test_cli_sampling_params_fail():
     """
     Tests task setting the sampling parameters in CLI failure when args are wrong.
     """
-    registry = Registry()
-
     # creation of object should fail
     with pytest.raises(ValueError):
-        registry.get_tasks_configs("lighteval|math_500@plop|0|0")
+        Registry("lighteval|math_500@plop|0")
 
 
 def test_task_group_expansion_with_subset_expansion():
     """
     Tests that task info selector correctly handles a group with task superset is provided.
     """
-    registry = Registry(custom_tasks="tests.tasks.test_registry")
-
-    task_info = registry.taskinfo_selector("all_mmlu")
+    registry = Registry(tasks="all_mmlu", custom_tasks="tests.tasks.test_registry")
 
-    assert len(task_info.keys()) == 57
+    # We have all mmlu tasks
+    assert len(registry.task_to_configs.keys()) == 57
 
 
 def test_invalid_task_creation():
     """
     Tests that tasks info registry correctly raises errors for invalid tasks
     """
-    registry = Registry()
     with pytest.raises(ValueError):
-        registry.get_tasks_configs("custom|task_revision")
+        Registry(tasks="custom|task_revision")
 
 
 def test_task_duplicates():
     """
     Tests that task info selector correctly handles if duplicate tasks are provided.
     """
-    registry = Registry()
-
-    task_info = registry.taskinfo_selector("custom|test_task_revision|0|0,custom|test_task_revision|0|0")
+    registry = Registry(
+        tasks="custom|test_task_revision|0,custom|test_task_revision|0", custom_tasks="tests.tasks.test_registry"
+    )
 
-    assert list(task_info.keys()) == ["custom|test_task_revision"]
+    assert list(registry.tasks_list) == ["custom|test_task_revision|0"]
 
 
 def test_task_creation():
     """
     Tests that tasks registry correctly creates tasks
     """
-    registry = Registry()
-    task_config = registry.get_tasks_configs("lighteval|storycloze:2016|0|0")
-    task = registry.get_tasks_from_configs(task_config)["lighteval|storycloze:2016|0"]
+    registry = Registry(tasks="lighteval|storycloze:2016|0")
+    task = registry.load_tasks()["lighteval|storycloze:2016|0"]
 
     assert isinstance(task, LightevalTask)
     assert task.name == "storycloze:2016"
diff --git a/tests/utils.py b/tests/utils.py
index 67714bceb..b44d27551 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -98,6 +98,7 @@ def fake_evaluate_task(
     # Mock the Registry.get_task_dict method
 
     task_name = f"{lighteval_task.suite[0]}|{lighteval_task.name}"
+    task_name_fs = f"{lighteval_task.suite[0]}|{lighteval_task.name}|{n_fewshot}"
 
     task_dict = {task_name: lighteval_task}
     evaluation_tracker = EvaluationTracker(output_dir="outputs")
@@ -105,18 +106,12 @@ def fake_evaluate_task(
     # Create a mock Registry class
 
     class FakeRegistry(Registry):
-        def __init__(self, custom_tasks: Optional[Union[str, Path, ModuleType]] = None):
-            super().__init__(custom_tasks=custom_tasks)
-
-        def get_task_dict(self, task_names: list[str]):
-            return task_dict
-
-        def get_tasks_configs(self, task: str):
-            config = lighteval_task.config
-            config.num_fewshots = n_fewshot
-            config.truncate_fewshots = False
-            config.full_name = f"{task_name}|{config.num_fewshots}"
-            return [config]
+        def __init__(self, tasks: Optional[str], custom_tasks: Optional[Union[str, Path, ModuleType]] = None):
+            self.tasks_list = [task_name_fs]
+            self.task_to_configs = {task_name_fs: [lighteval_task.config]}
+
+        def load_tasks(self):
+            return {task_name_fs: lighteval_task}
 
     # This is due to logger complaining we have no initialised the accelerator
     # It's hard to mock as it's global singleton