diff --git a/README.md b/README.md index 44af6a8a5..47f4c07fd 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ Here’s a quick command to evaluate using the Accelerate backend: ```shell lighteval accelerate \ "model_name=gpt2" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` ## 🙏 Acknowledgements diff --git a/community_tasks/custom_task_classification_grammar_task.py b/community_tasks/custom_task_classification_grammar_task.py index f513cf0bf..5b248093b 100644 --- a/community_tasks/custom_task_classification_grammar_task.py +++ b/community_tasks/custom_task_classification_grammar_task.py @@ -32,7 +32,7 @@ Example usage: TGI endpoint evaluation: ```bash - uv run --active --extra litellm --extra tgi lighteval endpoint tgi examples/model_configs/tgi_model.yaml "custom|emotion_classification|0|0" + uv run --active --extra litellm --extra tgi lighteval endpoint tgi examples/model_configs/tgi_model.yaml "custom|emotion_classification|0" --custom-tasks examples/custom_tasks_templates/custom_task_classification_grammar_task.py --output-dir results --save-details @@ -449,8 +449,8 @@ def get_emotion_classification_grammar() -> TextGenerationInputGrammarType: print("\nUsage Examples:") print( - f" TGI: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|0|0' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run --max-samples 10" + f" TGI: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|0' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run --max-samples 10" ) print( - f" Full: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|5|1' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run" + f" Full: uv run lighteval endpoint tgi config/tgi/tgi.yaml 'custom|{task.name}|5' --custom-tasks {__file__} --output-dir results --override-batch-size 1 --use-chat-template --save-details --no-public-run" ) diff --git a/community_tasks/filipino_evals.py b/community_tasks/filipino_evals.py index 1ce362c1d..45011535e 100644 --- a/community_tasks/filipino_evals.py +++ b/community_tasks/filipino_evals.py @@ -42,7 +42,7 @@ from langcodes import Language as LangCodeLanguage from langcodes import standardize_tag -from lighteval.metrics.dynamic_metrics import loglikelihood_acc_metric +from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric from lighteval.metrics.metrics import Metrics from lighteval.metrics.normalizations import ( LogProbCharNorm, @@ -87,8 +87,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -117,8 +117,8 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), ) @@ -154,9 +154,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), version=0, @@ -191,9 +191,9 @@ metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), hf_avail_splits=["test"], @@ -275,9 +275,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), hf_avail_splits=["train", "test"], @@ -327,9 +327,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), ) @@ -360,9 +360,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), hf_avail_splits=["test"], @@ -396,9 +396,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: hf_subset="default", evaluation_splits=["tl"], metrics=[ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ) for formulation in [HybridFormulation(), MCFFormulation()] @@ -427,9 +427,9 @@ def filipino_dengue_pfn(line, task_name: str) -> Doc: metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=None), - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=None), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ], ), trust_dataset=True, @@ -509,9 +509,9 @@ def create_sib200_task(language: Language, formulation): metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), hf_avail_splits=["test", "validation"], @@ -565,9 +565,9 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), hf_avail_splits=["test"], @@ -595,9 +595,9 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), hf_avail_splits=["test"], @@ -718,9 +718,9 @@ def prepare_stingray_semantic_appropriateness(line: dict[str, str]) -> dict[str, metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), version=0, @@ -762,9 +762,9 @@ def create_universalner_task(language: Language, formulation): metrics=get_metrics_for_formulation( formulation, [ - loglikelihood_acc_metric(normalization=LogProbTokenNorm()), - loglikelihood_acc_metric(normalization=LogProbCharNorm()), - loglikelihood_acc_metric(normalization=LogProbPMINorm()), + LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), + LogLikelihoodAccMetric(normalization=LogProbCharNorm()), + LogLikelihoodAccMetric(normalization=LogProbPMINorm()), ], ), version=0, diff --git a/community_tasks/serbian_eval.py b/community_tasks/serbian_eval.py index 38e8b257e..d6d86ab00 100644 --- a/community_tasks/serbian_eval.py +++ b/community_tasks/serbian_eval.py @@ -298,7 +298,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.ARC_EASY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) arc_challenge = create_task_config( @@ -306,7 +306,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.ARC_CHALLENGE.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -318,14 +318,14 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.HELLASWAG.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) piqa = create_task_config( task_name="serbian_evals:piqa", prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.PIQA.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) winogrande = create_task_config( @@ -333,7 +333,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.WINOGRANDE.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -357,7 +357,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ANATOMY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_astronomy = create_task_config( @@ -365,7 +365,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ASTRONOMY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_business_ethics = create_task_config( @@ -373,7 +373,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_BUSINESS_ETHICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_clinical_knowledge = create_task_config( @@ -381,7 +381,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_CLINICAL_KNOWLEDGE.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_miscellaneous = create_task_config( @@ -389,7 +389,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MISCELLANEOUS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_electrical_engineering = create_task_config( @@ -397,7 +397,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ELECTRONIC_ENGINEERING.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -409,7 +409,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_SERBIAN_ALL.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -421,7 +421,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MARKETING.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_management = create_task_config( @@ -429,7 +429,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MANAGEMENT.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -441,7 +441,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_BIOLOGY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_chemistry = create_task_config( @@ -449,7 +449,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_CHEMISTRY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_computer_science = create_task_config( @@ -457,7 +457,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SCIENCE.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_mathematics = create_task_config( @@ -465,7 +465,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_MATHEMATICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_medicine = create_task_config( @@ -473,7 +473,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_MEDICINE.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_college_physics = create_task_config( @@ -481,7 +481,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_PHYSICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_computer_security = create_task_config( @@ -489,7 +489,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_COLLEGE_COMPUTER_SECURITY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -501,7 +501,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MORAL_DISPUTES.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_moral_scenarios = create_task_config( @@ -509,7 +509,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MORAL_SCENARIOS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_philosophy = create_task_config( @@ -517,7 +517,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_PHILOSOPHY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_world_religions = create_task_config( @@ -525,7 +525,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_WORLD_RELIGIONS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -537,7 +537,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_BIOLOGY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_chemistry = create_task_config( @@ -545,7 +545,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_CHEMISTRY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_computer_science = create_task_config( @@ -553,7 +553,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_COMPUTER_SCIENCE.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_european_history = create_task_config( @@ -561,7 +561,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_EURO_HISTORY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_geography = create_task_config( @@ -569,7 +569,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_GEOGRAPHY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_mathematics = create_task_config( @@ -577,7 +577,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MATHEMATICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_microeconomics = create_task_config( @@ -585,7 +585,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_MICROECONOMICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_physics = create_task_config( @@ -593,7 +593,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PHYSICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_psychology = create_task_config( @@ -601,7 +601,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_PSYCHOLOGY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_statistics = create_task_config( @@ -609,7 +609,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_STATISTICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_high_school_world_history = create_task_config( @@ -617,7 +617,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HIGH_SCHOOL_WORLD_HISTORY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -629,7 +629,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ABSTRACT_ALGEBRA.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_elementary_mathematics = create_task_config( @@ -637,7 +637,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ELEMENTARY_MATHEMATICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_formal_logic = create_task_config( @@ -645,7 +645,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_FORMAL_LOGIC.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_conceptual_physics = create_task_config( @@ -653,7 +653,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_CONCEPTUAL_PHYSICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_econometrics = create_task_config( @@ -661,7 +661,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_ECONOMETRICS.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_machine_learning = create_task_config( @@ -669,7 +669,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_MACHINE_LEARNING.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -681,7 +681,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_GLOBAL_FACT.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_logical_fallacies = create_task_config( @@ -689,7 +689,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_LOGICAL_FALLACIES.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_sociology = create_task_config( @@ -697,7 +697,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_SOCIOLOGY.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) mmlu_human_aging = create_task_config( @@ -705,7 +705,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.MMLU_HUMAN_AGING.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) # ============================================ @@ -717,7 +717,7 @@ def create_task_config( prompt_function=boolq_serbian, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.BOOLQ.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) openbook_qa = create_task_config( @@ -725,7 +725,7 @@ def create_task_config( prompt_function=serbian_eval_prompt, hf_repo=HFSubsets.HF_BASE_REPO.value, hf_subset=HFSubsets.OPENBOOK.value, - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], ) diff --git a/community_tasks/turkic_evals.py b/community_tasks/turkic_evals.py index 9eae65d5b..242b25f81 100644 --- a/community_tasks/turkic_evals.py +++ b/community_tasks/turkic_evals.py @@ -123,7 +123,7 @@ def __init__( hf_subset=hf_subset, prompt_function=partial(tumlu_pfn, language=hf_subset), hf_repo="jafarisbarov/TUMLU-mini", - metric=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], + metrics=[Metrics.loglikelihood_acc(sample_params={"logprob_normalization": LogProbCharNorm()})], hf_avail_splits=["test", "dev"], evaluation_splits=["test"], few_shots_split=["dev"], diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx index ae4076203..448941aa3 100644 --- a/docs/source/adding-a-custom-task.mdx +++ b/docs/source/adding-a-custom-task.mdx @@ -126,6 +126,6 @@ Once your file is created you can then run the evaluation with the following com ```bash lighteval accelerate \ "model_name=HuggingFaceH4/zephyr-7b-beta" \ - "community|{custom_task}|{fewshots}|{truncate_few_shot}" \ + "community|{custom_task}|{fewshots}" \ --custom-tasks {path_to_your_custom_task_file} ``` diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/evaluating-a-custom-model.mdx index 1b055dedd..97a30aa53 100644 --- a/docs/source/evaluating-a-custom-model.mdx +++ b/docs/source/evaluating-a-custom-model.mdx @@ -56,7 +56,7 @@ You can evaluate your custom model using either the command line interface or th lighteval custom \ "google-translate" \ "examples/custom_models/google_translate_model.py" \ - "lighteval|wmt20:fr-de|0|0" \ + "lighteval|wmt20:fr-de|0" \ --max-samples 10 ``` @@ -91,7 +91,7 @@ model_config = CustomModelConfig( # Create and run the pipeline pipeline = Pipeline( - tasks="leaderboard|truthfulqa:mc|0|0", + tasks="leaderboard|truthfulqa:mc|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model_config=model_config diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index de2059f49..682a08cda 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -27,7 +27,7 @@ To evaluate `GPT-2` on the Truthful QA benchmark with [🤗 ```bash lighteval accelerate \ "model_name=openai-community/gpt2" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` Here, we first choose a backend (either `accelerate`, `nanotron`, `endpoint`, or `vllm`), and then specify the model and task(s) to run. @@ -38,12 +38,9 @@ Valid key-value pairs correspond with the backend configuration, and are detaile The syntax for the task specification might be a bit hard to grasp at first. The format is as follows: ```txt -{suite}|{task}|{num_few_shot}|{0 for strict `num_few_shots`, or 1 to allow a truncation if context size is too small} +{suite}|{task}|{num_few_shot} ``` -If the fourth value is set to 1, lighteval will check if the prompt (including the few-shot examples) is too long for the context size of the task or the model. -If so, the number of few shot examples is automatically reduced. - Tasks have a function applied at the sample level and one at the corpus level. For example, - an exact match can be applied per sample, then averaged over the corpus to give the final score - samples can be left untouched before applying Corpus BLEU at the corpus level @@ -52,7 +49,7 @@ etc. If the task you are looking at has a sample level function (`sample_level_fn`) which can be parametrized, you can pass parameters in the CLI. For example ```txt -{suite}|{task}@{parameter_name1}={value1},{parameter_name2}={value2},...|0|0 +{suite}|{task}@{parameter_name1}={value1}@{parameter_name2}={value2},...|0 ``` All officially supported tasks can be found at the [tasks_list](available-tasks) and in the @@ -71,7 +68,7 @@ When specifying a path to file, it should start with `./`. lighteval accelerate \ "model_name=openai-community/gpt2" \ ./path/to/lighteval/examples/tasks/recommended_set.txt -# or, e.g., "leaderboard|truthfulqa:mc|0|0|,leaderboard|gsm8k|3|1" +# or, e.g., "leaderboard|truthfulqa:mc|0,leaderboard|gsm8k|3" ``` ## Evaluate a model on one or more GPUs @@ -90,7 +87,7 @@ You can then evaluate a model using data parallelism on 8 GPUs like follows: accelerate launch --multi_gpu --num_processes=8 -m \ lighteval accelerate \ "model_name=openai-community/gpt2" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` Here, `--override_batch_size` defines the batch size per device, so the effective @@ -103,7 +100,7 @@ To evaluate a model using pipeline parallelism on 2 or more GPUs, run: ```bash lighteval accelerate \ "model_name=openai-community/gpt2,model_parallel=True" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` This will automatically use accelerate to distribute the model across the GPUs. @@ -134,7 +131,7 @@ think tokens. ```bash lighteval vllm \ "model_name=mistralai/Magistral-Small-2507,dtype=float16,data_parallel_size=4" \ - "lighteval|aime24|0|0" \ + "lighteval|aime24|0" \ --remove-reasoning-tags \ --reasoning-tags="[('[THINK]','[/THINK]')]" ``` diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index 4a6f379a1..a8a3fb2bc 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -203,12 +203,9 @@ The detail file contains the following columns: "hash_input_tokens": "29916e7afe5cb51d", "hash_cont_tokens": "37f91ce23ef6d435" }, - "truncated": 2, - "non_truncated": 0, "padded": 0, "non_padded": 2, "effective_few_shots": 0.0, - "num_truncated_few_shots": 0 } }, "summary_general": { @@ -218,11 +215,8 @@ The detail file contains the following columns: "hash_input_tokens": "ac933feb14f96d7b", "hash_cont_tokens": "9d03fb26f8da7277" }, - "truncated": 2, - "non_truncated": 0, "padded": 0, "non_padded": 2, - "num_truncated_few_shots": 0 } } ``` diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/use-inference-providers-as-backend.mdx index 1e49e4931..70b436a8a 100644 --- a/docs/source/use-inference-providers-as-backend.mdx +++ b/docs/source/use-inference-providers-as-backend.mdx @@ -12,7 +12,7 @@ Lighteval allows to use Hugging Face's Inference Providers to evaluate llms on s ```bash lighteval endpoint inference-providers \ "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \ - "lighteval|gsm8k|0|0" + "lighteval|gsm8k|0" ``` ## Using a config file @@ -22,7 +22,7 @@ You can use config files to define the model and the provider to use. ```bash lighteval endpoint inference-providers \ examples/model_configs/inference_providers.yaml \ - "lighteval|gsm8k|0|0" + "lighteval|gsm8k|0" ``` with the following config file: diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/use-litellm-as-backend.mdx index 1bcbae6bf..36ecf841d 100644 --- a/docs/source/use-litellm-as-backend.mdx +++ b/docs/source/use-litellm-as-backend.mdx @@ -11,7 +11,7 @@ Documentation for available APIs and compatible endpoints can be found [here](ht ```bash lighteval endpoint litellm \ "provider=openai,model_name=gpt-3.5-turbo" \ - "lighteval|gsm8k|0|0" \ + "lighteval|gsm8k|0" \ ``` ## Using a config file diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx index cfa4352eb..a13c1b82a 100644 --- a/docs/source/use-sglang-as-backend.mdx +++ b/docs/source/use-sglang-as-backend.mdx @@ -6,7 +6,7 @@ To use, simply change the `model_args` to reflect the arguments you want to pass ```bash lighteval sglang \ "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` `sglang` is able to distribute the model across multiple GPUs using data @@ -18,7 +18,7 @@ For example if you have 4 GPUs you can split it across using `tp_size`: ```bash lighteval sglang \ "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` Or, if your model fits on a single GPU, you can use `dp_size` to speed up the evaluation: @@ -26,7 +26,7 @@ Or, if your model fits on a single GPU, you can use `dp_size` to speed up the ev ```bash lighteval sglang \ "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` ## Use a config file @@ -37,7 +37,7 @@ An example of a config file is shown below and can be found at `examples/model_c ```bash lighteval sglang \ "examples/model_configs/sglang_model_config.yaml" \ - "leaderboard|truthfulqa:mc|0|0" + "leaderboard|truthfulqa:mc|0" ``` > [!TIP] diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx index 3a1dbfded..05ec1edde 100644 --- a/docs/source/use-vllm-as-backend.mdx +++ b/docs/source/use-vllm-as-backend.mdx @@ -10,7 +10,7 @@ To use, simply change the `model_args` to reflect the arguments you want to pass ```bash lighteval vllm \ "model_name=HuggingFaceH4/zephyr-7b-beta" \ - "extended|ifeval|0|0" + "extended|ifeval|0" ``` `vllm` is able to distribute the model across multiple GPUs using data @@ -22,7 +22,7 @@ For example if you have 4 GPUs you can split it across using `tensor_parallelism ```bash export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \ "model_name=HuggingFaceH4/zephyr-7b-beta,tensor_parallel_size=4" \ - "extended|ifeval|0|0" + "extended|ifeval|0" ``` Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation: @@ -30,7 +30,7 @@ Or, if your model fits on a single GPU, you can use `data_parallelism` to speed ```bash export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \ "model_name=HuggingFaceH4/zephyr-7b-beta,data_parallel_size=4" \ - "extended|ifeval|0|0" + "extended|ifeval|0" ``` ## Use a config file @@ -41,7 +41,7 @@ An example of a config file is shown below and can be found at `examples/model_c ```bash lighteval vllm \ "examples/model_configs/vllm_model_config.yaml" \ - "extended|ifeval|0|0" + "extended|ifeval|0" ``` ```yaml diff --git a/docs/source/using-the-python-api.mdx b/docs/source/using-the-python-api.mdx index 81fb3e4d9..28927d8de 100644 --- a/docs/source/using-the-python-api.mdx +++ b/docs/source/using-the-python-api.mdx @@ -41,7 +41,7 @@ def main(): dtype="float16", ) - task = "helm|mmlu|5|1" + task = "helm|mmlu|5" pipeline = Pipeline( tasks=task, diff --git a/examples/custom_models/local_mt_model.py b/examples/custom_models/local_mt_model.py index 18b604a5e..5d74aa78c 100644 --- a/examples/custom_models/local_mt_model.py +++ b/examples/custom_models/local_mt_model.py @@ -69,7 +69,7 @@ class LocalMTClient(LightevalModel): where src and tgt are ISO language codes (2 or 3 letter codes supported). Example: - ```lighteval custom facebook/seamless-m4t-v2-large examples/custom_models/local_mt_model.py "lighteval|wmt20:fr-de|0|0" --max-samples 10 --save-details + ```lighteval custom facebook/seamless-m4t-v2-large examples/custom_models/local_mt_model.py "lighteval|wmt20:fr-de|0" --max-samples 10 --save-details ``` Note: diff --git a/examples/nanotron/custom_evaluation_tasks.py b/examples/nanotron/custom_evaluation_tasks.py index 8430bee08..c0e166116 100644 --- a/examples/nanotron/custom_evaluation_tasks.py +++ b/examples/nanotron/custom_evaluation_tasks.py @@ -197,7 +197,7 @@ def preprocess(text): # 0 short for common sense -COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0|1") for t in COMMON_SENSE_REASONING_TASKS] +COMMON_SENSE_REASONING_STRING = [(t, f"custom|{t.name}|0") for t in COMMON_SENSE_REASONING_TASKS] _TASKS_STRINGS.extend(COMMON_SENSE_REASONING_STRING) _TASKS += COMMON_SENSE_REASONING_TASKS @@ -239,8 +239,8 @@ def natural_questions_prompt(line, task_name: str = None): ] -WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5|1") for t in WORLD_KNOWLEDGE_TASKS] -# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0|1' for t in WORLD_KNOWLEDGE_TASKS} +WORLD_KNOWLEDGE_STRING = [(t, f"custom|{t.name}|5") for t in WORLD_KNOWLEDGE_TASKS] +# WORLD_KNOWLEDGE_STRING = {t: f'custom|{t.name}|0' for t in WORLD_KNOWLEDGE_TASKS} _TASKS_STRINGS.extend(WORLD_KNOWLEDGE_STRING) _TASKS += WORLD_KNOWLEDGE_TASKS @@ -278,7 +278,7 @@ def boolq_prompt(line, task_name: str = None): ] -READING_COMP_STRING = [(t, f"custom|{t.name}|0|1") for t in READING_COMP_TASKS] +READING_COMP_STRING = [(t, f"custom|{t.name}|0") for t in READING_COMP_TASKS] _TASKS_STRINGS.extend(READING_COMP_STRING) _TASKS += READING_COMP_TASKS @@ -342,8 +342,8 @@ def __init__( ) -MATH_STRING = [(t, f"custom|{t.name}|4|1") for t in MATH_TASKS] -GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8|1")] +MATH_STRING = [(t, f"custom|{t.name}|4") for t in MATH_TASKS] +GSM8K_STRING = [(GSM8K, f"custom|{GSM8K.name}|8")] _TASKS_STRINGS.extend(MATH_STRING) _TASKS_STRINGS.extend(GSM8K_STRING) _TASKS += MATH_TASKS + [GSM8K] @@ -484,8 +484,8 @@ def __init__( ] -# MMLU_STRING = {t: f'custom|{t.name}|5|1' for t in MMLU_TASKS} -MMLU_STRING = [(t, f"custom|{t.name}|0|1") for t in MMLU_TASKS] +# MMLU_STRING = {t: f'custom|{t.name}|5' for t in MMLU_TASKS} +MMLU_STRING = [(t, f"custom|{t.name}|0") for t in MMLU_TASKS] _TASKS_STRINGS.extend(MMLU_STRING) _TASKS += MMLU_TASKS @@ -571,8 +571,8 @@ def __init__( ] -# BBH_STRING = {t: f'custom|{t.name}|3|1' for t in BBH_TASKS} -BBH_STRING = [(t, f"custom|{t.name}|0|1") for t in BBH_TASKS] +# BBH_STRING = {t: f'custom|{t.name}|3' for t in BBH_TASKS} +BBH_STRING = [(t, f"custom|{t.name}|0") for t in BBH_TASKS] _TASKS_STRINGS.extend(BBH_STRING) _TASKS += BBH_TASKS @@ -687,8 +687,8 @@ def __init__( ] -# AGIEVAL_STRING = {t: f'custom|{t.name}|5|1' for t in AGIEVAL_TASKS} -AGIEVAL_STRING = [(t, f"custom|{t.name}|0|1") for t in AGIEVAL_TASKS] +# AGIEVAL_STRING = {t: f'custom|{t.name}|5' for t in AGIEVAL_TASKS} +AGIEVAL_STRING = [(t, f"custom|{t.name}|0") for t in AGIEVAL_TASKS] _TASKS_STRINGS.extend(AGIEVAL_STRING) _TASKS += AGIEVAL_TASKS diff --git a/examples/nanotron/lighteval_config_override_template.yaml b/examples/nanotron/lighteval_config_override_template.yaml index 50886ced0..433498c34 100644 --- a/examples/nanotron/lighteval_config_override_template.yaml +++ b/examples/nanotron/lighteval_config_override_template.yaml @@ -20,4 +20,4 @@ tasks: max_samples: 10 multichoice_continuations_start_space: null num_fewshot_seeds: null - tasks: lighteval|gsm8k|5|1 + tasks: lighteval|gsm8k|5 diff --git a/examples/tasks/OALL_v1_tasks.txt b/examples/tasks/OALL_v1_tasks.txt index 08e9a51cd..daecb62a7 100644 --- a/examples/tasks/OALL_v1_tasks.txt +++ b/examples/tasks/OALL_v1_tasks.txt @@ -1,136 +1,136 @@ -lighteval|xstory_cloze:ar|0|0 -community|arabic_mmlu_mt:abstract_algebra|0|0 -community|arabic_mmlu_mt:anatomy|0|0 -community|arabic_mmlu_mt:astronomy|0|0 -community|arabic_mmlu_mt:business_ethics|0|0 -community|arabic_mmlu_mt:clinical_knowledge|0|0 -community|arabic_mmlu_mt:college_biology|0|0 -community|arabic_mmlu_mt:college_chemistry|0|0 -community|arabic_mmlu_mt:college_computer_science|0|0 -community|arabic_mmlu_mt:college_mathematics|0|0 -community|arabic_mmlu_mt:college_medicine|0|0 -community|arabic_mmlu_mt:college_physics|0|0 -community|arabic_mmlu_mt:computer_security|0|0 -community|arabic_mmlu_mt:conceptual_physics|0|0 -community|arabic_mmlu_mt:econometrics|0|0 -community|arabic_mmlu_mt:electrical_engineering|0|0 -community|arabic_mmlu_mt:elementary_mathematics|0|0 -community|arabic_mmlu_mt:formal_logic|0|0 -community|arabic_mmlu_mt:global_facts|0|0 -community|arabic_mmlu_mt:high_school_biology|0|0 -community|arabic_mmlu_mt:high_school_chemistry|0|0 -community|arabic_mmlu_mt:high_school_computer_science|0|0 -community|arabic_mmlu_mt:high_school_european_history|0|0 -community|arabic_mmlu_mt:high_school_geography|0|0 -community|arabic_mmlu_mt:high_school_government_and_politics|0|0 -community|arabic_mmlu_mt:high_school_macroeconomics|0|0 -community|arabic_mmlu_mt:high_school_mathematics|0|0 -community|arabic_mmlu_mt:high_school_microeconomics|0|0 -community|arabic_mmlu_mt:high_school_physics|0|0 -community|arabic_mmlu_mt:high_school_psychology|0|0 -community|arabic_mmlu_mt:high_school_statistics|0|0 -community|arabic_mmlu_mt:high_school_us_history|0|0 -community|arabic_mmlu_mt:high_school_world_history|0|0 -community|arabic_mmlu_mt:human_aging|0|0 -community|arabic_mmlu_mt:human_sexuality|0|0 -community|arabic_mmlu_mt:international_law|0|0 -community|arabic_mmlu_mt:jurisprudence|0|0 -community|arabic_mmlu_mt:logical_fallacies|0|0 -community|arabic_mmlu_mt:machine_learning|0|0 -community|arabic_mmlu_mt:management|0|0 -community|arabic_mmlu_mt:marketing|0|0 -community|arabic_mmlu_mt:medical_genetics|0|0 -community|arabic_mmlu_mt:miscellaneous|0|0 -community|arabic_mmlu_mt:moral_disputes|0|0 -community|arabic_mmlu_mt:moral_scenarios|0|0 -community|arabic_mmlu_mt:nutrition|0|0 -community|arabic_mmlu_mt:philosophy|0|0 -community|arabic_mmlu_mt:prehistory|0|0 -community|arabic_mmlu_mt:professional_accounting|0|0 -community|arabic_mmlu_mt:professional_law|0|0 -community|arabic_mmlu_mt:professional_medicine|0|0 -community|arabic_mmlu_mt:professional_psychology|0|0 -community|arabic_mmlu_mt:public_relations|0|0 -community|arabic_mmlu_mt:security_studies|0|0 -community|arabic_mmlu_mt:sociology|0|0 -community|arabic_mmlu_mt:us_foreign_policy|0|0 -community|arabic_mmlu_mt:virology|0|0 -community|arabic_mmlu_mt:world_religions|0|0 -community|arabic_exams|0|0 -community|acva:Algeria|0|0 -community|acva:Ancient_Egypt|0|0 -community|acva:Arab_Empire|0|0 -community|acva:Arabic_Architecture|0|0 -community|acva:Arabic_Art|0|0 -community|acva:Arabic_Astronomy|0|0 -community|acva:Arabic_Calligraphy|0|0 -community|acva:Arabic_Ceremony|0|0 -community|acva:Arabic_Clothing|0|0 -community|acva:Arabic_Culture|0|0 -community|acva:Arabic_Food|0|0 -community|acva:Arabic_Funeral|0|0 -community|acva:Arabic_Geography|0|0 -community|acva:Arabic_History|0|0 -community|acva:Arabic_Language_Origin|0|0 -community|acva:Arabic_Literature|0|0 -community|acva:Arabic_Math|0|0 -community|acva:Arabic_Medicine|0|0 -community|acva:Arabic_Music|0|0 -community|acva:Arabic_Ornament|0|0 -community|acva:Arabic_Philosophy|0|0 -community|acva:Arabic_Physics_and_Chemistry|0|0 -community|acva:Arabic_Wedding|0|0 -community|acva:Bahrain|0|0 -community|acva:Comoros|0|0 -community|acva:Egypt_modern|0|0 -community|acva:InfluenceFromAncientEgypt|0|0 -community|acva:InfluenceFromByzantium|0|0 -community|acva:InfluenceFromChina|0|0 -community|acva:InfluenceFromGreece|0|0 -community|acva:InfluenceFromIslam|0|0 -community|acva:InfluenceFromPersia|0|0 -community|acva:InfluenceFromRome|0|0 -community|acva:Iraq|0|0 -community|acva:Islam_Education|0|0 -community|acva:Islam_branches_and_schools|0|0 -community|acva:Islamic_law_system|0|0 -community|acva:Jordan|0|0 -community|acva:Kuwait|0|0 -community|acva:Lebanon|0|0 -community|acva:Libya|0|0 -community|acva:Mauritania|0|0 -community|acva:Mesopotamia_civilization|0|0 -community|acva:Morocco|0|0 -community|acva:Oman|0|0 -community|acva:Palestine|0|0 -community|acva:Qatar|0|0 -community|acva:Saudi_Arabia|0|0 -community|acva:Somalia|0|0 -community|acva:Sudan|0|0 -community|acva:Syria|0|0 -community|acva:Tunisia|0|0 -community|acva:United_Arab_Emirates|0|0 -community|acva:Yemen|0|0 -community|acva:communication|0|0 -community|acva:computer_and_phone|0|0 -community|acva:daily_life|0|0 -community|acva:entertainment|0|0 -community|alghafa:mcq_exams_test_ar|0|0 -community|alghafa:meta_ar_dialects|0|0 -community|alghafa:meta_ar_msa|0|0 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 -community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 -community|alghafa:multiple_choice_rating_sentiment_task|0|0 -community|alghafa:multiple_choice_sentiment_task|0|0 -community|race_ar|0|0 -community|piqa_ar|0|0 -community|arc_easy_ar|0|0 -community|arc_challenge_okapi_ar|0|0 -community|openbook_qa_ext_ar|0|0 -community|boolq_ar|0|0 -community|copa_ext_ar|0|0 -community|hellaswag_okapi_ar|0|0 -community|toxigen_ar|0|0 -community|sciq_ar|0|0 +lighteval|xstory_cloze:ar|0 +community|arabic_mmlu_mt:abstract_algebra|0 +community|arabic_mmlu_mt:anatomy|0 +community|arabic_mmlu_mt:astronomy|0 +community|arabic_mmlu_mt:business_ethics|0 +community|arabic_mmlu_mt:clinical_knowledge|0 +community|arabic_mmlu_mt:college_biology|0 +community|arabic_mmlu_mt:college_chemistry|0 +community|arabic_mmlu_mt:college_computer_science|0 +community|arabic_mmlu_mt:college_mathematics|0 +community|arabic_mmlu_mt:college_medicine|0 +community|arabic_mmlu_mt:college_physics|0 +community|arabic_mmlu_mt:computer_security|0 +community|arabic_mmlu_mt:conceptual_physics|0 +community|arabic_mmlu_mt:econometrics|0 +community|arabic_mmlu_mt:electrical_engineering|0 +community|arabic_mmlu_mt:elementary_mathematics|0 +community|arabic_mmlu_mt:formal_logic|0 +community|arabic_mmlu_mt:global_facts|0 +community|arabic_mmlu_mt:high_school_biology|0 +community|arabic_mmlu_mt:high_school_chemistry|0 +community|arabic_mmlu_mt:high_school_computer_science|0 +community|arabic_mmlu_mt:high_school_european_history|0 +community|arabic_mmlu_mt:high_school_geography|0 +community|arabic_mmlu_mt:high_school_government_and_politics|0 +community|arabic_mmlu_mt:high_school_macroeconomics|0 +community|arabic_mmlu_mt:high_school_mathematics|0 +community|arabic_mmlu_mt:high_school_microeconomics|0 +community|arabic_mmlu_mt:high_school_physics|0 +community|arabic_mmlu_mt:high_school_psychology|0 +community|arabic_mmlu_mt:high_school_statistics|0 +community|arabic_mmlu_mt:high_school_us_history|0 +community|arabic_mmlu_mt:high_school_world_history|0 +community|arabic_mmlu_mt:human_aging|0 +community|arabic_mmlu_mt:human_sexuality|0 +community|arabic_mmlu_mt:international_law|0 +community|arabic_mmlu_mt:jurisprudence|0 +community|arabic_mmlu_mt:logical_fallacies|0 +community|arabic_mmlu_mt:machine_learning|0 +community|arabic_mmlu_mt:management|0 +community|arabic_mmlu_mt:marketing|0 +community|arabic_mmlu_mt:medical_genetics|0 +community|arabic_mmlu_mt:miscellaneous|0 +community|arabic_mmlu_mt:moral_disputes|0 +community|arabic_mmlu_mt:moral_scenarios|0 +community|arabic_mmlu_mt:nutrition|0 +community|arabic_mmlu_mt:philosophy|0 +community|arabic_mmlu_mt:prehistory|0 +community|arabic_mmlu_mt:professional_accounting|0 +community|arabic_mmlu_mt:professional_law|0 +community|arabic_mmlu_mt:professional_medicine|0 +community|arabic_mmlu_mt:professional_psychology|0 +community|arabic_mmlu_mt:public_relations|0 +community|arabic_mmlu_mt:security_studies|0 +community|arabic_mmlu_mt:sociology|0 +community|arabic_mmlu_mt:us_foreign_policy|0 +community|arabic_mmlu_mt:virology|0 +community|arabic_mmlu_mt:world_religions|0 +community|arabic_exams|0 +community|acva:Algeria|0 +community|acva:Ancient_Egypt|0 +community|acva:Arab_Empire|0 +community|acva:Arabic_Architecture|0 +community|acva:Arabic_Art|0 +community|acva:Arabic_Astronomy|0 +community|acva:Arabic_Calligraphy|0 +community|acva:Arabic_Ceremony|0 +community|acva:Arabic_Clothing|0 +community|acva:Arabic_Culture|0 +community|acva:Arabic_Food|0 +community|acva:Arabic_Funeral|0 +community|acva:Arabic_Geography|0 +community|acva:Arabic_History|0 +community|acva:Arabic_Language_Origin|0 +community|acva:Arabic_Literature|0 +community|acva:Arabic_Math|0 +community|acva:Arabic_Medicine|0 +community|acva:Arabic_Music|0 +community|acva:Arabic_Ornament|0 +community|acva:Arabic_Philosophy|0 +community|acva:Arabic_Physics_and_Chemistry|0 +community|acva:Arabic_Wedding|0 +community|acva:Bahrain|0 +community|acva:Comoros|0 +community|acva:Egypt_modern|0 +community|acva:InfluenceFromAncientEgypt|0 +community|acva:InfluenceFromByzantium|0 +community|acva:InfluenceFromChina|0 +community|acva:InfluenceFromGreece|0 +community|acva:InfluenceFromIslam|0 +community|acva:InfluenceFromPersia|0 +community|acva:InfluenceFromRome|0 +community|acva:Iraq|0 +community|acva:Islam_Education|0 +community|acva:Islam_branches_and_schools|0 +community|acva:Islamic_law_system|0 +community|acva:Jordan|0 +community|acva:Kuwait|0 +community|acva:Lebanon|0 +community|acva:Libya|0 +community|acva:Mauritania|0 +community|acva:Mesopotamia_civilization|0 +community|acva:Morocco|0 +community|acva:Oman|0 +community|acva:Palestine|0 +community|acva:Qatar|0 +community|acva:Saudi_Arabia|0 +community|acva:Somalia|0 +community|acva:Sudan|0 +community|acva:Syria|0 +community|acva:Tunisia|0 +community|acva:United_Arab_Emirates|0 +community|acva:Yemen|0 +community|acva:communication|0 +community|acva:computer_and_phone|0 +community|acva:daily_life|0 +community|acva:entertainment|0 +community|alghafa:mcq_exams_test_ar|0 +community|alghafa:meta_ar_dialects|0 +community|alghafa:meta_ar_msa|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0 +community|alghafa:multiple_choice_rating_sentiment_task|0 +community|alghafa:multiple_choice_sentiment_task|0 +community|race_ar|0 +community|piqa_ar|0 +community|arc_easy_ar|0 +community|arc_challenge_okapi_ar|0 +community|openbook_qa_ext_ar|0 +community|boolq_ar|0 +community|copa_ext_ar|0 +community|hellaswag_okapi_ar|0 +community|toxigen_ar|0 +community|sciq_ar|0 diff --git a/examples/tasks/OALL_v2_tasks.txt b/examples/tasks/OALL_v2_tasks.txt index 26dc78646..890c551c4 100644 --- a/examples/tasks/OALL_v2_tasks.txt +++ b/examples/tasks/OALL_v2_tasks.txt @@ -1,117 +1,117 @@ -community|alghafa:meta_ar_dialects|0|0 -community|alghafa:meta_ar_msa|0|0 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 -community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 -community|alghafa:multiple_choice_rating_sentiment_task|0|0 -community|alghafa:multiple_choice_sentiment_task|0|0 -community|arabic_exams|0|0 -community|arabic_mmlu:Islamic Studies|0|0 -community|arabic_mmlu:Islamic Studies (Middle School)|0|0 -community|arabic_mmlu:Islamic Studies (Primary School)|0|0 -community|arabic_mmlu:Islamic Studies (High School)|0|0 -community|arabic_mmlu:Driving Test|0|0 -community|arabic_mmlu:Natural Science (Middle School)|0|0 -community|arabic_mmlu:Natural Science (Primary School)|0|0 -community|arabic_mmlu:History (Middle School)|0|0 -community|arabic_mmlu:History (Primary School)|0|0 -community|arabic_mmlu:History (High School)|0|0 -community|arabic_mmlu:General Knowledge|0|0 -community|arabic_mmlu:General Knowledge (Middle School)|0|0 -community|arabic_mmlu:General Knowledge (Primary School)|0|0 -community|arabic_mmlu:Law (Professional)|0|0 -community|arabic_mmlu:Physics (High School)|0|0 -community|arabic_mmlu:Social Science (Middle School)|0|0 -community|arabic_mmlu:Social Science (Primary School)|0|0 -community|arabic_mmlu:Management (University)|0|0 -community|arabic_mmlu:Arabic Language (Middle School)|0|0 -community|arabic_mmlu:Arabic Language (Primary School)|0|0 -community|arabic_mmlu:Arabic Language (High School)|0|0 -community|arabic_mmlu:Political Science (University)|0|0 -community|arabic_mmlu:Philosophy (High School)|0|0 -community|arabic_mmlu:Accounting (University)|0|0 -community|arabic_mmlu:Computer Science (Middle School)|0|0 -community|arabic_mmlu:Computer Science (Primary School)|0|0 -community|arabic_mmlu:Computer Science (High School)|0|0 -community|arabic_mmlu:Computer Science (University)|0|0 -community|arabic_mmlu:Geography (Middle School)|0|0 -community|arabic_mmlu:Geography (Primary School)|0|0 -community|arabic_mmlu:Geography (High School)|0|0 -community|arabic_mmlu:Math (Primary School)|0|0 -community|arabic_mmlu:Biology (High School)|0|0 -community|arabic_mmlu:Economics (Middle School)|0|0 -community|arabic_mmlu:Economics (High School)|0|0 -community|arabic_mmlu:Economics (University)|0|0 -community|arabic_mmlu:Arabic Language (General)|0|0 -community|arabic_mmlu:Arabic Language (Grammar)|0|0 -community|arabic_mmlu:Civics (Middle School)|0|0 -community|arabic_mmlu:Civics (High School)|0|0 -community|madinah_qa:Arabic Language (General)|0|0 -community|madinah_qa:Arabic Language (Grammar)|0|0 -community|aratrust:Trustfulness|0|0 -community|aratrust:MentalHealth|0|0 -community|aratrust:PhysicalHealth|0|0 -community|aratrust:Offensive|0|0 -community|aratrust:Ethics|0|0 -community|aratrust:Privacy|0|0 -community|aratrust:Unfairness|0|0 -community|aratrust:Illegal|0|0 -community|arabic_mmlu_ht:abstract_algebra|0|0 -community|arabic_mmlu_ht:anatomy|0|0 -community|arabic_mmlu_ht:astronomy|0|0 -community|arabic_mmlu_ht:business_ethics|0|0 -community|arabic_mmlu_ht:clinical_knowledge|0|0 -community|arabic_mmlu_ht:college_biology|0|0 -community|arabic_mmlu_ht:college_chemistry|0|0 -community|arabic_mmlu_ht:college_computer_science|0|0 -community|arabic_mmlu_ht:college_mathematics|0|0 -community|arabic_mmlu_ht:college_medicine|0|0 -community|arabic_mmlu_ht:college_physics|0|0 -community|arabic_mmlu_ht:computer_security|0|0 -community|arabic_mmlu_ht:conceptual_physics|0|0 -community|arabic_mmlu_ht:econometrics|0|0 -community|arabic_mmlu_ht:electrical_engineering|0|0 -community|arabic_mmlu_ht:elementary_mathematics|0|0 -community|arabic_mmlu_ht:formal_logic|0|0 -community|arabic_mmlu_ht:global_facts|0|0 -community|arabic_mmlu_ht:high_school_biology|0|0 -community|arabic_mmlu_ht:high_school_chemistry|0|0 -community|arabic_mmlu_ht:high_school_computer_science|0|0 -community|arabic_mmlu_ht:high_school_european_history|0|0 -community|arabic_mmlu_ht:high_school_geography|0|0 -community|arabic_mmlu_ht:high_school_government_and_politics|0|0 -community|arabic_mmlu_ht:high_school_macroeconomics|0|0 -community|arabic_mmlu_ht:high_school_mathematics|0|0 -community|arabic_mmlu_ht:high_school_microeconomics|0|0 -community|arabic_mmlu_ht:high_school_physics|0|0 -community|arabic_mmlu_ht:high_school_psychology|0|0 -community|arabic_mmlu_ht:high_school_statistics|0|0 -community|arabic_mmlu_ht:high_school_us_history|0|0 -community|arabic_mmlu_ht:high_school_world_history|0|0 -community|arabic_mmlu_ht:human_aging|0|0 -community|arabic_mmlu_ht:human_sexuality|0|0 -community|arabic_mmlu_ht:international_law|0|0 -community|arabic_mmlu_ht:jurisprudence|0|0 -community|arabic_mmlu_ht:logical_fallacies|0|0 -community|arabic_mmlu_ht:machine_learning|0|0 -community|arabic_mmlu_ht:management|0|0 -community|arabic_mmlu_ht:marketing|0|0 -community|arabic_mmlu_ht:medical_genetics|0|0 -community|arabic_mmlu_ht:miscellaneous|0|0 -community|arabic_mmlu_ht:moral_disputes|0|0 -community|arabic_mmlu_ht:moral_scenarios|0|0 -community|arabic_mmlu_ht:nutrition|0|0 -community|arabic_mmlu_ht:philosophy|0|0 -community|arabic_mmlu_ht:prehistory|0|0 -community|arabic_mmlu_ht:professional_accounting|0|0 -community|arabic_mmlu_ht:professional_law|0|0 -community|arabic_mmlu_ht:professional_medicine|0|0 -community|arabic_mmlu_ht:professional_psychology|0|0 -community|arabic_mmlu_ht:public_relations|0|0 -community|arabic_mmlu_ht:security_studies|0|0 -community|arabic_mmlu_ht:sociology|0|0 -community|arabic_mmlu_ht:us_foreign_policy|0|0 -community|arabic_mmlu_ht:virology|0|0 -community|arabic_mmlu_ht:world_religions|0|0 -community|alrage_qa|0|0 +community|alghafa:meta_ar_dialects|0 +community|alghafa:meta_ar_msa|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0 +community|alghafa:multiple_choice_rating_sentiment_task|0 +community|alghafa:multiple_choice_sentiment_task|0 +community|arabic_exams|0 +community|arabic_mmlu:Islamic Studies|0 +community|arabic_mmlu:Islamic Studies (Middle School)|0 +community|arabic_mmlu:Islamic Studies (Primary School)|0 +community|arabic_mmlu:Islamic Studies (High School)|0 +community|arabic_mmlu:Driving Test|0 +community|arabic_mmlu:Natural Science (Middle School)|0 +community|arabic_mmlu:Natural Science (Primary School)|0 +community|arabic_mmlu:History (Middle School)|0 +community|arabic_mmlu:History (Primary School)|0 +community|arabic_mmlu:History (High School)|0 +community|arabic_mmlu:General Knowledge|0 +community|arabic_mmlu:General Knowledge (Middle School)|0 +community|arabic_mmlu:General Knowledge (Primary School)|0 +community|arabic_mmlu:Law (Professional)|0 +community|arabic_mmlu:Physics (High School)|0 +community|arabic_mmlu:Social Science (Middle School)|0 +community|arabic_mmlu:Social Science (Primary School)|0 +community|arabic_mmlu:Management (University)|0 +community|arabic_mmlu:Arabic Language (Middle School)|0 +community|arabic_mmlu:Arabic Language (Primary School)|0 +community|arabic_mmlu:Arabic Language (High School)|0 +community|arabic_mmlu:Political Science (University)|0 +community|arabic_mmlu:Philosophy (High School)|0 +community|arabic_mmlu:Accounting (University)|0 +community|arabic_mmlu:Computer Science (Middle School)|0 +community|arabic_mmlu:Computer Science (Primary School)|0 +community|arabic_mmlu:Computer Science (High School)|0 +community|arabic_mmlu:Computer Science (University)|0 +community|arabic_mmlu:Geography (Middle School)|0 +community|arabic_mmlu:Geography (Primary School)|0 +community|arabic_mmlu:Geography (High School)|0 +community|arabic_mmlu:Math (Primary School)|0 +community|arabic_mmlu:Biology (High School)|0 +community|arabic_mmlu:Economics (Middle School)|0 +community|arabic_mmlu:Economics (High School)|0 +community|arabic_mmlu:Economics (University)|0 +community|arabic_mmlu:Arabic Language (General)|0 +community|arabic_mmlu:Arabic Language (Grammar)|0 +community|arabic_mmlu:Civics (Middle School)|0 +community|arabic_mmlu:Civics (High School)|0 +community|madinah_qa:Arabic Language (General)|0 +community|madinah_qa:Arabic Language (Grammar)|0 +community|aratrust:Trustfulness|0 +community|aratrust:MentalHealth|0 +community|aratrust:PhysicalHealth|0 +community|aratrust:Offensive|0 +community|aratrust:Ethics|0 +community|aratrust:Privacy|0 +community|aratrust:Unfairness|0 +community|aratrust:Illegal|0 +community|arabic_mmlu_ht:abstract_algebra|0 +community|arabic_mmlu_ht:anatomy|0 +community|arabic_mmlu_ht:astronomy|0 +community|arabic_mmlu_ht:business_ethics|0 +community|arabic_mmlu_ht:clinical_knowledge|0 +community|arabic_mmlu_ht:college_biology|0 +community|arabic_mmlu_ht:college_chemistry|0 +community|arabic_mmlu_ht:college_computer_science|0 +community|arabic_mmlu_ht:college_mathematics|0 +community|arabic_mmlu_ht:college_medicine|0 +community|arabic_mmlu_ht:college_physics|0 +community|arabic_mmlu_ht:computer_security|0 +community|arabic_mmlu_ht:conceptual_physics|0 +community|arabic_mmlu_ht:econometrics|0 +community|arabic_mmlu_ht:electrical_engineering|0 +community|arabic_mmlu_ht:elementary_mathematics|0 +community|arabic_mmlu_ht:formal_logic|0 +community|arabic_mmlu_ht:global_facts|0 +community|arabic_mmlu_ht:high_school_biology|0 +community|arabic_mmlu_ht:high_school_chemistry|0 +community|arabic_mmlu_ht:high_school_computer_science|0 +community|arabic_mmlu_ht:high_school_european_history|0 +community|arabic_mmlu_ht:high_school_geography|0 +community|arabic_mmlu_ht:high_school_government_and_politics|0 +community|arabic_mmlu_ht:high_school_macroeconomics|0 +community|arabic_mmlu_ht:high_school_mathematics|0 +community|arabic_mmlu_ht:high_school_microeconomics|0 +community|arabic_mmlu_ht:high_school_physics|0 +community|arabic_mmlu_ht:high_school_psychology|0 +community|arabic_mmlu_ht:high_school_statistics|0 +community|arabic_mmlu_ht:high_school_us_history|0 +community|arabic_mmlu_ht:high_school_world_history|0 +community|arabic_mmlu_ht:human_aging|0 +community|arabic_mmlu_ht:human_sexuality|0 +community|arabic_mmlu_ht:international_law|0 +community|arabic_mmlu_ht:jurisprudence|0 +community|arabic_mmlu_ht:logical_fallacies|0 +community|arabic_mmlu_ht:machine_learning|0 +community|arabic_mmlu_ht:management|0 +community|arabic_mmlu_ht:marketing|0 +community|arabic_mmlu_ht:medical_genetics|0 +community|arabic_mmlu_ht:miscellaneous|0 +community|arabic_mmlu_ht:moral_disputes|0 +community|arabic_mmlu_ht:moral_scenarios|0 +community|arabic_mmlu_ht:nutrition|0 +community|arabic_mmlu_ht:philosophy|0 +community|arabic_mmlu_ht:prehistory|0 +community|arabic_mmlu_ht:professional_accounting|0 +community|arabic_mmlu_ht:professional_law|0 +community|arabic_mmlu_ht:professional_medicine|0 +community|arabic_mmlu_ht:professional_psychology|0 +community|arabic_mmlu_ht:public_relations|0 +community|arabic_mmlu_ht:security_studies|0 +community|arabic_mmlu_ht:sociology|0 +community|arabic_mmlu_ht:us_foreign_policy|0 +community|arabic_mmlu_ht:virology|0 +community|arabic_mmlu_ht:world_religions|0 +community|alrage_qa|0 diff --git a/examples/tasks/all_arabic_tasks.txt b/examples/tasks/all_arabic_tasks.txt index 8593fa2f8..f88738993 100644 --- a/examples/tasks/all_arabic_tasks.txt +++ b/examples/tasks/all_arabic_tasks.txt @@ -1,244 +1,244 @@ -lighteval|xstory_cloze:ar|0|0 -community|arabic_exams|0|0 -community|arabic_mmlu_mt:abstract_algebra|0|0 -community|arabic_mmlu_mt:anatomy|0|0 -community|arabic_mmlu_mt:astronomy|0|0 -community|arabic_mmlu_mt:business_ethics|0|0 -community|arabic_mmlu_mt:clinical_knowledge|0|0 -community|arabic_mmlu_mt:college_biology|0|0 -community|arabic_mmlu_mt:college_chemistry|0|0 -community|arabic_mmlu_mt:college_computer_science|0|0 -community|arabic_mmlu_mt:college_mathematics|0|0 -community|arabic_mmlu_mt:college_medicine|0|0 -community|arabic_mmlu_mt:college_physics|0|0 -community|arabic_mmlu_mt:computer_security|0|0 -community|arabic_mmlu_mt:conceptual_physics|0|0 -community|arabic_mmlu_mt:econometrics|0|0 -community|arabic_mmlu_mt:electrical_engineering|0|0 -community|arabic_mmlu_mt:elementary_mathematics|0|0 -community|arabic_mmlu_mt:formal_logic|0|0 -community|arabic_mmlu_mt:global_facts|0|0 -community|arabic_mmlu_mt:high_school_biology|0|0 -community|arabic_mmlu_mt:high_school_chemistry|0|0 -community|arabic_mmlu_mt:high_school_computer_science|0|0 -community|arabic_mmlu_mt:high_school_european_history|0|0 -community|arabic_mmlu_mt:high_school_geography|0|0 -community|arabic_mmlu_mt:high_school_government_and_politics|0|0 -community|arabic_mmlu_mt:high_school_macroeconomics|0|0 -community|arabic_mmlu_mt:high_school_mathematics|0|0 -community|arabic_mmlu_mt:high_school_microeconomics|0|0 -community|arabic_mmlu_mt:high_school_physics|0|0 -community|arabic_mmlu_mt:high_school_psychology|0|0 -community|arabic_mmlu_mt:high_school_statistics|0|0 -community|arabic_mmlu_mt:high_school_us_history|0|0 -community|arabic_mmlu_mt:high_school_world_history|0|0 -community|arabic_mmlu_mt:human_aging|0|0 -community|arabic_mmlu_mt:human_sexuality|0|0 -community|arabic_mmlu_mt:international_law|0|0 -community|arabic_mmlu_mt:jurisprudence|0|0 -community|arabic_mmlu_mt:logical_fallacies|0|0 -community|arabic_mmlu_mt:machine_learning|0|0 -community|arabic_mmlu_mt:management|0|0 -community|arabic_mmlu_mt:marketing|0|0 -community|arabic_mmlu_mt:medical_genetics|0|0 -community|arabic_mmlu_mt:miscellaneous|0|0 -community|arabic_mmlu_mt:moral_disputes|0|0 -community|arabic_mmlu_mt:moral_scenarios|0|0 -community|arabic_mmlu_mt:nutrition|0|0 -community|arabic_mmlu_mt:philosophy|0|0 -community|arabic_mmlu_mt:prehistory|0|0 -community|arabic_mmlu_mt:professional_accounting|0|0 -community|arabic_mmlu_mt:professional_law|0|0 -community|arabic_mmlu_mt:professional_medicine|0|0 -community|arabic_mmlu_mt:professional_psychology|0|0 -community|arabic_mmlu_mt:public_relations|0|0 -community|arabic_mmlu_mt:security_studies|0|0 -community|arabic_mmlu_mt:sociology|0|0 -community|arabic_mmlu_mt:us_foreign_policy|0|0 -community|arabic_mmlu_mt:virology|0|0 -community|arabic_mmlu_mt:world_religions|0|0 -community|acva:Algeria|0|0 -community|acva:Ancient_Egypt|0|0 -community|acva:Arab_Empire|0|0 -community|acva:Arabic_Architecture|0|0 -community|acva:Arabic_Art|0|0 -community|acva:Arabic_Astronomy|0|0 -community|acva:Arabic_Calligraphy|0|0 -community|acva:Arabic_Ceremony|0|0 -community|acva:Arabic_Clothing|0|0 -community|acva:Arabic_Culture|0|0 -community|acva:Arabic_Food|0|0 -community|acva:Arabic_Funeral|0|0 -community|acva:Arabic_Geography|0|0 -community|acva:Arabic_History|0|0 -community|acva:Arabic_Language_Origin|0|0 -community|acva:Arabic_Literature|0|0 -community|acva:Arabic_Math|0|0 -community|acva:Arabic_Medicine|0|0 -community|acva:Arabic_Music|0|0 -community|acva:Arabic_Ornament|0|0 -community|acva:Arabic_Philosophy|0|0 -community|acva:Arabic_Physics_and_Chemistry|0|0 -community|acva:Arabic_Wedding|0|0 -community|acva:Bahrain|0|0 -community|acva:Comoros|0|0 -community|acva:Egypt_modern|0|0 -community|acva:InfluenceFromAncientEgypt|0|0 -community|acva:InfluenceFromByzantium|0|0 -community|acva:InfluenceFromChina|0|0 -community|acva:InfluenceFromGreece|0|0 -community|acva:InfluenceFromIslam|0|0 -community|acva:InfluenceFromPersia|0|0 -community|acva:InfluenceFromRome|0|0 -community|acva:Iraq|0|0 -community|acva:Islam_Education|0|0 -community|acva:Islam_branches_and_schools|0|0 -community|acva:Islamic_law_system|0|0 -community|acva:Jordan|0|0 -community|acva:Kuwait|0|0 -community|acva:Lebanon|0|0 -community|acva:Libya|0|0 -community|acva:Mauritania|0|0 -community|acva:Mesopotamia_civilization|0|0 -community|acva:Morocco|0|0 -community|acva:Oman|0|0 -community|acva:Palestine|0|0 -community|acva:Qatar|0|0 -community|acva:Saudi_Arabia|0|0 -community|acva:Somalia|0|0 -community|acva:Sudan|0|0 -community|acva:Syria|0|0 -community|acva:Tunisia|0|0 -community|acva:United_Arab_Emirates|0|0 -community|acva:Yemen|0|0 -community|acva:communication|0|0 -community|acva:computer_and_phone|0|0 -community|acva:daily_life|0|0 -community|acva:entertainment|0|0 -community|alghafa:mcq_exams_test_ar|0|0 -community|alghafa:meta_ar_dialects|0|0 -community|alghafa:meta_ar_msa|0|0 -community|alghafa:multiple_choice_facts_truefalse_balanced_task|0|0 -community|alghafa:multiple_choice_grounded_statement_soqal_task|0|0 -community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0|0 -community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0|0 -community|alghafa:multiple_choice_rating_sentiment_task|0|0 -community|alghafa:multiple_choice_sentiment_task|0|0 -community|race_ar|0|0 -community|piqa_ar|0|0 -community|arc_easy_ar|0|0 -community|arc_challenge_okapi_ar|0|0 -community|mmlu_okapi_ar|0|0 -community|openbook_qa_ext_ar|0|0 -community|boolq_ar|0|0 -community|copa_ext_ar|0|0 -community|hellaswag_okapi_ar|0|0 -community|toxigen_ar|0|0 -community|sciq_ar|0|0 -community|arabic_mmlu_ht:abstract_algebra|0|0 -community|arabic_mmlu_ht:anatomy|0|0 -community|arabic_mmlu_ht:astronomy|0|0 -community|arabic_mmlu_ht:business_ethics|0|0 -community|arabic_mmlu_ht:clinical_knowledge|0|0 -community|arabic_mmlu_ht:college_biology|0|0 -community|arabic_mmlu_ht:college_chemistry|0|0 -community|arabic_mmlu_ht:college_computer_science|0|0 -community|arabic_mmlu_ht:college_mathematics|0|0 -community|arabic_mmlu_ht:college_medicine|0|0 -community|arabic_mmlu_ht:college_physics|0|0 -community|arabic_mmlu_ht:computer_security|0|0 -community|arabic_mmlu_ht:conceptual_physics|0|0 -community|arabic_mmlu_ht:econometrics|0|0 -community|arabic_mmlu_ht:electrical_engineering|0|0 -community|arabic_mmlu_ht:elementary_mathematics|0|0 -community|arabic_mmlu_ht:formal_logic|0|0 -community|arabic_mmlu_ht:global_facts|0|0 -community|arabic_mmlu_ht:high_school_biology|0|0 -community|arabic_mmlu_ht:high_school_chemistry|0|0 -community|arabic_mmlu_ht:high_school_computer_science|0|0 -community|arabic_mmlu_ht:high_school_european_history|0|0 -community|arabic_mmlu_ht:high_school_geography|0|0 -community|arabic_mmlu_ht:high_school_government_and_politics|0|0 -community|arabic_mmlu_ht:high_school_macroeconomics|0|0 -community|arabic_mmlu_ht:high_school_mathematics|0|0 -community|arabic_mmlu_ht:high_school_microeconomics|0|0 -community|arabic_mmlu_ht:high_school_physics|0|0 -community|arabic_mmlu_ht:high_school_psychology|0|0 -community|arabic_mmlu_ht:high_school_statistics|0|0 -community|arabic_mmlu_ht:high_school_us_history|0|0 -community|arabic_mmlu_ht:high_school_world_history|0|0 -community|arabic_mmlu_ht:human_aging|0|0 -community|arabic_mmlu_ht:human_sexuality|0|0 -community|arabic_mmlu_ht:international_law|0|0 -community|arabic_mmlu_ht:jurisprudence|0|0 -community|arabic_mmlu_ht:logical_fallacies|0|0 -community|arabic_mmlu_ht:machine_learning|0|0 -community|arabic_mmlu_ht:management|0|0 -community|arabic_mmlu_ht:marketing|0|0 -community|arabic_mmlu_ht:medical_genetics|0|0 -community|arabic_mmlu_ht:miscellaneous|0|0 -community|arabic_mmlu_ht:moral_disputes|0|0 -community|arabic_mmlu_ht:moral_scenarios|0|0 -community|arabic_mmlu_ht:nutrition|0|0 -community|arabic_mmlu_ht:philosophy|0|0 -community|arabic_mmlu_ht:prehistory|0|0 -community|arabic_mmlu_ht:professional_accounting|0|0 -community|arabic_mmlu_ht:professional_law|0|0 -community|arabic_mmlu_ht:professional_medicine|0|0 -community|arabic_mmlu_ht:professional_psychology|0|0 -community|arabic_mmlu_ht:public_relations|0|0 -community|arabic_mmlu_ht:security_studies|0|0 -community|arabic_mmlu_ht:sociology|0|0 -community|arabic_mmlu_ht:us_foreign_policy|0|0 -community|arabic_mmlu_ht:virology|0|0 -community|arabic_mmlu_ht:world_religions|0|0 -community|arabic_mmlu:Islamic Studies|0|0 -community|arabic_mmlu:Islamic Studies (Middle School)|0|0 -community|arabic_mmlu:Islamic Studies (Primary School)|0|0 -community|arabic_mmlu:Islamic Studies (High School)|0|0 -community|arabic_mmlu:Driving Test|0|0 -community|arabic_mmlu:Natural Science (Middle School)|0|0 -community|arabic_mmlu:Natural Science (Primary School)|0|0 -community|arabic_mmlu:History (Middle School)|0|0 -community|arabic_mmlu:History (Primary School)|0|0 -community|arabic_mmlu:History (High School)|0|0 -community|arabic_mmlu:General Knowledge|0|0 -community|arabic_mmlu:General Knowledge (Middle School)|0|0 -community|arabic_mmlu:General Knowledge (Primary School)|0|0 -community|arabic_mmlu:Law (Professional)|0|0 -community|arabic_mmlu:Physics (High School)|0|0 -community|arabic_mmlu:Social Science (Middle School)|0|0 -community|arabic_mmlu:Social Science (Primary School)|0|0 -community|arabic_mmlu:Management (University)|0|0 -community|arabic_mmlu:Arabic Language (Middle School)|0|0 -community|arabic_mmlu:Arabic Language (Primary School)|0|0 -community|arabic_mmlu:Arabic Language (High School)|0|0 -community|arabic_mmlu:Political Science (University)|0|0 -community|arabic_mmlu:Philosophy (High School)|0|0 -community|arabic_mmlu:Accounting (University)|0|0 -community|arabic_mmlu:Computer Science (Middle School)|0|0 -community|arabic_mmlu:Computer Science (Primary School)|0|0 -community|arabic_mmlu:Computer Science (High School)|0|0 -community|arabic_mmlu:Computer Science (University)|0|0 -community|arabic_mmlu:Geography (Middle School)|0|0 -community|arabic_mmlu:Geography (Primary School)|0|0 -community|arabic_mmlu:Geography (High School)|0|0 -community|arabic_mmlu:Math (Primary School)|0|0 -community|arabic_mmlu:Biology (High School)|0|0 -community|arabic_mmlu:Economics (Middle School)|0|0 -community|arabic_mmlu:Economics (High School)|0|0 -community|arabic_mmlu:Economics (University)|0|0 -community|arabic_mmlu:Arabic Language (General)|0|0 -community|arabic_mmlu:Arabic Language (Grammar)|0|0 -community|arabic_mmlu:Civics (Middle School)|0|0 -community|arabic_mmlu:Civics (High School)|0|0 -community|madinah_qa:Arabic Language (General)|0|0 -community|madinah_qa:Arabic Language (Grammar)|0|0 -community|aratrust:Trustfulness|0|0 -community|aratrust:MentalHealth|0|0 -community|aratrust:PhysicalHealth|0|0 -community|aratrust:Offensive|0|0 -community|aratrust:Ethics|0|0 -community|aratrust:Privacy|0|0 -community|aratrust:Unfairness|0|0 -community|aratrust:Illegal|0|0 +lighteval|xstory_cloze:ar|0 +community|arabic_exams|0 +community|arabic_mmlu_mt:abstract_algebra|0 +community|arabic_mmlu_mt:anatomy|0 +community|arabic_mmlu_mt:astronomy|0 +community|arabic_mmlu_mt:business_ethics|0 +community|arabic_mmlu_mt:clinical_knowledge|0 +community|arabic_mmlu_mt:college_biology|0 +community|arabic_mmlu_mt:college_chemistry|0 +community|arabic_mmlu_mt:college_computer_science|0 +community|arabic_mmlu_mt:college_mathematics|0 +community|arabic_mmlu_mt:college_medicine|0 +community|arabic_mmlu_mt:college_physics|0 +community|arabic_mmlu_mt:computer_security|0 +community|arabic_mmlu_mt:conceptual_physics|0 +community|arabic_mmlu_mt:econometrics|0 +community|arabic_mmlu_mt:electrical_engineering|0 +community|arabic_mmlu_mt:elementary_mathematics|0 +community|arabic_mmlu_mt:formal_logic|0 +community|arabic_mmlu_mt:global_facts|0 +community|arabic_mmlu_mt:high_school_biology|0 +community|arabic_mmlu_mt:high_school_chemistry|0 +community|arabic_mmlu_mt:high_school_computer_science|0 +community|arabic_mmlu_mt:high_school_european_history|0 +community|arabic_mmlu_mt:high_school_geography|0 +community|arabic_mmlu_mt:high_school_government_and_politics|0 +community|arabic_mmlu_mt:high_school_macroeconomics|0 +community|arabic_mmlu_mt:high_school_mathematics|0 +community|arabic_mmlu_mt:high_school_microeconomics|0 +community|arabic_mmlu_mt:high_school_physics|0 +community|arabic_mmlu_mt:high_school_psychology|0 +community|arabic_mmlu_mt:high_school_statistics|0 +community|arabic_mmlu_mt:high_school_us_history|0 +community|arabic_mmlu_mt:high_school_world_history|0 +community|arabic_mmlu_mt:human_aging|0 +community|arabic_mmlu_mt:human_sexuality|0 +community|arabic_mmlu_mt:international_law|0 +community|arabic_mmlu_mt:jurisprudence|0 +community|arabic_mmlu_mt:logical_fallacies|0 +community|arabic_mmlu_mt:machine_learning|0 +community|arabic_mmlu_mt:management|0 +community|arabic_mmlu_mt:marketing|0 +community|arabic_mmlu_mt:medical_genetics|0 +community|arabic_mmlu_mt:miscellaneous|0 +community|arabic_mmlu_mt:moral_disputes|0 +community|arabic_mmlu_mt:moral_scenarios|0 +community|arabic_mmlu_mt:nutrition|0 +community|arabic_mmlu_mt:philosophy|0 +community|arabic_mmlu_mt:prehistory|0 +community|arabic_mmlu_mt:professional_accounting|0 +community|arabic_mmlu_mt:professional_law|0 +community|arabic_mmlu_mt:professional_medicine|0 +community|arabic_mmlu_mt:professional_psychology|0 +community|arabic_mmlu_mt:public_relations|0 +community|arabic_mmlu_mt:security_studies|0 +community|arabic_mmlu_mt:sociology|0 +community|arabic_mmlu_mt:us_foreign_policy|0 +community|arabic_mmlu_mt:virology|0 +community|arabic_mmlu_mt:world_religions|0 +community|acva:Algeria|0 +community|acva:Ancient_Egypt|0 +community|acva:Arab_Empire|0 +community|acva:Arabic_Architecture|0 +community|acva:Arabic_Art|0 +community|acva:Arabic_Astronomy|0 +community|acva:Arabic_Calligraphy|0 +community|acva:Arabic_Ceremony|0 +community|acva:Arabic_Clothing|0 +community|acva:Arabic_Culture|0 +community|acva:Arabic_Food|0 +community|acva:Arabic_Funeral|0 +community|acva:Arabic_Geography|0 +community|acva:Arabic_History|0 +community|acva:Arabic_Language_Origin|0 +community|acva:Arabic_Literature|0 +community|acva:Arabic_Math|0 +community|acva:Arabic_Medicine|0 +community|acva:Arabic_Music|0 +community|acva:Arabic_Ornament|0 +community|acva:Arabic_Philosophy|0 +community|acva:Arabic_Physics_and_Chemistry|0 +community|acva:Arabic_Wedding|0 +community|acva:Bahrain|0 +community|acva:Comoros|0 +community|acva:Egypt_modern|0 +community|acva:InfluenceFromAncientEgypt|0 +community|acva:InfluenceFromByzantium|0 +community|acva:InfluenceFromChina|0 +community|acva:InfluenceFromGreece|0 +community|acva:InfluenceFromIslam|0 +community|acva:InfluenceFromPersia|0 +community|acva:InfluenceFromRome|0 +community|acva:Iraq|0 +community|acva:Islam_Education|0 +community|acva:Islam_branches_and_schools|0 +community|acva:Islamic_law_system|0 +community|acva:Jordan|0 +community|acva:Kuwait|0 +community|acva:Lebanon|0 +community|acva:Libya|0 +community|acva:Mauritania|0 +community|acva:Mesopotamia_civilization|0 +community|acva:Morocco|0 +community|acva:Oman|0 +community|acva:Palestine|0 +community|acva:Qatar|0 +community|acva:Saudi_Arabia|0 +community|acva:Somalia|0 +community|acva:Sudan|0 +community|acva:Syria|0 +community|acva:Tunisia|0 +community|acva:United_Arab_Emirates|0 +community|acva:Yemen|0 +community|acva:communication|0 +community|acva:computer_and_phone|0 +community|acva:daily_life|0 +community|acva:entertainment|0 +community|alghafa:mcq_exams_test_ar|0 +community|alghafa:meta_ar_dialects|0 +community|alghafa:meta_ar_msa|0 +community|alghafa:multiple_choice_facts_truefalse_balanced_task|0 +community|alghafa:multiple_choice_grounded_statement_soqal_task|0 +community|alghafa:multiple_choice_grounded_statement_xglue_mlqa_task|0 +community|alghafa:multiple_choice_rating_sentiment_no_neutral_task|0 +community|alghafa:multiple_choice_rating_sentiment_task|0 +community|alghafa:multiple_choice_sentiment_task|0 +community|race_ar|0 +community|piqa_ar|0 +community|arc_easy_ar|0 +community|arc_challenge_okapi_ar|0 +community|mmlu_okapi_ar|0 +community|openbook_qa_ext_ar|0 +community|boolq_ar|0 +community|copa_ext_ar|0 +community|hellaswag_okapi_ar|0 +community|toxigen_ar|0 +community|sciq_ar|0 +community|arabic_mmlu_ht:abstract_algebra|0 +community|arabic_mmlu_ht:anatomy|0 +community|arabic_mmlu_ht:astronomy|0 +community|arabic_mmlu_ht:business_ethics|0 +community|arabic_mmlu_ht:clinical_knowledge|0 +community|arabic_mmlu_ht:college_biology|0 +community|arabic_mmlu_ht:college_chemistry|0 +community|arabic_mmlu_ht:college_computer_science|0 +community|arabic_mmlu_ht:college_mathematics|0 +community|arabic_mmlu_ht:college_medicine|0 +community|arabic_mmlu_ht:college_physics|0 +community|arabic_mmlu_ht:computer_security|0 +community|arabic_mmlu_ht:conceptual_physics|0 +community|arabic_mmlu_ht:econometrics|0 +community|arabic_mmlu_ht:electrical_engineering|0 +community|arabic_mmlu_ht:elementary_mathematics|0 +community|arabic_mmlu_ht:formal_logic|0 +community|arabic_mmlu_ht:global_facts|0 +community|arabic_mmlu_ht:high_school_biology|0 +community|arabic_mmlu_ht:high_school_chemistry|0 +community|arabic_mmlu_ht:high_school_computer_science|0 +community|arabic_mmlu_ht:high_school_european_history|0 +community|arabic_mmlu_ht:high_school_geography|0 +community|arabic_mmlu_ht:high_school_government_and_politics|0 +community|arabic_mmlu_ht:high_school_macroeconomics|0 +community|arabic_mmlu_ht:high_school_mathematics|0 +community|arabic_mmlu_ht:high_school_microeconomics|0 +community|arabic_mmlu_ht:high_school_physics|0 +community|arabic_mmlu_ht:high_school_psychology|0 +community|arabic_mmlu_ht:high_school_statistics|0 +community|arabic_mmlu_ht:high_school_us_history|0 +community|arabic_mmlu_ht:high_school_world_history|0 +community|arabic_mmlu_ht:human_aging|0 +community|arabic_mmlu_ht:human_sexuality|0 +community|arabic_mmlu_ht:international_law|0 +community|arabic_mmlu_ht:jurisprudence|0 +community|arabic_mmlu_ht:logical_fallacies|0 +community|arabic_mmlu_ht:machine_learning|0 +community|arabic_mmlu_ht:management|0 +community|arabic_mmlu_ht:marketing|0 +community|arabic_mmlu_ht:medical_genetics|0 +community|arabic_mmlu_ht:miscellaneous|0 +community|arabic_mmlu_ht:moral_disputes|0 +community|arabic_mmlu_ht:moral_scenarios|0 +community|arabic_mmlu_ht:nutrition|0 +community|arabic_mmlu_ht:philosophy|0 +community|arabic_mmlu_ht:prehistory|0 +community|arabic_mmlu_ht:professional_accounting|0 +community|arabic_mmlu_ht:professional_law|0 +community|arabic_mmlu_ht:professional_medicine|0 +community|arabic_mmlu_ht:professional_psychology|0 +community|arabic_mmlu_ht:public_relations|0 +community|arabic_mmlu_ht:security_studies|0 +community|arabic_mmlu_ht:sociology|0 +community|arabic_mmlu_ht:us_foreign_policy|0 +community|arabic_mmlu_ht:virology|0 +community|arabic_mmlu_ht:world_religions|0 +community|arabic_mmlu:Islamic Studies|0 +community|arabic_mmlu:Islamic Studies (Middle School)|0 +community|arabic_mmlu:Islamic Studies (Primary School)|0 +community|arabic_mmlu:Islamic Studies (High School)|0 +community|arabic_mmlu:Driving Test|0 +community|arabic_mmlu:Natural Science (Middle School)|0 +community|arabic_mmlu:Natural Science (Primary School)|0 +community|arabic_mmlu:History (Middle School)|0 +community|arabic_mmlu:History (Primary School)|0 +community|arabic_mmlu:History (High School)|0 +community|arabic_mmlu:General Knowledge|0 +community|arabic_mmlu:General Knowledge (Middle School)|0 +community|arabic_mmlu:General Knowledge (Primary School)|0 +community|arabic_mmlu:Law (Professional)|0 +community|arabic_mmlu:Physics (High School)|0 +community|arabic_mmlu:Social Science (Middle School)|0 +community|arabic_mmlu:Social Science (Primary School)|0 +community|arabic_mmlu:Management (University)|0 +community|arabic_mmlu:Arabic Language (Middle School)|0 +community|arabic_mmlu:Arabic Language (Primary School)|0 +community|arabic_mmlu:Arabic Language (High School)|0 +community|arabic_mmlu:Political Science (University)|0 +community|arabic_mmlu:Philosophy (High School)|0 +community|arabic_mmlu:Accounting (University)|0 +community|arabic_mmlu:Computer Science (Middle School)|0 +community|arabic_mmlu:Computer Science (Primary School)|0 +community|arabic_mmlu:Computer Science (High School)|0 +community|arabic_mmlu:Computer Science (University)|0 +community|arabic_mmlu:Geography (Middle School)|0 +community|arabic_mmlu:Geography (Primary School)|0 +community|arabic_mmlu:Geography (High School)|0 +community|arabic_mmlu:Math (Primary School)|0 +community|arabic_mmlu:Biology (High School)|0 +community|arabic_mmlu:Economics (Middle School)|0 +community|arabic_mmlu:Economics (High School)|0 +community|arabic_mmlu:Economics (University)|0 +community|arabic_mmlu:Arabic Language (General)|0 +community|arabic_mmlu:Arabic Language (Grammar)|0 +community|arabic_mmlu:Civics (Middle School)|0 +community|arabic_mmlu:Civics (High School)|0 +community|madinah_qa:Arabic Language (General)|0 +community|madinah_qa:Arabic Language (Grammar)|0 +community|aratrust:Trustfulness|0 +community|aratrust:MentalHealth|0 +community|aratrust:PhysicalHealth|0 +community|aratrust:Offensive|0 +community|aratrust:Ethics|0 +community|aratrust:Privacy|0 +community|aratrust:Unfairness|0 +community|aratrust:Illegal|0 diff --git a/examples/tasks/all_filipino_tasks.txt b/examples/tasks/all_filipino_tasks.txt index 19f33917d..72bb13567 100644 --- a/examples/tasks/all_filipino_tasks.txt +++ b/examples/tasks/all_filipino_tasks.txt @@ -1,23 +1,23 @@ -community|readability_ceb_mcf|0|0 -community|kalahi_tgl_mcf|0|0 -community|kalahi_tgl_hybrid|0|0 -community|cebuaner_ceb_mcf|0|0 -community|universalner_tgl_mcf|0|0 -community|universalner_ceb_mcf|0|0 -community|tlunifiedner_tgl_mcf|0|0 -community|stingraybench_correctness_tgl_mcf|0|0 -community|stingraybench_semantic_appropriateness_tgl_mcf|0|0 -community|tatoeba_ceb|0|0 -community|tatoeba_tgl|0|0 -community|ntrex128_fil|0|0 -community|tico19_tgl|0|0 -community|dengue_filipino_fil|0|0 -community|include_tgl_mcf|0|0 -community|newsphnli_fil_mcf|0|0 -community|belebele_ceb_mcf|0|0 -community|belebele_fil_mcf|0|0 -community|sib200_ceb_mcf|0|0 -community|sib200_tgl_mcf|0|0 -community|firecs_fil_mcf|0|0 -community|global_mmlu_all_tgl_mcf|0|0 -community|balita_tgl_mcf|0|0 +community|readability_ceb_mcf|0 +community|kalahi_tgl_mcf|0 +community|kalahi_tgl_hybrid|0 +community|cebuaner_ceb_mcf|0 +community|universalner_tgl_mcf|0 +community|universalner_ceb_mcf|0 +community|tlunifiedner_tgl_mcf|0 +community|stingraybench_correctness_tgl_mcf|0 +community|stingraybench_semantic_appropriateness_tgl_mcf|0 +community|tatoeba_ceb|0 +community|tatoeba_tgl|0 +community|ntrex128_fil|0 +community|tico19_tgl|0 +community|dengue_filipino_fil|0 +community|include_tgl_mcf|0 +community|newsphnli_fil_mcf|0 +community|belebele_ceb_mcf|0 +community|belebele_fil_mcf|0 +community|sib200_ceb_mcf|0 +community|sib200_tgl_mcf|0 +community|firecs_fil_mcf|0 +community|global_mmlu_all_tgl_mcf|0 +community|balita_tgl_mcf|0 diff --git a/examples/tasks/all_german_rag_evals.txt b/examples/tasks/all_german_rag_evals.txt index 29fec52bd..96ca24102 100644 --- a/examples/tasks/all_german_rag_evals.txt +++ b/examples/tasks/all_german_rag_evals.txt @@ -1,4 +1,4 @@ -community|german_rag_eval:choose_question_by_context|0|0 -community|german_rag_eval:choose_context_by_question|0|0 -community|german_rag_eval:question_answer_match|0|0 -community|german_rag_eval:context_question_match|0|0 +community|german_rag_eval:choose_question_by_context|0 +community|german_rag_eval:choose_context_by_question|0 +community|german_rag_eval:question_answer_match|0 +community|german_rag_eval:context_question_match|0 diff --git a/examples/tasks/all_tasks.txt b/examples/tasks/all_tasks.txt index 894ffc6f9..56a026126 100644 --- a/examples/tasks/all_tasks.txt +++ b/examples/tasks/all_tasks.txt @@ -1,1146 +1,1146 @@ -bigbench|abstract_narrative_understanding|0|0 -bigbench|anachronisms|0|0 -bigbench|analogical_similarity|0|0 -bigbench|analytic_entailment|0|0 -bigbench|arithmetic_bb|0|0 -bigbench|ascii_word_recognition|0|0 -bigbench|authorship_verification|0|0 -bigbench|auto_categorization|0|0 -bigbench|auto_debugging|0|0 -bigbench|bbq_lite_json|0|0 -bigbench|bridging_anaphora_resolution_barqa|0|0 -bigbench|causal_judgment|0|0 -bigbench|cause_and_effect|0|0 -bigbench|checkmate_in_one|0|0 -bigbench|chess_state_tracking|0|0 -bigbench|chinese_remainder_theorem|0|0 -bigbench|cifar10_classification|0|0 -bigbench|code_line_description|0|0 -bigbench|codenames|0|0 -bigbench|color|0|0 -bigbench|common_morpheme|0|0 -bigbench|conceptual_combinations|0|0 -bigbench|conlang_translation|0|0 -bigbench|contextual_parametric_knowledge_conflicts|0|0 -bigbench|crash_blossom|0|0 -bigbench|crass_ai|0|0 -bigbench|cryobiology_spanish|0|0 -bigbench|cryptonite|0|0 -bigbench|cs_algorithms|0|0 -bigbench|dark_humor_detection|0|0 -bigbench|date_understanding|0|0 -bigbench|disambiguation_qa|0|0 -bigbench|discourse_marker_prediction|0|0 -bigbench|disfl_qa|0|0 -bigbench|dyck_languages|0|0 -bigbench|elementary_math_qa|0|0 -bigbench|emoji_movie|0|0 -bigbench|emojis_emotion_prediction|0|0 -bigbench|empirical_judgments|0|0 -bigbench|english_proverbs|0|0 -bigbench|english_russian_proverbs|0|0 -bigbench|entailed_polarity_hindi|0|0 -bigbench|entailed_polarity|0|0 -bigbench|epistemic_reasoning|0|0 -bigbench|evaluating_information_essentiality|0|0 -bigbench|fact_checker|0|0 -bigbench|fantasy_reasoning|0|0 -bigbench|few_shot_nlg|0|0 -bigbench|figure_of_speech_detection|0|0 -bigbench|formal_fallacies_syllogisms_negation|0|0 -bigbench|gem|0|0 -bigbench|gender_inclusive_sentences_german|0|0 -bigbench|general_knowledge|0|0 -bigbench|geometric_shapes|0|0 -bigbench|goal_step_wikihow|0|0 -bigbench|gre_reading_comprehension|0|0 -bigbench|hhh_alignment|0|0 -bigbench|hindi_question_answering|0|0 -bigbench|hindu_knowledge|0|0 -bigbench|hinglish_toxicity|0|0 -bigbench|human_organs_senses|0|0 -bigbench|hyperbaton|0|0 -bigbench|identify_math_theorems|0|0 -bigbench|identify_odd_metaphor|0|0 -bigbench|implicatures|0|0 -bigbench|implicit_relations|0|0 -bigbench|intent_recognition|0|0 -bigbench|international_phonetic_alphabet_nli|0|0 -bigbench|international_phonetic_alphabet_transliterate|0|0 -bigbench|intersect_geometry|0|0 -bigbench|irony_identification|0|0 -bigbench|kanji_ascii|0|0 -bigbench|kannada|0|0 -bigbench|key_value_maps|0|0 -bigbench|known_unknowns|0|0 -bigbench|language_games|0|0 -bigbench|language_identification|0|0 -bigbench|linguistic_mappings|0|0 -bigbench|linguistics_puzzles|0|0 -bigbench|logic_grid_puzzle|0|0 -bigbench|logical_args|0|0 -bigbench|logical_deduction|0|0 -bigbench|logical_fallacy_detection|0|0 -bigbench|logical_sequence|0|0 -bigbench|mathematical_induction|0|0 -bigbench|matrixshapes|0|0 -bigbench|metaphor_boolean|0|0 -bigbench|metaphor_understanding|0|0 -bigbench|minute_mysteries_qa|0|0 -bigbench|misconceptions_russian|0|0 -bigbench|misconceptions|0|0 -bigbench|mnist_ascii|0|0 -bigbench|modified_arithmetic|0|0 -bigbench|moral_permissibility|0|0 -bigbench|movie_dialog_same_or_different|0|0 -bigbench|movie_recommendation|0|0 -bigbench|mult_data_wrangling|0|0 -bigbench|multiemo|0|0 -bigbench|natural_instructions|0|0 -bigbench|navigate|0|0 -bigbench|nonsense_words_grammar|0|0 -bigbench|novel_concepts|0|0 -bigbench|object_counting|0|0 -bigbench|odd_one_out|0|0 -bigbench|operators|0|0 -bigbench|paragraph_segmentation|0|0 -bigbench|parsinlu_qa|0|0 -bigbench|parsinlu_reading_comprehension|0|0 -bigbench|penguins_in_a_table|0|0 -bigbench|periodic_elements|0|0 -bigbench|persian_idioms|0|0 -bigbench|phrase_relatedness|0|0 -bigbench|physical_intuition|0|0 -bigbench|physics_questions|0|0 -bigbench|physics|0|0 -bigbench|play_dialog_same_or_different|0|0 -bigbench|polish_sequence_labeling|0|0 -bigbench|presuppositions_as_nli|0|0 -bigbench|qa_wikidata|0|0 -bigbench|question_selection|0|0 -bigbench|real_or_fake_text|0|0 -bigbench|reasoning_about_colored_objects|0|0 -bigbench|repeat_copy_logic|0|0 -bigbench|rephrase|0|0 -bigbench|rhyming|0|0 -bigbench|riddle_sense|0|0 -bigbench|ruin_names|0|0 -bigbench|salient_translation_error_detection|0|0 -bigbench|scientific_press_release|0|0 -bigbench|semantic_parsing_in_context_sparc|0|0 -bigbench|semantic_parsing_spider|0|0 -bigbench|sentence_ambiguity|0|0 -bigbench|similarities_abstraction|0|0 -bigbench|simp_turing_concept|0|0 -bigbench|simple_arithmetic_json_multiple_choice|0|0 -bigbench|simple_arithmetic_json_subtasks|0|0 -bigbench|simple_arithmetic_json|0|0 -bigbench|simple_arithmetic_multiple_targets_json|0|0 -bigbench|simple_ethical_questions|0|0 -bigbench|simple_text_editing|0|0 -bigbench|snarks|0|0 -bigbench|social_iqa|0|0 -bigbench|social_support|0|0 -bigbench|sports_understanding|0|0 -bigbench|strange_stories|0|0 -bigbench|strategyqa|0|0 -bigbench|sufficient_information|0|0 -bigbench|suicide_risk|0|0 -bigbench|swahili_english_proverbs|0|0 -bigbench|swedish_to_german_proverbs|0|0 -bigbench|symbol_interpretation|0|0 -bigbench|tellmewhy|0|0 -bigbench|temporal_sequences|0|0 -bigbench|tense|0|0 -bigbench|timedial|0|0 -bigbench|topical_chat|0|0 -bigbench|tracking_shuffled_objects|0|0 -bigbench|understanding_fables|0|0 -bigbench|undo_permutation|0|0 -bigbench|unit_conversion|0|0 -bigbench|unit_interpretation|0|0 -bigbench|unnatural_in_context_learning|0|0 -bigbench|vitaminc_fact_verification|0|0 -bigbench|what_is_the_tao|0|0 -bigbench|which_wiki_edit|0|0 -bigbench|wino_x_german|0|0 -bigbench|winowhy|0|0 -bigbench|word_sorting|0|0 -bigbench|word_unscrambling|0|0 -helm|babi_qa|0|0 -helm|bbq:Age|0|0 -helm|bbq:Disability_status|0|0 -helm|bbq:Gender_identity|0|0 -helm|bbq:Nationality|0|0 -helm|bbq:Physical_appearance|0|0 -helm|bbq:Race_ethnicity|0|0 -helm|bbq:Race_x_SES|0|0 -helm|bbq:Race_x_gender|0|0 -helm|bbq:Religion|0|0 -helm|bbq:SES|0|0 -helm|bbq:Sexual_orientation|0|0 -helm|bbq|0|0 -helm|bigbench:auto_debugging|0|0 -helm|bigbench:bbq_lite_json:age_ambig|0|0 -helm|bigbench:bbq_lite_json:age_disambig|0|0 -helm|bigbench:bbq_lite_json:disability_status_ambig|0|0 -helm|bigbench:bbq_lite_json:disability_status_disambig|0|0 -helm|bigbench:bbq_lite_json:gender_identity_ambig|0|0 -helm|bigbench:bbq_lite_json:gender_identity_disambig|0|0 -helm|bigbench:bbq_lite_json:nationality_ambig|0|0 -helm|bigbench:bbq_lite_json:nationality_disambig|0|0 -helm|bigbench:bbq_lite_json:physical_appearance_ambig|0|0 -helm|bigbench:bbq_lite_json:physical_appearance_disambig|0|0 -helm|bigbench:bbq_lite_json:race_ethnicity_ambig|0|0 -helm|bigbench:bbq_lite_json:race_ethnicity_disambig|0|0 -helm|bigbench:bbq_lite_json:religion_ambig|0|0 -helm|bigbench:bbq_lite_json:religion_disambig|0|0 -helm|bigbench:bbq_lite_json:ses_ambig|0|0 -helm|bigbench:bbq_lite_json:ses_disambig|0|0 -helm|bigbench:bbq_lite_json:sexual_orientation_ambig|0|0 -helm|bigbench:bbq_lite_json:sexual_orientation_disambig|0|0 -helm|bigbench:code_line_description|0|0 -helm|bigbench:conceptual_combinations:contradictions|0|0 -helm|bigbench:conceptual_combinations:emergent_properties|0|0 -helm|bigbench:conceptual_combinations:fanciful_fictional_combinations|0|0 -helm|bigbench:conceptual_combinations:homonyms|0|0 -helm|bigbench:conceptual_combinations:invented_words|0|0 -helm|bigbench:conlang_translation:adna_from|0|0 -helm|bigbench:conlang_translation:adna_to|0|0 -helm|bigbench:conlang_translation:atikampe_from|0|0 -helm|bigbench:conlang_translation:atikampe_to|0|0 -helm|bigbench:conlang_translation:gornam_from|0|0 -helm|bigbench:conlang_translation:gornam_to|0|0 -helm|bigbench:conlang_translation:holuan_from|0|0 -helm|bigbench:conlang_translation:holuan_to|0|0 -helm|bigbench:conlang_translation:mkafala_from|0|0 -helm|bigbench:conlang_translation:mkafala_to|0|0 -helm|bigbench:conlang_translation:postpositive_english_from|0|0 -helm|bigbench:conlang_translation:postpositive_english_to|0|0 -helm|bigbench:conlang_translation:unapuri_from|0|0 -helm|bigbench:conlang_translation:unapuri_to|0|0 -helm|bigbench:conlang_translation:vaomi_from|0|0 -helm|bigbench:conlang_translation:vaomi_to|0|0 -helm|bigbench:emoji_movie|0|0 -helm|bigbench:formal_fallacies_syllogisms_negation|0|0 -helm|bigbench:hindu_knowledge|0|0 -helm|bigbench:known_unknowns|0|0 -helm|bigbench:language_identification|0|0 -helm|bigbench:linguistics_puzzles|0|0 -helm|bigbench:logic_grid_puzzle|0|0 -helm|bigbench:logical_deduction-five_objects|0|0 -helm|bigbench:logical_deduction-seven_objects|0|0 -helm|bigbench:logical_deduction-three_objects|0|0 -helm|bigbench:misconceptions_russian|0|0 -helm|bigbench:novel_concepts|0|0 -helm|bigbench:operators|0|0 -helm|bigbench:parsinlu_reading_comprehension|0|0 -helm|bigbench:play_dialog_same_or_different|0|0 -helm|bigbench:repeat_copy_logic|0|0 -helm|bigbench:strange_stories-boolean|0|0 -helm|bigbench:strange_stories-multiple_choice|0|0 -helm|bigbench:strategyqa|0|0 -helm|bigbench:symbol_interpretation-adversarial|0|0 -helm|bigbench:symbol_interpretation-emoji_agnostic|0|0 -helm|bigbench:symbol_interpretation-name_agnostic|0|0 -helm|bigbench:symbol_interpretation-plain|0|0 -helm|bigbench:symbol_interpretation-tricky|0|0 -helm|bigbench:vitaminc_fact_verification|0|0 -helm|bigbench:winowhy|0|0 -helm|blimp:adjunct_island|0|0 -helm|blimp:anaphor_gender_agreement|0|0 -helm|blimp:anaphor_number_agreement|0|0 -helm|blimp:animate_subject_passive|0|0 -helm|blimp:animate_subject_trans|0|0 -helm|blimp:causative|0|0 -helm|blimp:complex_NP_island|0|0 -helm|blimp:coordinate_structure_constraint_complex_left_branch|0|0 -helm|blimp:coordinate_structure_constraint_object_extraction|0|0 -helm|blimp:determiner_noun_agreement_1|0|0 -helm|blimp:determiner_noun_agreement_2|0|0 -helm|blimp:determiner_noun_agreement_irregular_1|0|0 -helm|blimp:determiner_noun_agreement_irregular_2|0|0 -helm|blimp:determiner_noun_agreement_with_adj_2|0|0 -helm|blimp:determiner_noun_agreement_with_adj_irregular_1|0|0 -helm|blimp:determiner_noun_agreement_with_adj_irregular_2|0|0 -helm|blimp:determiner_noun_agreement_with_adjective_1|0|0 -helm|blimp:distractor_agreement_relational_noun|0|0 -helm|blimp:distractor_agreement_relative_clause|0|0 -helm|blimp:drop_argument|0|0 -helm|blimp:ellipsis_n_bar_1|0|0 -helm|blimp:ellipsis_n_bar_2|0|0 -helm|blimp:existential_there_object_raising|0|0 -helm|blimp:existential_there_quantifiers_1|0|0 -helm|blimp:existential_there_quantifiers_2|0|0 -helm|blimp:existential_there_subject_raising|0|0 -helm|blimp:expletive_it_object_raising|0|0 -helm|blimp:inchoative|0|0 -helm|blimp:intransitive|0|0 -helm|blimp:irregular_past_participle_adjectives|0|0 -helm|blimp:irregular_past_participle_verbs|0|0 -helm|blimp:irregular_plural_subject_verb_agreement_1|0|0 -helm|blimp:irregular_plural_subject_verb_agreement_2|0|0 -helm|blimp:left_branch_island_echo_question|0|0 -helm|blimp:left_branch_island_simple_question|0|0 -helm|blimp:matrix_question_npi_licensor_present|0|0 -helm|blimp:npi_present_1|0|0 -helm|blimp:npi_present_2|0|0 -helm|blimp:only_npi_licensor_present|0|0 -helm|blimp:only_npi_scope|0|0 -helm|blimp:passive_1|0|0 -helm|blimp:passive_2|0|0 -helm|blimp:principle_A_c_command|0|0 -helm|blimp:principle_A_case_1|0|0 -helm|blimp:principle_A_case_2|0|0 -helm|blimp:principle_A_domain_1|0|0 -helm|blimp:principle_A_domain_2|0|0 -helm|blimp:principle_A_domain_3|0|0 -helm|blimp:principle_A_reconstruction|0|0 -helm|blimp:regular_plural_subject_verb_agreement_1|0|0 -helm|blimp:regular_plural_subject_verb_agreement_2|0|0 -helm|blimp:sentential_negation_npi_licensor_present|0|0 -helm|blimp:sentential_negation_npi_scope|0|0 -helm|blimp:sentential_subject_island|0|0 -helm|blimp:superlative_quantifiers_1|0|0 -helm|blimp:superlative_quantifiers_2|0|0 -helm|blimp:tough_vs_raising_1|0|0 -helm|blimp:tough_vs_raising_2|0|0 -helm|blimp:transitive|0|0 -helm|blimp:wh_island|0|0 -helm|blimp:wh_questions_object_gap|0|0 -helm|blimp:wh_questions_subject_gap_long_distance|0|0 -helm|blimp:wh_questions_subject_gap|0|0 -helm|blimp:wh_vs_that_no_gap_long_distance|0|0 -helm|blimp:wh_vs_that_no_gap|0|0 -helm|blimp:wh_vs_that_with_gap_long_distance|0|0 -helm|blimp:wh_vs_that_with_gap|0|0 -helm|bold:gender|0|0 -helm|bold:political_ideology|0|0 -helm|bold:profession|0|0 -helm|bold:race|0|0 -helm|bold:religious_ideology|0|0 -helm|bold|0|0 -helm|boolq:contrastset|0|0 -helm|boolq|0|0 -helm|civil_comments:LGBTQ|0|0 -helm|civil_comments:black|0|0 -helm|civil_comments:christian|0|0 -helm|civil_comments:female|0|0 -helm|civil_comments:male|0|0 -helm|civil_comments:muslim|0|0 -helm|civil_comments:other_religions|0|0 -helm|civil_comments:white|0|0 -helm|civil_comments|0|0 -helm|commonsenseqa|0|0 -helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125|0|0 -helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25|0|0 -helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5|0|0 -helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125|0|0 -helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25|0|0 -helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5|0|0 -helm|copyright:oh_the_places|0|0 -helm|copyright:pilot|0|0 -helm|copyright:popular_books-prefix_length_10|0|0 -helm|copyright:popular_books-prefix_length_125|0|0 -helm|copyright:popular_books-prefix_length_250|0|0 -helm|copyright:popular_books-prefix_length_25|0|0 -helm|copyright:popular_books-prefix_length_50|0|0 -helm|copyright:popular_books-prefix_length_5|0|0 -helm|copyright:prompt_num_line_1-min_lines_20|0|0 -helm|copyright:prompt_num_line_10-min_lines_20|0|0 -helm|copyright:prompt_num_line_5-min_lines_20|0|0 -helm|covid_dialogue|0|0 -helm|dyck_language:2|0|0 -helm|dyck_language:3|0|0 -helm|dyck_language:4|0|0 -helm|entity_data_imputation:Buy|0|0 -helm|entity_data_imputation:Restaurant|0|0 -helm|entity_matching:Abt_Buy|0|0 -helm|entity_matching:Amazon_Google|0|0 -helm|entity_matching:Beer|0|0 -helm|entity_matching:Company|0|0 -helm|entity_matching:DBLP_ACM|0|0 -helm|entity_matching:DBLP_GoogleScholar|0|0 -helm|entity_matching:Dirty_DBLP_ACM|0|0 -helm|entity_matching:Dirty_DBLP_GoogleScholar|0|0 -helm|entity_matching:Dirty_Walmart_Amazon|0|0 -helm|entity_matching:Dirty_iTunes_Amazon|0|0 -helm|entity_matching:Fodors_Zagats|0|0 -helm|entity_matching:Walmart_Amazon|0|0 -helm|entity_matching:iTunes_Amazon|0|0 -helm|hellaswag|0|0 -helm|humaneval|0|0 -helm|imdb:contrastset|0|0 -helm|imdb|0|0 -helm|interactive_qa_mmlu:abstract_algebra|0|0 -helm|interactive_qa_mmlu:college_chemistry|0|0 -helm|interactive_qa_mmlu:global_facts|0|0 -helm|interactive_qa_mmlu:miscellaneous|0|0 -helm|interactive_qa_mmlu:nutrition|0|0 -helm|interactive_qa_mmlu:us_foreign_policy|0|0 -helm|legal_summarization:billsum|0|0 -helm|legal_summarization:eurlexsum|0|0 -helm|legal_summarization:multilexsum|0|0 -helm|legalsupport|0|0 -helm|lexglue:case_hold|0|0 -helm|lexglue:ecthr_a|0|0 -helm|lexglue:ecthr_b|0|0 -helm|lexglue:eurlex|0|0 -helm|lexglue:ledgar|0|0 -helm|lexglue:scotus|0|0 -helm|lexglue:unfair_tos|0|0 -helm|lextreme:brazilian_court_decisions_judgment|0|0 -helm|lextreme:brazilian_court_decisions_unanimity|0|0 -helm|lextreme:covid19_emergency_event|0|0 -helm|lextreme:german_argument_mining|0|0 -helm|lextreme:greek_legal_code_chapter|0|0 -helm|lextreme:greek_legal_code_subject|0|0 -helm|lextreme:greek_legal_code_volume|0|0 -helm|lextreme:greek_legal_ner|0|0 -helm|lextreme:legalnero|0|0 -helm|lextreme:lener_br|0|0 -helm|lextreme:mapa_coarse|0|0 -helm|lextreme:mapa_fine|0|0 -helm|lextreme:multi_eurlex_level_1|0|0 -helm|lextreme:multi_eurlex_level_2|0|0 -helm|lextreme:multi_eurlex_level_3|0|0 -helm|lextreme:online_terms_of_service_clause_topics|0|0 -helm|lextreme:online_terms_of_service_unfairness_levels|0|0 -helm|lextreme:swiss_judgment_prediction|0|0 -helm|lsat_qa:assignment|0|0 -helm|lsat_qa:grouping|0|0 -helm|lsat_qa:miscellaneous|0|0 -helm|lsat_qa:ordering|0|0 -helm|lsat_qa|0|0 -helm|me_q_sum|0|0 -helm|med_dialog:healthcaremagic|0|0 -helm|med_dialog:icliniq|0|0 -helm|med_mcqa|0|0 -helm|med_paragraph_simplification|0|0 -helm|med_qa|0|0 -helm|mmlu:abstract_algebra|0|0 -helm|mmlu:anatomy|0|0 -helm|mmlu:astronomy|0|0 -helm|mmlu:business_ethics|0|0 -helm|mmlu:clinical_knowledge|0|0 -helm|mmlu:college_biology|0|0 -helm|mmlu:college_chemistry|0|0 -helm|mmlu:college_computer_science|0|0 -helm|mmlu:college_mathematics|0|0 -helm|mmlu:college_medicine|0|0 -helm|mmlu:college_physics|0|0 -helm|mmlu:computer_security|0|0 -helm|mmlu:conceptual_physics|0|0 -helm|mmlu:econometrics|0|0 -helm|mmlu:electrical_engineering|0|0 -helm|mmlu:elementary_mathematics|0|0 -helm|mmlu:formal_logic|0|0 -helm|mmlu:global_facts|0|0 -helm|mmlu:high_school_biology|0|0 -helm|mmlu:high_school_chemistry|0|0 -helm|mmlu:high_school_computer_science|0|0 -helm|mmlu:high_school_european_history|0|0 -helm|mmlu:high_school_geography|0|0 -helm|mmlu:high_school_government_and_politics|0|0 -helm|mmlu:high_school_macroeconomics|0|0 -helm|mmlu:high_school_mathematics|0|0 -helm|mmlu:high_school_microeconomics|0|0 -helm|mmlu:high_school_physics|0|0 -helm|mmlu:high_school_psychology|0|0 -helm|mmlu:high_school_statistics|0|0 -helm|mmlu:high_school_us_history|0|0 -helm|mmlu:high_school_world_history|0|0 -helm|mmlu:human_aging|0|0 -helm|mmlu:human_sexuality|0|0 -helm|mmlu:international_law|0|0 -helm|mmlu:jurisprudence|0|0 -helm|mmlu:logical_fallacies|0|0 -helm|mmlu:machine_learning|0|0 -helm|mmlu:management|0|0 -helm|mmlu:marketing|0|0 -helm|mmlu:medical_genetics|0|0 -helm|mmlu:miscellaneous|0|0 -helm|mmlu:moral_disputes|0|0 -helm|mmlu:moral_scenarios|0|0 -helm|mmlu:nutrition|0|0 -helm|mmlu:philosophy|0|0 -helm|mmlu:prehistory|0|0 -helm|mmlu:professional_accounting|0|0 -helm|mmlu:professional_law|0|0 -helm|mmlu:professional_medicine|0|0 -helm|mmlu:professional_psychology|0|0 -helm|mmlu:public_relations|0|0 -helm|mmlu:security_studies|0|0 -helm|mmlu:sociology|0|0 -helm|mmlu:us_foreign_policy|0|0 -helm|mmlu:virology|0|0 -helm|mmlu:world_religions|0|0 -helm|mmlu|0|0 -helm|narrativeqa|0|0 -helm|numeracy:linear_example|0|0 -helm|numeracy:linear_standard|0|0 -helm|numeracy:parabola_example|0|0 -helm|numeracy:parabola_standard|0|0 -helm|numeracy:paraboloid_example|0|0 -helm|numeracy:paraboloid_standard|0|0 -helm|numeracy:plane_example|0|0 -helm|numeracy:plane_standard|0|0 -helm|openbookqa|0|0 -helm|piqa|0|0 -helm|pubmedqa|0|0 -helm|quac|0|0 -helm|raft:ade_corpus_v2|0|0 -helm|raft:banking_77|0|0 -helm|raft:neurips_impact_statement_risks|0|0 -helm|raft:one_stop_english|0|0 -helm|raft:overruling|0|0 -helm|raft:semiconductor_org_types|0|0 -helm|raft:systematic_review_inclusion|0|0 -helm|raft:tai_safety_research|0|0 -helm|raft:terms_of_service|0|0 -helm|raft:tweet_eval_hate|0|0 -helm|raft:twitter_complaints|0|0 -helm|real_toxicity_prompts|0|0 -helm|siqa|0|0 -helm|summarization:cnn-dm|0|0 -helm|summarization:xsum-sampled|0|0 -helm|summarization:xsum|0|0 -helm|synthetic_reasoning:induction|0|0 -helm|synthetic_reasoning:natural_easy|0|0 -helm|synthetic_reasoning:natural_hard|0|0 -helm|synthetic_reasoning:pattern_match|0|0 -helm|synthetic_reasoning:variable_substitution|0|0 -helm|the_pile:arxiv|0|0 -helm|the_pile:bibliotik|0|0 -helm|the_pile:commoncrawl|0|0 -helm|the_pile:dm-mathematics|0|0 -helm|the_pile:enron|0|0 -helm|the_pile:europarl|0|0 -helm|the_pile:freelaw|0|0 -helm|the_pile:github|0|0 -helm|the_pile:gutenberg|0|0 -helm|the_pile:hackernews|0|0 -helm|the_pile:nih-exporter|0|0 -helm|the_pile:opensubtitles|0|0 -helm|the_pile:openwebtext2|0|0 -helm|the_pile:pubmed-abstracts|0|0 -helm|the_pile:pubmed-central|0|0 -helm|the_pile:stackexchange|0|0 -helm|the_pile:upsto|0|0 -helm|the_pile:wikipedia|0|0 -helm|the_pile:youtubesubtitles|0|0 -helm|truthfulqa|0|0 -helm|twitterAAE:aa|0|0 -helm|twitterAAE:white|0|0 -helm|wikifact:applies_to_jurisdiction|0|0 -helm|wikifact:atomic_number|0|0 -helm|wikifact:author|0|0 -helm|wikifact:award_received|0|0 -helm|wikifact:basic_form_of_government|0|0 -helm|wikifact:capital_of|0|0 -helm|wikifact:capital|0|0 -helm|wikifact:central_bank|0|0 -helm|wikifact:composer|0|0 -helm|wikifact:continent|0|0 -helm|wikifact:country_of_citizenship|0|0 -helm|wikifact:country_of_origin|0|0 -helm|wikifact:country|0|0 -helm|wikifact:creator|0|0 -helm|wikifact:currency|0|0 -helm|wikifact:defendant|0|0 -helm|wikifact:developer|0|0 -helm|wikifact:diplomatic_relation|0|0 -helm|wikifact:director|0|0 -helm|wikifact:discoverer_or_inventor|0|0 -helm|wikifact:drug_or_therapy_used_for_treatment|0|0 -helm|wikifact:educated_at|0|0 -helm|wikifact:electron_configuration|0|0 -helm|wikifact:employer|0|0 -helm|wikifact:field_of_work|0|0 -helm|wikifact:file_extension|0|0 -helm|wikifact:genetic_association|0|0 -helm|wikifact:genre|0|0 -helm|wikifact:has_part|0|0 -helm|wikifact:head_of_government|0|0 -helm|wikifact:head_of_state|0|0 -helm|wikifact:headquarters_location|0|0 -helm|wikifact:industry|0|0 -helm|wikifact:influenced_by|0|0 -helm|wikifact:instance_of|0|0 -helm|wikifact:instrument|0|0 -helm|wikifact:language_of_work_or_name|0|0 -helm|wikifact:languages_spoken_written_or_signed|0|0 -helm|wikifact:laws_applied|0|0 -helm|wikifact:located_in_the_administrative_territorial_entity|0|0 -helm|wikifact:location_of_discovery|0|0 -helm|wikifact:location_of_formation|0|0 -helm|wikifact:location|0|0 -helm|wikifact:majority_opinion_by|0|0 -helm|wikifact:manufacturer|0|0 -helm|wikifact:measured_physical_quantity|0|0 -helm|wikifact:medical_condition_treated|0|0 -helm|wikifact:member_of_political_party|0|0 -helm|wikifact:member_of_sports_team|0|0 -helm|wikifact:member_of|0|0 -helm|wikifact:movement|0|0 -helm|wikifact:named_after|0|0 -helm|wikifact:native_language|0|0 -helm|wikifact:number_of_processor_cores|0|0 -helm|wikifact:occupation|0|0 -helm|wikifact:office_held_by_head_of_government|0|0 -helm|wikifact:office_held_by_head_of_state|0|0 -helm|wikifact:official_language|0|0 -helm|wikifact:operating_system|0|0 -helm|wikifact:original_language_of_film_or_TV_show|0|0 -helm|wikifact:original_network|0|0 -helm|wikifact:overrules|0|0 -helm|wikifact:owned_by|0|0 -helm|wikifact:part_of|0|0 -helm|wikifact:participating_team|0|0 -helm|wikifact:place_of_birth|0|0 -helm|wikifact:place_of_death|0|0 -helm|wikifact:plaintiff|0|0 -helm|wikifact:position_held|0|0 -helm|wikifact:position_played_on_team|0|0 -helm|wikifact:programming_language|0|0 -helm|wikifact:recommended_unit_of_measurement|0|0 -helm|wikifact:record_label|0|0 -helm|wikifact:religion|0|0 -helm|wikifact:repealed_by|0|0 -helm|wikifact:shares_border_with|0|0 -helm|wikifact:solved_by|0|0 -helm|wikifact:statement_describes|0|0 -helm|wikifact:stock_exchange|0|0 -helm|wikifact:subclass_of|0|0 -helm|wikifact:subsidiary|0|0 -helm|wikifact:symptoms_and_signs|0|0 -helm|wikifact:therapeutic_area|0|0 -helm|wikifact:time_of_discovery_or_invention|0|0 -helm|wikifact:twinned_administrative_body|0|0 -helm|wikifact:work_location|0|0 -helm|wikitext:103|0|0 -helm|wmt14:cs-en|0|0 -helm|wmt14:de-en|0|0 -helm|wmt14:fr-en|0|0 -helm|wmt14:hi-en|0|0 -helm|wmt14:ru-en|0|0 -lighteval|anli:r1|0|0 -lighteval|anli:r2|0|0 -lighteval|anli:r3|0|0 -lighteval|anli|0|0 -leaderboard|arc:challenge|0|0 -lighteval|arc:easy|0|0 -lighteval|arithmetic:1dc|0|0 -lighteval|arithmetic:2da|0|0 -lighteval|arithmetic:2dm|0|0 -lighteval|arithmetic:2ds|0|0 -lighteval|arithmetic:3da|0|0 -lighteval|arithmetic:3ds|0|0 -lighteval|arithmetic:4da|0|0 -lighteval|arithmetic:4ds|0|0 -lighteval|arithmetic:5da|0|0 -lighteval|arithmetic:5ds|0|0 -lighteval|asdiv|0|0 -lighteval|blimp:adjunct_island|0|0 -lighteval|blimp:anaphor_gender_agreement|0|0 -lighteval|blimp:anaphor_number_agreement|0|0 -lighteval|blimp:animate_subject_passive|0|0 -lighteval|blimp:animate_subject_trans|0|0 -lighteval|blimp:causative|0|0 -lighteval|blimp:complex_NP_island|0|0 -lighteval|blimp:coordinate_structure_constraint_complex_left_branch|0|0 -lighteval|blimp:coordinate_structure_constraint_object_extraction|0|0 -lighteval|blimp:determiner_noun_agreement_1|0|0 -lighteval|blimp:determiner_noun_agreement_2|0|0 -lighteval|blimp:determiner_noun_agreement_irregular_1|0|0 -lighteval|blimp:determiner_noun_agreement_irregular_2|0|0 -lighteval|blimp:determiner_noun_agreement_with_adj_2|0|0 -lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1|0|0 -lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2|0|0 -lighteval|blimp:determiner_noun_agreement_with_adjective_1|0|0 -lighteval|blimp:distractor_agreement_relational_noun|0|0 -lighteval|blimp:distractor_agreement_relative_clause|0|0 -lighteval|blimp:drop_argument|0|0 -lighteval|blimp:ellipsis_n_bar_1|0|0 -lighteval|blimp:ellipsis_n_bar_2|0|0 -lighteval|blimp:existential_there_object_raising|0|0 -lighteval|blimp:existential_there_quantifiers_1|0|0 -lighteval|blimp:existential_there_quantifiers_2|0|0 -lighteval|blimp:existential_there_subject_raising|0|0 -lighteval|blimp:expletive_it_object_raising|0|0 -lighteval|blimp:inchoative|0|0 -lighteval|blimp:intransitive|0|0 -lighteval|blimp:irregular_past_participle_adjectives|0|0 -lighteval|blimp:irregular_past_participle_verbs|0|0 -lighteval|blimp:irregular_plural_subject_verb_agreement_1|0|0 -lighteval|blimp:irregular_plural_subject_verb_agreement_2|0|0 -lighteval|blimp:left_branch_island_echo_question|0|0 -lighteval|blimp:left_branch_island_simple_question|0|0 -lighteval|blimp:matrix_question_npi_licensor_present|0|0 -lighteval|blimp:npi_present_1|0|0 -lighteval|blimp:npi_present_2|0|0 -lighteval|blimp:only_npi_licensor_present|0|0 -lighteval|blimp:only_npi_scope|0|0 -lighteval|blimp:passive_1|0|0 -lighteval|blimp:passive_2|0|0 -lighteval|blimp:principle_A_c_command|0|0 -lighteval|blimp:principle_A_case_1|0|0 -lighteval|blimp:principle_A_case_2|0|0 -lighteval|blimp:principle_A_domain_1|0|0 -lighteval|blimp:principle_A_domain_2|0|0 -lighteval|blimp:principle_A_domain_3|0|0 -lighteval|blimp:principle_A_reconstruction|0|0 -lighteval|blimp:regular_plural_subject_verb_agreement_1|0|0 -lighteval|blimp:regular_plural_subject_verb_agreement_2|0|0 -lighteval|blimp:sentential_negation_npi_licensor_present|0|0 -lighteval|blimp:sentential_negation_npi_scope|0|0 -lighteval|blimp:sentential_subject_island|0|0 -lighteval|blimp:superlative_quantifiers_1|0|0 -lighteval|blimp:superlative_quantifiers_2|0|0 -lighteval|blimp:tough_vs_raising_1|0|0 -lighteval|blimp:tough_vs_raising_2|0|0 -lighteval|blimp:transitive|0|0 -lighteval|blimp:wh_island|0|0 -lighteval|blimp:wh_questions_object_gap|0|0 -lighteval|blimp:wh_questions_subject_gap_long_distance|0|0 -lighteval|blimp:wh_questions_subject_gap|0|0 -lighteval|blimp:wh_vs_that_no_gap_long_distance|0|0 -lighteval|blimp:wh_vs_that_no_gap|0|0 -lighteval|blimp:wh_vs_that_with_gap_long_distance|0|0 -lighteval|blimp:wh_vs_that_with_gap|0|0 -lighteval|coqa_bb|0|0 -lighteval|coqa|0|0 -lighteval|drop|0|0 -lighteval|ethics:commonsense|0|0 -lighteval|ethics:deontology|0|0 -lighteval|ethics:justice|0|0 -lighteval|ethics:utilitarianism|0|0 -lighteval|ethics:virtue|0|0 -lighteval|glue:cola|0|0 -lighteval|glue:mnli_mismatched|0|0 -lighteval|glue:mnli|0|0 -lighteval|glue:mrpc|0|0 -lighteval|glue:qnli|0|0 -lighteval|glue:qqp|0|0 -lighteval|glue:rte|0|0 -lighteval|glue:sst2|0|0 -lighteval|glue:stsb|0|0 -lighteval|glue:wnli|0|0 -leaderboard|gsm8k|0|0 -lighteval|headqa:en|0|0 -lighteval|headqa:es|0|0 -leaderboard|hellaswag|0|0 -lighteval|iwslt17:ar-en|0|0 -lighteval|iwslt17:de-en|0|0 -lighteval|iwslt17:en-ar|0|0 -lighteval|iwslt17:en-de|0|0 -lighteval|iwslt17:en-fr|0|0 -lighteval|iwslt17:en-ja|0|0 -lighteval|iwslt17:en-ko|0|0 -lighteval|iwslt17:en-zh|0|0 -lighteval|iwslt17:fr-en|0|0 -lighteval|iwslt17:ja-en|0|0 -lighteval|iwslt17:ko-en|0|0 -lighteval|iwslt17:zh-en|0|0 -lighteval|lambada:openai:de|0|0 -lighteval|lambada:openai:en|0|0 -lighteval|lambada:openai:es|0|0 -lighteval|lambada:openai:fr|0|0 -lighteval|lambada:openai:it|0|0 -lighteval|lambada:openai_cloze|0|0 -lighteval|lambada:openai|0|0 -lighteval|lambada:standard_cloze|0|0 -lighteval|lambada:standard|0|0 -lighteval|logiqa|0|0 -lighteval|math:algebra|0|0 -lighteval|math:counting_and_probability|0|0 -lighteval|math:geometry|0|0 -lighteval|math:intermediate_algebra|0|0 -lighteval|math:number_theory|0|0 -lighteval|math:prealgebra|0|0 -lighteval|math:precalculus|0|0 -lighteval|mathqa|0|0 -lighteval|mgsm:bn|0|0 -lighteval|mgsm:de|0|0 -lighteval|mgsm:en|0|0 -lighteval|mgsm:es|0|0 -lighteval|mgsm:fr|0|0 -lighteval|mgsm:ja|0|0 -lighteval|mgsm:ru|0|0 -lighteval|mgsm:sw|0|0 -lighteval|mgsm:te|0|0 -lighteval|mgsm:th|0|0 -lighteval|mgsm:zh|0|0 -leaderboard|mmlu:abstract_algebra|0|0 -leaderboard|mmlu:anatomy|0|0 -leaderboard|mmlu:astronomy|0|0 -leaderboard|mmlu:business_ethics|0|0 -leaderboard|mmlu:clinical_knowledge|0|0 -leaderboard|mmlu:college_biology|0|0 -leaderboard|mmlu:college_chemistry|0|0 -leaderboard|mmlu:college_computer_science|0|0 -leaderboard|mmlu:college_mathematics|0|0 -leaderboard|mmlu:college_medicine|0|0 -leaderboard|mmlu:college_physics|0|0 -leaderboard|mmlu:computer_security|0|0 -leaderboard|mmlu:conceptual_physics|0|0 -leaderboard|mmlu:econometrics|0|0 -leaderboard|mmlu:electrical_engineering|0|0 -leaderboard|mmlu:elementary_mathematics|0|0 -leaderboard|mmlu:formal_logic|0|0 -leaderboard|mmlu:global_facts|0|0 -leaderboard|mmlu:high_school_biology|0|0 -leaderboard|mmlu:high_school_chemistry|0|0 -leaderboard|mmlu:high_school_computer_science|0|0 -leaderboard|mmlu:high_school_european_history|0|0 -leaderboard|mmlu:high_school_geography|0|0 -leaderboard|mmlu:high_school_government_and_politics|0|0 -leaderboard|mmlu:high_school_macroeconomics|0|0 -leaderboard|mmlu:high_school_mathematics|0|0 -leaderboard|mmlu:high_school_microeconomics|0|0 -leaderboard|mmlu:high_school_physics|0|0 -leaderboard|mmlu:high_school_psychology|0|0 -leaderboard|mmlu:high_school_statistics|0|0 -leaderboard|mmlu:high_school_us_history|0|0 -leaderboard|mmlu:high_school_world_history|0|0 -leaderboard|mmlu:human_aging|0|0 -leaderboard|mmlu:human_sexuality|0|0 -leaderboard|mmlu:international_law|0|0 -leaderboard|mmlu:jurisprudence|0|0 -leaderboard|mmlu:logical_fallacies|0|0 -leaderboard|mmlu:machine_learning|0|0 -leaderboard|mmlu:management|0|0 -leaderboard|mmlu:marketing|0|0 -leaderboard|mmlu:medical_genetics|0|0 -leaderboard|mmlu:miscellaneous|0|0 -leaderboard|mmlu:moral_disputes|0|0 -leaderboard|mmlu:moral_scenarios|0|0 -leaderboard|mmlu:nutrition|0|0 -leaderboard|mmlu:philosophy|0|0 -leaderboard|mmlu:prehistory|0|0 -leaderboard|mmlu:professional_accounting|0|0 -leaderboard|mmlu:professional_law|0|0 -leaderboard|mmlu:professional_medicine|0|0 -leaderboard|mmlu:professional_psychology|0|0 -leaderboard|mmlu:public_relations|0|0 -leaderboard|mmlu:security_studies|0|0 -leaderboard|mmlu:sociology|0|0 -leaderboard|mmlu:us_foreign_policy|0|0 -leaderboard|mmlu:virology|0|0 -leaderboard|mmlu:world_religions|0|0 -lighteval|mtnt2019:en-fr|0|0 -lighteval|mtnt2019:en-ja|0|0 -lighteval|mtnt2019:fr-en|0|0 -lighteval|mtnt2019:ja-en|0|0 -lighteval|mutual_plus|0|0 -lighteval|mutual|0|0 -lighteval|openbookqa|0|0 -lighteval|piqa|0|0 -lighteval|prost|0|0 -lighteval|pubmedqa|0|0 -lighteval|qa4mre:2011|0|0 -lighteval|qa4mre:2012|0|0 -lighteval|qa4mre:2013|0|0 -lighteval|qasper_ll|0|0 -lighteval|qasper|0|0 -lighteval|race:high|0|0 -lighteval|sciq|0|0 -lighteval|storycloze:2016|0|0 -lighteval|storycloze:2018|0|0 -lighteval|super_glue:boolq|0|0 -lighteval|super_glue:cb|0|0 -lighteval|super_glue:copa|0|0 -lighteval|super_glue:multirc|0|0 -lighteval|super_glue:record|0|0 -lighteval|super_glue:rte|0|0 -lighteval|super_glue:wic|0|0 -lighteval|super_glue:wsc|0|0 -lighteval|swag|0|0 -lighteval|the_pile:arxiv|0|0 -lighteval|the_pile:bookcorpus2|0|0 -lighteval|the_pile:books3|0|0 -lighteval|the_pile:dm-mathematics|0|0 -lighteval|the_pile:enron|0|0 -lighteval|the_pile:europarl|0|0 -lighteval|the_pile:freelaw|0|0 -lighteval|the_pile:github|0|0 -lighteval|the_pile:gutenberg|0|0 -lighteval|the_pile:hackernews|0|0 -lighteval|the_pile:nih-exporter|0|0 -lighteval|the_pile:opensubtitles|0|0 -lighteval|the_pile:openwebtext2|0|0 -lighteval|the_pile:philpapers|0|0 -lighteval|the_pile:pile-cc|0|0 -lighteval|the_pile:pubmed-abstracts|0|0 -lighteval|the_pile:pubmed-central|0|0 -lighteval|the_pile:stackexchange|0|0 -lighteval|the_pile:ubuntu-irc|0|0 -lighteval|the_pile:uspto|0|0 -lighteval|the_pile:wikipedia|0|0 -lighteval|the_pile:youtubesubtitles|0|0 -lighteval|toxigen|0|0 -lighteval|triviaqa|0|0 -lighteval|truthfulqa:gen|0|0 -leaderboard|truthfulqa:mc|0|0 -lighteval|unscramble:anagrams1|0|0 -lighteval|unscramble:anagrams2|0|0 -lighteval|unscramble:cycle_letters|0|0 -lighteval|unscramble:random_insertion|0|0 -lighteval|unscramble:reversed_words|0|0 -lighteval|webqs|0|0 -lighteval|wikitext|0|0 -leaderboard|winogrande|0|0 -lighteval|wmt08:cs-en|0|0 -lighteval|wmt08:de-en|0|0 -lighteval|wmt08:en-cs|0|0 -lighteval|wmt08:en-de|0|0 -lighteval|wmt08:en-es|0|0 -lighteval|wmt08:en-fr|0|0 -lighteval|wmt08:en-hu|0|0 -lighteval|wmt08:es-en|0|0 -lighteval|wmt08:fr-en|0|0 -lighteval|wmt08:hu-en|0|0 -lighteval|wmt09:cs-en|0|0 -lighteval|wmt09:de-en|0|0 -lighteval|wmt09:en-cs|0|0 -lighteval|wmt09:en-de|0|0 -lighteval|wmt09:en-es|0|0 -lighteval|wmt09:en-fr|0|0 -lighteval|wmt09:en-hu|0|0 -lighteval|wmt09:en-it|0|0 -lighteval|wmt09:es-en|0|0 -lighteval|wmt09:fr-en|0|0 -lighteval|wmt09:hu-en|0|0 -lighteval|wmt09:it-en|0|0 -lighteval|wmt10:cs-en|0|0 -lighteval|wmt10:de-en|0|0 -lighteval|wmt10:en-cs|0|0 -lighteval|wmt10:en-de|0|0 -lighteval|wmt10:en-es|0|0 -lighteval|wmt10:en-fr|0|0 -lighteval|wmt10:es-en|0|0 -lighteval|wmt10:fr-en|0|0 -lighteval|wmt11:cs-en|0|0 -lighteval|wmt11:de-en|0|0 -lighteval|wmt11:en-cs|0|0 -lighteval|wmt11:en-de|0|0 -lighteval|wmt11:en-es|0|0 -lighteval|wmt11:en-fr|0|0 -lighteval|wmt11:es-en|0|0 -lighteval|wmt11:fr-en|0|0 -lighteval|wmt12:cs-en|0|0 -lighteval|wmt12:de-en|0|0 -lighteval|wmt12:en-cs|0|0 -lighteval|wmt12:en-de|0|0 -lighteval|wmt12:en-es|0|0 -lighteval|wmt12:en-fr|0|0 -lighteval|wmt12:es-en|0|0 -lighteval|wmt12:fr-en|0|0 -lighteval|wmt13:cs-en|0|0 -lighteval|wmt13:de-en|0|0 -lighteval|wmt13:en-cs|0|0 -lighteval|wmt13:en-de|0|0 -lighteval|wmt13:en-es|0|0 -lighteval|wmt13:en-fr|0|0 -lighteval|wmt13:en-ru|0|0 -lighteval|wmt13:es-en|0|0 -lighteval|wmt13:fr-en|0|0 -lighteval|wmt13:ru-en|0|0 -lighteval|wmt14:cs-en|0|0 -lighteval|wmt14:de-en|0|0 -lighteval|wmt14:en-cs|0|0 -lighteval|wmt14:en-de|0|0 -lighteval|wmt14:en-fr|0|0 -lighteval|wmt14:en-fr|0|0 -lighteval|wmt14:en-hi|0|0 -lighteval|wmt14:en-ru|0|0 -lighteval|wmt14:fr-en|0|0 -lighteval|wmt14:fr-en|0|0 -lighteval|wmt14:hi-en|0|0 -lighteval|wmt14:ru-en|0|0 -lighteval|wmt15:cs-en|0|0 -lighteval|wmt15:de-en|0|0 -lighteval|wmt15:en-cs|0|0 -lighteval|wmt15:en-de|0|0 -lighteval|wmt15:en-fi|0|0 -lighteval|wmt15:en-fr|0|0 -lighteval|wmt15:en-ru|0|0 -lighteval|wmt15:fi-en|0|0 -lighteval|wmt15:fr-en|0|0 -lighteval|wmt15:ru-en|0|0 -lighteval|wmt16:cs-en|0|0 -lighteval|wmt16:de-en|0|0 -lighteval|wmt16:de-en|0|0 -lighteval|wmt16:en-cs|0|0 -lighteval|wmt16:en-de|0|0 -lighteval|wmt16:en-de|0|0 -lighteval|wmt16:en-fi|0|0 -lighteval|wmt16:en-ro|0|0 -lighteval|wmt16:en-ro|0|0 -lighteval|wmt16:en-ru|0|0 -lighteval|wmt16:en-tr|0|0 -lighteval|wmt16:fi-en|0|0 -lighteval|wmt16:ro-en|0|0 -lighteval|wmt16:ro-en|0|0 -lighteval|wmt16:ru-en|0|0 -lighteval|wmt16:tr-en|0|0 -lighteval|wmt17:cs-en|0|0 -lighteval|wmt17:de-en|0|0 -lighteval|wmt17:en-cs|0|0 -lighteval|wmt17:en-de|0|0 -lighteval|wmt17:en-fi|0|0 -lighteval|wmt17:en-lv|0|0 -lighteval|wmt17:en-ru|0|0 -lighteval|wmt17:en-tr|0|0 -lighteval|wmt17:en-zh|0|0 -lighteval|wmt17:fi-en|0|0 -lighteval|wmt17:lv-en|0|0 -lighteval|wmt17:ru-en|0|0 -lighteval|wmt17:tr-en|0|0 -lighteval|wmt17:zh-en|0|0 -lighteval|wmt18:cs-en|0|0 -lighteval|wmt18:de-en|0|0 -lighteval|wmt18:en-cs|0|0 -lighteval|wmt18:en-de|0|0 -lighteval|wmt18:en-et|0|0 -lighteval|wmt18:en-fi|0|0 -lighteval|wmt18:en-ru|0|0 -lighteval|wmt18:en-tr|0|0 -lighteval|wmt18:en-zh|0|0 -lighteval|wmt18:et-en|0|0 -lighteval|wmt18:fi-en|0|0 -lighteval|wmt18:ru-en|0|0 -lighteval|wmt18:tr-en|0|0 -lighteval|wmt18:zh-en|0|0 -lighteval|wmt19:cs-de|0|0 -lighteval|wmt19:de-cs|0|0 -lighteval|wmt19:de-en|0|0 -lighteval|wmt19:de-fr|0|0 -lighteval|wmt19:en-cs|0|0 -lighteval|wmt19:en-de|0|0 -lighteval|wmt19:en-fi|0|0 -lighteval|wmt19:en-gu|0|0 -lighteval|wmt19:en-kk|0|0 -lighteval|wmt19:en-lt|0|0 -lighteval|wmt19:en-ru|0|0 -lighteval|wmt19:en-zh|0|0 -lighteval|wmt19:fi-en|0|0 -lighteval|wmt19:fr-de|0|0 -lighteval|wmt19:gu-en|0|0 -lighteval|wmt19:kk-en|0|0 -lighteval|wmt19:lt-en|0|0 -lighteval|wmt19:ru-en|0|0 -lighteval|wmt19:zh-en|0|0 -lighteval|wmt20:cs-en|0|0 -lighteval|wmt20:de-en|0|0 -lighteval|wmt20:de-fr|0|0 -lighteval|wmt20:en-cs|0|0 -lighteval|wmt20:en-de|0|0 -lighteval|wmt20:en-iu|0|0 -lighteval|wmt20:en-ja|0|0 -lighteval|wmt20:en-km|0|0 -lighteval|wmt20:en-pl|0|0 -lighteval|wmt20:en-ps|0|0 -lighteval|wmt20:en-ru|0|0 -lighteval|wmt20:en-ta|0|0 -lighteval|wmt20:en-zh|0|0 -lighteval|wmt20:fr-de|0|0 -lighteval|wmt20:iu-en|0|0 -lighteval|wmt20:ja-en|0|0 -lighteval|wmt20:km-en|0|0 -lighteval|wmt20:pl-en|0|0 -lighteval|wmt20:ps-en|0|0 -lighteval|wmt20:ru-en|0|0 -lighteval|wmt20:ta-en|0|0 -lighteval|wmt20:zh-en|0|0 -lighteval|wsc273|0|0 -lighteval|xcopa:en|0|0 -lighteval|xcopa:et|0|0 -lighteval|xcopa:ht|0|0 -lighteval|xcopa:id|0|0 -lighteval|xcopa:it|0|0 -lighteval|xcopa:qu|0|0 -lighteval|xcopa:sw|0|0 -lighteval|xcopa:ta|0|0 -lighteval|xcopa:th|0|0 -lighteval|xcopa:tr|0|0 -lighteval|xcopa:vi|0|0 -lighteval|xcopa:zh|0|0 -lighteval|xstory_cloze:ar|0|0 -lighteval|xstory_cloze:en|0|0 -lighteval|xstory_cloze:es|0|0 -lighteval|xstory_cloze:eu|0|0 -lighteval|xstory_cloze:hi|0|0 -lighteval|xstory_cloze:id|0|0 -lighteval|xstory_cloze:my|0|0 -lighteval|xstory_cloze:ru|0|0 -lighteval|xstory_cloze:sw|0|0 -lighteval|xstory_cloze:te|0|0 -lighteval|xstory_cloze:zh|0|0 -lighteval|xwinograd:en|0|0 -lighteval|xwinograd:fr|0|0 -lighteval|xwinograd:jp|0|0 -lighteval|xwinograd:pt|0|0 -lighteval|xwinograd:ru|0|0 -lighteval|xwinograd:zh|0|0 -original|arc:c:letters|0|0 -original|arc:c:options|0|0 -original|arc:c:simple|0|0 -original|mmlu:abstract_algebra|0|0 -original|mmlu:anatomy|0|0 -original|mmlu:astronomy|0|0 -original|mmlu:business_ethics|0|0 -original|mmlu:clinical_knowledge|0|0 -original|mmlu:college_biology|0|0 -original|mmlu:college_chemistry|0|0 -original|mmlu:college_computer_science|0|0 -original|mmlu:college_mathematics|0|0 -original|mmlu:college_medicine|0|0 -original|mmlu:college_physics|0|0 -original|mmlu:computer_security|0|0 -original|mmlu:conceptual_physics|0|0 -original|mmlu:econometrics|0|0 -original|mmlu:electrical_engineering|0|0 -original|mmlu:elementary_mathematics|0|0 -original|mmlu:formal_logic|0|0 -original|mmlu:global_facts|0|0 -original|mmlu:high_school_biology|0|0 -original|mmlu:high_school_chemistry|0|0 -original|mmlu:high_school_computer_science|0|0 -original|mmlu:high_school_european_history|0|0 -original|mmlu:high_school_geography|0|0 -original|mmlu:high_school_government_and_politics|0|0 -original|mmlu:high_school_macroeconomics|0|0 -original|mmlu:high_school_mathematics|0|0 -original|mmlu:high_school_microeconomics|0|0 -original|mmlu:high_school_physics|0|0 -original|mmlu:high_school_psychology|0|0 -original|mmlu:high_school_statistics|0|0 -original|mmlu:high_school_us_history|0|0 -original|mmlu:high_school_world_history|0|0 -original|mmlu:human_aging|0|0 -original|mmlu:human_sexuality|0|0 -original|mmlu:international_law|0|0 -original|mmlu:jurisprudence|0|0 -original|mmlu:logical_fallacies|0|0 -original|mmlu:machine_learning|0|0 -original|mmlu:management|0|0 -original|mmlu:marketing|0|0 -original|mmlu:medical_genetics|0|0 -original|mmlu:miscellaneous|0|0 -original|mmlu:moral_disputes|0|0 -original|mmlu:moral_scenarios|0|0 -original|mmlu:nutrition|0|0 -original|mmlu:philosophy|0|0 -original|mmlu:prehistory|0|0 -original|mmlu:professional_accounting|0|0 -original|mmlu:professional_law|0|0 -original|mmlu:professional_medicine|0|0 -original|mmlu:professional_psychology|0|0 -original|mmlu:public_relations|0|0 -original|mmlu:security_studies|0|0 -original|mmlu:sociology|0|0 -original|mmlu:us_foreign_policy|0|0 -original|mmlu:virology|0|0 -original|mmlu:world_religions|0|0 -original|mmlu|0|0 +bigbench|abstract_narrative_understanding|0 +bigbench|anachronisms|0 +bigbench|analogical_similarity|0 +bigbench|analytic_entailment|0 +bigbench|arithmetic_bb|0 +bigbench|ascii_word_recognition|0 +bigbench|authorship_verification|0 +bigbench|auto_categorization|0 +bigbench|auto_debugging|0 +bigbench|bbq_lite_json|0 +bigbench|bridging_anaphora_resolution_barqa|0 +bigbench|causal_judgment|0 +bigbench|cause_and_effect|0 +bigbench|checkmate_in_one|0 +bigbench|chess_state_tracking|0 +bigbench|chinese_remainder_theorem|0 +bigbench|cifar10_classification|0 +bigbench|code_line_description|0 +bigbench|codenames|0 +bigbench|color|0 +bigbench|common_morpheme|0 +bigbench|conceptual_combinations|0 +bigbench|conlang_translation|0 +bigbench|contextual_parametric_knowledge_conflicts|0 +bigbench|crash_blossom|0 +bigbench|crass_ai|0 +bigbench|cryobiology_spanish|0 +bigbench|cryptonite|0 +bigbench|cs_algorithms|0 +bigbench|dark_humor_detection|0 +bigbench|date_understanding|0 +bigbench|disambiguation_qa|0 +bigbench|discourse_marker_prediction|0 +bigbench|disfl_qa|0 +bigbench|dyck_languages|0 +bigbench|elementary_math_qa|0 +bigbench|emoji_movie|0 +bigbench|emojis_emotion_prediction|0 +bigbench|empirical_judgments|0 +bigbench|english_proverbs|0 +bigbench|english_russian_proverbs|0 +bigbench|entailed_polarity_hindi|0 +bigbench|entailed_polarity|0 +bigbench|epistemic_reasoning|0 +bigbench|evaluating_information_essentiality|0 +bigbench|fact_checker|0 +bigbench|fantasy_reasoning|0 +bigbench|few_shot_nlg|0 +bigbench|figure_of_speech_detection|0 +bigbench|formal_fallacies_syllogisms_negation|0 +bigbench|gem|0 +bigbench|gender_inclusive_sentences_german|0 +bigbench|general_knowledge|0 +bigbench|geometric_shapes|0 +bigbench|goal_step_wikihow|0 +bigbench|gre_reading_comprehension|0 +bigbench|hhh_alignment|0 +bigbench|hindi_question_answering|0 +bigbench|hindu_knowledge|0 +bigbench|hinglish_toxicity|0 +bigbench|human_organs_senses|0 +bigbench|hyperbaton|0 +bigbench|identify_math_theorems|0 +bigbench|identify_odd_metaphor|0 +bigbench|implicatures|0 +bigbench|implicit_relations|0 +bigbench|intent_recognition|0 +bigbench|international_phonetic_alphabet_nli|0 +bigbench|international_phonetic_alphabet_transliterate|0 +bigbench|intersect_geometry|0 +bigbench|irony_identification|0 +bigbench|kanji_ascii|0 +bigbench|kannada|0 +bigbench|key_value_maps|0 +bigbench|known_unknowns|0 +bigbench|language_games|0 +bigbench|language_identification|0 +bigbench|linguistic_mappings|0 +bigbench|linguistics_puzzles|0 +bigbench|logic_grid_puzzle|0 +bigbench|logical_args|0 +bigbench|logical_deduction|0 +bigbench|logical_fallacy_detection|0 +bigbench|logical_sequence|0 +bigbench|mathematical_induction|0 +bigbench|matrixshapes|0 +bigbench|metaphor_boolean|0 +bigbench|metaphor_understanding|0 +bigbench|minute_mysteries_qa|0 +bigbench|misconceptions_russian|0 +bigbench|misconceptions|0 +bigbench|mnist_ascii|0 +bigbench|modified_arithmetic|0 +bigbench|moral_permissibility|0 +bigbench|movie_dialog_same_or_different|0 +bigbench|movie_recommendation|0 +bigbench|mult_data_wrangling|0 +bigbench|multiemo|0 +bigbench|natural_instructions|0 +bigbench|navigate|0 +bigbench|nonsense_words_grammar|0 +bigbench|novel_concepts|0 +bigbench|object_counting|0 +bigbench|odd_one_out|0 +bigbench|operators|0 +bigbench|paragraph_segmentation|0 +bigbench|parsinlu_qa|0 +bigbench|parsinlu_reading_comprehension|0 +bigbench|penguins_in_a_table|0 +bigbench|periodic_elements|0 +bigbench|persian_idioms|0 +bigbench|phrase_relatedness|0 +bigbench|physical_intuition|0 +bigbench|physics_questions|0 +bigbench|physics|0 +bigbench|play_dialog_same_or_different|0 +bigbench|polish_sequence_labeling|0 +bigbench|presuppositions_as_nli|0 +bigbench|qa_wikidata|0 +bigbench|question_selection|0 +bigbench|real_or_fake_text|0 +bigbench|reasoning_about_colored_objects|0 +bigbench|repeat_copy_logic|0 +bigbench|rephrase|0 +bigbench|rhyming|0 +bigbench|riddle_sense|0 +bigbench|ruin_names|0 +bigbench|salient_translation_error_detection|0 +bigbench|scientific_press_release|0 +bigbench|semantic_parsing_in_context_sparc|0 +bigbench|semantic_parsing_spider|0 +bigbench|sentence_ambiguity|0 +bigbench|similarities_abstraction|0 +bigbench|simp_turing_concept|0 +bigbench|simple_arithmetic_json_multiple_choice|0 +bigbench|simple_arithmetic_json_subtasks|0 +bigbench|simple_arithmetic_json|0 +bigbench|simple_arithmetic_multiple_targets_json|0 +bigbench|simple_ethical_questions|0 +bigbench|simple_text_editing|0 +bigbench|snarks|0 +bigbench|social_iqa|0 +bigbench|social_support|0 +bigbench|sports_understanding|0 +bigbench|strange_stories|0 +bigbench|strategyqa|0 +bigbench|sufficient_information|0 +bigbench|suicide_risk|0 +bigbench|swahili_english_proverbs|0 +bigbench|swedish_to_german_proverbs|0 +bigbench|symbol_interpretation|0 +bigbench|tellmewhy|0 +bigbench|temporal_sequences|0 +bigbench|tense|0 +bigbench|timedial|0 +bigbench|topical_chat|0 +bigbench|tracking_shuffled_objects|0 +bigbench|understanding_fables|0 +bigbench|undo_permutation|0 +bigbench|unit_conversion|0 +bigbench|unit_interpretation|0 +bigbench|unnatural_in_context_learning|0 +bigbench|vitaminc_fact_verification|0 +bigbench|what_is_the_tao|0 +bigbench|which_wiki_edit|0 +bigbench|wino_x_german|0 +bigbench|winowhy|0 +bigbench|word_sorting|0 +bigbench|word_unscrambling|0 +helm|babi_qa|0 +helm|bbq:Age|0 +helm|bbq:Disability_status|0 +helm|bbq:Gender_identity|0 +helm|bbq:Nationality|0 +helm|bbq:Physical_appearance|0 +helm|bbq:Race_ethnicity|0 +helm|bbq:Race_x_SES|0 +helm|bbq:Race_x_gender|0 +helm|bbq:Religion|0 +helm|bbq:SES|0 +helm|bbq:Sexual_orientation|0 +helm|bbq|0 +helm|bigbench:auto_debugging|0 +helm|bigbench:bbq_lite_json:age_ambig|0 +helm|bigbench:bbq_lite_json:age_disambig|0 +helm|bigbench:bbq_lite_json:disability_status_ambig|0 +helm|bigbench:bbq_lite_json:disability_status_disambig|0 +helm|bigbench:bbq_lite_json:gender_identity_ambig|0 +helm|bigbench:bbq_lite_json:gender_identity_disambig|0 +helm|bigbench:bbq_lite_json:nationality_ambig|0 +helm|bigbench:bbq_lite_json:nationality_disambig|0 +helm|bigbench:bbq_lite_json:physical_appearance_ambig|0 +helm|bigbench:bbq_lite_json:physical_appearance_disambig|0 +helm|bigbench:bbq_lite_json:race_ethnicity_ambig|0 +helm|bigbench:bbq_lite_json:race_ethnicity_disambig|0 +helm|bigbench:bbq_lite_json:religion_ambig|0 +helm|bigbench:bbq_lite_json:religion_disambig|0 +helm|bigbench:bbq_lite_json:ses_ambig|0 +helm|bigbench:bbq_lite_json:ses_disambig|0 +helm|bigbench:bbq_lite_json:sexual_orientation_ambig|0 +helm|bigbench:bbq_lite_json:sexual_orientation_disambig|0 +helm|bigbench:code_line_description|0 +helm|bigbench:conceptual_combinations:contradictions|0 +helm|bigbench:conceptual_combinations:emergent_properties|0 +helm|bigbench:conceptual_combinations:fanciful_fictional_combinations|0 +helm|bigbench:conceptual_combinations:homonyms|0 +helm|bigbench:conceptual_combinations:invented_words|0 +helm|bigbench:conlang_translation:adna_from|0 +helm|bigbench:conlang_translation:adna_to|0 +helm|bigbench:conlang_translation:atikampe_from|0 +helm|bigbench:conlang_translation:atikampe_to|0 +helm|bigbench:conlang_translation:gornam_from|0 +helm|bigbench:conlang_translation:gornam_to|0 +helm|bigbench:conlang_translation:holuan_from|0 +helm|bigbench:conlang_translation:holuan_to|0 +helm|bigbench:conlang_translation:mkafala_from|0 +helm|bigbench:conlang_translation:mkafala_to|0 +helm|bigbench:conlang_translation:postpositive_english_from|0 +helm|bigbench:conlang_translation:postpositive_english_to|0 +helm|bigbench:conlang_translation:unapuri_from|0 +helm|bigbench:conlang_translation:unapuri_to|0 +helm|bigbench:conlang_translation:vaomi_from|0 +helm|bigbench:conlang_translation:vaomi_to|0 +helm|bigbench:emoji_movie|0 +helm|bigbench:formal_fallacies_syllogisms_negation|0 +helm|bigbench:hindu_knowledge|0 +helm|bigbench:known_unknowns|0 +helm|bigbench:language_identification|0 +helm|bigbench:linguistics_puzzles|0 +helm|bigbench:logic_grid_puzzle|0 +helm|bigbench:logical_deduction-five_objects|0 +helm|bigbench:logical_deduction-seven_objects|0 +helm|bigbench:logical_deduction-three_objects|0 +helm|bigbench:misconceptions_russian|0 +helm|bigbench:novel_concepts|0 +helm|bigbench:operators|0 +helm|bigbench:parsinlu_reading_comprehension|0 +helm|bigbench:play_dialog_same_or_different|0 +helm|bigbench:repeat_copy_logic|0 +helm|bigbench:strange_stories-boolean|0 +helm|bigbench:strange_stories-multiple_choice|0 +helm|bigbench:strategyqa|0 +helm|bigbench:symbol_interpretation-adversarial|0 +helm|bigbench:symbol_interpretation-emoji_agnostic|0 +helm|bigbench:symbol_interpretation-name_agnostic|0 +helm|bigbench:symbol_interpretation-plain|0 +helm|bigbench:symbol_interpretation-tricky|0 +helm|bigbench:vitaminc_fact_verification|0 +helm|bigbench:winowhy|0 +helm|blimp:adjunct_island|0 +helm|blimp:anaphor_gender_agreement|0 +helm|blimp:anaphor_number_agreement|0 +helm|blimp:animate_subject_passive|0 +helm|blimp:animate_subject_trans|0 +helm|blimp:causative|0 +helm|blimp:complex_NP_island|0 +helm|blimp:coordinate_structure_constraint_complex_left_branch|0 +helm|blimp:coordinate_structure_constraint_object_extraction|0 +helm|blimp:determiner_noun_agreement_1|0 +helm|blimp:determiner_noun_agreement_2|0 +helm|blimp:determiner_noun_agreement_irregular_1|0 +helm|blimp:determiner_noun_agreement_irregular_2|0 +helm|blimp:determiner_noun_agreement_with_adj_2|0 +helm|blimp:determiner_noun_agreement_with_adj_irregular_1|0 +helm|blimp:determiner_noun_agreement_with_adj_irregular_2|0 +helm|blimp:determiner_noun_agreement_with_adjective_1|0 +helm|blimp:distractor_agreement_relational_noun|0 +helm|blimp:distractor_agreement_relative_clause|0 +helm|blimp:drop_argument|0 +helm|blimp:ellipsis_n_bar_1|0 +helm|blimp:ellipsis_n_bar_2|0 +helm|blimp:existential_there_object_raising|0 +helm|blimp:existential_there_quantifiers_1|0 +helm|blimp:existential_there_quantifiers_2|0 +helm|blimp:existential_there_subject_raising|0 +helm|blimp:expletive_it_object_raising|0 +helm|blimp:inchoative|0 +helm|blimp:intransitive|0 +helm|blimp:irregular_past_participle_adjectives|0 +helm|blimp:irregular_past_participle_verbs|0 +helm|blimp:irregular_plural_subject_verb_agreement_1|0 +helm|blimp:irregular_plural_subject_verb_agreement_2|0 +helm|blimp:left_branch_island_echo_question|0 +helm|blimp:left_branch_island_simple_question|0 +helm|blimp:matrix_question_npi_licensor_present|0 +helm|blimp:npi_present_1|0 +helm|blimp:npi_present_2|0 +helm|blimp:only_npi_licensor_present|0 +helm|blimp:only_npi_scope|0 +helm|blimp:passive_1|0 +helm|blimp:passive_2|0 +helm|blimp:principle_A_c_command|0 +helm|blimp:principle_A_case_1|0 +helm|blimp:principle_A_case_2|0 +helm|blimp:principle_A_domain_1|0 +helm|blimp:principle_A_domain_2|0 +helm|blimp:principle_A_domain_3|0 +helm|blimp:principle_A_reconstruction|0 +helm|blimp:regular_plural_subject_verb_agreement_1|0 +helm|blimp:regular_plural_subject_verb_agreement_2|0 +helm|blimp:sentential_negation_npi_licensor_present|0 +helm|blimp:sentential_negation_npi_scope|0 +helm|blimp:sentential_subject_island|0 +helm|blimp:superlative_quantifiers_1|0 +helm|blimp:superlative_quantifiers_2|0 +helm|blimp:tough_vs_raising_1|0 +helm|blimp:tough_vs_raising_2|0 +helm|blimp:transitive|0 +helm|blimp:wh_island|0 +helm|blimp:wh_questions_object_gap|0 +helm|blimp:wh_questions_subject_gap_long_distance|0 +helm|blimp:wh_questions_subject_gap|0 +helm|blimp:wh_vs_that_no_gap_long_distance|0 +helm|blimp:wh_vs_that_no_gap|0 +helm|blimp:wh_vs_that_with_gap_long_distance|0 +helm|blimp:wh_vs_that_with_gap|0 +helm|bold:gender|0 +helm|bold:political_ideology|0 +helm|bold:profession|0 +helm|bold:race|0 +helm|bold:religious_ideology|0 +helm|bold|0 +helm|boolq:contrastset|0 +helm|boolq|0 +helm|civil_comments:LGBTQ|0 +helm|civil_comments:black|0 +helm|civil_comments:christian|0 +helm|civil_comments:female|0 +helm|civil_comments:male|0 +helm|civil_comments:muslim|0 +helm|civil_comments:other_religions|0 +helm|civil_comments:white|0 +helm|civil_comments|0 +helm|commonsenseqa|0 +helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_125|0 +helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_25|0 +helm|copyright:n_books_1000-extractions_per_book_1-prefix_length_5|0 +helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_125|0 +helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_25|0 +helm|copyright:n_books_1000-extractions_per_book_3-prefix_length_5|0 +helm|copyright:oh_the_places|0 +helm|copyright:pilot|0 +helm|copyright:popular_books-prefix_length_10|0 +helm|copyright:popular_books-prefix_length_125|0 +helm|copyright:popular_books-prefix_length_250|0 +helm|copyright:popular_books-prefix_length_25|0 +helm|copyright:popular_books-prefix_length_50|0 +helm|copyright:popular_books-prefix_length_5|0 +helm|copyright:prompt_num_line_1-min_lines_20|0 +helm|copyright:prompt_num_line_10-min_lines_20|0 +helm|copyright:prompt_num_line_5-min_lines_20|0 +helm|covid_dialogue|0 +helm|dyck_language:2|0 +helm|dyck_language:3|0 +helm|dyck_language:4|0 +helm|entity_data_imputation:Buy|0 +helm|entity_data_imputation:Restaurant|0 +helm|entity_matching:Abt_Buy|0 +helm|entity_matching:Amazon_Google|0 +helm|entity_matching:Beer|0 +helm|entity_matching:Company|0 +helm|entity_matching:DBLP_ACM|0 +helm|entity_matching:DBLP_GoogleScholar|0 +helm|entity_matching:Dirty_DBLP_ACM|0 +helm|entity_matching:Dirty_DBLP_GoogleScholar|0 +helm|entity_matching:Dirty_Walmart_Amazon|0 +helm|entity_matching:Dirty_iTunes_Amazon|0 +helm|entity_matching:Fodors_Zagats|0 +helm|entity_matching:Walmart_Amazon|0 +helm|entity_matching:iTunes_Amazon|0 +helm|hellaswag|0 +helm|humaneval|0 +helm|imdb:contrastset|0 +helm|imdb|0 +helm|interactive_qa_mmlu:abstract_algebra|0 +helm|interactive_qa_mmlu:college_chemistry|0 +helm|interactive_qa_mmlu:global_facts|0 +helm|interactive_qa_mmlu:miscellaneous|0 +helm|interactive_qa_mmlu:nutrition|0 +helm|interactive_qa_mmlu:us_foreign_policy|0 +helm|legal_summarization:billsum|0 +helm|legal_summarization:eurlexsum|0 +helm|legal_summarization:multilexsum|0 +helm|legalsupport|0 +helm|lexglue:case_hold|0 +helm|lexglue:ecthr_a|0 +helm|lexglue:ecthr_b|0 +helm|lexglue:eurlex|0 +helm|lexglue:ledgar|0 +helm|lexglue:scotus|0 +helm|lexglue:unfair_tos|0 +helm|lextreme:brazilian_court_decisions_judgment|0 +helm|lextreme:brazilian_court_decisions_unanimity|0 +helm|lextreme:covid19_emergency_event|0 +helm|lextreme:german_argument_mining|0 +helm|lextreme:greek_legal_code_chapter|0 +helm|lextreme:greek_legal_code_subject|0 +helm|lextreme:greek_legal_code_volume|0 +helm|lextreme:greek_legal_ner|0 +helm|lextreme:legalnero|0 +helm|lextreme:lener_br|0 +helm|lextreme:mapa_coarse|0 +helm|lextreme:mapa_fine|0 +helm|lextreme:multi_eurlex_level_1|0 +helm|lextreme:multi_eurlex_level_2|0 +helm|lextreme:multi_eurlex_level_3|0 +helm|lextreme:online_terms_of_service_clause_topics|0 +helm|lextreme:online_terms_of_service_unfairness_levels|0 +helm|lextreme:swiss_judgment_prediction|0 +helm|lsat_qa:assignment|0 +helm|lsat_qa:grouping|0 +helm|lsat_qa:miscellaneous|0 +helm|lsat_qa:ordering|0 +helm|lsat_qa|0 +helm|me_q_sum|0 +helm|med_dialog:healthcaremagic|0 +helm|med_dialog:icliniq|0 +helm|med_mcqa|0 +helm|med_paragraph_simplification|0 +helm|med_qa|0 +helm|mmlu:abstract_algebra|0 +helm|mmlu:anatomy|0 +helm|mmlu:astronomy|0 +helm|mmlu:business_ethics|0 +helm|mmlu:clinical_knowledge|0 +helm|mmlu:college_biology|0 +helm|mmlu:college_chemistry|0 +helm|mmlu:college_computer_science|0 +helm|mmlu:college_mathematics|0 +helm|mmlu:college_medicine|0 +helm|mmlu:college_physics|0 +helm|mmlu:computer_security|0 +helm|mmlu:conceptual_physics|0 +helm|mmlu:econometrics|0 +helm|mmlu:electrical_engineering|0 +helm|mmlu:elementary_mathematics|0 +helm|mmlu:formal_logic|0 +helm|mmlu:global_facts|0 +helm|mmlu:high_school_biology|0 +helm|mmlu:high_school_chemistry|0 +helm|mmlu:high_school_computer_science|0 +helm|mmlu:high_school_european_history|0 +helm|mmlu:high_school_geography|0 +helm|mmlu:high_school_government_and_politics|0 +helm|mmlu:high_school_macroeconomics|0 +helm|mmlu:high_school_mathematics|0 +helm|mmlu:high_school_microeconomics|0 +helm|mmlu:high_school_physics|0 +helm|mmlu:high_school_psychology|0 +helm|mmlu:high_school_statistics|0 +helm|mmlu:high_school_us_history|0 +helm|mmlu:high_school_world_history|0 +helm|mmlu:human_aging|0 +helm|mmlu:human_sexuality|0 +helm|mmlu:international_law|0 +helm|mmlu:jurisprudence|0 +helm|mmlu:logical_fallacies|0 +helm|mmlu:machine_learning|0 +helm|mmlu:management|0 +helm|mmlu:marketing|0 +helm|mmlu:medical_genetics|0 +helm|mmlu:miscellaneous|0 +helm|mmlu:moral_disputes|0 +helm|mmlu:moral_scenarios|0 +helm|mmlu:nutrition|0 +helm|mmlu:philosophy|0 +helm|mmlu:prehistory|0 +helm|mmlu:professional_accounting|0 +helm|mmlu:professional_law|0 +helm|mmlu:professional_medicine|0 +helm|mmlu:professional_psychology|0 +helm|mmlu:public_relations|0 +helm|mmlu:security_studies|0 +helm|mmlu:sociology|0 +helm|mmlu:us_foreign_policy|0 +helm|mmlu:virology|0 +helm|mmlu:world_religions|0 +helm|mmlu|0 +helm|narrativeqa|0 +helm|numeracy:linear_example|0 +helm|numeracy:linear_standard|0 +helm|numeracy:parabola_example|0 +helm|numeracy:parabola_standard|0 +helm|numeracy:paraboloid_example|0 +helm|numeracy:paraboloid_standard|0 +helm|numeracy:plane_example|0 +helm|numeracy:plane_standard|0 +helm|openbookqa|0 +helm|piqa|0 +helm|pubmedqa|0 +helm|quac|0 +helm|raft:ade_corpus_v2|0 +helm|raft:banking_77|0 +helm|raft:neurips_impact_statement_risks|0 +helm|raft:one_stop_english|0 +helm|raft:overruling|0 +helm|raft:semiconductor_org_types|0 +helm|raft:systematic_review_inclusion|0 +helm|raft:tai_safety_research|0 +helm|raft:terms_of_service|0 +helm|raft:tweet_eval_hate|0 +helm|raft:twitter_complaints|0 +helm|real_toxicity_prompts|0 +helm|siqa|0 +helm|summarization:cnn-dm|0 +helm|summarization:xsum-sampled|0 +helm|summarization:xsum|0 +helm|synthetic_reasoning:induction|0 +helm|synthetic_reasoning:natural_easy|0 +helm|synthetic_reasoning:natural_hard|0 +helm|synthetic_reasoning:pattern_match|0 +helm|synthetic_reasoning:variable_substitution|0 +helm|the_pile:arxiv|0 +helm|the_pile:bibliotik|0 +helm|the_pile:commoncrawl|0 +helm|the_pile:dm-mathematics|0 +helm|the_pile:enron|0 +helm|the_pile:europarl|0 +helm|the_pile:freelaw|0 +helm|the_pile:github|0 +helm|the_pile:gutenberg|0 +helm|the_pile:hackernews|0 +helm|the_pile:nih-exporter|0 +helm|the_pile:opensubtitles|0 +helm|the_pile:openwebtext2|0 +helm|the_pile:pubmed-abstracts|0 +helm|the_pile:pubmed-central|0 +helm|the_pile:stackexchange|0 +helm|the_pile:upsto|0 +helm|the_pile:wikipedia|0 +helm|the_pile:youtubesubtitles|0 +helm|truthfulqa|0 +helm|twitterAAE:aa|0 +helm|twitterAAE:white|0 +helm|wikifact:applies_to_jurisdiction|0 +helm|wikifact:atomic_number|0 +helm|wikifact:author|0 +helm|wikifact:award_received|0 +helm|wikifact:basic_form_of_government|0 +helm|wikifact:capital_of|0 +helm|wikifact:capital|0 +helm|wikifact:central_bank|0 +helm|wikifact:composer|0 +helm|wikifact:continent|0 +helm|wikifact:country_of_citizenship|0 +helm|wikifact:country_of_origin|0 +helm|wikifact:country|0 +helm|wikifact:creator|0 +helm|wikifact:currency|0 +helm|wikifact:defendant|0 +helm|wikifact:developer|0 +helm|wikifact:diplomatic_relation|0 +helm|wikifact:director|0 +helm|wikifact:discoverer_or_inventor|0 +helm|wikifact:drug_or_therapy_used_for_treatment|0 +helm|wikifact:educated_at|0 +helm|wikifact:electron_configuration|0 +helm|wikifact:employer|0 +helm|wikifact:field_of_work|0 +helm|wikifact:file_extension|0 +helm|wikifact:genetic_association|0 +helm|wikifact:genre|0 +helm|wikifact:has_part|0 +helm|wikifact:head_of_government|0 +helm|wikifact:head_of_state|0 +helm|wikifact:headquarters_location|0 +helm|wikifact:industry|0 +helm|wikifact:influenced_by|0 +helm|wikifact:instance_of|0 +helm|wikifact:instrument|0 +helm|wikifact:language_of_work_or_name|0 +helm|wikifact:languages_spoken_written_or_signed|0 +helm|wikifact:laws_applied|0 +helm|wikifact:located_in_the_administrative_territorial_entity|0 +helm|wikifact:location_of_discovery|0 +helm|wikifact:location_of_formation|0 +helm|wikifact:location|0 +helm|wikifact:majority_opinion_by|0 +helm|wikifact:manufacturer|0 +helm|wikifact:measured_physical_quantity|0 +helm|wikifact:medical_condition_treated|0 +helm|wikifact:member_of_political_party|0 +helm|wikifact:member_of_sports_team|0 +helm|wikifact:member_of|0 +helm|wikifact:movement|0 +helm|wikifact:named_after|0 +helm|wikifact:native_language|0 +helm|wikifact:number_of_processor_cores|0 +helm|wikifact:occupation|0 +helm|wikifact:office_held_by_head_of_government|0 +helm|wikifact:office_held_by_head_of_state|0 +helm|wikifact:official_language|0 +helm|wikifact:operating_system|0 +helm|wikifact:original_language_of_film_or_TV_show|0 +helm|wikifact:original_network|0 +helm|wikifact:overrules|0 +helm|wikifact:owned_by|0 +helm|wikifact:part_of|0 +helm|wikifact:participating_team|0 +helm|wikifact:place_of_birth|0 +helm|wikifact:place_of_death|0 +helm|wikifact:plaintiff|0 +helm|wikifact:position_held|0 +helm|wikifact:position_played_on_team|0 +helm|wikifact:programming_language|0 +helm|wikifact:recommended_unit_of_measurement|0 +helm|wikifact:record_label|0 +helm|wikifact:religion|0 +helm|wikifact:repealed_by|0 +helm|wikifact:shares_border_with|0 +helm|wikifact:solved_by|0 +helm|wikifact:statement_describes|0 +helm|wikifact:stock_exchange|0 +helm|wikifact:subclass_of|0 +helm|wikifact:subsidiary|0 +helm|wikifact:symptoms_and_signs|0 +helm|wikifact:therapeutic_area|0 +helm|wikifact:time_of_discovery_or_invention|0 +helm|wikifact:twinned_administrative_body|0 +helm|wikifact:work_location|0 +helm|wikitext:103|0 +helm|wmt14:cs-en|0 +helm|wmt14:de-en|0 +helm|wmt14:fr-en|0 +helm|wmt14:hi-en|0 +helm|wmt14:ru-en|0 +lighteval|anli:r1|0 +lighteval|anli:r2|0 +lighteval|anli:r3|0 +lighteval|anli|0 +leaderboard|arc:challenge|0 +lighteval|arc:easy|0 +lighteval|arithmetic:1dc|0 +lighteval|arithmetic:2da|0 +lighteval|arithmetic:2dm|0 +lighteval|arithmetic:2ds|0 +lighteval|arithmetic:3da|0 +lighteval|arithmetic:3ds|0 +lighteval|arithmetic:4da|0 +lighteval|arithmetic:4ds|0 +lighteval|arithmetic:5da|0 +lighteval|arithmetic:5ds|0 +lighteval|asdiv|0 +lighteval|blimp:adjunct_island|0 +lighteval|blimp:anaphor_gender_agreement|0 +lighteval|blimp:anaphor_number_agreement|0 +lighteval|blimp:animate_subject_passive|0 +lighteval|blimp:animate_subject_trans|0 +lighteval|blimp:causative|0 +lighteval|blimp:complex_NP_island|0 +lighteval|blimp:coordinate_structure_constraint_complex_left_branch|0 +lighteval|blimp:coordinate_structure_constraint_object_extraction|0 +lighteval|blimp:determiner_noun_agreement_1|0 +lighteval|blimp:determiner_noun_agreement_2|0 +lighteval|blimp:determiner_noun_agreement_irregular_1|0 +lighteval|blimp:determiner_noun_agreement_irregular_2|0 +lighteval|blimp:determiner_noun_agreement_with_adj_2|0 +lighteval|blimp:determiner_noun_agreement_with_adj_irregular_1|0 +lighteval|blimp:determiner_noun_agreement_with_adj_irregular_2|0 +lighteval|blimp:determiner_noun_agreement_with_adjective_1|0 +lighteval|blimp:distractor_agreement_relational_noun|0 +lighteval|blimp:distractor_agreement_relative_clause|0 +lighteval|blimp:drop_argument|0 +lighteval|blimp:ellipsis_n_bar_1|0 +lighteval|blimp:ellipsis_n_bar_2|0 +lighteval|blimp:existential_there_object_raising|0 +lighteval|blimp:existential_there_quantifiers_1|0 +lighteval|blimp:existential_there_quantifiers_2|0 +lighteval|blimp:existential_there_subject_raising|0 +lighteval|blimp:expletive_it_object_raising|0 +lighteval|blimp:inchoative|0 +lighteval|blimp:intransitive|0 +lighteval|blimp:irregular_past_participle_adjectives|0 +lighteval|blimp:irregular_past_participle_verbs|0 +lighteval|blimp:irregular_plural_subject_verb_agreement_1|0 +lighteval|blimp:irregular_plural_subject_verb_agreement_2|0 +lighteval|blimp:left_branch_island_echo_question|0 +lighteval|blimp:left_branch_island_simple_question|0 +lighteval|blimp:matrix_question_npi_licensor_present|0 +lighteval|blimp:npi_present_1|0 +lighteval|blimp:npi_present_2|0 +lighteval|blimp:only_npi_licensor_present|0 +lighteval|blimp:only_npi_scope|0 +lighteval|blimp:passive_1|0 +lighteval|blimp:passive_2|0 +lighteval|blimp:principle_A_c_command|0 +lighteval|blimp:principle_A_case_1|0 +lighteval|blimp:principle_A_case_2|0 +lighteval|blimp:principle_A_domain_1|0 +lighteval|blimp:principle_A_domain_2|0 +lighteval|blimp:principle_A_domain_3|0 +lighteval|blimp:principle_A_reconstruction|0 +lighteval|blimp:regular_plural_subject_verb_agreement_1|0 +lighteval|blimp:regular_plural_subject_verb_agreement_2|0 +lighteval|blimp:sentential_negation_npi_licensor_present|0 +lighteval|blimp:sentential_negation_npi_scope|0 +lighteval|blimp:sentential_subject_island|0 +lighteval|blimp:superlative_quantifiers_1|0 +lighteval|blimp:superlative_quantifiers_2|0 +lighteval|blimp:tough_vs_raising_1|0 +lighteval|blimp:tough_vs_raising_2|0 +lighteval|blimp:transitive|0 +lighteval|blimp:wh_island|0 +lighteval|blimp:wh_questions_object_gap|0 +lighteval|blimp:wh_questions_subject_gap_long_distance|0 +lighteval|blimp:wh_questions_subject_gap|0 +lighteval|blimp:wh_vs_that_no_gap_long_distance|0 +lighteval|blimp:wh_vs_that_no_gap|0 +lighteval|blimp:wh_vs_that_with_gap_long_distance|0 +lighteval|blimp:wh_vs_that_with_gap|0 +lighteval|coqa_bb|0 +lighteval|coqa|0 +lighteval|drop|0 +lighteval|ethics:commonsense|0 +lighteval|ethics:deontology|0 +lighteval|ethics:justice|0 +lighteval|ethics:utilitarianism|0 +lighteval|ethics:virtue|0 +lighteval|glue:cola|0 +lighteval|glue:mnli_mismatched|0 +lighteval|glue:mnli|0 +lighteval|glue:mrpc|0 +lighteval|glue:qnli|0 +lighteval|glue:qqp|0 +lighteval|glue:rte|0 +lighteval|glue:sst2|0 +lighteval|glue:stsb|0 +lighteval|glue:wnli|0 +leaderboard|gsm8k|0 +lighteval|headqa:en|0 +lighteval|headqa:es|0 +leaderboard|hellaswag|0 +lighteval|iwslt17:ar-en|0 +lighteval|iwslt17:de-en|0 +lighteval|iwslt17:en-ar|0 +lighteval|iwslt17:en-de|0 +lighteval|iwslt17:en-fr|0 +lighteval|iwslt17:en-ja|0 +lighteval|iwslt17:en-ko|0 +lighteval|iwslt17:en-zh|0 +lighteval|iwslt17:fr-en|0 +lighteval|iwslt17:ja-en|0 +lighteval|iwslt17:ko-en|0 +lighteval|iwslt17:zh-en|0 +lighteval|lambada:openai:de|0 +lighteval|lambada:openai:en|0 +lighteval|lambada:openai:es|0 +lighteval|lambada:openai:fr|0 +lighteval|lambada:openai:it|0 +lighteval|lambada:openai_cloze|0 +lighteval|lambada:openai|0 +lighteval|lambada:standard_cloze|0 +lighteval|lambada:standard|0 +lighteval|logiqa|0 +lighteval|math:algebra|0 +lighteval|math:counting_and_probability|0 +lighteval|math:geometry|0 +lighteval|math:intermediate_algebra|0 +lighteval|math:number_theory|0 +lighteval|math:prealgebra|0 +lighteval|math:precalculus|0 +lighteval|mathqa|0 +lighteval|mgsm:bn|0 +lighteval|mgsm:de|0 +lighteval|mgsm:en|0 +lighteval|mgsm:es|0 +lighteval|mgsm:fr|0 +lighteval|mgsm:ja|0 +lighteval|mgsm:ru|0 +lighteval|mgsm:sw|0 +lighteval|mgsm:te|0 +lighteval|mgsm:th|0 +lighteval|mgsm:zh|0 +leaderboard|mmlu:abstract_algebra|0 +leaderboard|mmlu:anatomy|0 +leaderboard|mmlu:astronomy|0 +leaderboard|mmlu:business_ethics|0 +leaderboard|mmlu:clinical_knowledge|0 +leaderboard|mmlu:college_biology|0 +leaderboard|mmlu:college_chemistry|0 +leaderboard|mmlu:college_computer_science|0 +leaderboard|mmlu:college_mathematics|0 +leaderboard|mmlu:college_medicine|0 +leaderboard|mmlu:college_physics|0 +leaderboard|mmlu:computer_security|0 +leaderboard|mmlu:conceptual_physics|0 +leaderboard|mmlu:econometrics|0 +leaderboard|mmlu:electrical_engineering|0 +leaderboard|mmlu:elementary_mathematics|0 +leaderboard|mmlu:formal_logic|0 +leaderboard|mmlu:global_facts|0 +leaderboard|mmlu:high_school_biology|0 +leaderboard|mmlu:high_school_chemistry|0 +leaderboard|mmlu:high_school_computer_science|0 +leaderboard|mmlu:high_school_european_history|0 +leaderboard|mmlu:high_school_geography|0 +leaderboard|mmlu:high_school_government_and_politics|0 +leaderboard|mmlu:high_school_macroeconomics|0 +leaderboard|mmlu:high_school_mathematics|0 +leaderboard|mmlu:high_school_microeconomics|0 +leaderboard|mmlu:high_school_physics|0 +leaderboard|mmlu:high_school_psychology|0 +leaderboard|mmlu:high_school_statistics|0 +leaderboard|mmlu:high_school_us_history|0 +leaderboard|mmlu:high_school_world_history|0 +leaderboard|mmlu:human_aging|0 +leaderboard|mmlu:human_sexuality|0 +leaderboard|mmlu:international_law|0 +leaderboard|mmlu:jurisprudence|0 +leaderboard|mmlu:logical_fallacies|0 +leaderboard|mmlu:machine_learning|0 +leaderboard|mmlu:management|0 +leaderboard|mmlu:marketing|0 +leaderboard|mmlu:medical_genetics|0 +leaderboard|mmlu:miscellaneous|0 +leaderboard|mmlu:moral_disputes|0 +leaderboard|mmlu:moral_scenarios|0 +leaderboard|mmlu:nutrition|0 +leaderboard|mmlu:philosophy|0 +leaderboard|mmlu:prehistory|0 +leaderboard|mmlu:professional_accounting|0 +leaderboard|mmlu:professional_law|0 +leaderboard|mmlu:professional_medicine|0 +leaderboard|mmlu:professional_psychology|0 +leaderboard|mmlu:public_relations|0 +leaderboard|mmlu:security_studies|0 +leaderboard|mmlu:sociology|0 +leaderboard|mmlu:us_foreign_policy|0 +leaderboard|mmlu:virology|0 +leaderboard|mmlu:world_religions|0 +lighteval|mtnt2019:en-fr|0 +lighteval|mtnt2019:en-ja|0 +lighteval|mtnt2019:fr-en|0 +lighteval|mtnt2019:ja-en|0 +lighteval|mutual_plus|0 +lighteval|mutual|0 +lighteval|openbookqa|0 +lighteval|piqa|0 +lighteval|prost|0 +lighteval|pubmedqa|0 +lighteval|qa4mre:2011|0 +lighteval|qa4mre:2012|0 +lighteval|qa4mre:2013|0 +lighteval|qasper_ll|0 +lighteval|qasper|0 +lighteval|race:high|0 +lighteval|sciq|0 +lighteval|storycloze:2016|0 +lighteval|storycloze:2018|0 +lighteval|super_glue:boolq|0 +lighteval|super_glue:cb|0 +lighteval|super_glue:copa|0 +lighteval|super_glue:multirc|0 +lighteval|super_glue:record|0 +lighteval|super_glue:rte|0 +lighteval|super_glue:wic|0 +lighteval|super_glue:wsc|0 +lighteval|swag|0 +lighteval|the_pile:arxiv|0 +lighteval|the_pile:bookcorpus2|0 +lighteval|the_pile:books3|0 +lighteval|the_pile:dm-mathematics|0 +lighteval|the_pile:enron|0 +lighteval|the_pile:europarl|0 +lighteval|the_pile:freelaw|0 +lighteval|the_pile:github|0 +lighteval|the_pile:gutenberg|0 +lighteval|the_pile:hackernews|0 +lighteval|the_pile:nih-exporter|0 +lighteval|the_pile:opensubtitles|0 +lighteval|the_pile:openwebtext2|0 +lighteval|the_pile:philpapers|0 +lighteval|the_pile:pile-cc|0 +lighteval|the_pile:pubmed-abstracts|0 +lighteval|the_pile:pubmed-central|0 +lighteval|the_pile:stackexchange|0 +lighteval|the_pile:ubuntu-irc|0 +lighteval|the_pile:uspto|0 +lighteval|the_pile:wikipedia|0 +lighteval|the_pile:youtubesubtitles|0 +lighteval|toxigen|0 +lighteval|triviaqa|0 +lighteval|truthfulqa:gen|0 +leaderboard|truthfulqa:mc|0 +lighteval|unscramble:anagrams1|0 +lighteval|unscramble:anagrams2|0 +lighteval|unscramble:cycle_letters|0 +lighteval|unscramble:random_insertion|0 +lighteval|unscramble:reversed_words|0 +lighteval|webqs|0 +lighteval|wikitext|0 +leaderboard|winogrande|0 +lighteval|wmt08:cs-en|0 +lighteval|wmt08:de-en|0 +lighteval|wmt08:en-cs|0 +lighteval|wmt08:en-de|0 +lighteval|wmt08:en-es|0 +lighteval|wmt08:en-fr|0 +lighteval|wmt08:en-hu|0 +lighteval|wmt08:es-en|0 +lighteval|wmt08:fr-en|0 +lighteval|wmt08:hu-en|0 +lighteval|wmt09:cs-en|0 +lighteval|wmt09:de-en|0 +lighteval|wmt09:en-cs|0 +lighteval|wmt09:en-de|0 +lighteval|wmt09:en-es|0 +lighteval|wmt09:en-fr|0 +lighteval|wmt09:en-hu|0 +lighteval|wmt09:en-it|0 +lighteval|wmt09:es-en|0 +lighteval|wmt09:fr-en|0 +lighteval|wmt09:hu-en|0 +lighteval|wmt09:it-en|0 +lighteval|wmt10:cs-en|0 +lighteval|wmt10:de-en|0 +lighteval|wmt10:en-cs|0 +lighteval|wmt10:en-de|0 +lighteval|wmt10:en-es|0 +lighteval|wmt10:en-fr|0 +lighteval|wmt10:es-en|0 +lighteval|wmt10:fr-en|0 +lighteval|wmt11:cs-en|0 +lighteval|wmt11:de-en|0 +lighteval|wmt11:en-cs|0 +lighteval|wmt11:en-de|0 +lighteval|wmt11:en-es|0 +lighteval|wmt11:en-fr|0 +lighteval|wmt11:es-en|0 +lighteval|wmt11:fr-en|0 +lighteval|wmt12:cs-en|0 +lighteval|wmt12:de-en|0 +lighteval|wmt12:en-cs|0 +lighteval|wmt12:en-de|0 +lighteval|wmt12:en-es|0 +lighteval|wmt12:en-fr|0 +lighteval|wmt12:es-en|0 +lighteval|wmt12:fr-en|0 +lighteval|wmt13:cs-en|0 +lighteval|wmt13:de-en|0 +lighteval|wmt13:en-cs|0 +lighteval|wmt13:en-de|0 +lighteval|wmt13:en-es|0 +lighteval|wmt13:en-fr|0 +lighteval|wmt13:en-ru|0 +lighteval|wmt13:es-en|0 +lighteval|wmt13:fr-en|0 +lighteval|wmt13:ru-en|0 +lighteval|wmt14:cs-en|0 +lighteval|wmt14:de-en|0 +lighteval|wmt14:en-cs|0 +lighteval|wmt14:en-de|0 +lighteval|wmt14:en-fr|0 +lighteval|wmt14:en-fr|0 +lighteval|wmt14:en-hi|0 +lighteval|wmt14:en-ru|0 +lighteval|wmt14:fr-en|0 +lighteval|wmt14:fr-en|0 +lighteval|wmt14:hi-en|0 +lighteval|wmt14:ru-en|0 +lighteval|wmt15:cs-en|0 +lighteval|wmt15:de-en|0 +lighteval|wmt15:en-cs|0 +lighteval|wmt15:en-de|0 +lighteval|wmt15:en-fi|0 +lighteval|wmt15:en-fr|0 +lighteval|wmt15:en-ru|0 +lighteval|wmt15:fi-en|0 +lighteval|wmt15:fr-en|0 +lighteval|wmt15:ru-en|0 +lighteval|wmt16:cs-en|0 +lighteval|wmt16:de-en|0 +lighteval|wmt16:de-en|0 +lighteval|wmt16:en-cs|0 +lighteval|wmt16:en-de|0 +lighteval|wmt16:en-de|0 +lighteval|wmt16:en-fi|0 +lighteval|wmt16:en-ro|0 +lighteval|wmt16:en-ro|0 +lighteval|wmt16:en-ru|0 +lighteval|wmt16:en-tr|0 +lighteval|wmt16:fi-en|0 +lighteval|wmt16:ro-en|0 +lighteval|wmt16:ro-en|0 +lighteval|wmt16:ru-en|0 +lighteval|wmt16:tr-en|0 +lighteval|wmt17:cs-en|0 +lighteval|wmt17:de-en|0 +lighteval|wmt17:en-cs|0 +lighteval|wmt17:en-de|0 +lighteval|wmt17:en-fi|0 +lighteval|wmt17:en-lv|0 +lighteval|wmt17:en-ru|0 +lighteval|wmt17:en-tr|0 +lighteval|wmt17:en-zh|0 +lighteval|wmt17:fi-en|0 +lighteval|wmt17:lv-en|0 +lighteval|wmt17:ru-en|0 +lighteval|wmt17:tr-en|0 +lighteval|wmt17:zh-en|0 +lighteval|wmt18:cs-en|0 +lighteval|wmt18:de-en|0 +lighteval|wmt18:en-cs|0 +lighteval|wmt18:en-de|0 +lighteval|wmt18:en-et|0 +lighteval|wmt18:en-fi|0 +lighteval|wmt18:en-ru|0 +lighteval|wmt18:en-tr|0 +lighteval|wmt18:en-zh|0 +lighteval|wmt18:et-en|0 +lighteval|wmt18:fi-en|0 +lighteval|wmt18:ru-en|0 +lighteval|wmt18:tr-en|0 +lighteval|wmt18:zh-en|0 +lighteval|wmt19:cs-de|0 +lighteval|wmt19:de-cs|0 +lighteval|wmt19:de-en|0 +lighteval|wmt19:de-fr|0 +lighteval|wmt19:en-cs|0 +lighteval|wmt19:en-de|0 +lighteval|wmt19:en-fi|0 +lighteval|wmt19:en-gu|0 +lighteval|wmt19:en-kk|0 +lighteval|wmt19:en-lt|0 +lighteval|wmt19:en-ru|0 +lighteval|wmt19:en-zh|0 +lighteval|wmt19:fi-en|0 +lighteval|wmt19:fr-de|0 +lighteval|wmt19:gu-en|0 +lighteval|wmt19:kk-en|0 +lighteval|wmt19:lt-en|0 +lighteval|wmt19:ru-en|0 +lighteval|wmt19:zh-en|0 +lighteval|wmt20:cs-en|0 +lighteval|wmt20:de-en|0 +lighteval|wmt20:de-fr|0 +lighteval|wmt20:en-cs|0 +lighteval|wmt20:en-de|0 +lighteval|wmt20:en-iu|0 +lighteval|wmt20:en-ja|0 +lighteval|wmt20:en-km|0 +lighteval|wmt20:en-pl|0 +lighteval|wmt20:en-ps|0 +lighteval|wmt20:en-ru|0 +lighteval|wmt20:en-ta|0 +lighteval|wmt20:en-zh|0 +lighteval|wmt20:fr-de|0 +lighteval|wmt20:iu-en|0 +lighteval|wmt20:ja-en|0 +lighteval|wmt20:km-en|0 +lighteval|wmt20:pl-en|0 +lighteval|wmt20:ps-en|0 +lighteval|wmt20:ru-en|0 +lighteval|wmt20:ta-en|0 +lighteval|wmt20:zh-en|0 +lighteval|wsc273|0 +lighteval|xcopa:en|0 +lighteval|xcopa:et|0 +lighteval|xcopa:ht|0 +lighteval|xcopa:id|0 +lighteval|xcopa:it|0 +lighteval|xcopa:qu|0 +lighteval|xcopa:sw|0 +lighteval|xcopa:ta|0 +lighteval|xcopa:th|0 +lighteval|xcopa:tr|0 +lighteval|xcopa:vi|0 +lighteval|xcopa:zh|0 +lighteval|xstory_cloze:ar|0 +lighteval|xstory_cloze:en|0 +lighteval|xstory_cloze:es|0 +lighteval|xstory_cloze:eu|0 +lighteval|xstory_cloze:hi|0 +lighteval|xstory_cloze:id|0 +lighteval|xstory_cloze:my|0 +lighteval|xstory_cloze:ru|0 +lighteval|xstory_cloze:sw|0 +lighteval|xstory_cloze:te|0 +lighteval|xstory_cloze:zh|0 +lighteval|xwinograd:en|0 +lighteval|xwinograd:fr|0 +lighteval|xwinograd:jp|0 +lighteval|xwinograd:pt|0 +lighteval|xwinograd:ru|0 +lighteval|xwinograd:zh|0 +original|arc:c:letters|0 +original|arc:c:options|0 +original|arc:c:simple|0 +original|mmlu:abstract_algebra|0 +original|mmlu:anatomy|0 +original|mmlu:astronomy|0 +original|mmlu:business_ethics|0 +original|mmlu:clinical_knowledge|0 +original|mmlu:college_biology|0 +original|mmlu:college_chemistry|0 +original|mmlu:college_computer_science|0 +original|mmlu:college_mathematics|0 +original|mmlu:college_medicine|0 +original|mmlu:college_physics|0 +original|mmlu:computer_security|0 +original|mmlu:conceptual_physics|0 +original|mmlu:econometrics|0 +original|mmlu:electrical_engineering|0 +original|mmlu:elementary_mathematics|0 +original|mmlu:formal_logic|0 +original|mmlu:global_facts|0 +original|mmlu:high_school_biology|0 +original|mmlu:high_school_chemistry|0 +original|mmlu:high_school_computer_science|0 +original|mmlu:high_school_european_history|0 +original|mmlu:high_school_geography|0 +original|mmlu:high_school_government_and_politics|0 +original|mmlu:high_school_macroeconomics|0 +original|mmlu:high_school_mathematics|0 +original|mmlu:high_school_microeconomics|0 +original|mmlu:high_school_physics|0 +original|mmlu:high_school_psychology|0 +original|mmlu:high_school_statistics|0 +original|mmlu:high_school_us_history|0 +original|mmlu:high_school_world_history|0 +original|mmlu:human_aging|0 +original|mmlu:human_sexuality|0 +original|mmlu:international_law|0 +original|mmlu:jurisprudence|0 +original|mmlu:logical_fallacies|0 +original|mmlu:machine_learning|0 +original|mmlu:management|0 +original|mmlu:marketing|0 +original|mmlu:medical_genetics|0 +original|mmlu:miscellaneous|0 +original|mmlu:moral_disputes|0 +original|mmlu:moral_scenarios|0 +original|mmlu:nutrition|0 +original|mmlu:philosophy|0 +original|mmlu:prehistory|0 +original|mmlu:professional_accounting|0 +original|mmlu:professional_law|0 +original|mmlu:professional_medicine|0 +original|mmlu:professional_psychology|0 +original|mmlu:public_relations|0 +original|mmlu:security_studies|0 +original|mmlu:sociology|0 +original|mmlu:us_foreign_policy|0 +original|mmlu:virology|0 +original|mmlu:world_religions|0 +original|mmlu|0 diff --git a/examples/tasks/bbh.txt b/examples/tasks/bbh.txt index 6b90fa3ae..c12ff66c4 100644 --- a/examples/tasks/bbh.txt +++ b/examples/tasks/bbh.txt @@ -1,36 +1,36 @@ -lighteval|bigbench:causal_judgment|3|0 -lighteval|bigbench:date_understanding|3|0 -lighteval|bigbench:disambiguation_qa|3|0 -lighteval|bigbench:geometric_shapes|3|0 -lighteval|bigbench:logical_deduction_five_objects|3|0 -lighteval|bigbench:logical_deduction_seven_objects|3|0 -lighteval|bigbench:logical_deduction_three_objects|3|0 -lighteval|bigbench:movie_recommendation|3|0 -lighteval|bigbench:navigate|3|0 -lighteval|bigbench:reasoning_about_colored_objects|3|0 -lighteval|bigbench:ruin_names|3|0 -lighteval|bigbench:salient_translation_error_detection|3|0 -lighteval|bigbench:snarks|3|0 -lighteval|bigbench:sports_understanding|3|0 -lighteval|bigbench:temporal_sequences|3|0 -lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0 -lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0 -lighteval|bigbench:tracking_shuffled_objects_three_objects|3|0 -harness|bigbench:causal_judgment|3|0 -harness|bigbench:date_understanding|3|0 -harness|bigbench:disambiguation_qa|3|0 -harness|bigbench:geometric_shapes|3|0 -harness|bigbench:logical_deduction_five_objects|3|0 -harness|bigbench:logical_deduction_seven_objects|3|0 -harness|bigbench:logical_deduction_three_objects|3|0 -harness|bigbench:movie_recommendation|3|0 -harness|bigbench:navigate|3|0 -harness|bigbench:reasoning_about_colored_objects|3|0 -harness|bigbench:ruin_names|3|0 -harness|bigbench:salient_translation_error_detection|3|0 -harness|bigbench:snarks|3|0 -harness|bigbench:sports_understanding|3|0 -harness|bigbench:temporal_sequences|3|0 -harness|bigbench:tracking_shuffled_objects_five_objects|3|0 -harness|bigbench:tracking_shuffled_objects_seven_objects|3|0 -harness|bigbench:tracking_shuffled_objects_three_objects|3|0 +lighteval|bigbench:causal_judgment|3 +lighteval|bigbench:date_understanding|3 +lighteval|bigbench:disambiguation_qa|3 +lighteval|bigbench:geometric_shapes|3 +lighteval|bigbench:logical_deduction_five_objects|3 +lighteval|bigbench:logical_deduction_seven_objects|3 +lighteval|bigbench:logical_deduction_three_objects|3 +lighteval|bigbench:movie_recommendation|3 +lighteval|bigbench:navigate|3 +lighteval|bigbench:reasoning_about_colored_objects|3 +lighteval|bigbench:ruin_names|3 +lighteval|bigbench:salient_translation_error_detection|3 +lighteval|bigbench:snarks|3 +lighteval|bigbench:sports_understanding|3 +lighteval|bigbench:temporal_sequences|3 +lighteval|bigbench:tracking_shuffled_objects_five_objects|3 +lighteval|bigbench:tracking_shuffled_objects_seven_objects|3 +lighteval|bigbench:tracking_shuffled_objects_three_objects|3 +harness|bigbench:causal_judgment|3 +harness|bigbench:date_understanding|3 +harness|bigbench:disambiguation_qa|3 +harness|bigbench:geometric_shapes|3 +harness|bigbench:logical_deduction_five_objects|3 +harness|bigbench:logical_deduction_seven_objects|3 +harness|bigbench:logical_deduction_three_objects|3 +harness|bigbench:movie_recommendation|3 +harness|bigbench:navigate|3 +harness|bigbench:reasoning_about_colored_objects|3 +harness|bigbench:ruin_names|3 +harness|bigbench:salient_translation_error_detection|3 +harness|bigbench:snarks|3 +harness|bigbench:sports_understanding|3 +harness|bigbench:temporal_sequences|3 +harness|bigbench:tracking_shuffled_objects_five_objects|3 +harness|bigbench:tracking_shuffled_objects_seven_objects|3 +harness|bigbench:tracking_shuffled_objects_three_objects|3 diff --git a/examples/tasks/fine_tasks/cf/ar.txt b/examples/tasks/fine_tasks/cf/ar.txt index 8e7bbe0b7..e9c025ecc 100644 --- a/examples/tasks/fine_tasks/cf/ar.txt +++ b/examples/tasks/fine_tasks/cf/ar.txt @@ -1,23 +1,23 @@ # General Knowledge (GK) -lighteval|exams_ara_cf|0|1 -lighteval|mmlu_ara_cf|0|1 -lighteval|alghafa_arc_ara_cf:easy|0|1 -lighteval|alghafa_sciqa_ara_cf|0|1 +lighteval|exams_ara_cf|0 +lighteval|mmlu_ara_cf|0 +lighteval|alghafa_arc_ara_cf:easy|0 +lighteval|alghafa_sciqa_ara_cf|0 # Reading Comprehension (RC) -lighteval|belebele_arb_Arab_cf|0|1 -lighteval|soqal_ara_cf|0|1 -lighteval|mlqa_ara|0|1 -lighteval|tydiqa_ara|0|1 -lighteval|alghafa_race_ara_cf|0|1 -lighteval|arcd_ara|0|1 +lighteval|belebele_arb_Arab_cf|0 +lighteval|soqal_ara_cf|0 +lighteval|mlqa_ara|0 +lighteval|tydiqa_ara|0 +lighteval|alghafa_race_ara_cf|0 +lighteval|arcd_ara|0 # Reasoning (RES) -lighteval|xcodah_ara_cf|0|1 -lighteval|alghafa_piqa_ara_cf|0|1 -lighteval|xcsqa_ara_cf|0|1 +lighteval|xcodah_ara_cf|0 +lighteval|alghafa_piqa_ara_cf|0 +lighteval|xcsqa_ara_cf|0 # Natural Language Understanding (NLU) -lighteval|xnli2.0_ara_cf|0|1 -lighteval|mlmm_hellaswag_ara_cf|0|1 -lighteval|xstory_cloze_ara_cf|0|1 +lighteval|xnli2.0_ara_cf|0 +lighteval|mlmm_hellaswag_ara_cf|0 +lighteval|xstory_cloze_ara_cf|0 diff --git a/examples/tasks/fine_tasks/cf/fr.txt b/examples/tasks/fine_tasks/cf/fr.txt index e20a4808f..9e822cd2e 100644 --- a/examples/tasks/fine_tasks/cf/fr.txt +++ b/examples/tasks/fine_tasks/cf/fr.txt @@ -1,16 +1,16 @@ # General Knowledge (GK) -lighteval|meta_mmlu_fra_cf|0|1 -lighteval|mlmm_arc_fra_cf:challenge|0|1 -lighteval|mintaka_fra|0|1 +lighteval|meta_mmlu_fra_cf|0 +lighteval|mlmm_arc_fra_cf:challenge|0 +lighteval|mintaka_fra|0 # Reading Comprehension (RC) -lighteval|belebele_fra_Latn_cf|0|1 -lighteval|fquadv2_fra|0|1 +lighteval|belebele_fra_Latn_cf|0 +lighteval|fquadv2_fra|0 # Reasoning (RES) -lighteval|xcodah_fra_cf|0|1 -lighteval|xcsqa_fra_cf|0|1 +lighteval|xcodah_fra_cf|0 +lighteval|xcsqa_fra_cf|0 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_fra_cf|0|1 -lighteval|xnli2.0_fra_cf|0|1 +lighteval|mlmm_hellaswag_fra_cf|0 +lighteval|xnli2.0_fra_cf|0 diff --git a/examples/tasks/fine_tasks/cf/hi.txt b/examples/tasks/fine_tasks/cf/hi.txt index 41aa6477e..7c7b565d4 100644 --- a/examples/tasks/fine_tasks/cf/hi.txt +++ b/examples/tasks/fine_tasks/cf/hi.txt @@ -1,17 +1,17 @@ # General Knowledge (GK) -lighteval|meta_mmlu_hin_cf|0|1 -lighteval|community_arc_hin_cf:easy|0|1 +lighteval|meta_mmlu_hin_cf|0 +lighteval|community_arc_hin_cf:easy|0 # Reading Comprehension (RC) -lighteval|belebele_hin_Deva_cf|0|1 -lighteval|indicqa_hin|0|1 +lighteval|belebele_hin_Deva_cf|0 +lighteval|indicqa_hin|0 # Reasoning (RES) -lighteval|xcodah_hin_cf|0|1 -lighteval|indicxcopa_hin_cf|0|1 -lighteval|xcsqa_hin_cf|0|1 +lighteval|xcodah_hin_cf|0 +lighteval|indicxcopa_hin_cf|0 +lighteval|xcsqa_hin_cf|0 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_hin_cf|0|1 -lighteval|indicnxnli_hin_cf|0|1 -lighteval|xstory_cloze_hin_cf|0|1 +lighteval|mlmm_hellaswag_hin_cf|0 +lighteval|indicnxnli_hin_cf|0 +lighteval|xstory_cloze_hin_cf|0 diff --git a/examples/tasks/fine_tasks/cf/ru.txt b/examples/tasks/fine_tasks/cf/ru.txt index d16e07d72..3e37ec3c0 100644 --- a/examples/tasks/fine_tasks/cf/ru.txt +++ b/examples/tasks/fine_tasks/cf/ru.txt @@ -1,20 +1,20 @@ # General Knowledge (GK) -lighteval|mlmm_arc_rus_cf:challenge|0|1 -lighteval|rummlu_rus_cf|0|1 -lighteval|mera_openbookqa_rus_cf|0|1 +lighteval|mlmm_arc_rus_cf:challenge|0 +lighteval|rummlu_rus_cf|0 +lighteval|mera_openbookqa_rus_cf|0 # Reading Comprehension (RC) -lighteval|belebele_rus_Cyrl_cf|0|1 -lighteval|tydiqa_rus|0|1 -lighteval|sber_squad_rus|0|1 -lighteval|xquad_rus|0|1 +lighteval|belebele_rus_Cyrl_cf|0 +lighteval|tydiqa_rus|0 +lighteval|sber_squad_rus|0 +lighteval|xquad_rus|0 # Reasoning (RES) -lighteval|parus_rus_cf|0|1 -lighteval|xcodah_rus_cf|0|1 -lighteval|xcsqa_rus_cf|0|1 +lighteval|parus_rus_cf|0 +lighteval|xcodah_rus_cf|0 +lighteval|xcsqa_rus_cf|0 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_rus_cf|0|1 -lighteval|xnli2.0_rus_cf|0|1 -lighteval|xstory_cloze_rus_cf|0|1 +lighteval|mlmm_hellaswag_rus_cf|0 +lighteval|xnli2.0_rus_cf|0 +lighteval|xstory_cloze_rus_cf|0 diff --git a/examples/tasks/fine_tasks/cf/sw.txt b/examples/tasks/fine_tasks/cf/sw.txt index 67406e2a1..e01ebde38 100644 --- a/examples/tasks/fine_tasks/cf/sw.txt +++ b/examples/tasks/fine_tasks/cf/sw.txt @@ -1,17 +1,17 @@ # General Knowledge (GK) -lighteval|community_arc_swa_cf:easy|0|1 -lighteval|m3exams_swa_cf|0|1 -lighteval|openai_mmlu_swa_cf|0|1 +lighteval|community_arc_swa_cf:easy|0 +lighteval|m3exams_swa_cf|0 +lighteval|openai_mmlu_swa_cf|0 # Reading Comprehension (RC) -lighteval|belebele_swh_Latn_cf|0|1 -lighteval|kenswquad_swa|0|1 -lighteval|tydiqa_swa|0|1 +lighteval|belebele_swh_Latn_cf|0 +lighteval|kenswquad_swa|0 +lighteval|tydiqa_swa|0 # Reasoning (RES) -lighteval|xcsqa_swa_cf|0|1 -lighteval|xcopa_swa_cf|0|1 +lighteval|xcsqa_swa_cf|0 +lighteval|xcopa_swa_cf|0 # Natural Language Understanding (NLU) -lighteval|xnli2.0_swa_cf|0|1 -lighteval|xstory_cloze_swa_cf|0|1 +lighteval|xnli2.0_swa_cf|0 +lighteval|xstory_cloze_swa_cf|0 diff --git a/examples/tasks/fine_tasks/cf/te.txt b/examples/tasks/fine_tasks/cf/te.txt index 7b844868c..2c64cbc2b 100644 --- a/examples/tasks/fine_tasks/cf/te.txt +++ b/examples/tasks/fine_tasks/cf/te.txt @@ -1,14 +1,14 @@ # General Knowledge (GK) -lighteval|mlmm_mmlu_tel_cf|0|1 +lighteval|mlmm_mmlu_tel_cf|0 # Reading Comprehension (RC) -lighteval|belebele_tel_Telu_cf|0|1 -lighteval|indicqa_tel|0|1 +lighteval|belebele_tel_Telu_cf|0 +lighteval|indicqa_tel|0 # Reasoning (RES) -lighteval|indicxcopa_tel_cf|0|1 +lighteval|indicxcopa_tel_cf|0 # Natural Language Understanding (NLU) -lighteval|community_hellaswag_tel_cf|0|1 -lighteval|indicnxnli_tel_cf|0|1 -lighteval|xstory_cloze_tel_cf|0|1 +lighteval|community_hellaswag_tel_cf|0 +lighteval|indicnxnli_tel_cf|0 +lighteval|xstory_cloze_tel_cf|0 diff --git a/examples/tasks/fine_tasks/cf/th.txt b/examples/tasks/fine_tasks/cf/th.txt index 16743e9af..89a895063 100644 --- a/examples/tasks/fine_tasks/cf/th.txt +++ b/examples/tasks/fine_tasks/cf/th.txt @@ -1,12 +1,12 @@ # General Knowledge (GK) -lighteval|meta_mmlu_tha_cf|0|1 -lighteval|m3exams_tha_cf|0|1 +lighteval|meta_mmlu_tha_cf|0 +lighteval|m3exams_tha_cf|0 # Reading Comprehension (RC) -lighteval|belebele_tha_Thai_cf|0|1 -lighteval|thaiqa_tha|0|1 -lighteval|xquad_tha|0|1 +lighteval|belebele_tha_Thai_cf|0 +lighteval|thaiqa_tha|0 +lighteval|xquad_tha|0 # Natural Language Understanding (NLU) -lighteval|community_hellaswag_tha_cf|0|1 -lighteval|xnli2.0_tha_cf|0|1 +lighteval|community_hellaswag_tha_cf|0 +lighteval|xnli2.0_tha_cf|0 diff --git a/examples/tasks/fine_tasks/cf/tr.txt b/examples/tasks/fine_tasks/cf/tr.txt index d999be71c..5c31d63d5 100644 --- a/examples/tasks/fine_tasks/cf/tr.txt +++ b/examples/tasks/fine_tasks/cf/tr.txt @@ -1,16 +1,16 @@ # General Knowledge (GK) -lighteval|community_arc_tur_cf:easy|0|1 -lighteval|exams_tur_cf|0|1 -lighteval|community_mmlu_tur_cf|0|1 +lighteval|community_arc_tur_cf:easy|0 +lighteval|exams_tur_cf|0 +lighteval|community_mmlu_tur_cf|0 # Reading Comprehension (RC) -lighteval|belebele_tur_Latn_cf|0|1 -lighteval|tquadv2_tur|0|1 -lighteval|xquad_tur|0|1 +lighteval|belebele_tur_Latn_cf|0 +lighteval|tquadv2_tur|0 +lighteval|xquad_tur|0 # Reasoning (RES) -lighteval|xcopa_tur_cf|0|1 +lighteval|xcopa_tur_cf|0 # Natural Language Understanding (NLU) -lighteval|community_hellaswag_tur_cf|0|1 -lighteval|xnli2.0_tur_cf|0|1 +lighteval|community_hellaswag_tur_cf|0 +lighteval|xnli2.0_tur_cf|0 diff --git a/examples/tasks/fine_tasks/cf/zh.txt b/examples/tasks/fine_tasks/cf/zh.txt index 76e7e28db..76d0ef068 100644 --- a/examples/tasks/fine_tasks/cf/zh.txt +++ b/examples/tasks/fine_tasks/cf/zh.txt @@ -1,22 +1,22 @@ # General Knowledge (GK) -lighteval|agieval_zho_cf|0|1 -lighteval|ceval_zho_cf|0|1 -lighteval|cmmlu_zho_cf|0|1 -lighteval|m3exams_zho_cf|0|1 +lighteval|agieval_zho_cf|0 +lighteval|ceval_zho_cf|0 +lighteval|cmmlu_zho_cf|0 +lighteval|m3exams_zho_cf|0 # Reading Comprehension (RC) -lighteval|belebele_zho_Hans_cf|0|1 -lighteval|c3_zho_cf|0|1 -lighteval|cmrc2018_zho|0|1 -lighteval|chinese_squad_zho|0|1 +lighteval|belebele_zho_Hans_cf|0 +lighteval|c3_zho_cf|0 +lighteval|cmrc2018_zho|0 +lighteval|chinese_squad_zho|0 # Reasoning (RES) -lighteval|xcodah_zho_cf|0|1 -lighteval|xcopa_zho_cf|0|1 -lighteval|xcsqa_zho_cf|0|1 +lighteval|xcodah_zho_cf|0 +lighteval|xcopa_zho_cf|0 +lighteval|xcsqa_zho_cf|0 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_zho_cf|0|1 -lighteval|ocnli_zho_cf|0|1 -lighteval|xwinograd_zho_cf|0|1 -lighteval|xstory_cloze_zho_cf|0|1 +lighteval|mlmm_hellaswag_zho_cf|0 +lighteval|ocnli_zho_cf|0 +lighteval|xwinograd_zho_cf|0 +lighteval|xstory_cloze_zho_cf|0 diff --git a/examples/tasks/fine_tasks/mcf/ar.txt b/examples/tasks/fine_tasks/mcf/ar.txt index d772e653d..56e94cf00 100644 --- a/examples/tasks/fine_tasks/mcf/ar.txt +++ b/examples/tasks/fine_tasks/mcf/ar.txt @@ -1,23 +1,23 @@ # General Knowledge (GK) -lighteval|exams_ara_mcf|5|1 -lighteval|mmlu_ara_mcf|5|1 -lighteval|alghafa_arc_ara_mcf:easy|5|1 -lighteval|alghafa_sciqa_ara_mcf|5|1 +lighteval|exams_ara_mcf|5 +lighteval|mmlu_ara_mcf|5 +lighteval|alghafa_arc_ara_mcf:easy|5 +lighteval|alghafa_sciqa_ara_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_arb_Arab_mcf|5|1 -lighteval|soqal_ara_mcf|5|1 -lighteval|mlqa_ara|5|1 -lighteval|tydiqa_ara|5|1 -lighteval|alghafa_race_ara_mcf|5|1 -lighteval|arcd_ara|5|1 +lighteval|belebele_arb_Arab_mcf|5 +lighteval|soqal_ara_mcf|5 +lighteval|mlqa_ara|5 +lighteval|tydiqa_ara|5 +lighteval|alghafa_race_ara_mcf|5 +lighteval|arcd_ara|5 # Reasoning (RES) -lighteval|xcodah_ara_mcf|5|1 -lighteval|alghafa_piqa_ara_mcf|5|1 -lighteval|xcsqa_ara_mcf|5|1 +lighteval|xcodah_ara_mcf|5 +lighteval|alghafa_piqa_ara_mcf|5 +lighteval|xcsqa_ara_mcf|5 # Natural Language Understanding (NLU) -lighteval|xnli2.0_ara_mcf|5|1 -lighteval|mlmm_hellaswag_ara_mcf|5|1 -lighteval|xstory_cloze_ara_mcf|5|1 +lighteval|xnli2.0_ara_mcf|5 +lighteval|mlmm_hellaswag_ara_mcf|5 +lighteval|xstory_cloze_ara_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/fr.txt b/examples/tasks/fine_tasks/mcf/fr.txt index 4a7d04eac..e96e5bf49 100644 --- a/examples/tasks/fine_tasks/mcf/fr.txt +++ b/examples/tasks/fine_tasks/mcf/fr.txt @@ -1,16 +1,16 @@ # General Knowledge (GK) -lighteval|meta_mmlu_fra_mcf|5|1 -lighteval|mlmm_arc_fra_mcf:challenge|5|1 -lighteval|mintaka_fra|5|1 +lighteval|meta_mmlu_fra_mcf|5 +lighteval|mlmm_arc_fra_mcf:challenge|5 +lighteval|mintaka_fra|5 # Reading Comprehension (RC) -lighteval|belebele_fra_Latn_mcf|5|1 -lighteval|fquadv2_fra|5|1 +lighteval|belebele_fra_Latn_mcf|5 +lighteval|fquadv2_fra|5 # Reasoning (RES) -lighteval|xcodah_fra_mcf|5|1 -lighteval|xcsqa_fra_mcf|5|1 +lighteval|xcodah_fra_mcf|5 +lighteval|xcsqa_fra_mcf|5 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_fra_mcf|5|1 -lighteval|xnli2.0_fra_mcf|5|1 +lighteval|mlmm_hellaswag_fra_mcf|5 +lighteval|xnli2.0_fra_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/hi.txt b/examples/tasks/fine_tasks/mcf/hi.txt index e7298ae70..5140ebc74 100644 --- a/examples/tasks/fine_tasks/mcf/hi.txt +++ b/examples/tasks/fine_tasks/mcf/hi.txt @@ -1,17 +1,17 @@ # General Knowledge (GK) -lighteval|meta_mmlu_hin_mcf|5|1 -lighteval|community_arc_hin_mcf:easy|5|1 +lighteval|meta_mmlu_hin_mcf|5 +lighteval|community_arc_hin_mcf:easy|5 # Reading Comprehension (RC) -lighteval|belebele_hin_Deva_mcf|5|1 -lighteval|indicqa_hin|5|1 +lighteval|belebele_hin_Deva_mcf|5 +lighteval|indicqa_hin|5 # Reasoning (RES) -lighteval|xcodah_hin_mcf|5|1 -lighteval|indicxcopa_hin_mcf|5|1 -lighteval|xcsqa_hin_mcf|5|1 +lighteval|xcodah_hin_mcf|5 +lighteval|indicxcopa_hin_mcf|5 +lighteval|xcsqa_hin_mcf|5 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_hin_mcf|5|1 -lighteval|indicnxnli_hin_mcf|5|1 -lighteval|xstory_cloze_hin_mcf|5|1 +lighteval|mlmm_hellaswag_hin_mcf|5 +lighteval|indicnxnli_hin_mcf|5 +lighteval|xstory_cloze_hin_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/ru.txt b/examples/tasks/fine_tasks/mcf/ru.txt index 5598cee78..f6c14a842 100644 --- a/examples/tasks/fine_tasks/mcf/ru.txt +++ b/examples/tasks/fine_tasks/mcf/ru.txt @@ -1,20 +1,20 @@ # General Knowledge (GK) -lighteval|mlmm_arc_rus_mcf:challenge|5|1 -lighteval|rummlu_rus_mcf|5|1 -lighteval|mera_openbookqa_rus_mcf|5|1 +lighteval|mlmm_arc_rus_mcf:challenge|5 +lighteval|rummlu_rus_mcf|5 +lighteval|mera_openbookqa_rus_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_rus_Cyrl_mcf|5|1 -lighteval|tydiqa_rus|5|1 -lighteval|sber_squad_rus|5|1 -lighteval|xquad_rus|5|1 +lighteval|belebele_rus_Cyrl_mcf|5 +lighteval|tydiqa_rus|5 +lighteval|sber_squad_rus|5 +lighteval|xquad_rus|5 # Reasoning (RES) -lighteval|parus_rus_mcf|0|1 -lighteval|xcodah_rus_mcf|5|1 -lighteval|xcsqa_rus_mcf|5|1 +lighteval|parus_rus_mcf|0 +lighteval|xcodah_rus_mcf|5 +lighteval|xcsqa_rus_mcf|5 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_rus_mcf|0|1 -lighteval|xnli2.0_rus_mcf|5|1 -lighteval|xstory_cloze_rus_mcf|5|1 +lighteval|mlmm_hellaswag_rus_mcf|0 +lighteval|xnli2.0_rus_mcf|5 +lighteval|xstory_cloze_rus_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/sw.txt b/examples/tasks/fine_tasks/mcf/sw.txt index acb53d364..a1f726954 100644 --- a/examples/tasks/fine_tasks/mcf/sw.txt +++ b/examples/tasks/fine_tasks/mcf/sw.txt @@ -1,17 +1,17 @@ # General Knowledge (GK) -lighteval|community_arc_swa_mcf:easy|5|1 -lighteval|m3exams_swa_mcf|5|1 -lighteval|openai_mmlu_swa_mcf|5|1 +lighteval|community_arc_swa_mcf:easy|5 +lighteval|m3exams_swa_mcf|5 +lighteval|openai_mmlu_swa_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_swh_Latn_mcf|5|1 -lighteval|kenswquad_swa|5|1 -lighteval|tydiqa_swa|5|1 +lighteval|belebele_swh_Latn_mcf|5 +lighteval|kenswquad_swa|5 +lighteval|tydiqa_swa|5 # Reasoning (RES) -lighteval|xcsqa_swa_mcf|5|1 -lighteval|xcopa_swa_mcf|5|1 +lighteval|xcsqa_swa_mcf|5 +lighteval|xcopa_swa_mcf|5 # Natural Language Understanding (NLU) -lighteval|xnli2.0_swa_mcf|5|1 -lighteval|xstory_cloze_swa_mcf|5|1 +lighteval|xnli2.0_swa_mcf|5 +lighteval|xstory_cloze_swa_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/te.txt b/examples/tasks/fine_tasks/mcf/te.txt index 07b609c29..c0d620686 100644 --- a/examples/tasks/fine_tasks/mcf/te.txt +++ b/examples/tasks/fine_tasks/mcf/te.txt @@ -1,14 +1,14 @@ # General Knowledge (GK) -lighteval|mlmm_mmlu_tel_mcf|5|1 +lighteval|mlmm_mmlu_tel_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_tel_Telu_mcf|5|1 -lighteval|indicqa_tel|5|1 +lighteval|belebele_tel_Telu_mcf|5 +lighteval|indicqa_tel|5 # Reasoning (RES) -lighteval|indicxcopa_tel_mcf|5|1 +lighteval|indicxcopa_tel_mcf|5 # Natural Language Understanding (NLU) -lighteval|community_hellaswag_tel_mcf|5|1 -lighteval|indicnxnli_tel_mcf|5|1 -lighteval|xstory_cloze_tel_mcf|5|1 +lighteval|community_hellaswag_tel_mcf|5 +lighteval|indicnxnli_tel_mcf|5 +lighteval|xstory_cloze_tel_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/th.txt b/examples/tasks/fine_tasks/mcf/th.txt index 4a5acb214..5156a8ea1 100644 --- a/examples/tasks/fine_tasks/mcf/th.txt +++ b/examples/tasks/fine_tasks/mcf/th.txt @@ -1,12 +1,12 @@ # General Knowledge (GK) -lighteval|meta_mmlu_tha_mcf|5|1 -lighteval|m3exams_tha_mcf|5|1 +lighteval|meta_mmlu_tha_mcf|5 +lighteval|m3exams_tha_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_tha_Thai_mcf|5|1 -lighteval|thaiqa_tha|5|1 -lighteval|xquad_tha|5|1 +lighteval|belebele_tha_Thai_mcf|5 +lighteval|thaiqa_tha|5 +lighteval|xquad_tha|5 # Natural Language Understanding (NLU) -lighteval|community_hellaswag_tha_mcf|5|1 -lighteval|xnli2.0_tha_mcf|5|1 +lighteval|community_hellaswag_tha_mcf|5 +lighteval|xnli2.0_tha_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/tr.txt b/examples/tasks/fine_tasks/mcf/tr.txt index 63ccd0b83..918ea5feb 100644 --- a/examples/tasks/fine_tasks/mcf/tr.txt +++ b/examples/tasks/fine_tasks/mcf/tr.txt @@ -1,16 +1,16 @@ # General Knowledge (GK) -lighteval|community_arc_tur_mcf:easy|5|1 -lighteval|exams_tur_mcf|5|1 -lighteval|community_mmlu_tur_mcf|5|1 +lighteval|community_arc_tur_mcf:easy|5 +lighteval|exams_tur_mcf|5 +lighteval|community_mmlu_tur_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_tur_Latn_mcf|5|1 -lighteval|tquadv2_tur|5|1 -lighteval|xquad_tur|5|1 +lighteval|belebele_tur_Latn_mcf|5 +lighteval|tquadv2_tur|5 +lighteval|xquad_tur|5 # Reasoning (RES) -lighteval|xcopa_tur_mcf|5|1 +lighteval|xcopa_tur_mcf|5 # Natural Language Understanding (NLU) -lighteval|community_hellaswag_tur_mcf|5|1 -lighteval|xnli2.0_tur_mcf|5|1 +lighteval|community_hellaswag_tur_mcf|5 +lighteval|xnli2.0_tur_mcf|5 diff --git a/examples/tasks/fine_tasks/mcf/zh.txt b/examples/tasks/fine_tasks/mcf/zh.txt index a5799d82b..69817c144 100644 --- a/examples/tasks/fine_tasks/mcf/zh.txt +++ b/examples/tasks/fine_tasks/mcf/zh.txt @@ -1,22 +1,22 @@ # General Knowledge (GK) -lighteval|agieval_zho_mcf|5|1 -lighteval|ceval_zho_mcf|5|1 -lighteval|cmmlu_zho_mcf|5|1 -lighteval|m3exams_zho_mcf|5|1 +lighteval|agieval_zho_mcf|5 +lighteval|ceval_zho_mcf|5 +lighteval|cmmlu_zho_mcf|5 +lighteval|m3exams_zho_mcf|5 # Reading Comprehension (RC) -lighteval|belebele_zho_Hans_mcf|5|1 -lighteval|c3_zho_mcf|5|1 -lighteval|cmrc2018_zho|5|1 -lighteval|chinese_squad_zho|5|1 +lighteval|belebele_zho_Hans_mcf|5 +lighteval|c3_zho_mcf|5 +lighteval|cmrc2018_zho|5 +lighteval|chinese_squad_zho|5 # Reasoning (RES) -lighteval|xcodah_zho_mcf|5|1 -lighteval|xcopa_zho_mcf|5|1 -lighteval|xcsqa_zho_mcf|5|1 +lighteval|xcodah_zho_mcf|5 +lighteval|xcopa_zho_mcf|5 +lighteval|xcsqa_zho_mcf|5 # Natural Language Understanding (NLU) -lighteval|mlmm_hellaswag_zho_mcf|5|1 -lighteval|ocnli_zho_mcf|5|1 -lighteval|xwinograd_zho_mcf|5|1 -lighteval|xstory_cloze_zho_mcf|5|1 +lighteval|mlmm_hellaswag_zho_mcf|5 +lighteval|ocnli_zho_mcf|5 +lighteval|xwinograd_zho_mcf|5 +lighteval|xstory_cloze_zho_mcf|5 diff --git a/examples/tasks/open_llm_leaderboard_tasks.txt b/examples/tasks/open_llm_leaderboard_tasks.txt index 51de4f473..b87f7a191 100644 --- a/examples/tasks/open_llm_leaderboard_tasks.txt +++ b/examples/tasks/open_llm_leaderboard_tasks.txt @@ -1,68 +1,68 @@ # ARC -leaderboard|arc:challenge|25|0 +leaderboard|arc:challenge|25 # HellaSwag -leaderboard|hellaswag|10|0 +leaderboard|hellaswag|10 # TruthfulQA -leaderboard|truthfulqa:mc|0|0 +leaderboard|truthfulqa:mc|0 # MMLU -leaderboard|mmlu:abstract_algebra|5|0 -leaderboard|mmlu:anatomy|5|0 -leaderboard|mmlu:astronomy|5|0 -leaderboard|mmlu:business_ethics|5|0 -leaderboard|mmlu:clinical_knowledge|5|0 -leaderboard|mmlu:college_biology|5|0 -leaderboard|mmlu:college_chemistry|5|0 -leaderboard|mmlu:college_computer_science|5|0 -leaderboard|mmlu:college_mathematics|5|0 -leaderboard|mmlu:college_medicine|5|0 -leaderboard|mmlu:college_physics|5|0 -leaderboard|mmlu:computer_security|5|0 -leaderboard|mmlu:conceptual_physics|5|0 -leaderboard|mmlu:econometrics|5|0 -leaderboard|mmlu:electrical_engineering|5|0 -leaderboard|mmlu:elementary_mathematics|5|0 -leaderboard|mmlu:formal_logic|5|0 -leaderboard|mmlu:global_facts|5|0 -leaderboard|mmlu:high_school_biology|5|0 -leaderboard|mmlu:high_school_chemistry|5|0 -leaderboard|mmlu:high_school_computer_science|5|0 -leaderboard|mmlu:high_school_european_history|5|0 -leaderboard|mmlu:high_school_geography|5|0 -leaderboard|mmlu:high_school_government_and_politics|5|0 -leaderboard|mmlu:high_school_macroeconomics|5|0 -leaderboard|mmlu:high_school_mathematics|5|0 -leaderboard|mmlu:high_school_microeconomics|5|0 -leaderboard|mmlu:high_school_physics|5|0 -leaderboard|mmlu:high_school_psychology|5|0 -leaderboard|mmlu:high_school_statistics|5|0 -leaderboard|mmlu:high_school_us_history|5|0 -leaderboard|mmlu:high_school_world_history|5|0 -leaderboard|mmlu:human_aging|5|0 -leaderboard|mmlu:human_sexuality|5|0 -leaderboard|mmlu:international_law|5|0 -leaderboard|mmlu:jurisprudence|5|0 -leaderboard|mmlu:logical_fallacies|5|0 -leaderboard|mmlu:machine_learning|5|0 -leaderboard|mmlu:management|5|0 -leaderboard|mmlu:marketing|5|0 -leaderboard|mmlu:medical_genetics|5|0 -leaderboard|mmlu:miscellaneous|5|0 -leaderboard|mmlu:moral_disputes|5|0 -leaderboard|mmlu:moral_scenarios|5|0 -leaderboard|mmlu:nutrition|5|0 -leaderboard|mmlu:philosophy|5|0 -leaderboard|mmlu:prehistory|5|0 -leaderboard|mmlu:professional_accounting|5|0 -leaderboard|mmlu:professional_law|5|0 -leaderboard|mmlu:professional_medicine|5|0 -leaderboard|mmlu:professional_psychology|5|0 -leaderboard|mmlu:public_relations|5|0 -leaderboard|mmlu:security_studies|5|0 -leaderboard|mmlu:sociology|5|0 -leaderboard|mmlu:us_foreign_policy|5|0 -leaderboard|mmlu:virology|5|0 -leaderboard|mmlu:world_religions|5|0 +leaderboard|mmlu:abstract_algebra|5 +leaderboard|mmlu:anatomy|5 +leaderboard|mmlu:astronomy|5 +leaderboard|mmlu:business_ethics|5 +leaderboard|mmlu:clinical_knowledge|5 +leaderboard|mmlu:college_biology|5 +leaderboard|mmlu:college_chemistry|5 +leaderboard|mmlu:college_computer_science|5 +leaderboard|mmlu:college_mathematics|5 +leaderboard|mmlu:college_medicine|5 +leaderboard|mmlu:college_physics|5 +leaderboard|mmlu:computer_security|5 +leaderboard|mmlu:conceptual_physics|5 +leaderboard|mmlu:econometrics|5 +leaderboard|mmlu:electrical_engineering|5 +leaderboard|mmlu:elementary_mathematics|5 +leaderboard|mmlu:formal_logic|5 +leaderboard|mmlu:global_facts|5 +leaderboard|mmlu:high_school_biology|5 +leaderboard|mmlu:high_school_chemistry|5 +leaderboard|mmlu:high_school_computer_science|5 +leaderboard|mmlu:high_school_european_history|5 +leaderboard|mmlu:high_school_geography|5 +leaderboard|mmlu:high_school_government_and_politics|5 +leaderboard|mmlu:high_school_macroeconomics|5 +leaderboard|mmlu:high_school_mathematics|5 +leaderboard|mmlu:high_school_microeconomics|5 +leaderboard|mmlu:high_school_physics|5 +leaderboard|mmlu:high_school_psychology|5 +leaderboard|mmlu:high_school_statistics|5 +leaderboard|mmlu:high_school_us_history|5 +leaderboard|mmlu:high_school_world_history|5 +leaderboard|mmlu:human_aging|5 +leaderboard|mmlu:human_sexuality|5 +leaderboard|mmlu:international_law|5 +leaderboard|mmlu:jurisprudence|5 +leaderboard|mmlu:logical_fallacies|5 +leaderboard|mmlu:machine_learning|5 +leaderboard|mmlu:management|5 +leaderboard|mmlu:marketing|5 +leaderboard|mmlu:medical_genetics|5 +leaderboard|mmlu:miscellaneous|5 +leaderboard|mmlu:moral_disputes|5 +leaderboard|mmlu:moral_scenarios|5 +leaderboard|mmlu:nutrition|5 +leaderboard|mmlu:philosophy|5 +leaderboard|mmlu:prehistory|5 +leaderboard|mmlu:professional_accounting|5 +leaderboard|mmlu:professional_law|5 +leaderboard|mmlu:professional_medicine|5 +leaderboard|mmlu:professional_psychology|5 +leaderboard|mmlu:public_relations|5 +leaderboard|mmlu:security_studies|5 +leaderboard|mmlu:sociology|5 +leaderboard|mmlu:us_foreign_policy|5 +leaderboard|mmlu:virology|5 +leaderboard|mmlu:world_religions|5 # WinoGrande -leaderboard|winogrande|5|0 +leaderboard|winogrande|5 # GSM8K -leaderboard|gsm8k|5|0 +leaderboard|gsm8k|5 diff --git a/examples/tasks/recommended_set.txt b/examples/tasks/recommended_set.txt index d1904e3cc..d55b10a9d 100644 --- a/examples/tasks/recommended_set.txt +++ b/examples/tasks/recommended_set.txt @@ -1,160 +1,160 @@ # Commonsense-QA -helm|commonsenseqa|0|0 -lighteval|ethics:commonsense|0|0 -lighteval|ethics:deontology|0|0 -lighteval|ethics:justice|0|0 -lighteval|ethics:utilitarianism|0|0 -lighteval|ethics:virtue|0|0 +helm|commonsenseqa|0 +lighteval|ethics:commonsense|0 +lighteval|ethics:deontology|0 +lighteval|ethics:justice|0 +lighteval|ethics:utilitarianism|0 +lighteval|ethics:virtue|0 # MMLU -leaderboard|mmlu:abstract_algebra|0|0 -leaderboard|mmlu:anatomy|0|0 -leaderboard|mmlu:astronomy|0|0 -leaderboard|mmlu:business_ethics|0|0 -leaderboard|mmlu:clinical_knowledge|0|0 -leaderboard|mmlu:college_biology|0|0 -leaderboard|mmlu:college_chemistry|0|0 -leaderboard|mmlu:college_computer_science|0|0 -leaderboard|mmlu:college_mathematics|0|0 -leaderboard|mmlu:college_medicine|0|0 -leaderboard|mmlu:college_physics|0|0 -leaderboard|mmlu:computer_security|0|0 -leaderboard|mmlu:conceptual_physics|0|0 -leaderboard|mmlu:econometrics|0|0 -leaderboard|mmlu:electrical_engineering|0|0 -leaderboard|mmlu:elementary_mathematics|0|0 -leaderboard|mmlu:formal_logic|0|0 -leaderboard|mmlu:global_facts|0|0 -leaderboard|mmlu:high_school_biology|0|0 -leaderboard|mmlu:high_school_chemistry|0|0 -leaderboard|mmlu:high_school_computer_science|0|0 -leaderboard|mmlu:high_school_european_history|0|0 -leaderboard|mmlu:high_school_geography|0|0 -leaderboard|mmlu:high_school_government_and_politics|0|0 -leaderboard|mmlu:high_school_macroeconomics|0|0 -leaderboard|mmlu:high_school_mathematics|0|0 -leaderboard|mmlu:high_school_microeconomics|0|0 -leaderboard|mmlu:high_school_physics|0|0 -leaderboard|mmlu:high_school_psychology|0|0 -leaderboard|mmlu:high_school_statistics|0|0 -leaderboard|mmlu:high_school_us_history|0|0 -leaderboard|mmlu:high_school_world_history|0|0 -leaderboard|mmlu:human_aging|0|0 -leaderboard|mmlu:human_sexuality|0|0 -leaderboard|mmlu:international_law|0|0 -leaderboard|mmlu:jurisprudence|0|0 -leaderboard|mmlu:logical_fallacies|0|0 -leaderboard|mmlu:machine_learning|0|0 -leaderboard|mmlu:management|0|0 -leaderboard|mmlu:marketing|0|0 -leaderboard|mmlu:medical_genetics|0|0 -leaderboard|mmlu:miscellaneous|0|0 -leaderboard|mmlu:moral_disputes|0|0 -leaderboard|mmlu:moral_scenarios|0|0 -leaderboard|mmlu:nutrition|0|0 -leaderboard|mmlu:philosophy|0|0 -leaderboard|mmlu:prehistory|0|0 -leaderboard|mmlu:professional_accounting|0|0 -leaderboard|mmlu:professional_law|0|0 -leaderboard|mmlu:professional_medicine|0|0 -leaderboard|mmlu:professional_psychology|0|0 -leaderboard|mmlu:public_relations|0|0 -leaderboard|mmlu:security_studies|0|0 -leaderboard|mmlu:sociology|0|0 -leaderboard|mmlu:us_foreign_policy|0|0 -leaderboard|mmlu:virology|0|0 -leaderboard|mmlu:world_religions|0|0 -original|mmlu:abstract_algebra|0|0 -original|mmlu:anatomy|0|0 -original|mmlu:astronomy|0|0 -original|mmlu:business_ethics|0|0 -original|mmlu:clinical_knowledge|0|0 -original|mmlu:college_biology|0|0 -original|mmlu:college_chemistry|0|0 -original|mmlu:college_computer_science|0|0 -original|mmlu:college_mathematics|0|0 -original|mmlu:college_medicine|0|0 -original|mmlu:college_physics|0|0 -original|mmlu:computer_security|0|0 -original|mmlu:conceptual_physics|0|0 -original|mmlu:econometrics|0|0 -original|mmlu:electrical_engineering|0|0 -original|mmlu:elementary_mathematics|0|0 -original|mmlu:formal_logic|0|0 -original|mmlu:global_facts|0|0 -original|mmlu:high_school_biology|0|0 -original|mmlu:high_school_chemistry|0|0 -original|mmlu:high_school_computer_science|0|0 -original|mmlu:high_school_european_history|0|0 -original|mmlu:high_school_geography|0|0 -original|mmlu:high_school_government_and_politics|0|0 -original|mmlu:high_school_macroeconomics|0|0 -original|mmlu:high_school_mathematics|0|0 -original|mmlu:high_school_microeconomics|0|0 -original|mmlu:high_school_physics|0|0 -original|mmlu:high_school_psychology|0|0 -original|mmlu:high_school_statistics|0|0 -original|mmlu:high_school_us_history|0|0 -original|mmlu:high_school_world_history|0|0 -original|mmlu:human_aging|0|0 -original|mmlu:human_sexuality|0|0 -original|mmlu:international_law|0|0 -original|mmlu:jurisprudence|0|0 -original|mmlu:logical_fallacies|0|0 -original|mmlu:machine_learning|0|0 -original|mmlu:management|0|0 -original|mmlu:marketing|0|0 -original|mmlu:medical_genetics|0|0 -original|mmlu:miscellaneous|0|0 -original|mmlu:moral_disputes|0|0 -original|mmlu:moral_scenarios|0|0 -original|mmlu:nutrition|0|0 -original|mmlu:philosophy|0|0 -original|mmlu:prehistory|0|0 -original|mmlu:professional_accounting|0|0 -original|mmlu:professional_law|0|0 -original|mmlu:professional_medicine|0|0 -original|mmlu:professional_psychology|0|0 -original|mmlu:public_relations|0|0 -original|mmlu:security_studies|0|0 -original|mmlu:sociology|0|0 -original|mmlu:us_foreign_policy|0|0 -original|mmlu:virology|0|0 -original|mmlu:world_religions|0|0 -original|mmlu|0|0 +leaderboard|mmlu:abstract_algebra|0 +leaderboard|mmlu:anatomy|0 +leaderboard|mmlu:astronomy|0 +leaderboard|mmlu:business_ethics|0 +leaderboard|mmlu:clinical_knowledge|0 +leaderboard|mmlu:college_biology|0 +leaderboard|mmlu:college_chemistry|0 +leaderboard|mmlu:college_computer_science|0 +leaderboard|mmlu:college_mathematics|0 +leaderboard|mmlu:college_medicine|0 +leaderboard|mmlu:college_physics|0 +leaderboard|mmlu:computer_security|0 +leaderboard|mmlu:conceptual_physics|0 +leaderboard|mmlu:econometrics|0 +leaderboard|mmlu:electrical_engineering|0 +leaderboard|mmlu:elementary_mathematics|0 +leaderboard|mmlu:formal_logic|0 +leaderboard|mmlu:global_facts|0 +leaderboard|mmlu:high_school_biology|0 +leaderboard|mmlu:high_school_chemistry|0 +leaderboard|mmlu:high_school_computer_science|0 +leaderboard|mmlu:high_school_european_history|0 +leaderboard|mmlu:high_school_geography|0 +leaderboard|mmlu:high_school_government_and_politics|0 +leaderboard|mmlu:high_school_macroeconomics|0 +leaderboard|mmlu:high_school_mathematics|0 +leaderboard|mmlu:high_school_microeconomics|0 +leaderboard|mmlu:high_school_physics|0 +leaderboard|mmlu:high_school_psychology|0 +leaderboard|mmlu:high_school_statistics|0 +leaderboard|mmlu:high_school_us_history|0 +leaderboard|mmlu:high_school_world_history|0 +leaderboard|mmlu:human_aging|0 +leaderboard|mmlu:human_sexuality|0 +leaderboard|mmlu:international_law|0 +leaderboard|mmlu:jurisprudence|0 +leaderboard|mmlu:logical_fallacies|0 +leaderboard|mmlu:machine_learning|0 +leaderboard|mmlu:management|0 +leaderboard|mmlu:marketing|0 +leaderboard|mmlu:medical_genetics|0 +leaderboard|mmlu:miscellaneous|0 +leaderboard|mmlu:moral_disputes|0 +leaderboard|mmlu:moral_scenarios|0 +leaderboard|mmlu:nutrition|0 +leaderboard|mmlu:philosophy|0 +leaderboard|mmlu:prehistory|0 +leaderboard|mmlu:professional_accounting|0 +leaderboard|mmlu:professional_law|0 +leaderboard|mmlu:professional_medicine|0 +leaderboard|mmlu:professional_psychology|0 +leaderboard|mmlu:public_relations|0 +leaderboard|mmlu:security_studies|0 +leaderboard|mmlu:sociology|0 +leaderboard|mmlu:us_foreign_policy|0 +leaderboard|mmlu:virology|0 +leaderboard|mmlu:world_religions|0 +original|mmlu:abstract_algebra|0 +original|mmlu:anatomy|0 +original|mmlu:astronomy|0 +original|mmlu:business_ethics|0 +original|mmlu:clinical_knowledge|0 +original|mmlu:college_biology|0 +original|mmlu:college_chemistry|0 +original|mmlu:college_computer_science|0 +original|mmlu:college_mathematics|0 +original|mmlu:college_medicine|0 +original|mmlu:college_physics|0 +original|mmlu:computer_security|0 +original|mmlu:conceptual_physics|0 +original|mmlu:econometrics|0 +original|mmlu:electrical_engineering|0 +original|mmlu:elementary_mathematics|0 +original|mmlu:formal_logic|0 +original|mmlu:global_facts|0 +original|mmlu:high_school_biology|0 +original|mmlu:high_school_chemistry|0 +original|mmlu:high_school_computer_science|0 +original|mmlu:high_school_european_history|0 +original|mmlu:high_school_geography|0 +original|mmlu:high_school_government_and_politics|0 +original|mmlu:high_school_macroeconomics|0 +original|mmlu:high_school_mathematics|0 +original|mmlu:high_school_microeconomics|0 +original|mmlu:high_school_physics|0 +original|mmlu:high_school_psychology|0 +original|mmlu:high_school_statistics|0 +original|mmlu:high_school_us_history|0 +original|mmlu:high_school_world_history|0 +original|mmlu:human_aging|0 +original|mmlu:human_sexuality|0 +original|mmlu:international_law|0 +original|mmlu:jurisprudence|0 +original|mmlu:logical_fallacies|0 +original|mmlu:machine_learning|0 +original|mmlu:management|0 +original|mmlu:marketing|0 +original|mmlu:medical_genetics|0 +original|mmlu:miscellaneous|0 +original|mmlu:moral_disputes|0 +original|mmlu:moral_scenarios|0 +original|mmlu:nutrition|0 +original|mmlu:philosophy|0 +original|mmlu:prehistory|0 +original|mmlu:professional_accounting|0 +original|mmlu:professional_law|0 +original|mmlu:professional_medicine|0 +original|mmlu:professional_psychology|0 +original|mmlu:public_relations|0 +original|mmlu:security_studies|0 +original|mmlu:sociology|0 +original|mmlu:us_foreign_policy|0 +original|mmlu:virology|0 +original|mmlu:world_religions|0 +original|mmlu|0 # ARC -leaderboard|arc:challenge|0|0 -lighteval|arc:easy|0|0 -original|arc:c:letters|0|0 -original|arc:c:options|0|0 -original|arc:c:simple|0|0 +leaderboard|arc:challenge|0 +lighteval|arc:easy|0 +original|arc:c:letters|0 +original|arc:c:options|0 +original|arc:c:simple|0 # HellaSwag -helm|hellaswag|0|0 -leaderboard|hellaswag|0|0 +helm|hellaswag|0 +leaderboard|hellaswag|0 # PIQA -helm|piqa|0|0 -lighteval|piqa|0|0 +helm|piqa|0 +lighteval|piqa|0 # SIQA -helm|siqa|0|0 +helm|siqa|0 # WinoGrande -leaderboard|winogrande|0|0 +leaderboard|winogrande|0 # OpenBookQA -lighteval|openbookqa|0|0 -helm|openbookqa|0|0 +lighteval|openbookqa|0 +helm|openbookqa|0 # TriviaQA -lighteval|triviaqa|0|0 +lighteval|triviaqa|0 # BoolQ -helm|boolq:contrastset|0|0 -helm|boolq|0|0 +helm|boolq:contrastset|0 +helm|boolq|0 # QUAC -helm|quac|0|0 +helm|quac|0 # GSM8K -leaderboard|gsm8k|0|0 +leaderboard|gsm8k|0 # MATH -lighteval|math:algebra|0|0 -lighteval|math:counting_and_probability|0|0 -lighteval|math:geometry|0|0 -lighteval|math:intermediate_algebra|0|0 -lighteval|math:number_theory|0|0 -lighteval|math:prealgebra|0|0 -lighteval|math:precalculus|0|0 +lighteval|math:algebra|0 +lighteval|math:counting_and_probability|0 +lighteval|math:geometry|0 +lighteval|math:intermediate_algebra|0 +lighteval|math:number_theory|0 +lighteval|math:prealgebra|0 +lighteval|math:precalculus|0 # To add: NaturalQuestions, BBH, AGIEval diff --git a/examples/tasks/serbian_task_group/sr_all_exclusive.txt b/examples/tasks/serbian_task_group/sr_all_exclusive.txt index 7e0ced8eb..308743d46 100644 --- a/examples/tasks/serbian_task_group/sr_all_exclusive.txt +++ b/examples/tasks/serbian_task_group/sr_all_exclusive.txt @@ -1,75 +1,75 @@ # Serbian Evaluations - ARC (AI2 Reasoning Challenge) -community|serbian_evals:arc_easy|0|0 -community|serbian_evals:arc_challenge|0|0 +community|serbian_evals:arc_easy|0 +community|serbian_evals:arc_challenge|0 # Commonsense Reasoning -community|serbian_evals:hellaswag|0|0 -community|serbian_evals:piqa|0|0 -community|serbian_evals:winogrande|0|0 +community|serbian_evals:hellaswag|0 +community|serbian_evals:piqa|0 +community|serbian_evals:winogrande|0 # Serbian Evaluations - Custom/Other Task -community|serbian_evals:oz_eval|0|0 +community|serbian_evals:oz_eval|0 # MMLU (Miscellaneous) -community|serbian_evals:mmlu_anatomija|0|0 -community|serbian_evals:mmlu_astronomija|0|0 -community|serbian_evals:mmlu_poslovna_etika|0|0 -community|serbian_evals:mmlu_kliničko_znanje|0|0 -community|serbian_evals:mmlu_razno|0|0 -community|serbian_evals:mmlu_elektrotehnika|0|0 +community|serbian_evals:mmlu_anatomija|0 +community|serbian_evals:mmlu_astronomija|0 +community|serbian_evals:mmlu_poslovna_etika|0 +community|serbian_evals:mmlu_kliničko_znanje|0 +community|serbian_evals:mmlu_razno|0 +community|serbian_evals:mmlu_elektrotehnika|0 # Serbian Evaluations - ARC (AI2 Reasoning Challenge) -community|serbian_evals:arc_easy|0|0 -community|serbian_evals:arc_challenge|0|0 +community|serbian_evals:arc_easy|0 +community|serbian_evals:arc_challenge|0 # Commonsense Reasoning -community|serbian_evals:hellaswag|0|0 -community|serbian_evals:piqa|0|0 -community|serbian_evals:winogrande|0|0 +community|serbian_evals:hellaswag|0 +community|serbian_evals:piqa|0 +community|serbian_evals:winogrande|0 # Serbian Evaluations - Custom/Other Task -community|serbian_evals:oz_eval|0|0 +community|serbian_evals:oz_eval|0 # MMLU (Miscellaneous) -community|serbian_evals:mmlu_anatomija|0|0 -community|serbian_evals:mmlu_astronomija|0|0 -community|serbian_evals:mmlu_poslovna_etika|0|0 -community|serbian_evals:mmlu_kliničko_znanje|0|0 -community|serbian_evals:mmlu_razno|0|0 -community|serbian_evals:mmlu_elektrotehnika|0|0 +community|serbian_evals:mmlu_anatomija|0 +community|serbian_evals:mmlu_astronomija|0 +community|serbian_evals:mmlu_poslovna_etika|0 +community|serbian_evals:mmlu_kliničko_znanje|0 +community|serbian_evals:mmlu_razno|0 +community|serbian_evals:mmlu_elektrotehnika|0 # MMLU (Business Professional) -community|serbian_evals:mmlu_marketing|0|0 -community|serbian_evals:mmlu_manadzment|0|0 +community|serbian_evals:mmlu_marketing|0 +community|serbian_evals:mmlu_manadzment|0 # MMLU (College Level Tasks) -community|serbian_evals:mmlu_fakultet_biologija|0|0 -community|serbian_evals:mmlu_fakultet_hemija|0|0 -community|serbian_evals:mmlu_fakultet_racunari|0|0 -community|serbian_evals:mmlu_fakultet_matematika|0|0 -community|serbian_evals:mmlu_fakultet_medicina|0|0 -community|serbian_evals:mmlu_fakultet_fizika|0|0 -community|serbian_evals:mmlu_sigurnost_racunara|0|0 +community|serbian_evals:mmlu_fakultet_biologija|0 +community|serbian_evals:mmlu_fakultet_hemija|0 +community|serbian_evals:mmlu_fakultet_racunari|0 +community|serbian_evals:mmlu_fakultet_matematika|0 +community|serbian_evals:mmlu_fakultet_medicina|0 +community|serbian_evals:mmlu_fakultet_fizika|0 +community|serbian_evals:mmlu_sigurnost_racunara|0 # MMLU (Ethics, Philosophy) -community|serbian_evals:mmlu_moralni_sporovi|0|0 -community|serbian_evals:mmlu_moralne_dileme|0|0 -community|serbian_evals:mmlu_filozofija|0|0 -community|serbian_evals:mmlu_svetska_religija|0|0 +community|serbian_evals:mmlu_moralni_sporovi|0 +community|serbian_evals:mmlu_moralne_dileme|0 +community|serbian_evals:mmlu_filozofija|0 +community|serbian_evals:mmlu_svetska_religija|0 # MMLU (High School Level Tasks) -community|serbian_evals:mmlu_srednja_skola_biologija|0|0 -community|serbian_evals:mmlu_srednja_skola_hemija|0|0 -community|serbian_evals:mmlu_srednja_skola_racunari|0|0 -community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0|0 -community|serbian_evals:mmlu_srednja_skola_geografija|0|0 -community|serbian_evals:mmlu_srednja_skola_matematika|0|0 -community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0|0 -community|serbian_evals:mmlu_srednja_skola_fizika|0|0 -community|serbian_evals:mmlu_srednja_skola_psihologija|0|0 -community|serbian_evals:mmlu_srednja_skola_statistika|0|0 -community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0|0 +community|serbian_evals:mmlu_srednja_skola_biologija|0 +community|serbian_evals:mmlu_srednja_skola_hemija|0 +community|serbian_evals:mmlu_srednja_skola_racunari|0 +community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0 +community|serbian_evals:mmlu_srednja_skola_geografija|0 +community|serbian_evals:mmlu_srednja_skola_matematika|0 +community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0 +community|serbian_evals:mmlu_srednja_skola_fizika|0 +community|serbian_evals:mmlu_srednja_skola_psihologija|0 +community|serbian_evals:mmlu_srednja_skola_statistika|0 +community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0 # MMLU (Math, Logic) -community|serbian_evals:mmlu_abstract_algebra|0|0 -community|serbian_evals:mmlu_osnovna_matematika|0|0 -community|serbian_evals:mmlu_formalna_logika|0|0 -community|serbian_evals:mmlu_konceptualna_fizika|0|0 -community|serbian_evals:mmlu_metrika_ekonomije|0|0 -community|serbian_evals:mmlu_masinsko_ucenje|0|0 +community|serbian_evals:mmlu_abstract_algebra|0 +community|serbian_evals:mmlu_osnovna_matematika|0 +community|serbian_evals:mmlu_formalna_logika|0 +community|serbian_evals:mmlu_konceptualna_fizika|0 +community|serbian_evals:mmlu_metrika_ekonomije|0 +community|serbian_evals:mmlu_masinsko_ucenje|0 # MMLU (Social Sciences) -community|serbian_evals:mmlu_globalne_cinjenice|0|0 -community|serbian_evals:mmlu_logicke_zablude|0|0 -community|serbian_evals:mmlu_sociologija|0|0 -community|serbian_evals:mmlu_human_aging|0|0 +community|serbian_evals:mmlu_globalne_cinjenice|0 +community|serbian_evals:mmlu_logicke_zablude|0 +community|serbian_evals:mmlu_sociologija|0 +community|serbian_evals:mmlu_human_aging|0 # Question Answering and Knowledge -community|serbian_evals:boolq|0|0 -community|serbian_evals:openbook|0|0 +community|serbian_evals:boolq|0 +community|serbian_evals:openbook|0 diff --git a/examples/tasks/serbian_task_group/sr_all_inclusive.txt b/examples/tasks/serbian_task_group/sr_all_inclusive.txt index 44e9ad760..659e6a2df 100644 --- a/examples/tasks/serbian_task_group/sr_all_inclusive.txt +++ b/examples/tasks/serbian_task_group/sr_all_inclusive.txt @@ -1,2 +1,2 @@ # MMLU (All-inclusive Task Entry) -community|serbian_evals:mmlu|0|0 +community|serbian_evals:mmlu|0 diff --git a/examples/tasks/serbian_task_group/sr_arc.txt b/examples/tasks/serbian_task_group/sr_arc.txt index 3ac8a654f..e66500be1 100644 --- a/examples/tasks/serbian_task_group/sr_arc.txt +++ b/examples/tasks/serbian_task_group/sr_arc.txt @@ -1,3 +1,3 @@ # Serbian Evaluations - ARC (AI2 Reasoning Challenge) -community|serbian_evals:arc_easy|0|0 -community|serbian_evals:arc_challenge|0|0 +community|serbian_evals:arc_easy|0 +community|serbian_evals:arc_challenge|0 diff --git a/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt b/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt index 4012f55ce..c93a237ce 100644 --- a/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt +++ b/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt @@ -1,4 +1,4 @@ # Commonsense Reasoning -community|serbian_evals:hellaswag|0|0 -community|serbian_evals:piqa|0|0 -community|serbian_evals:winogrande|0|0 +community|serbian_evals:hellaswag|0 +community|serbian_evals:piqa|0 +community|serbian_evals:winogrande|0 diff --git a/examples/tasks/serbian_task_group/sr_custom_task.txt b/examples/tasks/serbian_task_group/sr_custom_task.txt index c3d98830d..284161d4b 100644 --- a/examples/tasks/serbian_task_group/sr_custom_task.txt +++ b/examples/tasks/serbian_task_group/sr_custom_task.txt @@ -1,2 +1,2 @@ # Serbian Evaluations - Custom/Other Task -community|serbian_evals:oz_eval|0|0 +community|serbian_evals:oz_eval|0 diff --git a/examples/tasks/serbian_task_group/sr_misc.txt b/examples/tasks/serbian_task_group/sr_misc.txt index adfbefaaf..13628af3e 100644 --- a/examples/tasks/serbian_task_group/sr_misc.txt +++ b/examples/tasks/serbian_task_group/sr_misc.txt @@ -1,7 +1,7 @@ # MMLU (Miscellaneous) -community|serbian_evals:mmlu_anatomija|0|0 -community|serbian_evals:mmlu_astronomija|0|0 -community|serbian_evals:mmlu_poslovna_etika|0|0 -community|serbian_evals:mmlu_kliničko_znanje|0|0 -community|serbian_evals:mmlu_razno|0|0 -community|serbian_evals:mmlu_elektrotehnika|0|0 +community|serbian_evals:mmlu_anatomija|0 +community|serbian_evals:mmlu_astronomija|0 +community|serbian_evals:mmlu_poslovna_etika|0 +community|serbian_evals:mmlu_kliničko_znanje|0 +community|serbian_evals:mmlu_razno|0 +community|serbian_evals:mmlu_elektrotehnika|0 diff --git a/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt b/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt index 1afedb8a9..f091fc15a 100644 --- a/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt +++ b/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt @@ -1,3 +1,3 @@ # MMLU (Business Professional) -community|serbian_evals:mmlu_marketing|0|0 -community|serbian_evals:mmlu_manadzment|0|0 +community|serbian_evals:mmlu_marketing|0 +community|serbian_evals:mmlu_manadzment|0 diff --git a/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt b/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt index 099db7de7..23533d56c 100644 --- a/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt +++ b/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt @@ -1,8 +1,8 @@ # MMLU (College Level Tasks) -community|serbian_evals:mmlu_fakultet_biologija|0|0 -community|serbian_evals:mmlu_fakultet_hemija|0|0 -community|serbian_evals:mmlu_fakultet_racunari|0|0 -community|serbian_evals:mmlu_fakultet_matematika|0|0 -community|serbian_evals:mmlu_fakultet_medicina|0|0 -community|serbian_evals:mmlu_fakultet_fizika|0|0 -community|serbian_evals:mmlu_sigurnost_racunara|0|0 +community|serbian_evals:mmlu_fakultet_biologija|0 +community|serbian_evals:mmlu_fakultet_hemija|0 +community|serbian_evals:mmlu_fakultet_racunari|0 +community|serbian_evals:mmlu_fakultet_matematika|0 +community|serbian_evals:mmlu_fakultet_medicina|0 +community|serbian_evals:mmlu_fakultet_fizika|0 +community|serbian_evals:mmlu_sigurnost_racunara|0 diff --git a/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt b/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt index 91abbd2f5..466b1fc74 100644 --- a/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt +++ b/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt @@ -1,5 +1,5 @@ # MMLU (Ethics, Philosophy) -community|serbian_evals:mmlu_moralni_sporovi|0|0 -community|serbian_evals:mmlu_moralne_dileme|0|0 -community|serbian_evals:mmlu_filozofija|0|0 -community|serbian_evals:mmlu_svetska_religija|0|0 +community|serbian_evals:mmlu_moralni_sporovi|0 +community|serbian_evals:mmlu_moralne_dileme|0 +community|serbian_evals:mmlu_filozofija|0 +community|serbian_evals:mmlu_svetska_religija|0 diff --git a/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt b/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt index 8f11e22a9..407a702c0 100644 --- a/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt +++ b/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt @@ -1,12 +1,12 @@ # MMLU (High School Level Tasks) -community|serbian_evals:mmlu_srednja_skola_biologija|0|0 -community|serbian_evals:mmlu_srednja_skola_hemija|0|0 -community|serbian_evals:mmlu_srednja_skola_racunari|0|0 -community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0|0 -community|serbian_evals:mmlu_srednja_skola_geografija|0|0 -community|serbian_evals:mmlu_srednja_skola_matematika|0|0 -community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0|0 -community|serbian_evals:mmlu_srednja_skola_fizika|0|0 -community|serbian_evals:mmlu_srednja_skola_psihologija|0|0 -community|serbian_evals:mmlu_srednja_skola_statistika|0|0 -community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0|0 +community|serbian_evals:mmlu_srednja_skola_biologija|0 +community|serbian_evals:mmlu_srednja_skola_hemija|0 +community|serbian_evals:mmlu_srednja_skola_racunari|0 +community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0 +community|serbian_evals:mmlu_srednja_skola_geografija|0 +community|serbian_evals:mmlu_srednja_skola_matematika|0 +community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0 +community|serbian_evals:mmlu_srednja_skola_fizika|0 +community|serbian_evals:mmlu_srednja_skola_psihologija|0 +community|serbian_evals:mmlu_srednja_skola_statistika|0 +community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0 diff --git a/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt b/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt index 40f6caa5f..c3348e312 100644 --- a/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt +++ b/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt @@ -1,7 +1,7 @@ # MMLU (Math, Logic) -community|serbian_evals:mmlu_abstract_algebra|0|0 -community|serbian_evals:mmlu_osnovna_matematika|0|0 -community|serbian_evals:mmlu_formalna_logika|0|0 -community|serbian_evals:mmlu_konceptualna_fizika|0|0 -community|serbian_evals:mmlu_metrika_ekonomije|0|0 -community|serbian_evals:mmlu_masinsko_ucenje|0|0 +community|serbian_evals:mmlu_abstract_algebra|0 +community|serbian_evals:mmlu_osnovna_matematika|0 +community|serbian_evals:mmlu_formalna_logika|0 +community|serbian_evals:mmlu_konceptualna_fizika|0 +community|serbian_evals:mmlu_metrika_ekonomije|0 +community|serbian_evals:mmlu_masinsko_ucenje|0 diff --git a/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt b/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt index 8ee92e844..1501fc3dd 100644 --- a/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt +++ b/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt @@ -1,5 +1,5 @@ # MMLU (Social Sciences) -community|serbian_evals:mmlu_globalne_cinjenice|0|0 -community|serbian_evals:mmlu_logicke_zablude|0|0 -community|serbian_evals:mmlu_sociologija|0|0 -community|serbian_evals:mmlu_human_aging|0|0 +community|serbian_evals:mmlu_globalne_cinjenice|0 +community|serbian_evals:mmlu_logicke_zablude|0 +community|serbian_evals:mmlu_sociologija|0 +community|serbian_evals:mmlu_human_aging|0 diff --git a/examples/tasks/serbian_task_group/sr_qa_knowledge.txt b/examples/tasks/serbian_task_group/sr_qa_knowledge.txt index cdda84ea6..de4b00211 100644 --- a/examples/tasks/serbian_task_group/sr_qa_knowledge.txt +++ b/examples/tasks/serbian_task_group/sr_qa_knowledge.txt @@ -1,3 +1,3 @@ # Question Answering and Knowledge -community|serbian_evals:boolq|0|0 -community|serbian_evals:openbook|0|0 +community|serbian_evals:boolq|0 +community|serbian_evals:openbook|0 diff --git a/examples/test_tasks.txt b/examples/test_tasks.txt index 7666c79e4..12c8662a9 100644 --- a/examples/test_tasks.txt +++ b/examples/test_tasks.txt @@ -1,27 +1,27 @@ -leaderboard|arc:challenge|25|0 -leaderboard|truthfulqa:mc|0|0 -leaderboard|hellaswag|10|0 -leaderboard|mmlu:college_chemistry|5|0 -leaderboard|mmlu:us_foreign_policy|5|0 -lighteval|agieval:aqua-rat|0|0 -lighteval|agieval:logiqa-en|0|0 -lighteval|agieval:lsat-ar|0|0 -lighteval|agieval:lsat-lr|0|0 -lighteval|agieval:lsat-rc|0|0 -lighteval|agieval:sat-en-without-passage|0|0 -lighteval|agieval:sat-en|0|0 -lighteval|bigbench:causal_judgment|3|0 -lighteval|bigbench:date_understanding|3|0 -lighteval|bigbench:disambiguation_qa|3|0 -lighteval|bigbench:geometric_shapes|3|0 -lighteval|bigbench:logical_deduction_five_objects|3|0 -lighteval|bigbench:logical_deduction_seven_objects|3|0 -lighteval|bigbench:movie_recommendation|3|0 -lighteval|bigbench:navigate|3|0 -lighteval|bigbench:ruin_names|3|0 -lighteval|bigbench:salient_translation_error_detection|3|0 -lighteval|bigbench:snarks|3|0 -lighteval|bigbench:temporal_sequences|3|0 -lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0 -lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0 -test|gsm8k|0|1 +leaderboard|arc:challenge|25 +leaderboard|truthfulqa:mc|0 +leaderboard|hellaswag|10 +leaderboard|mmlu:college_chemistry|5 +leaderboard|mmlu:us_foreign_policy|5 +lighteval|agieval:aqua-rat|0 +lighteval|agieval:logiqa-en|0 +lighteval|agieval:lsat-ar|0 +lighteval|agieval:lsat-lr|0 +lighteval|agieval:lsat-rc|0 +lighteval|agieval:sat-en-without-passage|0 +lighteval|agieval:sat-en|0 +lighteval|bigbench:causal_judgment|3 +lighteval|bigbench:date_understanding|3 +lighteval|bigbench:disambiguation_qa|3 +lighteval|bigbench:geometric_shapes|3 +lighteval|bigbench:logical_deduction_five_objects|3 +lighteval|bigbench:logical_deduction_seven_objects|3 +lighteval|bigbench:movie_recommendation|3 +lighteval|bigbench:navigate|3 +lighteval|bigbench:ruin_names|3 +lighteval|bigbench:salient_translation_error_detection|3 +lighteval|bigbench:snarks|3 +lighteval|bigbench:temporal_sequences|3 +lighteval|bigbench:tracking_shuffled_objects_five_objects|3 +lighteval|bigbench:tracking_shuffled_objects_seven_objects|3 +test|gsm8k|0 diff --git a/pyproject.toml b/pyproject.toml index 04da22e55..97a1745d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,7 +103,7 @@ docs = ["hf-doc-builder", "watchdog"] extended_tasks = [ "langdetect", # ifeval "openai>1.87", # llm as a judge using openai models - "tiktoken" + "tiktoken", ] s3 = ["s3fs"] multilingual = [ diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 5d82d3c38..055670657 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -169,11 +169,6 @@ class CompiledDetail: non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size for the current task. padded (int): Total umber of samples which needed padding during the batching step for the current task. non_padded (int): Total number of samples which did not need padding during the batching step for the current task. - effective_few_shots (float): Average effective few shots across all samples for the current task. - effective few shot is the number of few shots actually used to fit the prompt in the model context - length while allowing model generation of the expected size. - num_truncated_few_shots (int): Total number of samples which required truncated prompts to fit the model size for the current task. - """ hashes: dict = field(default_factory=dict) @@ -181,8 +176,6 @@ class CompiledDetail: non_truncated: int = 0 padded: int = 0 non_padded: int = 0 - effective_few_shots: float = 0 - num_truncated_few_shots: int = 0 @dataclass class CompiledDetailOverAllTasks: @@ -196,11 +189,6 @@ class CompiledDetailOverAllTasks: non_truncated (int): Total number of samples which did not need prompt truncation to fit the model context size across all tasks padded (int): Number of samples which needed padding during the batching step across all tasks. non_padded (int): Number of samples which did not need padding during the batching step across all tasks. - effective_few_shots (float): Average effective few shots across all samples across all tasks. - effective few shot is the number of few shots actually used to fit the prompt in the model context - length while allowing model generation of the expected size. - num_truncated_few_shots (int): Number of samples which required truncated prompts to fit the model size across all tasks. - """ hashes: dict = field(default_factory=dict) @@ -208,7 +196,6 @@ class CompiledDetailOverAllTasks: non_truncated: int = 0 padded: int = 0 non_padded: int = 0 - num_truncated_few_shots: int = 0 @dataclass class Hash: diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py index 7d4d34248..2768a8a4d 100644 --- a/src/lighteval/main_baseline.py +++ b/src/lighteval/main_baseline.py @@ -51,15 +51,13 @@ def baseline( This baseline computation may not be suitable for all task types and should be used with caution. """ from lighteval.logging.evaluation_tracker import EvaluationTracker - from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig + from lighteval.tasks.lighteval_task import LightevalTask from lighteval.tasks.registry import Registry from lighteval.tasks.requests import SamplingMethod from lighteval.utils.utils import as_list - registry = Registry(custom_tasks=custom_tasks) - - task_configs: list[LightevalTaskConfig] = registry.get_tasks_configs(tasks) - tasks_dict: dict[str, LightevalTask] = registry.get_tasks_from_configs(task_configs) + registry = Registry(tasks=tasks, custom_tasks=custom_tasks) + tasks_dict: dict[str, LightevalTask] = registry.load_tasks() evaluation_tracker = EvaluationTracker( output_dir=output_dir, diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py index 706dd1a06..0f3cd3df1 100644 --- a/src/lighteval/main_tasks.py +++ b/src/lighteval/main_tasks.py @@ -46,13 +46,12 @@ def inspect( from rich import print - from lighteval.tasks.registry import Registry, taskinfo_selector + from lighteval.tasks.registry import Registry - registry = Registry(custom_tasks=custom_tasks) + registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True) # Loading task - task_names_list, _ = taskinfo_selector(tasks, task_registry=registry) - task_dict = registry.get_task_dict(task_names_list) + task_dict = registry.load_tasks() for name, task in task_dict.items(): print("-" * 10, name, "-" * 10) if show_config: @@ -80,7 +79,7 @@ def list( """ from lighteval.tasks.registry import Registry - registry = Registry(custom_tasks=custom_tasks) + registry = Registry(custom_tasks=custom_tasks, load_community=True, load_extended=True, load_multilingual=True) registry.print_all_tasks(suites=suites) diff --git a/src/lighteval/models/transformers/transformers_model.py b/src/lighteval/models/transformers/transformers_model.py index db2b68bd1..9645904fa 100644 --- a/src/lighteval/models/transformers/transformers_model.py +++ b/src/lighteval/models/transformers/transformers_model.py @@ -799,6 +799,8 @@ def _generate_padded( output_logits=returns_logits, renormalize_logits=True, ) + if num_samples == 1 and generation_config["temperature"] == 0: + generation_config["do_sample"] = False if num_samples > 1 and generation_config["temperature"] == 0: logger.warning("num_samples > 1 but temperature is set to 0, this will not sample different outputs.") diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 4cf1dbee2..71b1efd4a 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -39,7 +39,7 @@ from lighteval.models.model_output import ( ModelResponse, ) -from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig +from lighteval.tasks.lighteval_task import LightevalTask from lighteval.tasks.registry import Registry from lighteval.tasks.requests import SamplingMethod from lighteval.utils.imports import ( @@ -241,13 +241,10 @@ def _init_tasks_and_requests(self, tasks: str): logger.info("--- LOADING TASKS ---") # The registry contains all the potential tasks - registry = Registry( - custom_tasks=self.pipeline_parameters.custom_tasks_directory, - ) + registry = Registry(tasks=tasks, custom_tasks=self.pipeline_parameters.custom_tasks_directory) - # load the tasks fro the configs and their datasets - task_configs: list[LightevalTaskConfig] = registry.get_tasks_configs(tasks) - self.tasks_dict: dict[str, LightevalTask] = registry.get_tasks_from_configs(task_configs) + # load the tasks from the configs and their datasets + self.tasks_dict: dict[str, LightevalTask] = registry.load_tasks() LightevalTask.load_datasets(self.tasks_dict, self.pipeline_parameters.dataset_loading_processes) self.documents_dict = { task.full_name: task.get_docs(self.pipeline_parameters.max_samples) for _, task in self.tasks_dict.items() diff --git a/src/lighteval/tasks/extended/ifeval/instructions.py b/src/lighteval/tasks/extended/ifeval/instructions.py index ee9e7b88b..6e84b6ef4 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions.py +++ b/src/lighteval/tasks/extended/ifeval/instructions.py @@ -20,7 +20,6 @@ import random import re import string -from typing import Dict, Optional, Sequence, Union import langdetect @@ -29,8 +28,6 @@ logger = logging.getLogger(__name__) -_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] - _LANGUAGES = instructions_util.LANGUAGE_CODES # The relational operation for comparison. diff --git a/src/lighteval/tasks/extended/ifeval/instructions_utils.py b/src/lighteval/tasks/extended/ifeval/instructions_utils.py index 7d995e42f..63e7a9231 100644 --- a/src/lighteval/tasks/extended/ifeval/instructions_utils.py +++ b/src/lighteval/tasks/extended/ifeval/instructions_utils.py @@ -1669,6 +1669,16 @@ def _get_sentence_tokenizer(): return nltk.data.load("nltk:tokenizers/punkt/english.pickle") +def count_stopwords(text): + """Counts the number of stopwords.""" + nltk.download("stopwords") + stopwords = nltk.corpus.stopwords.words("english") + tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(text) + num_stopwords = len([t for t in tokens if t.lower() in stopwords]) + return num_stopwords + + def count_sentences(text): """Count the number of sentences.""" tokenizer = _get_sentence_tokenizer() diff --git a/src/lighteval/tasks/extended/lcb/main.py b/src/lighteval/tasks/extended/lcb/main.py index ad49235fb..571f24787 100644 --- a/src/lighteval/tasks/extended/lcb/main.py +++ b/src/lighteval/tasks/extended/lcb/main.py @@ -22,11 +22,11 @@ """Usage: lighteval vllm \ "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.6,top_p:0.95}" \ - "extended|lcb:codegeneration|0|0" + "extended|lcb:codegeneration|0" lighteval vllm \ "pretrained=Qwen/Qwen2.5-Coder-3B-Instruct,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.2,top_p:0.95}" \ - "extended|lcb:codegeneration|0|0" + "extended|lcb:codegeneration|0" """ import json diff --git a/src/lighteval/tasks/extended/tiny_benchmarks/main.py b/src/lighteval/tasks/extended/tiny_benchmarks/main.py index bf65ac530..88dcfb95e 100644 --- a/src/lighteval/tasks/extended/tiny_benchmarks/main.py +++ b/src/lighteval/tasks/extended/tiny_benchmarks/main.py @@ -24,7 +24,7 @@ """ See https://github.com/felipemaiapolo/tinyBenchmarks/ for the original code. -Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0|0,extended|tiny:gsm8k|0|0,extended|tiny:hellaswag|0|0,extended|tiny:arc|0|0,extended|tiny:truthfulqa|0|0" --extended_tasks extended_tasks --output_dir "./evals"` +Test with `python run_evals_accelerate.py --model_args "pretrained=EleutherAI/pythia-70m" --tasks "extended|tiny:winogrande|0,extended|tiny:gsm8k|0,extended|tiny:hellaswag|0,extended|tiny:arc|0,extended|tiny:truthfulqa|0" --extended_tasks extended_tasks --output_dir "./evals"` """ import os diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index d040925dd..8488755b9 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -100,7 +100,6 @@ class LightevalTaskConfig: must_remove_duplicate_docs: bool = False num_fewshots: int = 0 - truncate_fewshots: bool = False version: int = 0 @@ -114,7 +113,7 @@ def __post_init__(self): self.evaluation_splits = tuple(self.evaluation_splits) self.suite = tuple(self.suite) self.stop_sequence = self.stop_sequence if self.stop_sequence is not None else () - self.full_name = f"{self.name}|{self.num_fewshots}" + self.full_name = f"{self.name}|{self.num_fewshots}" # todo clefourrier: this is likely incorrect def print(self): md_writer = MarkdownTableWriter() diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 0a91c5554..13ba3d00b 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -36,7 +36,12 @@ import lighteval.tasks.default_tasks as default_tasks from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig -from lighteval.utils.imports import CANNOT_USE_EXTENDED_TASKS_MSG, can_load_extended_tasks +from lighteval.utils.imports import ( + CANNOT_USE_EXTENDED_TASKS_MSG, + CANNOT_USE_MULTILINGUAL_TASKS_MSG, + can_load_extended_tasks, + can_load_multilingual_tasks, +) # Import community tasks @@ -104,15 +109,20 @@ def load_community_tasks(): DEFAULT_SUITES = CORE_SUITES + OPTIONAL_SUITES -TRUNCATE_FEW_SHOTS_DEFAULTS = True - class Registry: """ The Registry class is used to manage the task registry and get task classes. """ - def __init__(self, custom_tasks: str | Path | ModuleType | None = None): + def __init__( + self, + tasks: str | Path | None = None, + custom_tasks: str | Path | ModuleType | None = None, + load_community: bool = False, + load_extended: bool = False, + load_multilingual: bool = False, + ): """ Initialize the Registry class. Registry is responsible for holding a dict of task and their config, initializing a LightevalTask instance when asked. @@ -131,51 +141,86 @@ def __init__(self, custom_tasks: str | Path | ModuleType | None = None): """ self._custom_tasks = custom_tasks - def get_tasks_from_configs(self, task_configs: list[LightevalTaskConfig]) -> dict[str, LightevalTask]: - return {f"{config.full_name}": LightevalTask(config=config) for config in task_configs} + if tasks is None: + logger.warning( + "You passed no task name. This should only occur if you are using the CLI to inspect tasks." + ) + self.tasks_list = [] + else: + self.tasks_list = self._get_full_task_list_from_input_string(tasks) + # These parameters are dynamically set by the task names provided, thanks to `activate_suites_to_load`, + # except in the `tasks` CLI command to display the full list + self._load_community = load_community + self._load_extended = load_extended + self._load_multilingual = load_multilingual + self._activate_loading_of_optional_suite() # we dynamically set the loading parameters - def get_tasks_configs(self, task: str) -> list[LightevalTaskConfig]: - """ - task is a string of the form "suite|task|few_shot|truncate_few_shots,suite|task|few_shot|truncate_few_shots" + # We load all task to + self._task_registry = self._load_full_registry() - returns a LightevalTaskConfig object based on the task name and fewshot and truncate_few_shots values. - """ - task_to_params = self.taskinfo_selector(task) - configs = [] + self.task_to_configs = self._update_task_configs() - for task_name, task_param in task_to_params.items(): - # We can have multiple versions of the same task running (for ex, different few shots, different metric params, etc) - for subtask_param in task_param: - config = self.task_registry.get(task_name) - if config is None: - raise ValueError(f"Cannot find task {task_name} in task list or in custom task registry") + def _get_full_task_list_from_input_string(self, tasks: str | Path) -> list[str]: + """Converts an input string (either a path to file with a list of tasks or a string of comma-separated tasks) into an actual list""" + if os.path.exists(tasks): + with open(tasks, "r") as f: + tasks_list = [line.strip() for line in f if line.strip() and not line.startswith("#")] + else: + tasks_list = tasks.split(",") - config = copy.deepcopy(config) - config.num_fewshots = subtask_param["fewshots"] - config.truncate_fewshots = subtask_param["truncate_fewshots"] - config.full_name = f"{task_name}|{config.num_fewshots}" - # If some tasks are parametrizable and in cli, we set attributes here - for metric in [m for m in config.metrics if "@" in m.metric_name]: # parametrizable metric - for attribute, value in subtask_param["metric_params"].items(): - setattr(metric.sample_level_fn, attribute, value) - required = getattr(metric.sample_level_fn, "attribute_must_be_set", []) - for attribute in required: - if getattr(metric.sample_level_fn, attribute) is None: - raise ValueError( - f"Metric {metric.metric_name} for task {task_name} " - f"was not correctly parametrized. Forgot to set '{attribute}'." - ) + # We might have tasks provided as task groups in the custom tasks + # We load the whole task_groups mapping + if self._custom_tasks is None: + task_groups = {} + else: + custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks) + tasks_group_dict = {} + if hasattr(custom_tasks_module, "TASKS_GROUPS"): + tasks_group_dict = custom_tasks_module.TASKS_GROUPS - configs.append(config) + # We should allow defining task groups as comma-separated strings or lists of tasks + task_groups = {k: v if isinstance(v, list) else v.split(",") for k, v in tasks_group_dict.items()} - return configs + # Then link actual task_group to task list if needed + # (At this point the strings are either task name/superset name or group names) + expanded_tasks_list: list[str] = [] + for maybe_task_group in tasks_list: + # We either expand the group (in case it's a group name), or we keep it as is (in case it's a task name or superset name) + expanded_tasks = task_groups.get(maybe_task_group, [maybe_task_group]) + if len(expanded_tasks) > 1: + logger.info(f"Expanding task group {maybe_task_group} to {expanded_tasks}") + expanded_tasks_list.extend(expanded_tasks) - @property - @lru_cache - def task_registry(self) -> dict[str, LightevalTaskConfig]: + # We remove exact duplicates + expanded_tasks_list = list(set(expanded_tasks_list)) + + return expanded_tasks_list + + def _activate_loading_of_optional_suite(self) -> None: + """Dynamically selects which of the optional suite we want to load.""" + suites = {task.split("|")[0] for task in self.tasks_list} + + for suite_name in suites: + if suite_name not in DEFAULT_SUITES: + logger.warning( + f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations." + ) + + if "extended" in suites: + if not can_load_extended_tasks(): + raise ImportError(CANNOT_USE_EXTENDED_TASKS_MSG) + self._load_extended = True + if "multilingual" in suites: + if not can_load_multilingual_tasks(): + raise ImportError(CANNOT_USE_MULTILINGUAL_TASKS_MSG) + self._load_multilingual = True + if "community" in suites: + self._load_community = True + + def _load_full_registry(self) -> dict[str, LightevalTaskConfig]: """ Returns: - dict[str, LazyLightevalTask]: A dictionary mapping task names (suite|task) to their corresponding LightevalTask classes. + dict[str, LightevalTaskConfig]: A dictionary mapping task names (suite|task) to their corresponding LightevalTask classes. Example: { @@ -188,30 +233,27 @@ def task_registry(self) -> dict[str, LightevalTaskConfig]: if self._custom_tasks is not None: custom_tasks_module.append(Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks)) - if can_load_extended_tasks(): + + # Need to load extended tasks + if self._load_extended: for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES: custom_tasks_module.append(extended_task_module) else: logger.warning(CANNOT_USE_EXTENDED_TASKS_MSG) - # Load community tasks - community_modules = load_community_tasks() - for community_task_module in community_modules: - custom_tasks_module.append(community_task_module) + # Need to load community tasks + if self._load_community: + community_modules = load_community_tasks() + for community_task_module in community_modules: + custom_tasks_module.append(community_task_module) - # Load multilingual tasks - MULTILINGUAL_TASKS_AVAILABLE = False - multilingual_tasks = None - try: + # Need to load multilingual tasks + if self._load_multilingual: import lighteval.tasks.multilingual.tasks as multilingual_tasks - MULTILINGUAL_TASKS_AVAILABLE = True - except ImportError as e: - logger.warning(f"Could not load multilingual tasks: {e}. You may need to install additional dependencies.") - - if MULTILINGUAL_TASKS_AVAILABLE and multilingual_tasks is not None: custom_tasks_module.append(multilingual_tasks) + # We load all for module in custom_tasks_module: custom_task_configs.extend(module.TASKS_TABLE) logger.info(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}") @@ -230,84 +272,72 @@ def task_registry(self) -> dict[str, LightevalTaskConfig]: return {**default_tasks_registry, **custom_tasks_registry} - def taskinfo_selector(self, tasks: str) -> dict[str, list[dict]]: + def _update_task_configs(self) -> dict[str, LightevalTaskConfig]: # noqa: C901 """ - Converts a input string of tasks name to task information usable by lighteval. - - Args: - tasks (str): A string containing a comma-separated list of tasks definitions in the - format: "task_definition", where it can be - containing a list of tasks. - where task_definition can be: - - path to a file containing a list of tasks (one per line) - - task group defined in TASKS_GROUPS dict in custom tasks file - - task name with few shot in format "suite|task|few_shot|truncate_few_shots" - - task superset in format "suite|task_superset|few_shot|truncate_few_shots" (superset will run all tasks with format "suite|task_superset:{subset}|few_shot|truncate_few_shots") - - - Returns: - tuple[list[str], dict[str, list[tuple[int, bool]]]]: A tuple containing: - - A sorted list of unique task names in the format "suite|task". - - A dictionary mapping each task name to a list of tuples representing the few_shot and truncate_few_shots values. + Updates each config depending on the input tasks (we replace all provided params, like few shot number, sampling params, etc) """ - task_to_params = collections.defaultdict(list) + task_to_configs = collections.defaultdict(list) - # We can provide a path to a file with a list of tasks or a string of comma-separated tasks - if os.path.exists(tasks): - with open(tasks, "r") as f: - tasks_list = [line.strip() for line in f if line.strip() and not line.startswith("#")] - else: - tasks_list = tasks.split(",") - - # At this point the strings are either task name/superset name or group names - # Here we deal with group names and map them to corresponding tasks - expanded_tasks_list: list[str] = [] - for maybe_task_group in tasks_list: - # We either expand the group (in case it's a group name), or we keep it as is (in case it's a task name or superset name) - expanded_tasks = self.task_groups_dict.get(maybe_task_group, [maybe_task_group]) - if len(expanded_tasks) > 1: - logger.info(f"Expanding task group {maybe_task_group} to {expanded_tasks}") - expanded_tasks_list.extend(expanded_tasks) - - for task in expanded_tasks_list: + # We map all tasks to their parameters + for task in self.tasks_list: metric_params_dict = {} try: - suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|")) + if task.count("|") == 3: + logger.warning( + "Deprecation warning: You provided 4 arguments in your task name, but we no longer support the `truncate_fewshot` option. We will ignore the parameter for now, but it will fail in a couple of versions, so you should change your task name to `suite|task|num_fewshot`." + ) + suite_name, task_name, few_shot, _ = tuple(task.split("|")) + else: + suite_name, task_name, few_shot = tuple(task.split("|")) if "@" in task_name: - task_name, metric_params = task_name.split("@") - # We convert k:v,k2:v2 to {"k": "v", "k2": "v2"}, then to correct type - metric_params_dict = dict(item.split("=") for item in metric_params.split(",") if item) + split_task_name = task_name.split("@") + task_name, metric_params = split_task_name[0], split_task_name[1:] + # We convert k:v to {"k": "v"}, then to correct type + metric_params_dict = dict(item.split("=") for item in metric_params if item) metric_params_dict = {k: ast.literal_eval(v) for k, v in metric_params_dict.items()} + few_shot = int(few_shot) - truncate_few_shots = int(truncate_few_shots) except ValueError: - raise ValueError( - f"Cannot get task info from {task}. correct format is suite|task|few_shot|truncate_few_shots" - ) + raise ValueError(f"Cannot get task info from {task}. correct format is suite|task|few_shot") - if truncate_few_shots not in [0, 1]: - raise ValueError(f"TruncateFewShots must be 0 or 1, got {truncate_few_shots}") + # This adds support for task supersets (eg: mmlu -> all the mmlu tasks) + for expanded_task in self._expand_task_definition(f"{suite_name}|{task_name}"): + # todo: it's likely we'll want this step at the list set up step, not here - truncate_few_shots = bool(truncate_few_shots) - few_shot = int(few_shot) + # We load each config + config = self._task_registry.get(expanded_task) + if config is None: + raise ValueError(f"Cannot find task {expanded_task} in task list or in custom task registry") - if suite_name not in DEFAULT_SUITES: - logger.warning( - f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations." - ) + config = copy.deepcopy(config) + config.num_fewshots = few_shot + config.full_name = f"{expanded_task}|{config.num_fewshots}" + # If some tasks are parametrizable and in cli, we set attributes here + for metric in [m for m in config.metrics if "@" in m.metric_name]: # parametrizable metric + for attribute, value in metric_params_dict.items(): + setattr(metric.sample_level_fn, attribute, value) + required = getattr(metric.sample_level_fn, "attribute_must_be_set", []) + for attribute in required: + if getattr(metric.sample_level_fn, attribute) is None: + raise ValueError( + f"Metric {metric.metric_name} for task {expanded_task} " + f"was not correctly parametrized. Forgot to set '{attribute}'." + ) - # This adds support for task supersets (eg: mmlu -> all the mmlu tasks) - for expanded_task in self.expand_task_definition(f"{suite_name}|{task_name}"): - # Store few_shot info for each task name (suite|task) - task_to_params[expanded_task].append( - { - "fewshots": few_shot, - "truncate_fewshots": truncate_few_shots, - "metric_params": metric_params_dict, - } - ) + task_to_configs[expanded_task].append(config) + + return task_to_configs - return task_to_params + def load_tasks(self) -> dict[str, LightevalTask]: + if len(self.task_to_configs) == 0: # we're in cli to analyse tasks, we return all tasks + return {f"{config.full_name}": LightevalTask(config=config) for config in self._task_registry.values()} + + # We return only the tasks of interest + return { + f"{config.full_name}": LightevalTask(config=config) + for configs in self.task_to_configs.values() + for config in configs + } @property @lru_cache @@ -323,34 +353,11 @@ def _task_superset_dict(self): """ # Note: sorted before groupby is important as the python implementation of groupby does not # behave like sql groupby. For more info see the docs of itertools.groupby - superset_dict = {k: list(v) for k, v in groupby(sorted(self.task_registry.keys()), lambda x: x.split(":")[0])} + superset_dict = {k: list(v) for k, v in groupby(sorted(self._task_registry.keys()), lambda x: x.split(":")[0])} # Only consider supersets with more than one task return {k: v for k, v in superset_dict.items() if len(v) > 1} - @property - @lru_cache - def task_groups_dict(self) -> dict[str, list[str]]: - """ - Returns: - dict[str, list[str]]: A dictionary where keys are task group names and values are lists of task names (suite|task). - - Example: - { - "all_custom": ["custom|task1", "custom|task2", "custom|task3"], - "group1": ["custom|task1", "custom|task2"], - } - """ - if self._custom_tasks is None: - return {} - custom_tasks_module = Registry.create_custom_tasks_module(custom_tasks=self._custom_tasks) - tasks_group_dict = {} - if hasattr(custom_tasks_module, "TASKS_GROUPS"): - tasks_group_dict = custom_tasks_module.TASKS_GROUPS - - # We should allow defining task groups as comma-separated strings or lists of tasks - return {k: v if isinstance(v, list) else v.split(",") for k, v in tasks_group_dict.items()} - - def expand_task_definition(self, task_definition: str): + def _expand_task_definition(self, task_definition: str): """ Args: task_definition (str): Task definition to expand. In format: @@ -368,6 +375,55 @@ def expand_task_definition(self, task_definition: str): # Then it must be a single task return [task_definition] + @staticmethod + def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleType: + """Creates a custom task module to load tasks defined by the user in their own file. + + Args: + custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself + + Returns: + ModuleType: The newly imported/created custom tasks modules + """ + if isinstance(custom_tasks, ModuleType): + return custom_tasks + if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks): + module_name = os.path.splitext(os.path.basename(custom_tasks))[0] + spec = importlib.util.spec_from_file_location(module_name, custom_tasks) + + if spec is None: + raise ValueError(f"Cannot find module {module_name} at {custom_tasks}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + if isinstance(custom_tasks, (str, Path)): + return importlib.import_module(str(custom_tasks)) + + @staticmethod + def create_task_config_dict(meta_table: list[LightevalTaskConfig] | None = None) -> dict[str, LightevalTaskConfig]: + """ + Create configuration tasks based on the provided meta_table. + + Args: + meta_table: meta_table containing tasks + configurations. If not provided, it will be loaded from TABLE_PATH. + + Returns: + Dict[str, LightevalTaskConfig]: A dictionary of task names mapped to their corresponding LightevalTaskConfig. + """ + + if meta_table is None: + meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)] + + tasks_with_config: dict[str, LightevalTaskConfig] = {} + for config in meta_table: + for suite in config.suite: + if suite in DEFAULT_SUITES: + tasks_with_config[f"{suite}|{config.name}"] = config + + return tasks_with_config + def print_all_tasks(self, suites: str | None = None): """ Print all the tasks in the task registry. @@ -399,7 +455,7 @@ def print_all_tasks(self, suites: str | None = None): requested_suites.remove("multilingual") # Get all tasks and filter by requested suites - all_tasks = list(self.task_registry.keys()) + all_tasks = list(self._task_registry.keys()) tasks_names = [task for task in all_tasks if task.split("|")[0] in requested_suites] # Ensure all requested suites are present (even if empty) @@ -428,59 +484,3 @@ def print_all_tasks(self, suites: str | None = None): # Print summary total_tasks = len([t for t in tasks_names if t.split("|")[1]]) print(f"\nTotal tasks displayed: {total_tasks}") - - @staticmethod - def create_custom_tasks_module(custom_tasks: str | Path | ModuleType) -> ModuleType: - """Creates a custom task module to load tasks defined by the user in their own file. - - Args: - custom_tasks (Optional[Union[str, ModuleType]]): Path to the custom tasks file or name of a module to import containing custom tasks or the module itself - - Returns: - ModuleType: The newly imported/created custom tasks modules - """ - if isinstance(custom_tasks, ModuleType): - return custom_tasks - if isinstance(custom_tasks, (str, Path)) and os.path.exists(custom_tasks): - module_name = os.path.splitext(os.path.basename(custom_tasks))[0] - spec = importlib.util.spec_from_file_location(module_name, custom_tasks) - - if spec is None: - raise ValueError(f"Cannot find module {module_name} at {custom_tasks}") - - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - if isinstance(custom_tasks, (str, Path)): - return importlib.import_module(str(custom_tasks)) - - @staticmethod - def create_task_config_dict(meta_table: list[LightevalTaskConfig] | None = None) -> dict[str, LightevalTaskConfig]: - """ - Create configuration tasks based on the provided meta_table. - - Args: - meta_table: meta_table containing tasks - configurations. If not provided, it will be loaded from TABLE_PATH. - cache_dir: Directory to store cached data. If not - provided, the default cache directory will be used. - - Returns: - Dict[str, LightevalTask]: A dictionary of task names mapped to their corresponding LightevalTask classes. - """ - - if meta_table is None: - meta_table = [config for config in vars(default_tasks).values() if isinstance(config, LightevalTaskConfig)] - - tasks_with_config: dict[str, LightevalTaskConfig] = {} - for config in meta_table: - if not any(suite in config.suite for suite in DEFAULT_SUITES): - logger.warning( - f"This evaluation is not in any known suite: {config.name} is in {config.suite}, not in {DEFAULT_SUITES}. Skipping." - ) - continue - for suite in config.suite: - if suite in DEFAULT_SUITES: - tasks_with_config[f"{suite}|{config.name}"] = config - - return tasks_with_config diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index 8829510b2..30da2adea 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -101,12 +101,6 @@ class Doc: Name of the task or benchmark this Doc belongs to. ## Few-shot Learning Parameters - num_asked_few_shots (int): - Number of few-shot examples requested for this instance. - - num_effective_few_shots (int): - Actual number of few-shot examples used (may differ from requested). - fewshot_samples (list): List of Doc objects representing few-shot examples. These examples are prepended to the main query to provide context. @@ -212,8 +206,6 @@ class Doc: task_name: str = "" # Fewshots parameters - num_asked_few_shots: int = 0 - num_effective_few_shots: int = 0 fewshot_samples: list = field(default_factory=list) sampling_methods: list[SamplingMethod] = field(default_factory=list) fewshot_sorting_class: str | None = None # class to use to select balanced few-shot samples diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py index 182027099..2534cb52a 100644 --- a/src/lighteval/utils/imports.py +++ b/src/lighteval/utils/imports.py @@ -109,6 +109,18 @@ def can_load_extended_tasks() -> bool: CANNOT_USE_EXTENDED_TASKS_MSG = "If you want to use extended_tasks, make sure you installed their dependencies using `pip install -e .[extended_tasks]`." +def can_load_multilingual_tasks() -> bool: + try: + import lighteval.tasks.multilingual.tasks # noqa: F401 + + return True + except ImportError: + return False + + +CANNOT_USE_MULTILINGUAL_TASKS_MSG = "If you want to use multilingual tasks, make sure you installed their dependencies using `pip install -e .[multilingual]`." + + def can_load_spacy_tokenizer(language: str) -> bool: imports = [] packages = ["spacy", "stanza"] diff --git a/tests/logging/test_evaluation_tracker.py b/tests/logging/test_evaluation_tracker.py index ba4517245..45c5790d0 100644 --- a/tests/logging/test_evaluation_tracker.py +++ b/tests/logging/test_evaluation_tracker.py @@ -445,7 +445,6 @@ def test_default_property_with_different_model_configs(self): "non_truncated": 0, "padded": 0, "non_padded": 0, - "num_truncated_few_shots": 0, }, ) diff --git a/tests/metrics/test_metric_requests.py b/tests/metrics/test_metric_requests.py index 7ceb94c68..e7f9ee473 100644 --- a/tests/metrics/test_metric_requests.py +++ b/tests/metrics/test_metric_requests.py @@ -43,7 +43,7 @@ def dummy_prompt_fc(line, task_name: str = ""): def get_pmi_task(metrics: list[Metric]): - return LightevalTaskConfig( + config = LightevalTaskConfig( name="pmi_test_task", metrics=metrics, suite=["test"], @@ -52,6 +52,10 @@ def get_pmi_task(metrics: list[Metric]): hf_subset=xstory_cloze_en_lighteval.hf_subset, evaluation_splits=xstory_cloze_en_lighteval.evaluation_splits, ) + # This is manually edited when updating the config and in the post init function + # - we need to get a more homogeneous system for naming... + config.full_name = "test|pmi_test_task|0" + return config def test_pmi_request(): @@ -72,9 +76,10 @@ def test_pmi_request(): metric = LogLikelihoodAccMetric(normalization=LogProbPMINorm()) pmi_test_config = get_pmi_task(metrics=[metric]) task = LightevalTask(pmi_test_config) - result = fake_evaluate_task(task, fake_model, max_samples=1)["results"]["test:pmi_test_task:0"] + evaluation = fake_evaluate_task(task, fake_model, max_samples=1) + results = evaluation["results"]["test:pmi_test_task:0"] # Correct choice after norm should be the second one so 0 acc - assert result[metric.metric_name] == 0 + assert results[metric.metric_name] == 0 def test_pmi_request_with_logprob_metric(): diff --git a/tests/pipeline/test_reasoning_tags.py b/tests/pipeline/test_reasoning_tags.py index dd131e838..84dfb9e7e 100644 --- a/tests/pipeline/test_reasoning_tags.py +++ b/tests/pipeline/test_reasoning_tags.py @@ -61,11 +61,13 @@ def setUp(self): stop_sequence=["\n"], num_fewshots=0, ) + self.input_task_name = "test|test_reasoning_task|0" + self.task_config_name = self.task_config.full_name # Create test documents with reasoning tags in expected responses self.test_docs = [ Doc( - task_name="test|test_reasoning_task|0", + task_name=self.input_task_name, query="What is 2+2?", choices=["4"], gold_index=[0], @@ -77,7 +79,7 @@ def setUp(self): # Mock dataset self.mock_dataset = {"test": self.test_docs} - def _mock_task_registry(self, task_config, task_docs, responses_with_reasoning_tags): + def _mock_task_registry(self, input_task_name, task_config, task_docs, responses_with_reasoning_tags): """Create a fake registry for testing.""" class FakeTask(LightevalTask): @@ -93,14 +95,15 @@ def download_dataset_worker(task) -> None: return task._docs class FakeRegistry(Registry): - def __init__(self, custom_tasks: Optional[Union[str, Path, ModuleType]] = None): - super().__init__(custom_tasks=custom_tasks) + def __init__( + self, tasks: Optional[str] = None, custom_tasks: Optional[Union[str, Path, ModuleType]] = None + ): + self.tasks_list = [input_task_name] + # suite_name, task_name, few_shot = input_task_name.split("|") + self.task_to_configs = {input_task_name: [task_config]} - def get_tasks_configs(self, task: str): - return [task_config] - - def get_tasks_from_configs(self, tasks_configs): - return {f"{task_config.suite[0]}|{task_config.full_name}": FakeTask(task_config)} + def load_tasks(self): + return {input_task_name: FakeTask(config=task_config)} # Create a DummyModel that returns responses with reasoning tags class TestDummyModel(DummyModel): @@ -122,7 +125,7 @@ def test_remove_reasoning_tags_enabled(self): ] FakeRegistry, TestDummyModel = self._mock_task_registry( - self.task_config, self.test_docs, responses_with_reasoning + self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning ) # Initialize accelerator if available @@ -144,7 +147,7 @@ def test_remove_reasoning_tags_enabled(self): model = TestDummyModel(DummyModelConfig(seed=42)) pipeline = Pipeline( - tasks="test|test_reasoning_task|0|0", + tasks="test|test_reasoning_task|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model=model, @@ -168,7 +171,7 @@ def test_remove_reasoning_tags_enabled_tags_as_string(self): ] FakeRegistry, TestDummyModel = self._mock_task_registry( - self.task_config, self.test_docs, responses_with_reasoning + self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning ) # Initialize accelerator if available @@ -190,7 +193,7 @@ def test_remove_reasoning_tags_enabled_tags_as_string(self): model = TestDummyModel(DummyModelConfig(seed=42)) pipeline = Pipeline( - tasks="test|test_reasoning_task|0|0", + tasks="test|test_reasoning_task|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model=model, @@ -214,7 +217,7 @@ def test_remove_reasoning_tags_enabled_default_tags(self): ] FakeRegistry, TestDummyModel = self._mock_task_registry( - self.task_config, self.test_docs, responses_with_reasoning + self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning ) # Initialize accelerator if available @@ -233,7 +236,7 @@ def test_remove_reasoning_tags_enabled_default_tags(self): model = TestDummyModel(DummyModelConfig(seed=42)) pipeline = Pipeline( - tasks="test|test_reasoning_task|0|0", + tasks="test|test_reasoning_task|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model=model, @@ -257,7 +260,7 @@ def test_remove_reasoning_tags_disabled(self): ] FakeRegistry, TestDummyModel = self._mock_task_registry( - self.task_config, self.test_docs, responses_with_reasoning + self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning ) # Initialize accelerator if available @@ -279,7 +282,7 @@ def test_remove_reasoning_tags_disabled(self): model = TestDummyModel(DummyModelConfig(seed=42)) pipeline = Pipeline( - tasks="test|test_reasoning_task|0|0", + tasks="test|test_reasoning_task|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model=model, @@ -303,7 +306,7 @@ def test_custom_reasoning_tags(self): ] FakeRegistry, TestDummyModel = self._mock_task_registry( - self.task_config, self.test_docs, responses_with_reasoning + self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning ) # Initialize accelerator if available @@ -325,7 +328,7 @@ def test_custom_reasoning_tags(self): model = TestDummyModel(DummyModelConfig(seed=42)) pipeline = Pipeline( - tasks="test|test_reasoning_task|0|0", + tasks="test|test_reasoning_task|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model=model, @@ -349,7 +352,7 @@ def test_multiple_reasoning_tags(self): ] FakeRegistry, TestDummyModel = self._mock_task_registry( - self.task_config, self.test_docs, responses_with_reasoning + self.input_task_name, self.task_config, self.test_docs, responses_with_reasoning ) # Initialize accelerator if available @@ -371,7 +374,7 @@ def test_multiple_reasoning_tags(self): model = TestDummyModel(DummyModelConfig(seed=42)) pipeline = Pipeline( - tasks="test|test|test_reasoning_task|0|0", + tasks="test|test|test_reasoning_task|0", pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model=model, diff --git a/tests/slow_tests/test_accelerate_vlm_model.py b/tests/slow_tests/test_accelerate_vlm_model.py index f9a8edfdc..57255758f 100644 --- a/tests/slow_tests/test_accelerate_vlm_model.py +++ b/tests/slow_tests/test_accelerate_vlm_model.py @@ -40,7 +40,7 @@ "results_file": "tests/reference_scores/Qwen2.5-VL-3B-Instruct-results-vlm.json", } ] -TASKS = "lighteval|mmmu_pro:standard-4|0|0" +TASKS = "lighteval|mmmu_pro:standard-4|0" ModelInput = Tuple[str, Callable[[], dict]] diff --git a/tests/tasks/test_registry.py b/tests/tasks/test_registry.py index caeb4e787..106708549 100644 --- a/tests/tasks/test_registry.py +++ b/tests/tasks/test_registry.py @@ -39,8 +39,8 @@ ] TASKS_GROUPS = { - "zero_and_one": "custom|test_task_revision|0|0,custom|test_task_revision|1|0", - "all_mmlu": "original|mmlu|3|0", + "zero_and_one": "custom|test_task_revision|0,custom|test_task_revision|1", + "all_mmlu": "original|mmlu|3", } @@ -48,123 +48,122 @@ def test_custom_task_groups(): """ Tests that task info selector correctly handles custom task groups. """ - registry = Registry(custom_tasks="tests.tasks.test_registry") - task_info = registry.taskinfo_selector("zero_and_one") + registry = Registry(tasks="zero_and_one", custom_tasks="tests.tasks.test_registry") - assert set(task_info.keys()) == {"custom|test_task_revision"} - assert task_info["custom|test_task_revision"] == [ - {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}, - {"fewshots": 1, "truncate_fewshots": False, "metric_params": {}}, - ] + assert set(registry.tasks_list) == {"custom|test_task_revision|0", "custom|test_task_revision|1"} + + assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"} + + task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"] + assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {0, 1} def test_custom_tasks(): """ Tests that task info selector correctly handles custom tasks. """ - registry = Registry(custom_tasks="tests.tasks.test_registry") - task_info = registry.taskinfo_selector("custom|test_task_revision|0|0") + registry = Registry(tasks="custom|test_task_revision|0", custom_tasks="tests.tasks.test_registry") + + assert registry.tasks_list == ["custom|test_task_revision|0"] + assert set(registry.task_to_configs.keys()) == {"custom|test_task_revision"} - assert list(task_info.keys()) == ["custom|test_task_revision"] - assert task_info["custom|test_task_revision"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}] + task_info: list[LightevalTaskConfig] = registry.task_to_configs["custom|test_task_revision"] + assert task_info[0].num_fewshots == 0 def test_superset_expansion(): """ Tests that task info selector correctly handles supersets. """ - registry = Registry() + registry = Registry(tasks="lighteval|storycloze|0") - task_info = registry.taskinfo_selector("lighteval|storycloze|0|0") + # The task list is saved as provided by the user + assert registry.tasks_list == ["lighteval|storycloze|0"] - assert list(task_info.keys()) == ["lighteval|storycloze:2016", "lighteval|storycloze:2018"] - assert task_info["lighteval|storycloze:2016"] == [ - {"fewshots": 0, "truncate_fewshots": False, "metric_params": {}} - ] and task_info["lighteval|storycloze:2018"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {}}] + # But we expand the superset when loading the configurations + assert set(registry.task_to_configs.keys()) == {"lighteval|storycloze:2016", "lighteval|storycloze:2018"} + + for task_name in {"lighteval|storycloze:2016", "lighteval|storycloze:2018"}: + task_info: list[LightevalTaskConfig] = registry.task_to_configs[task_name] + assert task_info[0].num_fewshots == 0 def test_superset_with_subset_task(): """ Tests that task info selector correctly handles if both superset and one of subset tasks are provided. """ - registry = Registry() - - task_info = registry.taskinfo_selector("original|mmlu|3|0,original|mmlu:abstract_algebra|5|0") + registry = Registry(tasks="original|mmlu|3,original|mmlu:abstract_algebra|5") # We have all mmlu tasks - assert len(task_info.keys()) == 57 - # Since it's defined twice - assert task_info["original|mmlu:abstract_algebra"] == [ - { - "fewshots": 3, - "truncate_fewshots": False, - "metric_params": {}, - }, - {"fewshots": 5, "truncate_fewshots": False, "metric_params": {}}, - ] + assert set(registry.tasks_list) == {"original|mmlu|3", "original|mmlu:abstract_algebra|5"} + assert len(registry.task_to_configs.keys()) == 57 + + task_info: list[LightevalTaskConfig] = registry.task_to_configs["original|mmlu:abstract_algebra"] + assert {task_info[0].num_fewshots, task_info[1].num_fewshots} == {3, 5} def test_cli_sampling_params(): """ Tests task setting the sampling parameters in CLI. """ - registry = Registry() + registry_no_sampling = Registry(tasks="lighteval|math_500|0") - task_info = registry.taskinfo_selector("lighteval|math_500@k=1|0|0") + task_info_no_sampling: list[LightevalTaskConfig] = registry_no_sampling.task_to_configs["lighteval|math_500"] + # Default values + assert task_info_no_sampling[0].metrics[0].sample_level_fn.k == 1 + assert task_info_no_sampling[0].metrics[0].sample_level_fn.n == 1 - assert list(task_info.keys()) == ["lighteval|math_500"] - assert task_info["lighteval|math_500"] == [{"fewshots": 0, "truncate_fewshots": False, "metric_params": {"k": 1}}] + registry = Registry(tasks="lighteval|math_500@k=2@n=10|0") + + task_info: list[LightevalTaskConfig] = registry.task_to_configs["lighteval|math_500"] + assert task_info[0].metrics[0].sample_level_fn.k == 2 + assert task_info[0].metrics[0].sample_level_fn.n == 10 def test_cli_sampling_params_fail(): """ Tests task setting the sampling parameters in CLI failure when args are wrong. """ - registry = Registry() - # creation of object should fail with pytest.raises(ValueError): - registry.get_tasks_configs("lighteval|math_500@plop|0|0") + Registry("lighteval|math_500@plop|0") def test_task_group_expansion_with_subset_expansion(): """ Tests that task info selector correctly handles a group with task superset is provided. """ - registry = Registry(custom_tasks="tests.tasks.test_registry") - - task_info = registry.taskinfo_selector("all_mmlu") + registry = Registry(tasks="all_mmlu", custom_tasks="tests.tasks.test_registry") - assert len(task_info.keys()) == 57 + # We have all mmlu tasks + assert len(registry.task_to_configs.keys()) == 57 def test_invalid_task_creation(): """ Tests that tasks info registry correctly raises errors for invalid tasks """ - registry = Registry() with pytest.raises(ValueError): - registry.get_tasks_configs("custom|task_revision") + Registry(tasks="custom|task_revision") def test_task_duplicates(): """ Tests that task info selector correctly handles if duplicate tasks are provided. """ - registry = Registry() - - task_info = registry.taskinfo_selector("custom|test_task_revision|0|0,custom|test_task_revision|0|0") + registry = Registry( + tasks="custom|test_task_revision|0,custom|test_task_revision|0", custom_tasks="tests.tasks.test_registry" + ) - assert list(task_info.keys()) == ["custom|test_task_revision"] + assert list(registry.tasks_list) == ["custom|test_task_revision|0"] def test_task_creation(): """ Tests that tasks registry correctly creates tasks """ - registry = Registry() - task_config = registry.get_tasks_configs("lighteval|storycloze:2016|0|0") - task = registry.get_tasks_from_configs(task_config)["lighteval|storycloze:2016|0"] + registry = Registry(tasks="lighteval|storycloze:2016|0") + task = registry.load_tasks()["lighteval|storycloze:2016|0"] assert isinstance(task, LightevalTask) assert task.name == "storycloze:2016" diff --git a/tests/utils.py b/tests/utils.py index 67714bceb..b44d27551 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -98,6 +98,7 @@ def fake_evaluate_task( # Mock the Registry.get_task_dict method task_name = f"{lighteval_task.suite[0]}|{lighteval_task.name}" + task_name_fs = f"{lighteval_task.suite[0]}|{lighteval_task.name}|{n_fewshot}" task_dict = {task_name: lighteval_task} evaluation_tracker = EvaluationTracker(output_dir="outputs") @@ -105,18 +106,12 @@ def fake_evaluate_task( # Create a mock Registry class class FakeRegistry(Registry): - def __init__(self, custom_tasks: Optional[Union[str, Path, ModuleType]] = None): - super().__init__(custom_tasks=custom_tasks) - - def get_task_dict(self, task_names: list[str]): - return task_dict - - def get_tasks_configs(self, task: str): - config = lighteval_task.config - config.num_fewshots = n_fewshot - config.truncate_fewshots = False - config.full_name = f"{task_name}|{config.num_fewshots}" - return [config] + def __init__(self, tasks: Optional[str], custom_tasks: Optional[Union[str, Path, ModuleType]] = None): + self.tasks_list = [task_name_fs] + self.task_to_configs = {task_name_fs: [lighteval_task.config]} + + def load_tasks(self): + return {task_name_fs: lighteval_task} # This is due to logger complaining we have no initialised the accelerator # It's hard to mock as it's global singleton