From b00a806fc073d035ab17ea69a9797c767b9c94cd Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Tue, 24 Oct 2023 22:24:15 -0400 Subject: [PATCH] remove --- scripts/eval/yamls/eval_gauntlet.yaml | 117 -------------------------- scripts/eval/yamls/tasks.yaml | 2 +- 2 files changed, 1 insertion(+), 118 deletions(-) diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml index b361a64c38..f15768b327 100644 --- a/scripts/eval/yamls/eval_gauntlet.yaml +++ b/scripts/eval/yamls/eval_gauntlet.yaml @@ -11,19 +11,6 @@ eval_gauntlet: - reading_comprehension - safety - programming - lm_task_average: - - world_knowledge_lm_task_subscore - - commonsense_reasoning_lm_task_subscore - - language_understanding_lm_task_subscore - - symbolic_problem_solving_lm_task_subscore - - reading_comprehension_lm_task_subscore - lite_average: - - world_knowledge_lite - - commonsense_reasoning_lite - - language_understanding_lite - - symbolic_problem_solving_lite - - reading_comprehension_lite - - programming_lite categories: - name: world_knowledge benchmarks: @@ -213,107 +200,3 @@ eval_gauntlet: - name: human_eval_execution_prediction num_fewshot: 3 random_baseline: 0.0 - - name: world_knowledge_lm_task_subscore - benchmarks: - - name: jeopardy - num_fewshot: 10 - random_baseline: 0 - - name: bigbench_qa_wikidata - num_fewshot: 10 - random_baseline: 0 - - name: language_understanding_lm_task_subscore - benchmarks: - - name: lambada_openai - num_fewshot: 0 - random_baseline: 0.0 - - name: bigbench_conlang_translation - num_fewshot: 0 - random_baseline: 0.0 - - name: symbolic_problem_solving_lm_task_subscore - benchmarks: - - name: bigbench_dyck_languages - num_fewshot: 10 - random_baseline: 0 - - name: bigbench_cs_algorithms - num_fewshot: 10 - random_baseline: 0 - - name: bigbench_operators - num_fewshot: 10 - random_baseline: 0.0 - - name: bigbench_repeat_copy_logic - num_fewshot: 10 - random_baseline: 0.0 - - name: simple_arithmetic_withspaces - num_fewshot: 10 - random_baseline: 0.0 - - name: simple_arithmetic_nospaces - num_fewshot: 10 - random_baseline: 0.0 - - name: reading_comprehension_lm_task_subscore - benchmarks: - - name: pubmed_qa_labeled - num_fewshot: 10 - random_baseline: 0.0 - - name: squad - num_fewshot: 10 - random_baseline: 0 - - name: world_knowledge_lite - benchmarks: - - name: jeopardy - num_fewshot: 10 - random_baseline: 0 - - name: arc_challenge - num_fewshot: 10 - random_baseline: 0.25 - - name: commonsense_reasoning_lite - benchmarks: - - name: copa - num_fewshot: 0 - random_baseline: 0.5 - - name: piqa - num_fewshot: 10 - random_baseline: 0.5 - - name: language_understanding_lite - benchmarks: - - name: lambada_openai - num_fewshot: 0 - random_baseline: 0.0 - - name: hellaswag - num_fewshot: 10 - random_baseline: 0.25 - - name: winograd - num_fewshot: 0 - random_baseline: 0.5 - - name: symbolic_problem_solving_lite - benchmarks: - - name: bigbench_elementary_math_qa - num_fewshot: 10 - random_baseline: 0.25 - - name: bigbench_dyck_languages - num_fewshot: 10 - random_baseline: 0 - - name: bigbench_operators - num_fewshot: 10 - random_baseline: 0.0 - - name: bigbench_repeat_copy_logic - num_fewshot: 10 - random_baseline: 0.0 - - name: simple_arithmetic_withspaces - num_fewshot: 10 - random_baseline: 0.0 - - name: simple_arithmetic_nospaces - num_fewshot: 10 - random_baseline: 0.0 - - name: reading_comprehension_lite - benchmarks: - - name: pubmed_qa_labeled - num_fewshot: 10 - random_baseline: 0.0 - - name: squad - num_fewshot: 10 - random_baseline: 0 - - name: programming_lite - benchmarks: - - name: human_eval - num_fewshot: 0 - random_baseline: 0.0 diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml index f199309bc6..e1f65f74c0 100644 --- a/scripts/eval/yamls/tasks.yaml +++ b/scripts/eval/yamls/tasks.yaml @@ -250,7 +250,7 @@ icl_tasks: icl_task_type: multiple_choice - label: winogender_mc_male - dataset_uri: eval/local_data/safety/winogender_mc_female.jsonl # ADD YOUR OWN DATASET URI + dataset_uri: eval/local_data/safety/winogender_mc_male.jsonl # ADD YOUR OWN DATASET URI num_fewshot: [10] icl_task_type: multiple_choice -