diff --git a/eval/chat_benchmarks/LiveBench/eval_instruct.py b/eval/chat_benchmarks/LiveBench/eval_instruct.py index 763145b..d853598 100644 --- a/eval/chat_benchmarks/LiveBench/eval_instruct.py +++ b/eval/chat_benchmarks/LiveBench/eval_instruct.py @@ -131,11 +131,6 @@ def get_question_list(self, model_name: str, release_set: set): else: raise ValueError(f"Bad question source {self.question_source}.") - # questions_all = [ - # q - # for q in questions_all - # if q[0]["livebench_removal_date"] == "" or q[0]["livebench_removal_date"] > self.release_date - # ] return questions_all def _get_model_name(self, model: LM) -> str: @@ -317,12 +312,6 @@ def evaluate_responses(self, results: Dict[str, Any]) -> Dict[str, float]: question_file, self.all_release_dates, self.question_begin, self.question_end ) - # questions = [ - # q - # for q in questions - # if q["livebench_removal_date"] == "" or q["livebench_removal_date"] > self.release_date - # ] - bench_name = os.path.dirname(question_file).replace(f"{self.data_path}/", "") output_file = f"{self.data_path}/{bench_name}/model_judgment/ground_truth_judgment.jsonl" diff --git a/eval/chat_benchmarks/LiveBench/livebench/common.py b/eval/chat_benchmarks/LiveBench/livebench/common.py index de9e319..8fcb9ea 100644 --- a/eval/chat_benchmarks/LiveBench/livebench/common.py +++ b/eval/chat_benchmarks/LiveBench/livebench/common.py @@ -647,8 +647,7 @@ def check_data(questions, model_answers, models): # check model answers for m in models: if not m in model_answers: - breakpoint() - # raise ValueError(f"Missing model answer for {m}") + raise ValueError(f"Missing model answer for {m}") m_answer = model_answers[m] for q in questions: assert q["question_id"] in m_answer, f"Missing model {m}'s answer to Question {q['question_id']}" diff --git a/eval/eval.py b/eval/eval.py index 2d55d52..3f6dc39 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -252,10 +252,10 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: with open(args.config, "r") as file: tasks_yaml = yaml.safe_load(file) args.tasks = ",".join([t["task_name"] for t in tasks_yaml["tasks"]]) - batch_sizes_list = [t["batch_size"] for t in tasks_yaml["tasks"]] + batch_sizes_list = [int(t["batch_size"]) for t in tasks_yaml["tasks"]] args.annotator_model = tasks_yaml.get("annotator_model", args.annotator_model) else: - batch_sizes_list = [args.batch_size for _ in range(len(args.tasks.split(",")))] + batch_sizes_list = [int(args.batch_size) for _ in range(len(args.tasks.split(",")))] # Initialize evaluation tracker if args.output_path: