From de4a56d6660c480f35b488801e952cf722a56003 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 9 Nov 2023 09:54:03 -0500 Subject: [PATCH 01/19] save intermediate res --- bigcode_eval/evaluator.py | 93 ++++++++++++++++++++++++++++++-------- bigcode_eval/generation.py | 6 +-- main.py | 6 +++ 3 files changed, 83 insertions(+), 22 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 9307307dd..53d6c7745 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -3,6 +3,10 @@ import os import warnings +from typing import Any, Iterable, List + +from datasets import Dataset + from bigcode_eval import tasks from bigcode_eval.generation import parallel_generations @@ -24,6 +28,24 @@ ################################################################################\ """ +def chunk_list(item_list: List[Any], chunk_size: int = 32) -> List[List[Any]]: + """ + Turn an list of items into a list of item chunks + Where each chunk is at most of len `chunk_size` + + Args: + item_list (List[Any]): an list of items to batchify + chunk_size (int): the size of each chunk + + Returns: + a List[List[Any]] where each List[Any] is of at most length chunk_size + and the length of the list is ceiling(len(item_list)/chunk_size) + """ + if chunk_size < 1: + raise ValueError("chunk_size must be >= 1") + if len(item_list) == 0: + raise ValueError("Must be a non-empty list") + return [item_list[i : i + chunk_size] for i in range(0, len(item_list), chunk_size)] class Evaluator: def __init__(self, accelerator, model, tokenizer, args): @@ -38,6 +60,7 @@ def __init__(self, accelerator, model, tokenizer, args): # code evaluation permission self.allow_code_execution = args.allow_code_execution + # TODO (Max): add in the passed list of generations to start from an intermediate checkpoint def generate_text(self, task_name): task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() @@ -52,15 +75,40 @@ def generate_text(self, task_name): solutions = [[ref] for ref in references] return solutions, references - generations = parallel_generations( - task, - dataset, - self.accelerator, - self.model, - self.tokenizer, - n_tasks=n_tasks, - args=self.args, - ) + generations = [] + + # TODO (Max): if intermediate generations file is passed + # Then append all the generations from that task to `generations` + # and only chunk data from generations onward + # Note: ALSO want to change `parallel_generations` so that we don't use self.args.limit_start if curr_iter isn't 0 + # chunk data for saving intermediate generations and references + chunk_size = self.args.save_every_k_samples if self.args.save_every_k_samples >= 1 else len(references) + dataset_chunks = chunk_list(dataset, chunk_size) + + intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}-intermediate.json" + + for iter, data_chunk in enumerate(dataset_chunks): + generation_chunk = parallel_generations( + task, + Dataset.from_dict(data_chunk), + self.accelerator, + self.model, + self.tokenizer, + n_tasks=n_tasks, + args=self.args, + curr_iter=iter, # Note: this is because we manually change limit_start to 0 if curr_iter > 0 + ) + generations.extend(generation_chunk) + + # save intermediate results + if self.accelerator.is_main_process: + self.save_json_files( + generations, + references[:len(generations)], + intermediate_save_generations_path, + "references-intermediate.json" + ) + if len(generations[0]) > self.args.n_samples: generations = [l[: self.args.n_samples] for l in generations] warnings.warn( @@ -77,16 +125,7 @@ def evaluate(self, task_name): if self.accelerator.is_main_process: if not self.args.load_generations_path: - if self.args.save_generations: - with open(self.args.save_generations_path, "w") as fp: - json.dump(generations, fp) - print( - f"generations were saved at {self.args.save_generations_path}" - ) - if self.args.save_references: - with open("references.json", "w") as fp: - json.dump(references, fp) - print("references were saved at references.json") + self.save_json_files(generations, references, self.args.save_generations_path, "references.json") # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -95,3 +134,19 @@ def evaluate(self, task_name): print("Evaluating generations...") results = task.process_results(generations, references) return results + + def save_json_files( + self, + generations: List[str], + references: List[str], + save_generations_path: str, + save_references_path: str, + ) -> None: + if self.args.save_generations: + with open(save_generations_path, "w") as fp: + json.dump(generations, fp) + print(f"generations were saved at {save_generations_path}") + if self.args.save_references: + with open(save_references_path, "w") as fp: + json.dump(references, fp) + print(f"references were saved at {save_references_path}") diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index bf2a24301..a1e8a9e13 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -37,7 +37,7 @@ def __call__(self, input_ids, scores, **kwargs): return input_ids.shape[1] > int(self.input_length * self.multiplier) -def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args): +def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args, curr_iter): if args.load_generations_path: # load generated code with open(args.load_generations_path) as fp: @@ -100,7 +100,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, tokenizer, num_devices=accelerator.state.num_processes, max_length=args.max_length_generation, - limit_start=args.limit_start, + limit_start=args.limit_start if curr_iter == 0 else 0, n_tasks=n_tasks, n_copies=n_copies, prefix=args.prefix, @@ -131,7 +131,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, tokenizer, ds_loader, n_tasks=n_tasks, - limit_start=args.limit_start, + limit_start=args.limit_start if curr_iter == 0 else 0, batch_size=args.batch_size, prefix=args.prefix, instruction_tokens=instruction_tokens, diff --git a/main.py b/main.py index d1d16ec16..804c333a8 100644 --- a/main.py +++ b/main.py @@ -120,6 +120,12 @@ def parse_args(): default=0, help="Optional offset to start from when limiting the number of samples", ) + parser.add_argument( + "--save_every_k_samples", + type=int, + default=-1, + help="Optional saving after every k samples", + ) parser.add_argument( "--postprocess", action="store_false", From 1667834cd71997814d655349bec7f0f6b36343ed Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 9 Nov 2023 10:16:04 -0500 Subject: [PATCH 02/19] fix indexing inssue w/ generate code --- bigcode_eval/evaluator.py | 2 ++ bigcode_eval/generation.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 53d6c7745..19cef924c 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -88,6 +88,7 @@ def generate_text(self, task_name): intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}-intermediate.json" for iter, data_chunk in enumerate(dataset_chunks): + curr_sample_idx = len(generations) generation_chunk = parallel_generations( task, Dataset.from_dict(data_chunk), @@ -97,6 +98,7 @@ def generate_text(self, task_name): n_tasks=n_tasks, args=self.args, curr_iter=iter, # Note: this is because we manually change limit_start to 0 if curr_iter > 0 + curr_sample_idx=curr_sample_idx, # curr_sample_idx will be used in `complete_code` so we don't mess up indexing during post-process ) generations.extend(generation_chunk) diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index a1e8a9e13..5d9a7ae5e 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -37,7 +37,7 @@ def __call__(self, input_ids, scores, **kwargs): return input_ids.shape[1] > int(self.input_length * self.multiplier) -def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args, curr_iter): +def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args, curr_iter, curr_sample_idx): if args.load_generations_path: # load generated code with open(args.load_generations_path) as fp: @@ -131,7 +131,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, tokenizer, ds_loader, n_tasks=n_tasks, - limit_start=args.limit_start if curr_iter == 0 else 0, + limit_start=args.limit_start + curr_sample_idx, batch_size=args.batch_size, prefix=args.prefix, instruction_tokens=instruction_tokens, From b35f5d74db4c9c3974b1f8df31a38de835269799 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 9 Nov 2023 11:06:27 -0500 Subject: [PATCH 03/19] save gen and ref per task --- bigcode_eval/evaluator.py | 7 ++++--- main.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 19cef924c..2748f7c77 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -85,7 +85,7 @@ def generate_text(self, task_name): chunk_size = self.args.save_every_k_samples if self.args.save_every_k_samples >= 1 else len(references) dataset_chunks = chunk_list(dataset, chunk_size) - intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}-intermediate.json" + intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" for iter, data_chunk in enumerate(dataset_chunks): curr_sample_idx = len(generations) @@ -108,7 +108,7 @@ def generate_text(self, task_name): generations, references[:len(generations)], intermediate_save_generations_path, - "references-intermediate.json" + f"references_{task_name}_intermediate.json" ) if len(generations[0]) > self.args.n_samples: @@ -127,7 +127,8 @@ def evaluate(self, task_name): if self.accelerator.is_main_process: if not self.args.load_generations_path: - self.save_json_files(generations, references, self.args.save_generations_path, "references.json") + save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}.json" + self.save_json_files(generations, references, save_generations_path, f"references_{task_name}.json") # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/main.py b/main.py index 804c333a8..c1e2fd117 100644 --- a/main.py +++ b/main.py @@ -337,6 +337,7 @@ def main(): print("generation mode only") generations, references = evaluator.generate_text(task) if accelerator.is_main_process: + # TODO (Max): refactor this with evaluator.save_json_files()? with open(args.save_generations_path, "w") as fp: json.dump(generations, fp) print(f"generations were saved at {args.save_generations_path}") From e17dd8670c40c376d632cde6c58527662ed70ae6 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 9 Nov 2023 13:01:58 -0500 Subject: [PATCH 04/19] save intermediate code generations --- bigcode_eval/evaluator.py | 74 ++++++++++---------------------- bigcode_eval/generation.py | 6 ++- bigcode_eval/utils.py | 86 +++++++++++++++++++++++++++++++------- main.py | 26 ++++++++---- 4 files changed, 112 insertions(+), 80 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 2748f7c77..fb0124b4b 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -28,25 +28,6 @@ ################################################################################\ """ -def chunk_list(item_list: List[Any], chunk_size: int = 32) -> List[List[Any]]: - """ - Turn an list of items into a list of item chunks - Where each chunk is at most of len `chunk_size` - - Args: - item_list (List[Any]): an list of items to batchify - chunk_size (int): the size of each chunk - - Returns: - a List[List[Any]] where each List[Any] is of at most length chunk_size - and the length of the list is ceiling(len(item_list)/chunk_size) - """ - if chunk_size < 1: - raise ValueError("chunk_size must be >= 1") - if len(item_list) == 0: - raise ValueError("Must be a non-empty list") - return [item_list[i : i + chunk_size] for i in range(0, len(item_list), chunk_size)] - class Evaluator: def __init__(self, accelerator, model, tokenizer, args): self.accelerator = accelerator @@ -61,7 +42,7 @@ def __init__(self, accelerator, model, tokenizer, args): self.allow_code_execution = args.allow_code_execution # TODO (Max): add in the passed list of generations to start from an intermediate checkpoint - def generate_text(self, task_name): + def generate_text(self, task_name): # TODO (Max): pass intermediate generations file here task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() # if args.limit is None, use all samples @@ -75,41 +56,28 @@ def generate_text(self, task_name): solutions = [[ref] for ref in references] return solutions, references - generations = [] - - # TODO (Max): if intermediate generations file is passed - # Then append all the generations from that task to `generations` - # and only chunk data from generations onward - # Note: ALSO want to change `parallel_generations` so that we don't use self.args.limit_start if curr_iter isn't 0 - # chunk data for saving intermediate generations and references - chunk_size = self.args.save_every_k_samples if self.args.save_every_k_samples >= 1 else len(references) - dataset_chunks = chunk_list(dataset, chunk_size) + generations = [] # list[list[str | None]] (list of a list of generations) + # Note (Max): when passing an intermediate list of generations, the len would be the same as n_tasks + # so need to subset by + # generations = [gen for gen in loaded_generations if len(gen) > 0] or [gen for gen in loaded_generations if gen] intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" - for iter, data_chunk in enumerate(dataset_chunks): - curr_sample_idx = len(generations) - generation_chunk = parallel_generations( - task, - Dataset.from_dict(data_chunk), - self.accelerator, - self.model, - self.tokenizer, - n_tasks=n_tasks, - args=self.args, - curr_iter=iter, # Note: this is because we manually change limit_start to 0 if curr_iter > 0 - curr_sample_idx=curr_sample_idx, # curr_sample_idx will be used in `complete_code` so we don't mess up indexing during post-process - ) - generations.extend(generation_chunk) + curr_sample_idx = len(generations) + generation_chunk = parallel_generations( + task, + dataset, + self.accelerator, + self.model, + self.tokenizer, + n_tasks=n_tasks, + args=self.args, + curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing + save_every_k_samples = self.args.save_every_k_samples, + intermediate_save_generations_path=intermediate_save_generations_path + ) + generations.extend(generation_chunk) - # save intermediate results - if self.accelerator.is_main_process: - self.save_json_files( - generations, - references[:len(generations)], - intermediate_save_generations_path, - f"references_{task_name}_intermediate.json" - ) if len(generations[0]) > self.args.n_samples: generations = [l[: self.args.n_samples] for l in generations] @@ -118,7 +86,7 @@ def generate_text(self, task_name): ) return generations, references - def evaluate(self, task_name): + def evaluate(self, task_name): # TODO (Max): pass intermediate generations file here task = tasks.get_task(task_name, self.args) if task.requires_execution and not self.allow_code_execution: raise ValueError(_WARNING) @@ -128,7 +96,7 @@ def evaluate(self, task_name): if self.accelerator.is_main_process: if not self.args.load_generations_path: save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}.json" - self.save_json_files(generations, references, save_generations_path, f"references_{task_name}.json") + self.save_json_files(self.args, generations, references, save_generations_path, f"references_{task_name}.json") # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index 5d9a7ae5e..e1a003f2f 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -37,7 +37,7 @@ def __call__(self, input_ids, scores, **kwargs): return input_ids.shape[1] > int(self.input_length * self.multiplier) -def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args, curr_iter, curr_sample_idx): +def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args, curr_sample_idx, save_every_k_samples, intermediate_save_generations_path): if args.load_generations_path: # load generated code with open(args.load_generations_path) as fp: @@ -100,7 +100,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, tokenizer, num_devices=accelerator.state.num_processes, max_length=args.max_length_generation, - limit_start=args.limit_start if curr_iter == 0 else 0, + limit_start=args.limit_start + curr_sample_idx, n_tasks=n_tasks, n_copies=n_copies, prefix=args.prefix, @@ -137,6 +137,8 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, instruction_tokens=instruction_tokens, postprocess=args.postprocess, is_wrapped=is_loaded_in_8bit or is_loaded_in_4bit, + save_every_k_samples=save_every_k_samples, + intermediate_save_generations_path=intermediate_save_generations_path, **gen_kwargs, ) return generations diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index 5a9f82629..9259e445a 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -1,6 +1,9 @@ +import json import math +import re import warnings from collections import defaultdict +from typing import List, Optional import torch from torch.utils.data import IterableDataset @@ -49,7 +52,7 @@ def __iter__(self): prompts_encoder = [] infill = [] instruction = [] - for sample in range(self.limit_start, self.limit_start+self.n_tasks): + for sample in range(self.limit_start, self.limit_start + self.n_tasks): prompt_contents = self.task.get_prompt(self.dataset[sample]) if isinstance(prompt_contents, str): # Normal code completion mode @@ -111,8 +114,6 @@ def __iter__(self): return_token_type_ids=return_token_type_ids, ) - - if self.n_copies == 1 and self.n_tasks % self.num_devices != 0: self.n_copies = 2 warnings.warn( @@ -127,7 +128,9 @@ def __iter__(self): "ids_encoder": outputs_encoder.input_ids[sample], "task_id": sample, "input_len": outputs.attention_mask[sample].sum(), - "input_len_encoder": outputs_encoder.attention_mask[sample].sum(), + "input_len_encoder": outputs_encoder.attention_mask[ + sample + ].sum(), } else: yield { @@ -231,6 +234,8 @@ def complete_code( instruction_tokens=None, postprocess=True, is_wrapped=False, + save_every_k_samples: int = -1, + intermediate_save_generations_path: Optional[str] = None, **gen_kwargs, ): """Generate multiple codes for each task in the dataset using multiple GPUs with accelerate. @@ -238,7 +243,9 @@ def complete_code( [p_0_0, p_0_1, ..., p_0_nc-1, p_1_0, ..., p_nt-1_nc-1] where nc is the number of copies of the prompt, and nt is the number of tasks. nc is such that num_samples(for each task)= nc * batch_size """ - + # keep track of the list of generated codes + # where len(code_gens) = n_tasks and len(code_gens[0]) = number of generated code samples + code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)] gen_token_dict = defaultdict(list) # dict of list of generated tokens for step, batch in tqdm( enumerate(dataloader), @@ -251,12 +258,14 @@ def complete_code( # Set the start_length after which to check for stopping to be the longest input ignoring padding max_len = batch["input_len"].max().item() if "ids_encoder" in batch: - max_len += 1 # Add 1 for decoder_start_token_id + max_len += 1 # Add 1 for decoder_start_token_id gen_kwargs["stopping_criteria"][0].start_length = max_len if hasattr(task, "max_length_multiplier") and task.max_length_multiplier: idx = 1 if task.stop_words else 0 - gen_kwargs["stopping_criteria"][idx].input_length = batch["input_len"].max().item() - + gen_kwargs["stopping_criteria"][idx].input_length = ( + batch["input_len"].max().item() + ) + inputs = batch["ids"][:, : batch["input_len"]] if "ids_encoder" in batch: if is_wrapped: @@ -306,7 +315,57 @@ def complete_code( for sample, generated_tokens in zip(generated_tasks, generated_tokens): gen_token_dict[sample].append(generated_tokens) - code_gens = [[] for _ in range(n_tasks)] + if save_every_k_samples >= 1 and step % save_every_k_samples == 0: + # Note (Max): + # This should be fine since we iterate over each task at a time + # so all generations per task would be complete before saving + if not intermediate_save_generations_path: + raise ValueError( + "intermediate_save_generations_path cannot be empty!" + ) + + code_gens = update_code_gens( + task, + tokenizer, + limit_start, + prefix, + instruction_tokens, + postprocess, + code_gens, + gen_token_dict, + ) + with open(intermediate_save_generations_path, "w") as fp: + json.dump(code_gens, fp) + print( + f"intermediate generations were saved at {intermediate_save_generations_path}" + ) + # reset gen_token_dict + gen_token_dict = defaultdict(list) + + code_gens = update_code_gens( + task, + tokenizer, + limit_start, + prefix, + instruction_tokens, + postprocess, + code_gens, + gen_token_dict, + ) + + return code_gens + + +def update_code_gens( + task, + tokenizer, + limit_start, + prefix, + instruction_tokens, + postprocess, + code_gens, + gen_token_dict, +): for sample, generated_tokens in gen_token_dict.items(): for s in generated_tokens: if INFILL_MODE or tokenizer.eos_token in task.stop_words: @@ -315,7 +374,7 @@ def complete_code( # Treat eos token as a regular stop word not removing it from the output # If it's removed it may have the effect of removing it in the middle of a # longer generation in case a batch size > 1 is used, which will result in - # a wrong generation as it won't be used for splitting lateron + # a wrong generation as it won't be used for splitting lateron gen_code = tokenizer.decode( s, skip_special_tokens=False, clean_up_tokenization_spaces=False ) @@ -339,11 +398,6 @@ def complete_code( ) code_gens[sample].append(gen_code) - return code_gens - - -import re - def remove_after_return(code): """ @@ -361,6 +415,6 @@ def remove_after_return(code): and start_match < len(code) and code[start_match].strip() != "" ): - return code[0:start_match] + return code[0: start_match] end_last_match = end_match return code diff --git a/main.py b/main.py index c1e2fd117..293a8b42b 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import os import fnmatch import json import warnings @@ -164,6 +165,12 @@ def parse_args(): action="store_true", help="Whether to save code generations", ) + parser.add_argument( + "--save_generations_intermediate_path", + type=str, + default="generations.json", + help="Path for saving the intermediate code generations", + ) parser.add_argument( "--save_generations_path", type=str, @@ -335,18 +342,19 @@ def main(): if args.generation_only: if accelerator.is_main_process: print("generation mode only") - generations, references = evaluator.generate_text(task) + generations, references = evaluator.generate_text(task) # TODO (Max): pass intermediate generations file here if accelerator.is_main_process: # TODO (Max): refactor this with evaluator.save_json_files()? - with open(args.save_generations_path, "w") as fp: - json.dump(generations, fp) - print(f"generations were saved at {args.save_generations_path}") - if args.save_references: - with open("references.json", "w") as fp: - json.dump(references, fp) - print("references were saved") + save_generations_path = f"{os.path.splitext(args.save_generations_path)[0]}_{task}.json" + save_references_path = f"references_{task}.json" + evaluator.save_json_files( + generations, + references, + save_generations_path, + save_references_path, + ) else: - results[task] = evaluator.evaluate(task) + results[task] = evaluator.evaluate(task) # TODO (Max): pass intermediate generations file here # Save all args to config results["config"] = vars(args) From 0bf932ef574733d4443a98725c37e80aeb8107e4 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 9 Nov 2023 13:44:20 -0500 Subject: [PATCH 05/19] add intermediate generations to continue generating from --- bigcode_eval/evaluator.py | 20 ++++++++++---------- bigcode_eval/generation.py | 13 ++++++++++++- main.py | 31 ++++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index fb0124b4b..dc324f273 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -42,7 +42,7 @@ def __init__(self, accelerator, model, tokenizer, args): self.allow_code_execution = args.allow_code_execution # TODO (Max): add in the passed list of generations to start from an intermediate checkpoint - def generate_text(self, task_name): # TODO (Max): pass intermediate generations file here + def generate_text(self, task_name, intermediate_generations): # TODO (Max): pass intermediate generations file here task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() # if args.limit is None, use all samples @@ -58,25 +58,25 @@ def generate_text(self, task_name): # TODO (Max): pass intermediate generations generations = [] # list[list[str | None]] (list of a list of generations) # Note (Max): when passing an intermediate list of generations, the len would be the same as n_tasks - # so need to subset by - # generations = [gen for gen in loaded_generations if len(gen) > 0] or [gen for gen in loaded_generations if gen] - + # so need to subset by [gen for gen in intermediate_generations if gen] + if intermediate_generations: + generations = [gen for gen in intermediate_generations if gen] intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" + curr_sample_idx = len(generations) - curr_sample_idx = len(generations) - generation_chunk = parallel_generations( + new_generations = parallel_generations( task, dataset, self.accelerator, self.model, self.tokenizer, n_tasks=n_tasks, - args=self.args, curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing - save_every_k_samples = self.args.save_every_k_samples, - intermediate_save_generations_path=intermediate_save_generations_path + save_every_k_samples=self.args.save_every_k_samples, + intermediate_save_generations_path=intermediate_save_generations_path, + args=self.args, ) - generations.extend(generation_chunk) + generations.extend(new_generations) if len(generations[0]) > self.args.n_samples: diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index e1a003f2f..28c3b7a39 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -37,7 +37,18 @@ def __call__(self, input_ids, scores, **kwargs): return input_ids.shape[1] > int(self.input_length * self.multiplier) -def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args, curr_sample_idx, save_every_k_samples, intermediate_save_generations_path): +def parallel_generations( + task, + dataset, + accelerator, + model, + tokenizer, + n_tasks, + curr_sample_idx, + save_every_k_samples, + intermediate_save_generations_path, + args, +): if args.load_generations_path: # load generated code with open(args.load_generations_path) as fp: diff --git a/main.py b/main.py index 293a8b42b..8e1f1d812 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,8 @@ import json import warnings +from typing import List, Optional + import datasets import torch import transformers @@ -166,10 +168,10 @@ def parse_args(): help="Whether to save code generations", ) parser.add_argument( - "--save_generations_intermediate_path", + "--load_generations_intermediate_paths", type=str, - default="generations.json", - help="Path for saving the intermediate code generations", + nargs="*", + help="List of paths for saving the intermediate code generations", ) parser.add_argument( "--save_generations_path", @@ -338,13 +340,28 @@ def main(): evaluator = Evaluator(accelerator, model, tokenizer, args) - for task in task_names: + if ( + args.load_generations_intermediate_paths + and len(args.load_generations_intermediate_paths) != len(task_names) + ): + raise ValueError( + "If passing --load_generations_intermediate_paths, \ + must pass equal number of files as number of tasks" + ) + + for idx, task in enumerate(task_names): + intermediate_generations = None + if args.load_generations_intermediate_paths: + with open(args.load_generations_intermediate_paths[idx], "r") as f_in: + # intermediate_generations: list[list[str | None]] of len n_tasks + # where list[i] = generated codes or empty + intermediate_generations = json.load(f_in) + if args.generation_only: if accelerator.is_main_process: print("generation mode only") - generations, references = evaluator.generate_text(task) # TODO (Max): pass intermediate generations file here + generations, references = evaluator.generate_text(task, intermediate_generations) # TODO (Max): pass intermediate generations file here if accelerator.is_main_process: - # TODO (Max): refactor this with evaluator.save_json_files()? save_generations_path = f"{os.path.splitext(args.save_generations_path)[0]}_{task}.json" save_references_path = f"references_{task}.json" evaluator.save_json_files( @@ -354,7 +371,7 @@ def main(): save_references_path, ) else: - results[task] = evaluator.evaluate(task) # TODO (Max): pass intermediate generations file here + results[task] = evaluator.evaluate(task, intermediate_generations) # TODO (Max): pass intermediate generations file here # Save all args to config results["config"] = vars(args) From 72337293c8f49f128448b191cab90a0a1b7a063d Mon Sep 17 00:00:00 2001 From: Max Tian Date: Tue, 14 Nov 2023 13:04:15 -0500 Subject: [PATCH 06/19] fix indexing issues --- bigcode_eval/evaluator.py | 20 ++++++++------------ bigcode_eval/utils.py | 8 +++----- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index dc324f273..af425b9ac 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -3,9 +3,8 @@ import os import warnings -from typing import Any, Iterable, List +from typing import List -from datasets import Dataset from bigcode_eval import tasks from bigcode_eval.generation import parallel_generations @@ -41,8 +40,7 @@ def __init__(self, accelerator, model, tokenizer, args): # code evaluation permission self.allow_code_execution = args.allow_code_execution - # TODO (Max): add in the passed list of generations to start from an intermediate checkpoint - def generate_text(self, task_name, intermediate_generations): # TODO (Max): pass intermediate generations file here + def generate_text(self, task_name, intermediate_generations): task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() # if args.limit is None, use all samples @@ -56,13 +54,12 @@ def generate_text(self, task_name, intermediate_generations): # TODO (Max): pas solutions = [[ref] for ref in references] return solutions, references - generations = [] # list[list[str | None]] (list of a list of generations) - # Note (Max): when passing an intermediate list of generations, the len would be the same as n_tasks - # so need to subset by [gen for gen in intermediate_generations if gen] + generations = [] # list[list[str | None] | None] if intermediate_generations: generations = [gen for gen in intermediate_generations if gen] + n_tasks -= len(generations) intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" - curr_sample_idx = len(generations) + curr_sample_idx = len(generations) - 1 new_generations = parallel_generations( task, @@ -78,7 +75,6 @@ def generate_text(self, task_name, intermediate_generations): # TODO (Max): pas ) generations.extend(new_generations) - if len(generations[0]) > self.args.n_samples: generations = [l[: self.args.n_samples] for l in generations] warnings.warn( @@ -86,17 +82,17 @@ def generate_text(self, task_name, intermediate_generations): # TODO (Max): pas ) return generations, references - def evaluate(self, task_name): # TODO (Max): pass intermediate generations file here + def evaluate(self, task_name, intermediate_generations): task = tasks.get_task(task_name, self.args) if task.requires_execution and not self.allow_code_execution: raise ValueError(_WARNING) - generations, references = self.generate_text(task_name) + generations, references = self.generate_text(task_name, intermediate_generations) if self.accelerator.is_main_process: if not self.args.load_generations_path: save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}.json" - self.save_json_files(self.args, generations, references, save_generations_path, f"references_{task_name}.json") + self.save_json_files(generations, references, save_generations_path, f"references_{task_name}.json") # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index 9259e445a..394aa44f5 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -315,10 +315,7 @@ def complete_code( for sample, generated_tokens in zip(generated_tasks, generated_tokens): gen_token_dict[sample].append(generated_tokens) - if save_every_k_samples >= 1 and step % save_every_k_samples == 0: - # Note (Max): - # This should be fine since we iterate over each task at a time - # so all generations per task would be complete before saving + if save_every_k_samples >= 1 and (step + 1) % save_every_k_samples == 0: if not intermediate_save_generations_path: raise ValueError( "intermediate_save_generations_path cannot be empty!" @@ -339,7 +336,7 @@ def complete_code( print( f"intermediate generations were saved at {intermediate_save_generations_path}" ) - # reset gen_token_dict + # reset gen_token_dict - prevent redundant decoding gen_token_dict = defaultdict(list) code_gens = update_code_gens( @@ -397,6 +394,7 @@ def update_code_gens( "model output is not postprocessed, this might lower evaluation scores" ) code_gens[sample].append(gen_code) + return code_gens def remove_after_return(code): From cd46f9a8b4e0dbc177b208aa5d6de5469c62335e Mon Sep 17 00:00:00 2001 From: Max Tian Date: Tue, 14 Nov 2023 13:54:38 -0500 Subject: [PATCH 07/19] fix out of bounds with args.limit_start --- bigcode_eval/evaluator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index af425b9ac..fd2b4daa2 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -45,6 +45,10 @@ def generate_text(self, task_name, intermediate_generations): dataset = task.get_dataset() # if args.limit is None, use all samples n_tasks = self.args.limit if self.args.limit else len(dataset) + # when args.limit is None + # adjust n_tasks by args.limit_start to prevent out of bounds issues + if not self.args.limit: + n_tasks -= self.args.limit_start references = [task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start+n_tasks)] if self.args.check_references: @@ -59,7 +63,7 @@ def generate_text(self, task_name, intermediate_generations): generations = [gen for gen in intermediate_generations if gen] n_tasks -= len(generations) intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" - curr_sample_idx = len(generations) - 1 + curr_sample_idx = len(generations) new_generations = parallel_generations( task, From 81c7e13fac822ec72e412b3c338512e5cc8ab4e0 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Tue, 14 Nov 2023 15:24:24 -0500 Subject: [PATCH 08/19] pass intermediate_generations as kwarg --- bigcode_eval/evaluator.py | 6 +++--- main.py | 8 ++++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index fd2b4daa2..34e2db61b 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -40,7 +40,7 @@ def __init__(self, accelerator, model, tokenizer, args): # code evaluation permission self.allow_code_execution = args.allow_code_execution - def generate_text(self, task_name, intermediate_generations): + def generate_text(self, task_name, intermediate_generations=None): task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() # if args.limit is None, use all samples @@ -86,12 +86,12 @@ def generate_text(self, task_name, intermediate_generations): ) return generations, references - def evaluate(self, task_name, intermediate_generations): + def evaluate(self, task_name, intermediate_generations=None): task = tasks.get_task(task_name, self.args) if task.requires_execution and not self.allow_code_execution: raise ValueError(_WARNING) - generations, references = self.generate_text(task_name, intermediate_generations) + generations, references = self.generate_text(task_name, intermediate_generations=intermediate_generations) if self.accelerator.is_main_process: if not self.args.load_generations_path: diff --git a/main.py b/main.py index 8e1f1d812..b9931e348 100644 --- a/main.py +++ b/main.py @@ -360,7 +360,9 @@ def main(): if args.generation_only: if accelerator.is_main_process: print("generation mode only") - generations, references = evaluator.generate_text(task, intermediate_generations) # TODO (Max): pass intermediate generations file here + generations, references = evaluator.generate_text( + task, intermediate_generations=intermediate_generations + ) if accelerator.is_main_process: save_generations_path = f"{os.path.splitext(args.save_generations_path)[0]}_{task}.json" save_references_path = f"references_{task}.json" @@ -371,7 +373,9 @@ def main(): save_references_path, ) else: - results[task] = evaluator.evaluate(task, intermediate_generations) # TODO (Max): pass intermediate generations file here + results[task] = evaluator.evaluate( + task, intermediate_generations=intermediate_generations + ) # Save all args to config results["config"] = vars(args) From f35f9a470c49cf1f9ff25aa9b71ae505804d9aa2 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Wed, 15 Nov 2023 13:46:22 -0500 Subject: [PATCH 09/19] add defaults to parallel_generations --- bigcode_eval/evaluator.py | 2 +- bigcode_eval/generation.py | 8 +++++--- main.py | 2 -- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 34e2db61b..36927aa24 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -72,10 +72,10 @@ def generate_text(self, task_name, intermediate_generations=None): self.model, self.tokenizer, n_tasks=n_tasks, + args=self.args, curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing save_every_k_samples=self.args.save_every_k_samples, intermediate_save_generations_path=intermediate_save_generations_path, - args=self.args, ) generations.extend(new_generations) diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index 28c3b7a39..9fa41abf3 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -1,6 +1,8 @@ import json from math import ceil +from typing import Optional + from accelerate.utils import set_seed from torch.utils.data.dataloader import DataLoader from transformers import StoppingCriteria, StoppingCriteriaList @@ -44,10 +46,10 @@ def parallel_generations( model, tokenizer, n_tasks, - curr_sample_idx, - save_every_k_samples, - intermediate_save_generations_path, args, + curr_sample_idx: int = 0, + save_every_k_samples: int = -1, + intermediate_save_generations_path: Optional[str] = None, ): if args.load_generations_path: # load generated code diff --git a/main.py b/main.py index b9931e348..6be3d307a 100644 --- a/main.py +++ b/main.py @@ -3,8 +3,6 @@ import json import warnings -from typing import List, Optional - import datasets import torch import transformers From 7af4088d5f896d730bab692dda703bd01e1def98 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Wed, 15 Nov 2023 13:51:47 -0500 Subject: [PATCH 10/19] add args to test --- tests/test_generation_evaluation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_generation_evaluation.py b/tests/test_generation_evaluation.py index 7d5b3ab49..78dd7e586 100644 --- a/tests/test_generation_evaluation.py +++ b/tests/test_generation_evaluation.py @@ -81,6 +81,7 @@ def load_generation_examples(task): def test_generation(): args.generation_only = True + args.save_every_k_samples = -1 evaluator = Evaluator(accelerator, model, tokenizer, args) for task in GEN_TASKS: print(f"testing task {task}") @@ -94,6 +95,7 @@ def test_generation(): def test_evaluation(): # TODO add scores for each task args.n_samples = 2 + args.save_every_k_samples = -1 for task in EVAL_TASKS: print(f"testing task {task}") # path to generation examples to evaluate From 9f600a3c5be8444b8e82a07c83067bfe00ee9d6f Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 16 Nov 2023 09:29:41 -0500 Subject: [PATCH 11/19] better naming convention --- bigcode_eval/evaluator.py | 2 +- bigcode_eval/generation.py | 4 ++-- main.py | 4 ++-- tests/test_generation_evaluation.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 36927aa24..3bba91495 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -74,7 +74,7 @@ def generate_text(self, task_name, intermediate_generations=None): n_tasks=n_tasks, args=self.args, curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing - save_every_k_samples=self.args.save_every_k_samples, + save_every_k_tasks=self.args.save_every_k_tasks, intermediate_save_generations_path=intermediate_save_generations_path, ) generations.extend(new_generations) diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index 9fa41abf3..25a74561d 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -48,7 +48,7 @@ def parallel_generations( n_tasks, args, curr_sample_idx: int = 0, - save_every_k_samples: int = -1, + save_every_k_tasks: int = -1, intermediate_save_generations_path: Optional[str] = None, ): if args.load_generations_path: @@ -150,7 +150,7 @@ def parallel_generations( instruction_tokens=instruction_tokens, postprocess=args.postprocess, is_wrapped=is_loaded_in_8bit or is_loaded_in_4bit, - save_every_k_samples=save_every_k_samples, + save_every_k_tasks=save_every_k_tasks, intermediate_save_generations_path=intermediate_save_generations_path, **gen_kwargs, ) diff --git a/main.py b/main.py index 6be3d307a..a35328f9f 100644 --- a/main.py +++ b/main.py @@ -122,10 +122,10 @@ def parse_args(): help="Optional offset to start from when limiting the number of samples", ) parser.add_argument( - "--save_every_k_samples", + "--save_every_k_tasks", type=int, default=-1, - help="Optional saving after every k samples", + help="Optional saving after every k tasks", ) parser.add_argument( "--postprocess", diff --git a/tests/test_generation_evaluation.py b/tests/test_generation_evaluation.py index 78dd7e586..42ce10a27 100644 --- a/tests/test_generation_evaluation.py +++ b/tests/test_generation_evaluation.py @@ -81,7 +81,7 @@ def load_generation_examples(task): def test_generation(): args.generation_only = True - args.save_every_k_samples = -1 + args.save_every_k_tasks = -1 evaluator = Evaluator(accelerator, model, tokenizer, args) for task in GEN_TASKS: print(f"testing task {task}") @@ -95,7 +95,7 @@ def test_generation(): def test_evaluation(): # TODO add scores for each task args.n_samples = 2 - args.save_every_k_samples = -1 + args.save_every_k_tasks = -1 for task in EVAL_TASKS: print(f"testing task {task}") # path to generation examples to evaluate From b661667b58d08ba9088b06ea9882340a0b25ed4c Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 16 Nov 2023 09:30:38 -0500 Subject: [PATCH 12/19] better naming convention --- bigcode_eval/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index 394aa44f5..995044202 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -234,7 +234,7 @@ def complete_code( instruction_tokens=None, postprocess=True, is_wrapped=False, - save_every_k_samples: int = -1, + save_every_k_tasks: int = -1, intermediate_save_generations_path: Optional[str] = None, **gen_kwargs, ): @@ -315,7 +315,7 @@ def complete_code( for sample, generated_tokens in zip(generated_tasks, generated_tokens): gen_token_dict[sample].append(generated_tokens) - if save_every_k_samples >= 1 and (step + 1) % save_every_k_samples == 0: + if save_every_k_tasks >= 1 and (step + 1) % save_every_k_tasks == 0: if not intermediate_save_generations_path: raise ValueError( "intermediate_save_generations_path cannot be empty!" From b54ee659638f46c9a513a191b58b72951826823c Mon Sep 17 00:00:00 2001 From: Max Tian Date: Wed, 3 Jan 2024 11:13:35 -0500 Subject: [PATCH 13/19] fix multiple iterations of saving intermediate outputs --- bigcode_eval/evaluator.py | 1 + bigcode_eval/generation.py | 4 +++- bigcode_eval/utils.py | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 3bba91495..eae157197 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -75,6 +75,7 @@ def generate_text(self, task_name, intermediate_generations=None): args=self.args, curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing save_every_k_tasks=self.args.save_every_k_tasks, + intermediate_generations=generations, intermediate_save_generations_path=intermediate_save_generations_path, ) generations.extend(new_generations) diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index 25a74561d..98e15a7be 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -1,7 +1,7 @@ import json from math import ceil -from typing import Optional +from typing import List, Optional from accelerate.utils import set_seed from torch.utils.data.dataloader import DataLoader @@ -49,6 +49,7 @@ def parallel_generations( args, curr_sample_idx: int = 0, save_every_k_tasks: int = -1, + intermediate_generations: Optional[List[Optional[List[Optional[str]]]]] = None, intermediate_save_generations_path: Optional[str] = None, ): if args.load_generations_path: @@ -151,6 +152,7 @@ def parallel_generations( postprocess=args.postprocess, is_wrapped=is_loaded_in_8bit or is_loaded_in_4bit, save_every_k_tasks=save_every_k_tasks, + intermediate_generations=intermediate_generations, intermediate_save_generations_path=intermediate_save_generations_path, **gen_kwargs, ) diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index 995044202..f19437e66 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -235,6 +235,7 @@ def complete_code( postprocess=True, is_wrapped=False, save_every_k_tasks: int = -1, + intermediate_generations: Optional[List[Optional[List[Optional[str]]]]] = None, intermediate_save_generations_path: Optional[str] = None, **gen_kwargs, ): @@ -246,6 +247,7 @@ def complete_code( # keep track of the list of generated codes # where len(code_gens) = n_tasks and len(code_gens[0]) = number of generated code samples code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)] + intermediate_generations = [] if not intermediate_generations else intermediate_generations gen_token_dict = defaultdict(list) # dict of list of generated tokens for step, batch in tqdm( enumerate(dataloader), @@ -332,7 +334,8 @@ def complete_code( gen_token_dict, ) with open(intermediate_save_generations_path, "w") as fp: - json.dump(code_gens, fp) + intermediate_generations.extend(code_gens) + json.dump(intermediate_generations, fp) print( f"intermediate generations were saved at {intermediate_save_generations_path}" ) From 0754793a4ba22eaaeff2c5381a480dfc36700cbe Mon Sep 17 00:00:00 2001 From: Max Tian Date: Wed, 3 Jan 2024 16:49:04 -0500 Subject: [PATCH 14/19] minor optimization for preventing oob errors --- bigcode_eval/evaluator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index eae157197..67895a00f 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -44,7 +44,8 @@ def generate_text(self, task_name, intermediate_generations=None): task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() # if args.limit is None, use all samples - n_tasks = self.args.limit if self.args.limit else len(dataset) + # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset) + n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset) # when args.limit is None # adjust n_tasks by args.limit_start to prevent out of bounds issues if not self.args.limit: From 8cffbfd282f58217a7bb1082fe815beb6374951f Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 4 Jan 2024 10:26:46 -0500 Subject: [PATCH 15/19] fix duplication issues --- bigcode_eval/evaluator.py | 13 ++++++------- bigcode_eval/utils.py | 8 +++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index 67895a00f..fa70000b9 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -59,14 +59,14 @@ def generate_text(self, task_name, intermediate_generations=None): solutions = [[ref] for ref in references] return solutions, references - generations = [] # list[list[str | None] | None] + curr_generations = [] # list[list[str | None] | None] if intermediate_generations: - generations = [gen for gen in intermediate_generations if gen] - n_tasks -= len(generations) + curr_generations = [gen for gen in intermediate_generations if gen] + n_tasks -= len(curr_generations) intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" - curr_sample_idx = len(generations) + curr_sample_idx = len(curr_generations) - new_generations = parallel_generations( + generations = parallel_generations( task, dataset, self.accelerator, @@ -76,10 +76,9 @@ def generate_text(self, task_name, intermediate_generations=None): args=self.args, curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing save_every_k_tasks=self.args.save_every_k_tasks, - intermediate_generations=generations, + intermediate_generations=curr_generations, intermediate_save_generations_path=intermediate_save_generations_path, ) - generations.extend(new_generations) if len(generations[0]) > self.args.n_samples: generations = [l[: self.args.n_samples] for l in generations] diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index f19437e66..c923dc2e6 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -3,6 +3,7 @@ import re import warnings from collections import defaultdict +from copy import deepcopy from typing import List, Optional import torch @@ -334,8 +335,9 @@ def complete_code( gen_token_dict, ) with open(intermediate_save_generations_path, "w") as fp: - intermediate_generations.extend(code_gens) - json.dump(intermediate_generations, fp) + intermediate_save_generations = deepcopy(intermediate_generations) + intermediate_save_generations.extend(code_gens) + json.dump(intermediate_save_generations, fp) print( f"intermediate generations were saved at {intermediate_save_generations_path}" ) @@ -353,7 +355,7 @@ def complete_code( gen_token_dict, ) - return code_gens + return intermediate_generations.extend(code_gens) def update_code_gens( From 88fec422a3c2d033713e49d72ea33d9039b5f0bd Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 4 Jan 2024 10:42:09 -0500 Subject: [PATCH 16/19] fix return for complete_code --- bigcode_eval/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index c923dc2e6..30c53d995 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -355,7 +355,8 @@ def complete_code( gen_token_dict, ) - return intermediate_generations.extend(code_gens) + intermediate_generations.extend(code_gens) + return intermediate_generations def update_code_gens( From 96eb239f814dc4fd52aa49575b2e35731dd6b546 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 4 Jan 2024 11:14:11 -0500 Subject: [PATCH 17/19] update ci yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c29b8c6d..0a7b2d5b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest - pip install transformers==4.21.1 accelerate==0.13.2 datasets==2.6.1 evaluate==0.2.2 pyext==0.7 mosestokenizer==1.0.0 "fsspec<2023.10.0" + pip install transformers==4.21.1 accelerate==0.13.2 datasets==2.14.6 evaluate==0.2.2 pyext==0.7 mosestokenizer==1.0.0 "fsspec<2023.10.0" #- name: Lint with flake8 # run: | # flake8 . From 9e86dd5bfcea4f24d2ee7f2b230de0915050dae8 Mon Sep 17 00:00:00 2001 From: Max Tian Date: Thu, 4 Jan 2024 21:11:41 -0500 Subject: [PATCH 18/19] clean up variable naming --- bigcode_eval/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index 30c53d995..f1532856e 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -248,7 +248,7 @@ def complete_code( # keep track of the list of generated codes # where len(code_gens) = n_tasks and len(code_gens[0]) = number of generated code samples code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)] - intermediate_generations = [] if not intermediate_generations else intermediate_generations + generations = [] if not intermediate_generations else intermediate_generations gen_token_dict = defaultdict(list) # dict of list of generated tokens for step, batch in tqdm( enumerate(dataloader), @@ -335,7 +335,7 @@ def complete_code( gen_token_dict, ) with open(intermediate_save_generations_path, "w") as fp: - intermediate_save_generations = deepcopy(intermediate_generations) + intermediate_save_generations = deepcopy(generations) intermediate_save_generations.extend(code_gens) json.dump(intermediate_save_generations, fp) print( @@ -355,8 +355,8 @@ def complete_code( gen_token_dict, ) - intermediate_generations.extend(code_gens) - return intermediate_generations + generations.extend(code_gens) + return generations def update_code_gens( From 6b18f1e0bec6288dd5a99bed832e36695771973d Mon Sep 17 00:00:00 2001 From: Max Tian Date: Mon, 8 Jan 2024 10:47:03 -0500 Subject: [PATCH 19/19] remove deepcopy --- bigcode_eval/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index f1532856e..f787b10ed 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -3,7 +3,6 @@ import re import warnings from collections import defaultdict -from copy import deepcopy from typing import List, Optional import torch @@ -335,9 +334,7 @@ def complete_code( gen_token_dict, ) with open(intermediate_save_generations_path, "w") as fp: - intermediate_save_generations = deepcopy(generations) - intermediate_save_generations.extend(code_gens) - json.dump(intermediate_save_generations, fp) + json.dump(generations + code_gens, fp) print( f"intermediate generations were saved at {intermediate_save_generations_path}" )