diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7c29b8c6d..0a7b2d5b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest - pip install transformers==4.21.1 accelerate==0.13.2 datasets==2.6.1 evaluate==0.2.2 pyext==0.7 mosestokenizer==1.0.0 "fsspec<2023.10.0" + pip install transformers==4.21.1 accelerate==0.13.2 datasets==2.14.6 evaluate==0.2.2 pyext==0.7 mosestokenizer==1.0.0 "fsspec<2023.10.0" #- name: Lint with flake8 # run: | # flake8 . diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py index b281612f4..fa70000b9 100644 --- a/bigcode_eval/evaluator.py +++ b/bigcode_eval/evaluator.py @@ -3,6 +3,9 @@ import os import warnings +from typing import List + + from bigcode_eval import tasks from bigcode_eval.generation import parallel_generations @@ -24,7 +27,6 @@ ################################################################################\ """ - class Evaluator: def __init__(self, accelerator, model, tokenizer, args): self.accelerator = accelerator @@ -38,11 +40,16 @@ def __init__(self, accelerator, model, tokenizer, args): # code evaluation permission self.allow_code_execution = args.allow_code_execution - def generate_text(self, task_name): + def generate_text(self, task_name, intermediate_generations=None): task = tasks.get_task(task_name, self.args) dataset = task.get_dataset() # if args.limit is None, use all samples - n_tasks = self.args.limit if self.args.limit else len(dataset) + # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset) + n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset) + # when args.limit is None + # adjust n_tasks by args.limit_start to prevent out of bounds issues + if not self.args.limit: + n_tasks -= self.args.limit_start references = [task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start+n_tasks)] if self.args.check_references: @@ -52,6 +59,13 @@ def generate_text(self, task_name): solutions = [[ref] for ref in references] return solutions, references + curr_generations = [] # list[list[str | None] | None] + if intermediate_generations: + curr_generations = [gen for gen in intermediate_generations if gen] + n_tasks -= len(curr_generations) + intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json" + curr_sample_idx = len(curr_generations) + generations = parallel_generations( task, dataset, @@ -60,7 +74,12 @@ def generate_text(self, task_name): self.tokenizer, n_tasks=n_tasks, args=self.args, + curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing + save_every_k_tasks=self.args.save_every_k_tasks, + intermediate_generations=curr_generations, + intermediate_save_generations_path=intermediate_save_generations_path, ) + if len(generations[0]) > self.args.n_samples: generations = [l[: self.args.n_samples] for l in generations] warnings.warn( @@ -68,25 +87,17 @@ def generate_text(self, task_name): ) return generations, references - def evaluate(self, task_name): + def evaluate(self, task_name, intermediate_generations=None): task = tasks.get_task(task_name, self.args) if task.requires_execution and not self.allow_code_execution: raise ValueError(_WARNING) - generations, references = self.generate_text(task_name) + generations, references = self.generate_text(task_name, intermediate_generations=intermediate_generations) if self.accelerator.is_main_process: if not self.args.load_generations_path: - if self.args.save_generations: - with open(self.args.save_generations_path, "w") as fp: - json.dump(generations, fp) - print( - f"generations were saved at {self.args.save_generations_path}" - ) - if self.args.save_references: - with open(self.args.save_references_path, "w") as fp: - json.dump(references, fp) - print(f"references were saved at {self.args.save_references_path}") + save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}.json" + self.save_json_files(generations, references, save_generations_path, f"references_{task_name}.json") # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -95,3 +106,19 @@ def evaluate(self, task_name): print("Evaluating generations...") results = task.process_results(generations, references) return results + + def save_json_files( + self, + generations: List[str], + references: List[str], + save_generations_path: str, + save_references_path: str, + ) -> None: + if self.args.save_generations: + with open(save_generations_path, "w") as fp: + json.dump(generations, fp) + print(f"generations were saved at {save_generations_path}") + if self.args.save_references: + with open(save_references_path, "w") as fp: + json.dump(references, fp) + print(f"references were saved at {save_references_path}") diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py index bf2a24301..98e15a7be 100644 --- a/bigcode_eval/generation.py +++ b/bigcode_eval/generation.py @@ -1,6 +1,8 @@ import json from math import ceil +from typing import List, Optional + from accelerate.utils import set_seed from torch.utils.data.dataloader import DataLoader from transformers import StoppingCriteria, StoppingCriteriaList @@ -37,7 +39,19 @@ def __call__(self, input_ids, scores, **kwargs): return input_ids.shape[1] > int(self.input_length * self.multiplier) -def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args): +def parallel_generations( + task, + dataset, + accelerator, + model, + tokenizer, + n_tasks, + args, + curr_sample_idx: int = 0, + save_every_k_tasks: int = -1, + intermediate_generations: Optional[List[Optional[List[Optional[str]]]]] = None, + intermediate_save_generations_path: Optional[str] = None, +): if args.load_generations_path: # load generated code with open(args.load_generations_path) as fp: @@ -100,7 +114,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, tokenizer, num_devices=accelerator.state.num_processes, max_length=args.max_length_generation, - limit_start=args.limit_start, + limit_start=args.limit_start + curr_sample_idx, n_tasks=n_tasks, n_copies=n_copies, prefix=args.prefix, @@ -131,12 +145,15 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, tokenizer, ds_loader, n_tasks=n_tasks, - limit_start=args.limit_start, + limit_start=args.limit_start + curr_sample_idx, batch_size=args.batch_size, prefix=args.prefix, instruction_tokens=instruction_tokens, postprocess=args.postprocess, is_wrapped=is_loaded_in_8bit or is_loaded_in_4bit, + save_every_k_tasks=save_every_k_tasks, + intermediate_generations=intermediate_generations, + intermediate_save_generations_path=intermediate_save_generations_path, **gen_kwargs, ) return generations diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index 5a9f82629..f787b10ed 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -1,6 +1,9 @@ +import json import math +import re import warnings from collections import defaultdict +from typing import List, Optional import torch from torch.utils.data import IterableDataset @@ -49,7 +52,7 @@ def __iter__(self): prompts_encoder = [] infill = [] instruction = [] - for sample in range(self.limit_start, self.limit_start+self.n_tasks): + for sample in range(self.limit_start, self.limit_start + self.n_tasks): prompt_contents = self.task.get_prompt(self.dataset[sample]) if isinstance(prompt_contents, str): # Normal code completion mode @@ -111,8 +114,6 @@ def __iter__(self): return_token_type_ids=return_token_type_ids, ) - - if self.n_copies == 1 and self.n_tasks % self.num_devices != 0: self.n_copies = 2 warnings.warn( @@ -127,7 +128,9 @@ def __iter__(self): "ids_encoder": outputs_encoder.input_ids[sample], "task_id": sample, "input_len": outputs.attention_mask[sample].sum(), - "input_len_encoder": outputs_encoder.attention_mask[sample].sum(), + "input_len_encoder": outputs_encoder.attention_mask[ + sample + ].sum(), } else: yield { @@ -231,6 +234,9 @@ def complete_code( instruction_tokens=None, postprocess=True, is_wrapped=False, + save_every_k_tasks: int = -1, + intermediate_generations: Optional[List[Optional[List[Optional[str]]]]] = None, + intermediate_save_generations_path: Optional[str] = None, **gen_kwargs, ): """Generate multiple codes for each task in the dataset using multiple GPUs with accelerate. @@ -238,7 +244,10 @@ def complete_code( [p_0_0, p_0_1, ..., p_0_nc-1, p_1_0, ..., p_nt-1_nc-1] where nc is the number of copies of the prompt, and nt is the number of tasks. nc is such that num_samples(for each task)= nc * batch_size """ - + # keep track of the list of generated codes + # where len(code_gens) = n_tasks and len(code_gens[0]) = number of generated code samples + code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)] + generations = [] if not intermediate_generations else intermediate_generations gen_token_dict = defaultdict(list) # dict of list of generated tokens for step, batch in tqdm( enumerate(dataloader), @@ -251,12 +260,14 @@ def complete_code( # Set the start_length after which to check for stopping to be the longest input ignoring padding max_len = batch["input_len"].max().item() if "ids_encoder" in batch: - max_len += 1 # Add 1 for decoder_start_token_id + max_len += 1 # Add 1 for decoder_start_token_id gen_kwargs["stopping_criteria"][0].start_length = max_len if hasattr(task, "max_length_multiplier") and task.max_length_multiplier: idx = 1 if task.stop_words else 0 - gen_kwargs["stopping_criteria"][idx].input_length = batch["input_len"].max().item() - + gen_kwargs["stopping_criteria"][idx].input_length = ( + batch["input_len"].max().item() + ) + inputs = batch["ids"][:, : batch["input_len"]] if "ids_encoder" in batch: if is_wrapped: @@ -306,7 +317,55 @@ def complete_code( for sample, generated_tokens in zip(generated_tasks, generated_tokens): gen_token_dict[sample].append(generated_tokens) - code_gens = [[] for _ in range(n_tasks)] + if save_every_k_tasks >= 1 and (step + 1) % save_every_k_tasks == 0: + if not intermediate_save_generations_path: + raise ValueError( + "intermediate_save_generations_path cannot be empty!" + ) + + code_gens = update_code_gens( + task, + tokenizer, + limit_start, + prefix, + instruction_tokens, + postprocess, + code_gens, + gen_token_dict, + ) + with open(intermediate_save_generations_path, "w") as fp: + json.dump(generations + code_gens, fp) + print( + f"intermediate generations were saved at {intermediate_save_generations_path}" + ) + # reset gen_token_dict - prevent redundant decoding + gen_token_dict = defaultdict(list) + + code_gens = update_code_gens( + task, + tokenizer, + limit_start, + prefix, + instruction_tokens, + postprocess, + code_gens, + gen_token_dict, + ) + + generations.extend(code_gens) + return generations + + +def update_code_gens( + task, + tokenizer, + limit_start, + prefix, + instruction_tokens, + postprocess, + code_gens, + gen_token_dict, +): for sample, generated_tokens in gen_token_dict.items(): for s in generated_tokens: if INFILL_MODE or tokenizer.eos_token in task.stop_words: @@ -315,7 +374,7 @@ def complete_code( # Treat eos token as a regular stop word not removing it from the output # If it's removed it may have the effect of removing it in the middle of a # longer generation in case a batch size > 1 is used, which will result in - # a wrong generation as it won't be used for splitting lateron + # a wrong generation as it won't be used for splitting lateron gen_code = tokenizer.decode( s, skip_special_tokens=False, clean_up_tokenization_spaces=False ) @@ -338,13 +397,9 @@ def complete_code( "model output is not postprocessed, this might lower evaluation scores" ) code_gens[sample].append(gen_code) - return code_gens -import re - - def remove_after_return(code): """ Takes as input a code, and removes everything that is after the return. @@ -361,6 +416,6 @@ def remove_after_return(code): and start_match < len(code) and code[start_match].strip() != "" ): - return code[0:start_match] + return code[0: start_match] end_last_match = end_match return code diff --git a/main.py b/main.py index d11cc0889..0f4757b9f 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,4 @@ +import os import fnmatch import json import warnings @@ -120,6 +121,12 @@ def parse_args(): default=0, help="Optional offset to start from when limiting the number of samples", ) + parser.add_argument( + "--save_every_k_tasks", + type=int, + default=-1, + help="Optional saving after every k tasks", + ) parser.add_argument( "--postprocess", action="store_false", @@ -158,6 +165,12 @@ def parse_args(): action="store_true", help="Whether to save code generations", ) + parser.add_argument( + "--load_generations_intermediate_paths", + type=str, + nargs="*", + help="List of paths for saving the intermediate code generations", + ) parser.add_argument( "--save_generations_path", type=str, @@ -331,21 +344,42 @@ def main(): evaluator = Evaluator(accelerator, model, tokenizer, args) - for task in task_names: + if ( + args.load_generations_intermediate_paths + and len(args.load_generations_intermediate_paths) != len(task_names) + ): + raise ValueError( + "If passing --load_generations_intermediate_paths, \ + must pass equal number of files as number of tasks" + ) + + for idx, task in enumerate(task_names): + intermediate_generations = None + if args.load_generations_intermediate_paths: + with open(args.load_generations_intermediate_paths[idx], "r") as f_in: + # intermediate_generations: list[list[str | None]] of len n_tasks + # where list[i] = generated codes or empty + intermediate_generations = json.load(f_in) + if args.generation_only: if accelerator.is_main_process: print("generation mode only") - generations, references = evaluator.generate_text(task) + generations, references = evaluator.generate_text( + task, intermediate_generations=intermediate_generations + ) if accelerator.is_main_process: - with open(args.save_generations_path, "w") as fp: - json.dump(generations, fp) - print(f"generations were saved at {args.save_generations_path}") - if args.save_references: - with open(args.save_references_path, "w") as fp: - json.dump(references, fp) - print(f"references were saved at {args.save_references_path}") + save_generations_path = f"{os.path.splitext(args.save_generations_path)[0]}_{task}.json" + save_references_path = f"references_{task}.json" + evaluator.save_json_files( + generations, + references, + save_generations_path, + save_references_path, + ) else: - results[task] = evaluator.evaluate(task) + results[task] = evaluator.evaluate( + task, intermediate_generations=intermediate_generations + ) # Save all args to config results["config"] = vars(args) diff --git a/tests/test_generation_evaluation.py b/tests/test_generation_evaluation.py index a6809f017..367f3c0b2 100644 --- a/tests/test_generation_evaluation.py +++ b/tests/test_generation_evaluation.py @@ -82,6 +82,7 @@ def load_generation_examples(task): def test_generation(): args.generation_only = True + args.save_every_k_tasks = -1 evaluator = Evaluator(accelerator, model, tokenizer, args) for task in GEN_TASKS: print(f"testing task {task}") @@ -95,6 +96,7 @@ def test_generation(): def test_evaluation(): # TODO add scores for each task args.n_samples = 2 + args.save_every_k_tasks = -1 for task in EVAL_TASKS: print(f"testing task {task}") # path to generation examples to evaluate