diff --git a/bigcode_eval/utils.py b/bigcode_eval/utils.py index f787b10ed..ff79c0e5f 100644 --- a/bigcode_eval/utils.py +++ b/bigcode_eval/utils.py @@ -268,7 +268,7 @@ def complete_code( batch["input_len"].max().item() ) - inputs = batch["ids"][:, : batch["input_len"]] + inputs = batch["ids"][:, : batch["input_len"]] if tokenizer.padding_side == "right" else batch["ids"] if "ids_encoder" in batch: if is_wrapped: generated_tokens = accelerator.unwrap_model(model).generate( @@ -365,7 +365,7 @@ def update_code_gens( postprocess, code_gens, gen_token_dict, -): +): for sample, generated_tokens in gen_token_dict.items(): for s in generated_tokens: if INFILL_MODE or tokenizer.eos_token in task.stop_words: @@ -378,6 +378,13 @@ def update_code_gens( gen_code = tokenizer.decode( s, skip_special_tokens=False, clean_up_tokenization_spaces=False ) + try: + # some tokenizers add a multi-token prefix to the generation (e.g ChatGLM) + tokenizer_prefix = tokenizer.decode(tokenizer.get_prefix_tokens()) + if gen_code.startswith(f"{tokenizer_prefix}"): + gen_code = gen_code[len(tokenizer_prefix):].lstrip() + except: + pass if INFILL_MODE: gen_code = _parse_infill(gen_code, tokenizer) if INSTRUCTION_MODE: diff --git a/main.py b/main.py index 0f4757b9f..bc10736bf 100644 --- a/main.py +++ b/main.py @@ -109,6 +109,11 @@ def parse_args(): action="store_true", help="Load model in 4bit", ) + parser.add_argument( + "--left_padding", + action="store_true", + help="Force left padding, needed for models like chatglm3-6b", + ) parser.add_argument( "--limit", type=int, @@ -311,14 +316,25 @@ def main(): model.merge_and_unload() print("Merge complete.") - tokenizer = AutoTokenizer.from_pretrained( - args.model, - revision=args.revision, - trust_remote_code=args.trust_remote_code, - use_auth_token=args.use_auth_token, - truncation_side="left", - padding_side="right", # padding on the right is needed to cut off padding in `complete_code` - ) + if args.left_padding: + # left padding is required for some models like chatglm3-6b + tokenizer = AutoTokenizer.from_pretrained( + args.model, + revision=args.revision, + trust_remote_code=args.trust_remote_code, + use_auth_token=args.use_auth_token, + padding_side="left", + ) + else: + # used by default for most models + tokenizer = AutoTokenizer.from_pretrained( + args.model, + revision=args.revision, + trust_remote_code=args.trust_remote_code, + use_auth_token=args.use_auth_token, + truncation_side="left", + padding_side="right", + ) if not tokenizer.eos_token: if tokenizer.bos_token: tokenizer.eos_token = tokenizer.bos_token diff --git a/tests/test_generation_evaluation.py b/tests/test_generation_evaluation.py index 367f3c0b2..2f5062bff 100644 --- a/tests/test_generation_evaluation.py +++ b/tests/test_generation_evaluation.py @@ -45,6 +45,7 @@ def update_args(args): args.limit_start = 0 args.batch_size = 1 args.max_length_generation = 300 + args.left_padding = False args.do_sample = False args.top_p = 0 args.n_samples = 1