Skip to content

Commit

Permalink
Merge pull request #184 from bigcode-project/support-left-pad
Browse files Browse the repository at this point in the history
support left padding and prefix post-processing for models like chatglm
  • Loading branch information
loubnabnl authored Jan 19, 2024
2 parents e4fd563 + b8685b0 commit 3910745
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 10 deletions.
11 changes: 9 additions & 2 deletions bigcode_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def complete_code(
batch["input_len"].max().item()
)

inputs = batch["ids"][:, : batch["input_len"]]
inputs = batch["ids"][:, : batch["input_len"]] if tokenizer.padding_side == "right" else batch["ids"]
if "ids_encoder" in batch:
if is_wrapped:
generated_tokens = accelerator.unwrap_model(model).generate(
Expand Down Expand Up @@ -365,7 +365,7 @@ def update_code_gens(
postprocess,
code_gens,
gen_token_dict,
):
):
for sample, generated_tokens in gen_token_dict.items():
for s in generated_tokens:
if INFILL_MODE or tokenizer.eos_token in task.stop_words:
Expand All @@ -378,6 +378,13 @@ def update_code_gens(
gen_code = tokenizer.decode(
s, skip_special_tokens=False, clean_up_tokenization_spaces=False
)
try:
# some tokenizers add a multi-token prefix to the generation (e.g ChatGLM)
tokenizer_prefix = tokenizer.decode(tokenizer.get_prefix_tokens())
if gen_code.startswith(f"{tokenizer_prefix}"):
gen_code = gen_code[len(tokenizer_prefix):].lstrip()
except:
pass
if INFILL_MODE:
gen_code = _parse_infill(gen_code, tokenizer)
if INSTRUCTION_MODE:
Expand Down
32 changes: 24 additions & 8 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ def parse_args():
action="store_true",
help="Load model in 4bit",
)
parser.add_argument(
"--left_padding",
action="store_true",
help="Force left padding, needed for models like chatglm3-6b",
)
parser.add_argument(
"--limit",
type=int,
Expand Down Expand Up @@ -311,14 +316,25 @@ def main():
model.merge_and_unload()
print("Merge complete.")

tokenizer = AutoTokenizer.from_pretrained(
args.model,
revision=args.revision,
trust_remote_code=args.trust_remote_code,
use_auth_token=args.use_auth_token,
truncation_side="left",
padding_side="right", # padding on the right is needed to cut off padding in `complete_code`
)
if args.left_padding:
# left padding is required for some models like chatglm3-6b
tokenizer = AutoTokenizer.from_pretrained(
args.model,
revision=args.revision,
trust_remote_code=args.trust_remote_code,
use_auth_token=args.use_auth_token,
padding_side="left",
)
else:
# used by default for most models
tokenizer = AutoTokenizer.from_pretrained(
args.model,
revision=args.revision,
trust_remote_code=args.trust_remote_code,
use_auth_token=args.use_auth_token,
truncation_side="left",
padding_side="right",
)
if not tokenizer.eos_token:
if tokenizer.bos_token:
tokenizer.eos_token = tokenizer.bos_token
Expand Down
1 change: 1 addition & 0 deletions tests/test_generation_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def update_args(args):
args.limit_start = 0
args.batch_size = 1
args.max_length_generation = 300
args.left_padding = False
args.do_sample = False
args.top_p = 0
args.n_samples = 1
Expand Down

0 comments on commit 3910745

Please sign in to comment.