diff --git a/tools/convert_module_to_hf.py b/tools/convert_module_to_hf.py index 905bdfa16..2cbf390b9 100644 --- a/tools/convert_module_to_hf.py +++ b/tools/convert_module_to_hf.py @@ -227,10 +227,12 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ "attention.rotary_emb.inv_freq" ] - state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] - state_dict["attention.masked_bias"] = hf_layer.state_dict()[ - "attention.masked_bias" - ] + if "attention.bias" in hf_layer.state_dict(): + state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] + if "attention.masked_bias" in hf_layer.state_dict(): + state_dict["attention.masked_bias"] = hf_layer.state_dict()[ + "attention.masked_bias" + ] # load state_dict into layer hf_layer.load_state_dict(state_dict) diff --git a/tools/convert_sequential_to_hf.py b/tools/convert_sequential_to_hf.py index 5a66219bf..13b92437d 100644 --- a/tools/convert_sequential_to_hf.py +++ b/tools/convert_sequential_to_hf.py @@ -247,10 +247,12 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): loaded_tp_ranks, "attention.rotary_emb.inv_freq", layer_i + 2 )[0] - state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] - state_dict["attention.masked_bias"] = hf_layer.state_dict()[ - "attention.masked_bias" - ] + if "attention.bias" in hf_layer.state_dict(): + state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"] + if "attention.masked_bias" in hf_layer.state_dict(): + state_dict["attention.masked_bias"] = hf_layer.state_dict()[ + "attention.masked_bias" + ] # load state_dict into layer hf_layer.load_state_dict(state_dict)