From 04dc2ba24a99a0fce71df4c9a42a368d4b788dc3 Mon Sep 17 00:00:00 2001 From: kyuhee Date: Tue, 7 Nov 2023 10:15:09 +0900 Subject: [PATCH] When processing mlp.dense_4h_to_h.bias and attention.dense.bias, tp_ranks are not reflected, so strange results always appear when tp_ranks is greater than 1. --- tools/ckpts/convert_module_to_hf.py | 4 +++- tools/ckpts/convert_sequential_to_hf.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/ckpts/convert_module_to_hf.py b/tools/ckpts/convert_module_to_hf.py index f3f43c308..9a5823cb9 100644 --- a/tools/ckpts/convert_module_to_hf.py +++ b/tools/ckpts/convert_module_to_hf.py @@ -225,7 +225,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): "mlp.dense_4h_to_h.bias", "attention.dense.bias", ]: - state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) + state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) / len( + loaded_tp_ranks + ) # Just take one state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][ diff --git a/tools/ckpts/convert_sequential_to_hf.py b/tools/ckpts/convert_sequential_to_hf.py index f0a505ac3..69ad58786 100644 --- a/tools/ckpts/convert_sequential_to_hf.py +++ b/tools/ckpts/convert_sequential_to_hf.py @@ -238,7 +238,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): "mlp.dense_4h_to_h.bias", "attention.dense.bias", ]: - state_dict[key] = sum(get_state(loaded_tp_ranks, key, layer_i + 2)) + state_dict[key] = sum(get_state(loaded_tp_ranks, key, layer_i + 2)) / len( + loaded_tp_ranks + ) # Just take one state_dict["attention.rotary_emb.inv_freq"] = get_state(