diff --git a/examples/xglm/README.md b/examples/xglm/README.md index 8f62fc57..368c85f9 100644 --- a/examples/xglm/README.md +++ b/examples/xglm/README.md @@ -25,3 +25,10 @@ cd examples/xglm torchrun --nproc-per-node=1 convert_dense2moe.py --checkpoint-path=checkpoints/xglm-564M --save-path=$SCRATCH/checkpoints/xglm-8x564M --num-experts=8 ``` Note that this upcycling _drops_ the bias parameters of the MLP because the MegaBlocks implementation does not support bias parameters. While this is a limitation of the current implementation, the performance is quickly recovered after a few training steps. + +To save back to huggingface format use +```bash +torchrun examples/xglm/convert_ntmoe2hf.py --checkpoint-path=$SCRATCH/checkpoints/xglm-8x564M --save-path=$SCRATCH/checkpoints/huggingface/xglm-8x56fM +``` + +Make sure to have the [XGLM MOE implementation](https://github.com/negar-foroutan/Multilingual_MoE) installed (e.g. using `PYTHONPATH=/path/to/Multilingual_MoE/models`). diff --git a/examples/xglm/convert_ntmoe2hf.py b/examples/xglm/convert_ntmoe2hf.py new file mode 100644 index 00000000..dfa6d510 --- /dev/null +++ b/examples/xglm/convert_ntmoe2hf.py @@ -0,0 +1,122 @@ +""" +Converts a nanotron moe model to HF format +Command: + torchrun --nproc-per-node=1 convert_nt2hf.py --checkpoint-path=nanotron_weights --save-path=hf_weights +""" + +import warnings +from argparse import ArgumentParser +from pathlib import Path +from typing import Optional + +from transformers import AutoTokenizer + +from nanotron.config.models_config import GPT3MoEConfig +from nanotron.models.gpt3_moe import GPT3MoEForTraining, GPT3MoEBlock +from nanotron.models.moe import dMoE, SparseMLP + +from examples.xglm.convert_dense2moe import create_nt_moe_model, convert_attention +from examples.xglm.convert_utils import convert_generic + +from models.xglm_model import XGLMForCausalLM, XGLMDecoderLayer, XGLMmoeConfig, XGLMSparseMoeBlock, XGLMMLP + +# TODO: nanotron moe scales down the moe weights but hf doesn't +# TODO: nanotron does not use pdrop in moe. + + +def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig + assert config.moe_num_experts > 1, f"Why are you using a 1-expert moe? lol" + if config.embd_pdrop != config.resid_pdrop: + warnings.warn( + f"nanotron.embd_pdrop = {config.embd_pdrop} does not match with " + f"nanotron.resid_pdrop = {config.resid_pdrop}. " + "XGLM implementation needs these two values to be equal " + "for correct conversion." + ) + if config.layer_norm_epsilon != 1e-5: + warnings.warn(f"nanotron.layer_norm_epsilon must be 1e-5, not {config.layer_norm_epsilon}") + if config.moe_z_loss_weight != 0: + warnings.warn(f"transformer implementation does not support z loss") + assert not config.moe_glu, "Transformer implementation does not support glu MLP layers" + + return XGLMmoeConfig( + # Regular xglm config. + activation_function=config.activation_function, + attention_dropout=config.attn_pdrop, + dropout=config.embd_pdrop, + eos_token_id=config.eos_token_id, + d_model=config.hidden_size, + ffn_dim=config.intermediate_size, + max_position_embeddings=config.max_position_embeddings, + attention_heads=config.num_attention_heads, + num_layers=config.num_hidden_layers, + vocab_size=config.vocab_size, + decoder_start_token_id=config.position_embedding_offset, + activation_dropout=config.act_pdrop, + scale_embedding=config.scale_embedding, + # Moe specifics. + num_local_experts=config.moe_num_experts, + num_experts_per_tok=config.num_experts_per_tok, + gate_type="linear", + gate_depth=1, + router_aux_loss_coef=config.moe_looss_weight, + ) + + +def convert_mlp(mlp_hf: XGLMMLP, mlp_nt: SparseMLP): + # TODO: mlp_hf has non-zero bias. + convert_generic(mlp_hf.fc1, mlp_nt.w1.module) + convert_generic(mlp_hf.fc2, mlp_nt.w2.module) + + +def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE): + convert_generic(ff_hf.gate.gate, ff_nt.router.layer) + for expert_hf, expert_nt in zip(ff_hf.experts, ff_nt.experts): + convert_mlp(expert_hf, expert_nt.mlp) + + +def convert_decoder(block_hf: XGLMDecoderLayer, block_nt: GPT3MoEBlock): + convert_generic(block_hf.self_attn_layer_norm, block_nt.ln_1) + convert_attention(block_hf.self_attn, block_nt.attn) + convert_generic(block_hf.final_layer_norm, block_nt.ln_2) + # TODO: hf has fc1, fc2 attributes but they are not used, probably should be removed. + convert_generic(block_hf.fc1, block_nt.ff.c_fc) + convert_generic(block_hf.fc2, block_nt.ff.c_proj) + + +def convert(model_hf: XGLMForCausalLM, model_nt: GPT3MoEForTraining): + convert_generic(model_hf.model.embed_tokens, model_nt.model.token_embeddings.pp_block.token_embedding) + for layer_hf, layer_nt in zip(model_hf.model.layers, model_nt.model.decoder): + convert_decoder(layer_hf, layer_nt.pp_block) + convert_generic(model_hf.model.layer_norm, model_nt.model.final_layer_norm.pp_block) + convert_generic(model_hf.lm_head, model_nt.model.lm_head.pp_block) + + +def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]): + # Load nanotron model. + model_nt = create_nt_moe_model(checkpoint_path=checkpoint_path) + + # Init huggingface model. + model_config_hf = convert_config(model_nt.config) + model_hf = XGLMForCausalLM._from_config(model_config_hf) + + # Copy weights, initialize tokenizer and save model. + if tokenizer_name is not None: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer.save_pretrained(save_path) + convert(model_hf, model_nt) + model_hf.save_pretrained(save_path) + print(f"Model saved to {save_path}") + + +if __name__ == "__main__": + parser = ArgumentParser(description="Convert HF weights to nanotron format") + parser.add_argument( + "--checkpoint-path", type=Path, default="checkpoints/xglm-7.5B", help="Path to the nanotron checkpoint" + ) + parser.add_argument( + "--save-path", type=Path, default="facebook/xglm-7.5B", help="Path to save the huggingface model" + ) + parser.add_argument("--tokenizer-name", type=str, default="facebook/xglm-7.5B") + args = parser.parse_args() + main(args.checkpoint_path, args.save_path, args.tokenizer_name)