Skip to content

Commit

Permalink
very close
Browse files Browse the repository at this point in the history
  • Loading branch information
AleHD committed Sep 3, 2024
1 parent 38364d5 commit 930fe81
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 22 deletions.
2 changes: 1 addition & 1 deletion examples/xglm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ To save back to huggingface format use
torchrun examples/xglm/convert_ntmoe2hf.py --checkpoint-path=$SCRATCH/checkpoints/xglm-8x564M --save-path=$SCRATCH/checkpoints/huggingface/xglm-8x56fM
```

Make sure to have the [XGLM MOE implementation](https://github.com/negar-foroutan/Multilingual_MoE) installed (e.g. using `PYTHONPATH=/path/to/Multilingual_MoE/models`).
Make sure to have the [XGLM MOE implementation](https://github.com/negar-foroutan/Multilingual_MoE) installed (e.g. using `PYTHONPATH=/path/to/Multilingual_MoE`).
48 changes: 34 additions & 14 deletions examples/xglm/convert_ntmoe2hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,26 @@
from pathlib import Path
from typing import Optional

import torch
from transformers import AutoTokenizer
from tqdm import tqdm

from nanotron.config.models_config import GPT3MoEConfig
from nanotron.models.gpt3_moe import GPT3MoEForTraining, GPT3MoEBlock
from nanotron.models.moe import dMoE, SparseMLP
from nanotron.models.moe import dMoE, SparseMLP, LearnedRouter

from examples.xglm.convert_dense2moe import create_nt_moe_model, convert_attention
from examples.xglm.convert_dense2moe import create_nt_moe_model
from examples.xglm.convert_nt2hf import convert_attention
from examples.xglm.convert_utils import convert_generic

from models.xglm_model import XGLMForCausalLM, XGLMDecoderLayer, XGLMmoeConfig, XGLMSparseMoeBlock, XGLMMLP
from models.gating import BasicGate

# TODO: nanotron moe scales down the moe weights but hf doesn't
# TODO: nanotron does not use pdrop in moe.


def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig
def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig:
assert config.moe_num_experts > 1, f"Why are you using a 1-expert moe? lol"
if config.embd_pdrop != config.resid_pdrop:
warnings.warn(
Expand Down Expand Up @@ -59,7 +63,7 @@ def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig
num_experts_per_tok=config.num_experts_per_tok,
gate_type="linear",
gate_depth=1,
router_aux_loss_coef=config.moe_looss_weight,
router_aux_loss_coef=config.moe_loss_weight,
)


Expand All @@ -69,25 +73,38 @@ def convert_mlp(mlp_hf: XGLMMLP, mlp_nt: SparseMLP):
convert_generic(mlp_hf.fc2, mlp_nt.w2.module)


def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE):
convert_generic(ff_hf.gate.gate, ff_nt.router.layer)
for expert_hf, expert_nt in zip(ff_hf.experts, ff_nt.experts):
convert_mlp(expert_hf, expert_nt.mlp)
def convert_gate(gate_hf: BasicGate, gate_nt: LearnedRouter):
convert_generic(gate_hf.gate, gate_nt.layer)


def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE):
convert_gate(ff_hf.gate, ff_nt.gate)
int_size = ff_nt.config.intermediate_size
for i, expert_hf in enumerate(ff_hf.experts):
# TODO: fc1, fc2 has bias
i0 = i*int_size
i1 = (i + 1)*int_size
with torch.no_grad():
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight.T[i0:i1, :].clone())
expert_hf.fc1.bias.data.zero_()
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[i0:i1, :].T.clone())
expert_hf.fc2.bias.data.zero_()

def convert_decoder(block_hf: XGLMDecoderLayer, block_nt: GPT3MoEBlock):
convert_generic(block_hf.self_attn_layer_norm, block_nt.ln_1)
convert_attention(block_hf.self_attn, block_nt.attn)
convert_generic(block_hf.final_layer_norm, block_nt.ln_2)
# TODO: hf has fc1, fc2 attributes but they are not used, probably should be removed.
convert_generic(block_hf.fc1, block_nt.ff.c_fc)
convert_generic(block_hf.fc2, block_nt.ff.c_proj)
#return block_nt.ff
convert_ff(block_hf.block_sparse_moe, block_nt.ff) # REMOVE


def convert(model_hf: XGLMForCausalLM, model_nt: GPT3MoEForTraining):
convert_generic(model_hf.model.embed_tokens, model_nt.model.token_embeddings.pp_block.token_embedding)
for layer_hf, layer_nt in zip(model_hf.model.layers, model_nt.model.decoder):
convert_decoder(layer_hf, layer_nt.pp_block)
for layer_hf, layer_nt in tqdm(zip(model_hf.model.layers, model_nt.model.decoder), desc="Converting layers",
total=model_nt.config.num_hidden_layers):
#return convert_decoder(layer_hf, layer_nt.pp_block)
convert_decoder(layer_hf, layer_nt.pp_block) # REMOVE
convert_generic(model_hf.model.layer_norm, model_nt.model.final_layer_norm.pp_block)
convert_generic(model_hf.lm_head, model_nt.model.lm_head.pp_block)

Expand All @@ -104,7 +121,10 @@ def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]):
if tokenizer_name is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.save_pretrained(save_path)
convert(model_hf, model_nt)
states = torch.randn(4, 1, 1024)
#return convert(model_hf, model_nt), states.cuda().bfloat16()
convert(model_hf, model_nt), states.cuda().bfloat16() # REMOVE
print("Saving...")
model_hf.save_pretrained(save_path)
print(f"Model saved to {save_path}")

Expand All @@ -119,4 +139,4 @@ def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]):
)
parser.add_argument("--tokenizer-name", type=str, default="facebook/xglm-7.5B")
args = parser.parse_args()
main(args.checkpoint_path, args.save_path, args.tokenizer_name)
ret = main(args.checkpoint_path, args.save_path, args.tokenizer_name)
179 changes: 179 additions & 0 deletions examples/xglm/tests/test_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import torch
import pytest

import nanotron
from nanotron.config.parallelism_config import ParallelismArgs
from nanotron.config.models_config import GPT3MoEConfig
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.trainer import mark_tied_parameters
from nanotron.models.gpt3_moe import GPT3MoEBlock, GPT3MoEForTraining
from nanotron.models.moe import LearnedRouter, dMoE

from tests.helpers.utils import init_distributed

from examples.xglm.convert_ntmoe2hf import convert_config, convert_gate, convert_ff, convert
from examples.xglm.tests.test_implementation import almost_close

from models.xglm_model import XGLMSparseMoeBlock, XGLMForCausalLM
from models.gating import BasicGate


MAX_SEQUENCE_LENGTH = 2048
TEST_SEQUENCE_LENGTH = 128 # If we test with a very large sequence length, precision errors get more significant independent of the correct implementation.
#TEST_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH
BATCH_SIZE = 4
HIDDEN_SIZE = 1024
DTYPE = torch.bfloat16
#DTYPE = torch.float32
TEXT = "Hello. This is a relatively long text. I will use this text to test the conversion scripts. Let's finish this text soon because I don't have much more to say. Final note:"

CONFIG = GPT3MoEConfig(
attn_pdrop=0.0,
embd_pdrop=0.0,
resid_pdrop=0.0,
act_pdrop=0.0,
eos_token_id=2,
hidden_size=HIDDEN_SIZE,
intermediate_size=4096,
layer_norm_epsilon=1e-05,
max_position_embeddings=MAX_SEQUENCE_LENGTH,
num_attention_heads=16,
num_hidden_layers=24,
scale_attn_weights=True,
vocab_size=256008,
sinusoidal_position_embedding=True,
position_embedding_offset=2,
use_spda=DTYPE is not torch.bfloat16,
# vvv moe vvv
is_moe=True,
moe_num_experts=4,
num_experts_per_tok=4,
moe_loss_weight=0.01,
moe_z_loss_weight=0.0,
moe_glu=False,
)
#PARALLEL_CONFIG = ParallelismArgs(dp=1, pp=1, tp=1, expert_parallel_size=1) #CONFIG.moe_num_experts)


@pytest.fixture
def hidden_states() -> torch.Tensor:
return torch.randn(TEST_SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE)


@pytest.fixture
def input_mask() -> torch.Tensor:
return torch.ones(BATCH_SIZE, TEST_SEQUENCE_LENGTH, dtype=torch.bool)


@pytest.fixture
def input_ids() -> torch.Tensor:
return torch.randint(0, CONFIG.vocab_size, (BATCH_SIZE, TEST_SEQUENCE_LENGTH))


def _test_nt2hf_gate(parallel_context: ParallelContext, hidden_states: torch.Tensor):
hidden_states = hidden_states.cuda()

config_hf = convert_config(CONFIG)
gate_nt = LearnedRouter(CONFIG).cuda().to(DTYPE)
gate_hf = BasicGate(config_hf).cuda().to(DTYPE)
convert_gate(gate_hf, gate_nt)

router_logits_nt, _, _ = gate_nt(hidden_states.view(-1, HIDDEN_SIZE))
router_logits_hf = gate_hf(hidden_states.permute(1, 0, 2).reshape(-1, HIDDEN_SIZE), "")

router_logits_nt = router_logits_nt.view(TEST_SEQUENCE_LENGTH, BATCH_SIZE, -1)
router_logits_hf = router_logits_hf.view(BATCH_SIZE, TEST_SEQUENCE_LENGTH, -1).permute(1, 0, 2)

assert router_logits_nt.size() == router_logits_hf.size()
torch.testing.assert_close(router_logits_nt, router_logits_hf)


def test_nt2hf_gate(hidden_states: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_gate)(hidden_states=hidden_states)


def _test_nt2hf_ff(parallel_context: ParallelContext, hidden_states: torch.Tensor):
hidden_states = hidden_states.cuda()

config_hf = convert_config(CONFIG)
ff_nt = dMoE(CONFIG, parallel_context, None).cuda().to(DTYPE)
ff_hf = XGLMSparseMoeBlock(config_hf).cuda().to(DTYPE)
convert_ff(ff_hf, ff_nt)

out_nt = ff_nt(hidden_states)["hidden_states"]
out_hf, _ = ff_hf(hidden_states.permute(1, 0, 2).contiguous(), "")
out_hf = out_hf.permute(1, 0, 2)

assert out_nt.size() == out_hf.size()
almost_close(out_nt, out_hf, max_far=0.1, far_atol=0.02)
#torch.testing.assert_close(out_nt, out_hf)



def _test_nt2hf_model(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
random_states = nanotron.random.RandomStates({"tp_synced": nanotron.random.get_current_random_state()})
input_ids = input_ids.cuda()
input_mask = input_mask.cuda()

# unfortunately, we can't use float64 with huggingface xglm.
new_dtype = torch.float32 if DTYPE == torch.float64 else DTYPE

# Get nanotron model.
config_nt = GPT3MoEConfig(**vars(CONFIG))
if new_dtype not in {torch.bfloat16, torch.float16}:
config_nt.use_spda = True
model_nt = nanotron.models.build_model(
model_builder=lambda: GPT3MoEForTraining(
config=config_nt,
parallel_context=parallel_context,
parallel_config=None,
random_states=random_states,
),
parallel_context=parallel_context,
dtype=new_dtype,
device="cuda",
).eval()
mark_tied_parameters(model=model_nt, parallel_context=parallel_context)

# Create empty model_hf and make conversion.
model_hf = XGLMForCausalLM(convert_config(config_nt)).cuda().to(new_dtype).eval()
convert(model_hf, model_nt)

# Needed :/
aux_losses = {
"load_balancing_loss": (
torch.zeros(1, device=input_ids.device)
if not isinstance(input_ids, TensorPointer)
else TensorPointer(self.input_pp_rank)
),
"z_loss": (
torch.zeros(1, device=input_ids.device)
if not isinstance(input_ids, TensorPointer)
else TensorPointer(self.input_pp_rank)
),
}

# Get outputs and assert.
with torch.no_grad():
out_nt = model_nt.model(input_ids, input_mask, aux_losses)["sharded_logits"].to(new_dtype)
del model_nt
torch.cuda.empty_cache()
out_hf = model_hf(input_ids=input_ids, attention_mask=input_mask, output_router_logits=False).logits.permute(1, 0, 2)
del model_hf
torch.cuda.empty_cache()
assert out_nt.size() == out_hf.size(), f"{out_nt.size()}, {out_hf.size()}"
return out_nt.cpu(), out_hf.cpu()


def test_nt2hf_ff(hidden_states: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_ff)(hidden_states=hidden_states)


def _test_nt2hf_dummy_xglm(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
out_nt, out_hf = _test_nt2hf_model(parallel_context, input_ids, input_mask)
almost_close(out_nt, out_hf, max_far=0.1, far_atol=0.02)


def test_nt2hf_dummy_xglm(input_ids: torch.Tensor, input_mask: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_dummy_xglm)(input_ids=input_ids, input_mask=input_mask)
27 changes: 20 additions & 7 deletions src/nanotron/models/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ def forward(self, hidden_states: torch.Tensor):
router_logits, expert_weights, top_experts = self.gate(x)

# Compute the experts.
x, lbl_loss, z_loss = self.experts(x, router_logits, expert_weights, top_experts)
#return self.experts(x, router_logits, expert_weights, top_experts)
x, lbl_loss, z_loss = self.experts(x, router_logits, expert_weights, top_experts) #REMOVE
return {
"hidden_states": x.reshape(batch_size, sequence_length, -1),
"load_balancing_loss": lbl_loss,
Expand Down Expand Up @@ -300,12 +301,15 @@ def forward_once(self, x, expert_weights, top_experts): # TODO: sparse
) = self.indices_and_padded_bins(top_experts)

# Route the tokens for MoE computation.
#x_pre = x.clone()
x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, self.num_experts_per_tok)
#print("forward_once a", x.shape)

with torch.no_grad():
topo = self.topology(x, padded_bins)

x = self.mlp(x, topo)
x = self.mlp(x, topo) #REMOVE
#return x_pre, self.mlp(x, topo)

# Un-route the data for the MoE output.
x = ops.padded_scatter(
Expand Down Expand Up @@ -422,7 +426,11 @@ def forward(self, x, router_logits, expert_weights, top_experts):
top_experts: tensor of shape [sequence_length * batch_size, num_experts_per_tok]
"""
# Compute the experts.
x, tokens_per_expert = self.forward_fn(x, expert_weights.flatten(), top_experts.flatten())
x, tokens_per_expert = self.forward_fn(x, expert_weights.flatten(), top_experts.flatten()) #REMOVE
#return router_logits
#print("nano b", expert_weights)
#return expert_weights.bfloat16()
#return self.forward_fn(x, expert_weights.flatten(), top_experts.flatten())
if self.training:
lbl_loss = load_balancing_loss(router_logits, tokens_per_expert, self.config)
z_loss = router_z_loss(router_logits, self.config)
Expand Down Expand Up @@ -595,9 +603,14 @@ def __init__(

def forward(self, x, topo):
self.w1.scale_gradients(), self.w2.scale_gradients()
x = self.sdd(x.contiguous(), self.w1.module.weight, topo)
activation_fn_out = act_fn(x, self.act)
return self.dsd(activation_fn_out, self.w2.module.weight)
x = self.sdd(x.contiguous(), self.w1.module.weight, topo) # REMOVE
#x1 = self.sdd(x.contiguous(), self.w1.module.weight, topo)
activation_fn_out = act_fn(x, self.act) # REMOVE
#print(x.shape, activation_fn_out.shape, self.w2.module.weight.shape)
#activation_fn_out = act_fn(x1, self.act)
return self.dsd(activation_fn_out, self.w2.module.weight) #REMOVE
#x2 = self.dsd(activation_fn_out, self.w2.module.weight)
#return x, x1, x2, topo, self.w1.module.weight, self.w2.module.weight


class MLP(nn.Module):
Expand Down Expand Up @@ -718,4 +731,4 @@ def forward(self, x, topo):
x1 = self.sdd(x, self.w1.module.weight, topo)
x2 = self.sdd(x, self.w3.module.weight, topo)
x = stk.ops.mul(act_fn(x1, self.act), x2)
return self.dsd(x, self.w2.module.weight)
return self.dsd(x, self.w2.module.weight)

0 comments on commit 930fe81

Please sign in to comment.