From ac5667471e24434c378781c5400b19d595d05fd8 Mon Sep 17 00:00:00 2001 From: Shivam Sahni Date: Mon, 16 Dec 2024 22:01:19 -0800 Subject: [PATCH] fix: correct typos in docstrings (#482) - Fix 'transfomers' to 'transformers' in mixtral.py - Fix 'Emebedding' to 'Embedding' in orpo_trainer.py ## Summary ## Testing Done - Hardware Type: - [ ] run `make test` to ensure correctness - [ ] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: byhsu@linkedin.com --- src/liger_kernel/transformers/model/mixtral.py | 2 +- src/liger_kernel/transformers/trainer/orpo_trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/liger_kernel/transformers/model/mixtral.py b/src/liger_kernel/transformers/model/mixtral.py index 22fea53da..145bc78cd 100644 --- a/src/liger_kernel/transformers/model/mixtral.py +++ b/src/liger_kernel/transformers/model/mixtral.py @@ -38,7 +38,7 @@ def lce_forward_deprecated( cache_position: Optional[torch.LongTensor] = None, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" - Copy paste Mixtral's forward from transfomers v4.44.2 but replace torch cross entropy with liger fused linear cross entropy + Copy paste Mixtral's forward from transformers v4.44.2 but replace torch cross entropy with liger fused linear cross entropy Args: diff --git a/src/liger_kernel/transformers/trainer/orpo_trainer.py b/src/liger_kernel/transformers/trainer/orpo_trainer.py index 184430ac1..04391fa5f 100644 --- a/src/liger_kernel/transformers/trainer/orpo_trainer.py +++ b/src/liger_kernel/transformers/trainer/orpo_trainer.py @@ -17,7 +17,7 @@ class _FSDPForwardRedirection: This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`) - will not work because the first `nn.Emebedding` layer is not independently wrapped as a FSDP module (because of + will not work because the first `nn.Embedding` layer is not independently wrapped as a FSDP module (because of the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.