diff --git a/src/liger_kernel/transformers/model/mixtral.py b/src/liger_kernel/transformers/model/mixtral.py
index 22fea53da..145bc78cd 100644
--- a/src/liger_kernel/transformers/model/mixtral.py
+++ b/src/liger_kernel/transformers/model/mixtral.py
@@ -38,7 +38,7 @@ def lce_forward_deprecated(
     cache_position: Optional[torch.LongTensor] = None,
 ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
     r"""
-    Copy paste Mixtral's forward from transfomers v4.44.2 but replace torch cross entropy with liger fused linear cross entropy
+    Copy paste Mixtral's forward from transformers v4.44.2 but replace torch cross entropy with liger fused linear cross entropy
 
 
     Args:
diff --git a/src/liger_kernel/transformers/trainer/orpo_trainer.py b/src/liger_kernel/transformers/trainer/orpo_trainer.py
index 184430ac1..04391fa5f 100644
--- a/src/liger_kernel/transformers/trainer/orpo_trainer.py
+++ b/src/liger_kernel/transformers/trainer/orpo_trainer.py
@@ -17,7 +17,7 @@ class _FSDPForwardRedirection:
     This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
     the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
     GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
-    will not work because the first `nn.Emebedding` layer is not independently wrapped as a FSDP module (because of
+    will not work because the first `nn.Embedding` layer is not independently wrapped as a FSDP module (because of
     the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
     its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
     the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.