linkedin · ryankert01 · Dec 8, 2024 · Dec 8, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/src/liger_kernel/chunked_loss/cpo_loss.py b/src/liger_kernel/chunked_loss/cpo_loss.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
 
@@ -55,6 +57,7 @@ def forward(
         label_smoothing=0.0,
         compute_nll_loss=True,
         compiled=True,
+        softcap=None,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx,
@@ -69,6 +72,7 @@ def forward(
             label_smoothing=label_smoothing,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            softcap=softcap,
         )
 
     @staticmethod
@@ -90,11 +94,13 @@ def __init__(
         label_smoothing: float = 0.0,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        softcap: Optional[float] = None,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
         """
         super().__init__()
         self.ignore_index = ignore_index
@@ -103,6 +109,7 @@ def __init__(
         self.label_smoothing = label_smoothing
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.softcap = softcap
 
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearCPOFunction.apply(
@@ -116,4 +123,5 @@ def forward(self, lin_weight, _input, target, bias=None):
             self.label_smoothing,
             self.compute_nll_loss,
             self.compiled,
+            self.softcap,
         )
diff --git a/src/liger_kernel/chunked_loss/dpo_loss.py b/src/liger_kernel/chunked_loss/dpo_loss.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
 
@@ -67,6 +69,7 @@ def forward(
         compute_nll_loss=False,
         compiled=True,
         use_ref_model=True,
+        softcap=None,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
@@ -83,12 +86,13 @@ def forward(
             ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            softcap=softcap,
         )
 
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None, None, None, None, None
+        return *grads, None, None, None, None, None, None, None, None, None
 
 
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -103,6 +107,7 @@ def __init__(
         compute_nll_loss: bool = False,
         compiled: bool = True,
         use_ref_model: bool = False,
+        softcap: Optional[float] = None,
     ):
         """
         Args:
@@ -111,13 +116,15 @@ def __init__(
             compute_nll_loss (bool): Whether to compute the NLL loss.
             compiled (bool): Whether to use the torch compiled kernel.
             use_ref_model (bool): Whether to use a reference model for the DPO loss.
+            softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
         self.use_ref_model = use_ref_model
+        self.softcap = softcap
 
     def forward(
         self,
@@ -142,4 +149,5 @@ def forward(
             self.compute_nll_loss,
             self.compiled,
             self.use_ref_model,
+            self.softcap,
         )
diff --git a/src/liger_kernel/chunked_loss/fused_linear_preference.py b/src/liger_kernel/chunked_loss/fused_linear_preference.py
@@ -32,6 +32,7 @@ def forward(
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
+        softcap=None,
         **loss_kwargs,
     ):
         """
@@ -61,6 +62,7 @@ def forward(
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -282,11 +284,16 @@ def chunk_forward(
         bias=None,
         ignore_index=-100,
         compute_nll_loss=True,
+        softcap=None,
     ):
         len_chosen_chunk = target_chunk.shape[0] // 2
         logits_chunk = input_chunk @ weight.t()
         if bias is not None:
             logits_chunk = logits_chunk + bias
+        if softcap is not None:
+            logits_chunk = logits_chunk / softcap
+            logits_chunk = torch.tanh(logits_chunk)
+            logits_chunk = logits_chunk * softcap
         log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
 
         chosen_nll_loss = 0.0
@@ -336,6 +343,7 @@ def _compute_loss(
         ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
+        softcap=None,
         **loss_kwargs,
     ):
         """
@@ -354,6 +362,7 @@ def _compute_loss(
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
             loss_kwargs (dict): Additional arguments for the loss function.
         """
         (
@@ -369,6 +378,7 @@ def _compute_loss(
             bias=bias,
             ignore_index=ignore_index,
             compute_nll_loss=compute_nll_loss,
+            softcap=softcap,
         )
         chosen_nll_loss = (
             chosen_nll_loss
@@ -396,6 +406,7 @@ def _compute_loss(
                     ref_bias,
                     ignore_index=ignore_index,
                     compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                    softcap=softcap,
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
             loss_kwargs["ref_rejected_logps"] = ref_rejected_logps

diff --git a/src/liger_kernel/chunked_loss/orpo_loss.py b/src/liger_kernel/chunked_loss/orpo_loss.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
 
@@ -57,6 +59,7 @@ def forward(
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        softcap=None,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
@@ -69,12 +72,13 @@ def forward(
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            softcap=softcap,
         )
 
     @staticmethod
     def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None
 
 
 class LigerFusedLinearORPOLoss(torch.nn.Module):
@@ -88,17 +92,20 @@ def __init__(
         beta: float = 0.1,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        softcap: Optional[float] = None,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.softcap = softcap
 
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearORPOFunction.apply(
@@ -110,4 +117,5 @@ def forward(self, lin_weight, _input, target, bias=None):
             self.beta,
             self.compute_nll_loss,
             self.compiled,
+            self.softcap,
         )
diff --git a/src/liger_kernel/chunked_loss/simpo_loss.py b/src/liger_kernel/chunked_loss/simpo_loss.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import torch
 import torch.nn.functional as F
 
@@ -62,6 +64,7 @@ def forward(
         compute_nll_loss=False,
         compiled=True,
         gamma=0.5,
+        softcap=None,
     ):
         return LigerFusedLinearPreferenceBase.forward(
             ctx,
@@ -77,6 +80,7 @@ def forward(
             label_smoothing=label_smoothing,
             compiled=compiled,
             gamma=gamma,
+            softcap=softcap,
         )
 
     @staticmethod
@@ -99,11 +103,13 @@ def __init__(
         compute_nll_loss: bool = True,
         compiled: bool = True,
         gamma: float = 0.5,
+        softcap: Optional[float] = None,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            softcap (Optional[float]): The upper threshold for scaling logits to the range (-softcap, +softcap).
         """
         super().__init__()
         self.ignore_index = ignore_index
@@ -113,6 +119,7 @@ def __init__(
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
         self.gamma = gamma
+        self.softcap = softcap
 
     def forward(self, lin_weight, _input, target, bias=None):
         return LigerFusedLinearSimPOFunction.apply(
@@ -127,4 +134,5 @@ def forward(self, lin_weight, _input, target, bias=None):
             self.compute_nll_loss,
             self.compiled,
             self.gamma,
+            self.softcap,
         )