Merge branch 'huggingface:main' into main

swiss-ai · Jul 1, 2024 · 15fc6a2 · 15fc6a2
2 parents adde82a + ee785d6
commit 15fc6a2
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 3 deletions.
diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
@@ -0,0 +1,15 @@
+on:
+  push:
+
+name: Secret Leaks
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main
diff --git a/examples/llama/convert_weights.py b/examples/llama/convert_weights.py
@@ -71,6 +71,7 @@ def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:
         "pretraining_tp": "pretraining_tp",
         "rms_norm_eps": "rms_norm_eps",
         "rope_scaling": "rope_scaling",
+        "rope_theta": "rope_theta",
         "tie_word_embeddings": "tie_word_embeddings",
         "use_cache": "use_cache",
         "vocab_size": "vocab_size",

diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
@@ -249,7 +249,7 @@ class LRSchedulerArgs:
 
     lr_warmup_steps: number of steps to warmup the learning rate
     lr_warmup_style: linear or constant
-    lr_decay_style: linear or cosine
+    lr_decay_style: linear, cosine or 1-sqrt
     min_decay_lr: minimum learning rate after decay
     lr_decay_steps: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
     lr_decay_starting_step: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
@@ -272,9 +272,9 @@ def __post_init__(self):
             self.lr_warmup_style = "linear"
         if self.lr_decay_style is None:
             self.lr_decay_style = "linear"
-        if self.lr_decay_style not in ["linear", "cosine"]:
+        if self.lr_decay_style not in ["linear", "cosine", "1-sqrt"]:
             raise ValueError(
-                f"lr_decay_style should be a string selected in ['linear', 'cosine'] and not {self.lr_decay_style}"
+                f"lr_decay_style should be a string selected in ['linear', 'cosine', '1-sqrt'] and not {self.lr_decay_style}"
             )
         if self.min_decay_lr is None:
             self.min_decay_lr = self.learning_rate

diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py
@@ -146,6 +146,12 @@ def lr_lambda(current_step: int, initial_lr: float):
                     * (lr_decay_steps - (current_step - lr_decay_starting_step))
                     / lr_decay_steps
                 )
+            elif lr_scheduler_args.lr_decay_style == "1-sqrt":
+                lmbda = (
+                    lr_scheduler_args.min_decay_lr
+                    + (initial_lr - lr_scheduler_args.min_decay_lr)
+                    * (1 - math.sqrt((current_step - lr_decay_starting_step) / lr_decay_steps))
+                )
             else:
                 raise ValueError(f"Unknown decay style {lr_scheduler_args.lr_decay_style}")