From e5a7ea71e96eeada636c9612036dc85e886d973d Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Tue, 26 Dec 2023 17:48:11 -0500
Subject: [PATCH] Update neox_args.py (#1107)

* Update neox_args.py

Changed some default values to correspond to values that we generally recommend people use.

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
---
 configs/neox_arguments.md            | 12 ++++++------
 megatron/neox_arguments/neox_args.py | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 0c0f88e5b..14cb3a605 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 79befef
+    Default = 31cb364
 
     current git hash of repository
 
@@ -1143,7 +1143,7 @@ Training Arguments
 
 - **weighted_sampler_alpha**: float
 
-    Default = 0.3
+    Default = 1.0
 
     Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
 
@@ -1373,7 +1373,7 @@ Training Arguments
 
 - **attention_dropout**: float
 
-    Default = 0.1
+    Default = 0.0
 
     Post attention dropout probability.
 
@@ -1381,7 +1381,7 @@ Training Arguments
 
 - **hidden_dropout**: float
 
-    Default = 0.1
+    Default = 0.0
 
     Dropout probability for hidden state transformer.
 
@@ -1389,7 +1389,7 @@ Training Arguments
 
 - **weight_decay**: float
 
-    Default = 0.01
+    Default = 0.1
 
     Weight decay coefficient for L2 regularization.
 
@@ -1470,7 +1470,7 @@ Training Arguments
 
 - **clip_grad**: float
 
-    Default = None
+    Default = 1.0
 
     Gradient clipping based on global L2 norm.
 
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 2cfed465d..e025f6f8a 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -774,7 +774,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     See https://arxiv.org/abs/1911.02116 for more details
     """
 
-    weighted_sampler_alpha: float = 0.3
+    weighted_sampler_alpha: float = 1.0
     """
     Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
 
@@ -923,17 +923,17 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Exit the program after the iteration is divisible by this value.
     """
 
-    attention_dropout: float = 0.1
+    attention_dropout: float = 0.0
     """
     Post attention dropout probability.
     """
 
-    hidden_dropout: float = 0.1
+    hidden_dropout: float = 0.0
     """
     Dropout probability for hidden state transformer.
     """
 
-    weight_decay: float = 0.01
+    weight_decay: float = 0.1
     """
     Weight decay coefficient for L2 regularization.
     """
@@ -982,7 +982,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     gas: int = None
     """gradient_accumulation_steps"""  # TODO this is a duplicate, remove?
 
-    clip_grad: float = None
+    clip_grad: float = 1.0
     """
     Gradient clipping based on global L2 norm.
     """