From e5a7ea71e96eeada636c9612036dc85e886d973d Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Tue, 26 Dec 2023 17:48:11 -0500 Subject: [PATCH] Update neox_args.py (#1107) * Update neox_args.py Changed some default values to correspond to values that we generally recommend people use. * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions --- configs/neox_arguments.md | 12 ++++++------ megatron/neox_arguments/neox_args.py | 10 +++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 0c0f88e5b..14cb3a605 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 79befef + Default = 31cb364 current git hash of repository @@ -1143,7 +1143,7 @@ Training Arguments - **weighted_sampler_alpha**: float - Default = 0.3 + Default = 1.0 Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True. @@ -1373,7 +1373,7 @@ Training Arguments - **attention_dropout**: float - Default = 0.1 + Default = 0.0 Post attention dropout probability. @@ -1381,7 +1381,7 @@ Training Arguments - **hidden_dropout**: float - Default = 0.1 + Default = 0.0 Dropout probability for hidden state transformer. @@ -1389,7 +1389,7 @@ Training Arguments - **weight_decay**: float - Default = 0.01 + Default = 0.1 Weight decay coefficient for L2 regularization. @@ -1470,7 +1470,7 @@ Training Arguments - **clip_grad**: float - Default = None + Default = 1.0 Gradient clipping based on global L2 norm. diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 2cfed465d..e025f6f8a 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -774,7 +774,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): See https://arxiv.org/abs/1911.02116 for more details """ - weighted_sampler_alpha: float = 0.3 + weighted_sampler_alpha: float = 1.0 """ Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True. @@ -923,17 +923,17 @@ class NeoXArgsTraining(NeoXArgsTemplate): Exit the program after the iteration is divisible by this value. """ - attention_dropout: float = 0.1 + attention_dropout: float = 0.0 """ Post attention dropout probability. """ - hidden_dropout: float = 0.1 + hidden_dropout: float = 0.0 """ Dropout probability for hidden state transformer. """ - weight_decay: float = 0.01 + weight_decay: float = 0.1 """ Weight decay coefficient for L2 regularization. """ @@ -982,7 +982,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): gas: int = None """gradient_accumulation_steps""" # TODO this is a duplicate, remove? - clip_grad: float = None + clip_grad: float = 1.0 """ Gradient clipping based on global L2 norm. """