From a3d4241a3a7be8f7b8be2c313688549629c46484 Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Mon, 23 Sep 2024 22:30:22 +0000 Subject: [PATCH 1/2] update args docs --- configs/neox_arguments.md | 339 +++++++++++++++++++++++++++-- megatron/fused_kernels/type_shim.h | 4 +- 2 files changed, 328 insertions(+), 15 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index d24b2b60a..698e28697 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 217b4c5 + Default = 62c9738a current git hash of repository @@ -133,6 +133,54 @@ Logging Arguments +- **use_comet**: bool + + Default = None + + Flag indicating if comet is to be used. + + + +- **comet_workspace**: Optional + + Default = None + + Comet workspace name, if not configured Comet Experiments will be created in the user configured default workspace. + + + +- **comet_project**: Optional + + Default = None + + Comet project name, if not configured Comet Experiments will be created in the Uncategorized Experiments project. + + + +- **comet_experiment_name**: Optional + + Default = None + + Custom name for the Comet experiment. If not provided, a random name is used. + + + +- **comet_tags**: Optional + + Default = None + + List of tags to attach to the created Comet Experiment. + + + +- **comet_others**: Optional + + Default = None + + Custom metadata to attach to the created Comet Experiment. + + + - **log_interval**: int Default = 100 @@ -281,9 +329,23 @@ Model Arguments Default = None - Transformer intermediate size. Currently only used for "mlp_type": "llama". + Transformer intermediate size. Default = 4h + + + +- **mlp_multiple_of**: int + + Default = 1 + + force mlp size to be a multiple of this value + + + +- **expansion_factor**: float - If not passed, will be set to a reasonable default. + Default = None + + Transformer intermediate size. Default = 4 @@ -351,6 +413,14 @@ Model Arguments +- **rmsnorm_fusion**: bool + + Default = False + + Use fused RMS norm kernel (if `norm` is `rmsnorm`). + + + - **use_qk_layernorm**: bool Default = False @@ -497,11 +567,19 @@ Model Arguments -- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu'] +- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu', 'reglu', 'swiglu', 'bilinear', 'glu'] Default = gelu - Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] + Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"] + + + +- **use_flashattn_swiglu**: bool + + Default = False + + Use flash attention's version of swiglu @@ -681,13 +759,11 @@ Model Arguments -- **mlp_type**: str +- **use_bias_in_mlp**: bool - Default = regular + Default = True - Types: - regular: Megatron implementation - llama: LLaMA MLP (SiLU-gated MLP) + If false, mlps will not have bias terms @@ -1091,7 +1167,15 @@ Text Generation arguments Default = None How to generate text/sample the model. - Options: `unconditional`, `input-file`, `interactive` + Options: `unconditional`, `input-file`, `interactive`, `precompute` + + + +- **precompute_model_name**: str + + Default = None + + Model name to use for saving precomputed logprobs @@ -1378,11 +1462,19 @@ Training Arguments -- **label_data_paths**: list +- **train_label_data_paths**: list Default = None - List of paths to label datasets (not shifted by 1 yet!). + List of paths to train label datasets (not shifted by 1 yet!). + + + +- **train_reward_data_paths**: list + + Default = None + + List of paths to train reward datasets @@ -1394,6 +1486,22 @@ Training Arguments +- **test_label_data_paths**: list + + Default = None + + List of paths to test label datasets (not shifted by 1 yet!). + + + +- **test_reward_data_paths**: list + + Default = None + + List of paths to test reward datasets + + + - **valid_data_paths**: list Default = None @@ -1402,6 +1510,118 @@ Training Arguments +- **valid_label_data_paths**: list + + Default = None + + List of paths to validation label datasets (not shifted by 1 yet!). + + + +- **valid_reward_data_paths**: list + + Default = None + + List of paths to validation reward datasets + + + +- **pos_train_data_paths**: list + + Default = None + + + + + +- **neg_train_data_paths**: list + + Default = None + + List of paths to positive and negative training datasets. + + + +- **pos_train_label_data_paths**: list + + Default = None + + + + + +- **neg_train_label_data_paths**: list + + Default = None + + List of paths to positive and negative training label datasets (not shifted by 1 yet!). + + + +- **pos_valid_data_paths**: list + + Default = None + + + + + +- **neg_valid_data_paths**: list + + Default = None + + List of paths to positive and negative validation datasets. + + + +- **pos_valid_label_data_paths**: list + + Default = None + + + + + +- **neg_valid_label_data_paths**: list + + Default = None + + List of paths to positive and negative validation label datasets (not shifted by 1 yet!). + + + +- **pos_test_data_paths**: list + + Default = None + + + + + +- **neg_test_data_paths**: list + + Default = None + + List of paths to positive and negative test datasets. + + + +- **pos_test_label_data_paths**: list + + Default = None + + + + + +- **neg_test_label_data_paths**: list + + Default = None + + List of paths to positive and negative test label datasets (not shifted by 1 yet!). + + + - **train_data_weights**: list Default = None @@ -1469,6 +1689,99 @@ Training Arguments +- **pack_impl**: typing.Literal['packed', 'pack_until_overflow', 'unpacked'] + + Default = packed + + Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked". + + warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets + + + +- **dataset_impl**: typing.Literal['gpt2', 'pairwise'] + + Default = gpt2 + + Dataset implementation, can be one of "gpt2" or "pairwise" + + + +- **train_impl**: typing.Literal['normal', 'dpo', 'rm', 'kto'] + + Default = normal + + Training implementation, can be one of "normal", "dpo", "kto", or "rm" + + + +- **dpo_fp32**: bool + + Default = True + + Whether to cast logits to fp32 for DPO loss calculation. + + + +- **dpo_reference_free**: bool + + Default = False + + Whether to use reference-free DPO. + + + +- **dpo_beta**: float + + Default = 0.1 + + Beta value for DPO + + + +- **kto_fp32**: bool + + Default = True + + Whether to cast logits to fp32 for KTO loss calculation. + + + +- **kto_desirable_weight**: float + + Default = 1.0 + + Weight for desirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes. + + + +- **kto_undesirable_weight**: float + + Default = 1.0 + + Weight for undesirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes. + + + +- **kto_beta**: float + + Default = 0.1 + + Beta value for KTO + + + +- **allow_chopped**: bool + + Default = True + + WARNING: if your packing impl is packed, this is ignored. + + Allow chopped samples in the dataset. + (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024) + + + - **mmap_warmup**: bool Default = False diff --git a/megatron/fused_kernels/type_shim.h b/megatron/fused_kernels/type_shim.h index 6b1c8927d..c8dbd1f10 100644 --- a/megatron/fused_kernels/type_shim.h +++ b/megatron/fused_kernels/type_shim.h @@ -277,7 +277,7 @@ reduce_block_into_lanes(T* x, final = x[tid] + x[tid + 32]; else final = val; - // __SYNCWARP(); + // __SYNCWARP(); #pragma unroll for (int i = 16; i >= lanes; i >>= 1) @@ -321,7 +321,7 @@ reduce_block_into_lanes_max_op(T* x, final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32])); else final = val; - // __SYNCWARP(); + // __SYNCWARP(); #pragma unroll for (int i = 16; i >= lanes; i >>= 1) From 039df7d691cad519b106ca254c5a9ec83eb8ee61 Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Mon, 23 Sep 2024 22:32:40 +0000 Subject: [PATCH 2/2] undo pre-commit change --- megatron/fused_kernels/type_shim.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/fused_kernels/type_shim.h b/megatron/fused_kernels/type_shim.h index c8dbd1f10..6b1c8927d 100644 --- a/megatron/fused_kernels/type_shim.h +++ b/megatron/fused_kernels/type_shim.h @@ -277,7 +277,7 @@ reduce_block_into_lanes(T* x, final = x[tid] + x[tid + 32]; else final = val; - // __SYNCWARP(); + // __SYNCWARP(); #pragma unroll for (int i = 16; i >= lanes; i >>= 1) @@ -321,7 +321,7 @@ reduce_block_into_lanes_max_op(T* x, final = fmaxf(fabsf(x[tid]), fabsf(x[tid + 32])); else final = val; - // __SYNCWARP(); + // __SYNCWARP(); #pragma unroll for (int i = 16; i >= lanes; i >>= 1)