Skip to content

Commit

Permalink
hparam tuning and scaling 0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangir-azerbayev committed Nov 24, 2023
1 parent ede55f4 commit a5b9a9f
Show file tree
Hide file tree
Showing 36 changed files with 2,180 additions and 11 deletions.
47 changes: 47 additions & 0 deletions configs/hparam-0.1/1-4B_no-lr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 24,
"hidden_size": 2048,
"num_attention_heads": 16,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 24]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
49 changes: 49 additions & 0 deletions configs/hparam-0.1/160M.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00006,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
49 changes: 49 additions & 0 deletions configs/hparam-0.1/160M_100x-lr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.06,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.006,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
49 changes: 49 additions & 0 deletions configs/hparam-0.1/160M_20x-lr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.012,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.0012,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
49 changes: 49 additions & 0 deletions configs/hparam-0.1/160M_50x-lr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.03,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.003,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
49 changes: 49 additions & 0 deletions configs/hparam-0.1/160M_alt-decay.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0006,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00018,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
49 changes: 49 additions & 0 deletions configs/hparam-0.1/160M_high-lr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0024,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00024,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
47 changes: 47 additions & 0 deletions configs/hparam-0.1/160M_no-lr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 12,
"hidden_size": 768,
"num_attention_heads": 12,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"attention_config": [[["flash"], 12]],

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 64,
"gas": 2,
"data_impl": "mmap",
"num_workers": 1
}
Loading

0 comments on commit a5b9a9f

Please sign in to comment.