diff --git a/configs/scaling-0.3/1-4B.yml b/configs/scaling-0.3/1-4B.yml new file mode 100644 index 000000000..d5b7f1863 --- /dev/null +++ b/configs/scaling-0.3/1-4B.yml @@ -0,0 +1,49 @@ +{ + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + "num_layers": 24, + "hidden_size": 2048, + "num_attention_heads": 16, + "seq_length": 2048, + "max_position_embeddings": 2048, + "pos_emb": "rotary", + "rotary_pct": 0.25, + "no_weight_tying": true, + "gpt_j_residual": true, + "output_layer_parallelism": "column", + + "attention_config": [[["flash"], 24]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": true, + + "init_method": "small_init", + "output_layer_init_method": "wang_init", + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0002, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.00002, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 500000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 500000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_micro_batch_size_per_gpu": 64, + "gas": 2, + "data_impl": "mmap", + "num_workers": 1 +} diff --git a/configs/scaling-0.3/410M.yml b/configs/scaling-0.3/410M.yml new file mode 100644 index 000000000..0026085ac --- /dev/null +++ b/configs/scaling-0.3/410M.yml @@ -0,0 +1,49 @@ +{ + "pipe_parallel_size": 1, + "model_parallel_size": 1, + + "num_layers": 24, + "hidden_size": 1024, + "num_attention_heads": 16, + "seq_length": 2048, + "max_position_embeddings": 2048, + "pos_emb": "rotary", + "rotary_pct": 0.25, + "no_weight_tying": true, + "gpt_j_residual": true, + "output_layer_parallelism": "column", + + "attention_config": [[["flash"], 24]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": true, + + "init_method": "small_init", + "output_layer_init_method": "wang_init", + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0003, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.00003, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 500000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 500000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_micro_batch_size_per_gpu": 64, + "gas": 2, + "data_impl": "mmap", + "num_workers": 1 +} diff --git a/slurm/scaling-0.3/scaling.sh b/slurm/scaling-0.3/scaling.sh index 88bf2b7d0..28410daa6 100755 --- a/slurm/scaling-0.3/scaling.sh +++ b/slurm/scaling-0.3/scaling.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --job-name="eleutherscaling" -#SBATCH --array=1 +#SBATCH --array=0-15 # #SBATCH --account=dw87 #SBATCH --comment="eleutherai" #SBATCH --qos=dw87 @@ -20,8 +20,10 @@ # parameters, steps, warmup steps, eval interval declare -a args=( - "70M,1024,100,256" - "160M,2048,200,512" + "1-4B,4096,400,1024" "1-4B,8192,800,1024" "1-4B,12288,1000,1024" "1-4B,16384,1000,1024" + "410M,2048,200,512" "410M,3072,300,512" "410M,4096,400,512" "410M,6144,600,512" + "160M,1536,150,512" "160M,2048,200,512" "160M,3072,300,512" "160M,4096,400,512" + "70M,1024,100,512" "70M,1536,150,512" "70M,2048,200,512" "70M,3072,300,512" ) export SAVE_BASE_DIR="/home/za2514/compute/scaling/saved-weights/scaling-0.3"