diff --git a/configs/scaling-0.3/1-4B.yml b/configs/scaling-0.3/1-4B.yml
new file mode 100644
index 000000000..d5b7f1863
--- /dev/null
+++ b/configs/scaling-0.3/1-4B.yml
@@ -0,0 +1,49 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 24,
+  "hidden_size": 2048,
+  "num_attention_heads": 16,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 24]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0002,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00002,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 64,
+  "gas": 2,
+  "data_impl": "mmap",
+  "num_workers": 1
+}
diff --git a/configs/scaling-0.3/410M.yml b/configs/scaling-0.3/410M.yml
new file mode 100644
index 000000000..0026085ac
--- /dev/null
+++ b/configs/scaling-0.3/410M.yml
@@ -0,0 +1,49 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 24,
+  "hidden_size": 1024,
+  "num_attention_heads": 16,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "attention_config": [[["flash"], 24]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0003,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00003,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 64,
+  "gas": 2,
+  "data_impl": "mmap",
+  "num_workers": 1
+}
diff --git a/slurm/scaling-0.3/scaling.sh b/slurm/scaling-0.3/scaling.sh
index 88bf2b7d0..28410daa6 100755
--- a/slurm/scaling-0.3/scaling.sh
+++ b/slurm/scaling-0.3/scaling.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name="eleutherscaling"
-#SBATCH --array=1
+#SBATCH --array=0-15
 # #SBATCH --account=dw87
 #SBATCH --comment="eleutherai"
 #SBATCH --qos=dw87
@@ -20,8 +20,10 @@
 
 # parameters, steps, warmup steps, eval interval
 declare -a args=(
-    "70M,1024,100,256"
-    "160M,2048,200,512"
+    "1-4B,4096,400,1024" "1-4B,8192,800,1024" "1-4B,12288,1000,1024" "1-4B,16384,1000,1024"
+    "410M,2048,200,512" "410M,3072,300,512" "410M,4096,400,512" "410M,6144,600,512"
+    "160M,1536,150,512" "160M,2048,200,512" "160M,3072,300,512" "160M,4096,400,512"
+    "70M,1024,100,512" "70M,1536,150,512" "70M,2048,200,512" "70M,3072,300,512"
 )
 export SAVE_BASE_DIR="/home/za2514/compute/scaling/saved-weights/scaling-0.3"