topK training + kernels (tested)

EleutherAI · Jun 17, 2024 · f1e4dba · f1e4dba
1 parent 43c1a53
commit f1e4dba
Show file tree

Hide file tree

Showing 5 changed files with 1,063 additions and 22 deletions.
diff --git a/configs/sparse-training/410M_topk.yml b/configs/sparse-training/410M_topk.yml
@@ -0,0 +1,107 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 24,
+  "hidden_size": 1024,
+  "num_attention_heads": 16,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "activation": "sqrelu",
+
+  "attention_config": [[["flash"], 24]],
+
+  # "scaled_upper_triang_masked_softmax_fusion": true,
+  # "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0003,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00003,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 16,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  # "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 100,
+  "eval_iters": 50,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  # "save": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",
+  #"load": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",
+
+  "use_wandb": true,
+  "wandb_group": "16sparsity-topk-sqrelu-410m", 
+  "wandb_team": "schoelkopf",
+  "wandb_project": "sparse-is-enough",
+
+  "log-interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "data_path": "/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document",
+  # "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+  # "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+
+  "tokenizer-type": "HFTokenizer",
+  "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json",
+
+  "use_topk_ffn": true, 
+  "topk_ffn_k": 256 
+}
diff --git a/configs/sparse-training/410M_topk_baseline.yml b/configs/sparse-training/410M_topk_baseline.yml
@@ -0,0 +1,107 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+
+  "num_layers": 24,
+  "hidden_size": 1024,
+  "num_attention_heads": 16,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+
+  "activation": "sqrelu",
+
+  "attention_config": [[["flash"], 24]],
+
+  # "scaled_upper_triang_masked_softmax_fusion": true,
+  # "bias_gelu_fusion": true,
+
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0003,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00003,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_micro_batch_size_per_gpu": 16,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  # "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
+  "eval_interval": 100,
+  "eval_iters": 50,
+
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  # "save": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",
+  #"load": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",
+
+  "use_wandb": true,
+  "wandb_group": "baseline-sqrelu-410m-bs16-accum2", 
+  "wandb_team": "schoelkopf",
+  "wandb_project": "sparse-is-enough",
+
+  "log-interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+
+  "data_path": "/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document",
+  # "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+  # "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
+
+  "tokenizer-type": "HFTokenizer",
+  "vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json",
+
+  # "use_topk_ffn": true, 
+  # "topk_ffn_k": 32 
+}