Skip to content

Commit

Permalink
topK training + kernels (tested)
Browse files Browse the repository at this point in the history
  • Loading branch information
haileyschoelkopf committed Jun 17, 2024
1 parent 43c1a53 commit f1e4dba
Show file tree
Hide file tree
Showing 5 changed files with 1,063 additions and 22 deletions.
107 changes: 107 additions & 0 deletions configs/sparse-training/410M_topk.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 24,
"hidden_size": 1024,
"num_attention_heads": 16,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"activation": "sqrelu",

"attention_config": [[["flash"], 24]],

# "scaled_upper_triang_masked_softmax_fusion": true,
# "bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0003,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00003,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 2,
"data_impl": "mmap",
"num_workers": 1,

"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,

"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,

"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},

"train_iters": 143000,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 1000,
# "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
"eval_interval": 100,
"eval_iters": 50,

"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,

# "save": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",
#"load": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",

"use_wandb": true,
"wandb_group": "16sparsity-topk-sqrelu-410m",
"wandb_team": "schoelkopf",
"wandb_project": "sparse-is-enough",

"log-interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,

"data_path": "/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document",
# "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
# "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],

"tokenizer-type": "HFTokenizer",
"vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json",

"use_topk_ffn": true,
"topk_ffn_k": 256
}
107 changes: 107 additions & 0 deletions configs/sparse-training/410M_topk_baseline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"pipe_parallel_size": 1,
"model_parallel_size": 1,

"num_layers": 24,
"hidden_size": 1024,
"num_attention_heads": 16,
"seq_length": 2048,
"max_position_embeddings": 2048,
"pos_emb": "rotary",
"rotary_pct": 0.25,
"no_weight_tying": true,
"gpt_j_residual": true,
"output_layer_parallelism": "column",

"activation": "sqrelu",

"attention_config": [[["flash"], 24]],

# "scaled_upper_triang_masked_softmax_fusion": true,
# "bias_gelu_fusion": true,

"init_method": "small_init",
"output_layer_init_method": "wang_init",

"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0003,
"betas": [0.9, 0.95],
"eps": 1.0e-8
}
},
"min_lr": 0.00003,

"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 500000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 500000000,
"contiguous_gradients": true,
"cpu_offload": false
},

"train_micro_batch_size_per_gpu": 16,
"gradient_accumulation_steps": 2,
"data_impl": "mmap",
"num_workers": 1,

"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,

"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,

"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 12,
"hysteresis": 2,
"min_loss_scale": 1
},

"train_iters": 143000,
"lr_decay_iters": 143000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 1000,
# "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
"eval_interval": 100,
"eval_iters": 50,

"log_interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,

# "save": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",
#"load": "/mnt/hdd-0/tiny-pythia/ckpts/pythia-14m",

"use_wandb": true,
"wandb_group": "baseline-sqrelu-410m-bs16-accum2",
"wandb_team": "schoelkopf",
"wandb_project": "sparse-is-enough",

"log-interval": 10,
"steps_per_print": 10,
"wall_clock_breakdown": true,

"data_path": "/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document",
# "valid-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],
# "test-data-paths": ["/mnt/ssd-2/pile_deduped/pile_20B_tokenizer_text_document"],

"tokenizer-type": "HFTokenizer",
"vocab-file": "/mnt/ssd-2/pile/20B_tokenizer.json",

# "use_topk_ffn": true,
# "topk_ffn_k": 32
}
Loading

0 comments on commit f1e4dba

Please sign in to comment.