Merge branch 'main' into infinite_lr

EleutherAI · Sep 7, 2024 · 3815242 · 3815242
2 parents e217ed7 + 7548a8b
commit 3815242
Show file tree

Hide file tree

Showing 42 changed files with 1,781 additions and 158 deletions.
diff --git a/.github/workflows/cpu_ci_on_pr.yml → .github/workflows/.cpu_ci_on_pr.yml b/.github/workflows/cpu_ci_on_pr.yml → .github/workflows/.cpu_ci_on_pr.yml
@@ -1,3 +1,5 @@
+# This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed.
+
 name: "Pull Request CPU Tests"
 
 on:
@@ -7,7 +9,7 @@ on:
 
 jobs:
   run-tests:
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v4

diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml
@@ -17,9 +17,10 @@ jobs:
     runs-on: ubuntu-latest
 
     env:
-      COV_USER: ${{ secrets.COV_USER }}
+      COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions
       COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }}
-      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }}
+      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard:
+        # https://scan.coverity.com/projects/<project>?tab=project_settings
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml
@@ -5,7 +5,7 @@ on: "push"
 jobs:
   run-tests:
     #runs-on: ubuntu-latest
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.github/workflows/cpu_ci_dispatch.yml b/.github/workflows/cpu_ci_dispatch.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   run-tests:
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v4

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -1,6 +1,7 @@
 name: Pull Request
 
-on: [pull_request]
+#on: [pull_request, workflow_dispatch]
+on: workflow_dispatch
 
 jobs:
   pre-commit:
@@ -9,7 +10,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.10
+          python-version: "3.10.14"
           cache: "pip"
           cache-dependency-path: "**/requirements*.txt"
       # Need the right version of clang-format
@@ -40,10 +41,20 @@ jobs:
           git commit -m "Update NeoXArgs docs automatically"
           git push
   run-tests:
-    runs-on: self-hosted
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10.13"
+          cache-dependency-path: "**/requirements*.txt"
       - name: prepare data
-        run: python prepare_data.py
+        run: python3 prepare_data.py
+      - name: install pytest
+        run: python3 -m pip install pytest pytest-forked pyyaml requests wandb
+      - name: install torch
+        run: python3 -m pip install torch
+      - name: install requirements
+        run: pip install -r requirements/requirements.txt
       - name: Run Tests
         run: pytest --forked tests
diff --git a/README.md b/README.md
@@ -96,7 +96,6 @@ To install the remaining basic dependencies, run:
 pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
 pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
-python ./megatron/fused_kernels/setup.py install # optional, if using fused kernels
 ```
 
 from the repository root.
@@ -106,6 +105,16 @@ from the repository root.
 
 </aside>
 
+### Fused Kernels
+We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build:
+
+```python
+python
+from megatron.fused_kernels import load
+load()
+```
+This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py`
+
 ### Flash Attention
 
 To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
@@ -640,7 +649,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher,
 
 # Profiling
 
-We support profiling with Nsight Systems and PyTorch Memory Profiling.
+We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
 
 ## Nsight Systems Profiling
 
@@ -656,6 +665,15 @@ The generated output file can then by viewed with the Nsight Systems GUI:
 
 ![Alt text](images/nsight_profiling.png)
 
+## PyTorch Profiling
+
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`.
+
+The PyTorch profiler will save traces to your `tensorboard` log directory.  You can view these traces within
+TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+
+![Alt text](images/pytorch_profiling.png)
+
 ## PyTorch Memory Profiling
 
 To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.
@@ -718,7 +736,7 @@ The following publications by other research groups use this library:
 The following models were trained using this library:
 
 ### English LLMs
-- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
 - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
 - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
 - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
@@ -729,25 +747,29 @@ The following models were trained using this library:
 ### Non-English LLMs
 - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
 - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
-- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b)
+- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese)
 - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
 - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
 - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
 - The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
 - The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
 - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
+- Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese)
+- The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque)
 
 ### Code Models
 - Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
 - StabilityAI's [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding) and [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
 - CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
 
 ### AI for Science
+- EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
 - Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
-- Oak Ridge National Lab and EleutherAI's [Unnamed Material Science Domain Models (7B)](https://github.com/at-aaims/forge)
+- Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691)
 - Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr)
 
 ### Other Modalities
+-  Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text)
 -  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
 -  Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
 

diff --git a/configs/README.md b/configs/README.md
@@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model.
 
 For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
 
-Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
+Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
 ```yaml
 # GPT-3 pretraining setup
 {

diff --git a/configs/mamba/mamba-1.4B.yml b/configs/mamba/mamba-1.4B.yml
@@ -19,5 +19,71 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 1,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml
@@ -19,5 +19,70 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
-}
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,