diff --git a/.github/workflows/cpu_ci_on_pr.yml b/.github/workflows/.cpu_ci_on_pr.yml similarity index 58% rename from .github/workflows/cpu_ci_on_pr.yml rename to .github/workflows/.cpu_ci_on_pr.yml index 971640c18..43ce025c0 100644 --- a/.github/workflows/cpu_ci_on_pr.yml +++ b/.github/workflows/.cpu_ci_on_pr.yml @@ -1,3 +1,5 @@ +# This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed. + name: "Pull Request CPU Tests" on: @@ -7,7 +9,7 @@ on: jobs: run-tests: - runs-on: [ 'test', 'self-hosted' ] + runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest steps: - name: Checkout Repository uses: actions/checkout@v4 diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml index a79d0d8fb..128d279cc 100644 --- a/.github/workflows/coverity_scan.yml +++ b/.github/workflows/coverity_scan.yml @@ -17,9 +17,10 @@ jobs: runs-on: ubuntu-latest env: - COV_USER: ${{ secrets.COV_USER }} + COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }} - COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} + COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard: + # https://scan.coverity.com/projects/?tab=project_settings steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml index 9160fccab..6910b8a1c 100644 --- a/.github/workflows/cpu_ci.yml +++ b/.github/workflows/cpu_ci.yml @@ -5,7 +5,7 @@ on: "push" jobs: run-tests: #runs-on: ubuntu-latest - runs-on: [ 'test', 'self-hosted' ] + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/cpu_ci_dispatch.yml b/.github/workflows/cpu_ci_dispatch.yml index b1d108b3b..38485d6a6 100644 --- a/.github/workflows/cpu_ci_dispatch.yml +++ b/.github/workflows/cpu_ci_dispatch.yml @@ -10,7 +10,7 @@ on: jobs: run-tests: - runs-on: [ 'test', 'self-hosted' ] + runs-on: ubuntu-22.04 steps: - name: Checkout Repository uses: actions/checkout@v4 diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 3213718df..7b06256bf 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1,6 +1,7 @@ name: Pull Request -on: [pull_request] +#on: [pull_request, workflow_dispatch] +on: workflow_dispatch jobs: pre-commit: @@ -9,7 +10,7 @@ jobs: - uses: actions/checkout@v2 - uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: "3.10.14" cache: "pip" cache-dependency-path: "**/requirements*.txt" # Need the right version of clang-format @@ -40,10 +41,20 @@ jobs: git commit -m "Update NeoXArgs docs automatically" git push run-tests: - runs-on: self-hosted + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: "3.10.13" + cache-dependency-path: "**/requirements*.txt" - name: prepare data - run: python prepare_data.py + run: python3 prepare_data.py + - name: install pytest + run: python3 -m pip install pytest pytest-forked pyyaml requests wandb + - name: install torch + run: python3 -m pip install torch + - name: install requirements + run: pip install -r requirements/requirements.txt - name: Run Tests run: pytest --forked tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7de35027a..249255306 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: hooks: - id: codespell args: [ - '--ignore-words-list=reord,dout', # Word used in error messages that need rewording + '--ignore-words-list=reord,dout,te', # Word used in error messages that need rewording. te --> transformerengine --check-filenames, --check-hidden, ] diff --git a/README.md b/README.md index ef97cdc17..c4f2fc23a 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,21 @@ GPT-NeoX leverages many of the same features and technologies as the popular Meg * Cutting edge architectural innovations including rotary and alibi positional embeddings, parallel feedforward attention layers, and flash attention. * Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 \& 2 * Curriculum Learning -* Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, logging via [WandB](https://wandb.ai/site), and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness). +* Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, monitor experiments via [WandB](https://wandb.ai/site)/[Comet](https://www.comet.com/site/)/TensorBoard, and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness). ## News +**[9/9/2024]** We now support preference learning via [DPO](https://arxiv.org/abs/2305.18290), [KTO](https://arxiv.org/abs/2402.01306), and reward modeling + +**[9/9/2024]** We now support integration with [Comet ML](https://www.comet.com/site/), a machine learning monitoring platform + +**[5/21/2024]** We now support [RWKV](https://www.rwkv.com/) with pipeline parallelism!. See the PRs for [RWKV](https://github.com/EleutherAI/gpt-neox/pull/1198) and [RWKV+pipeline](https://github.com/EleutherAI/gpt-neox/pull/1221) + +**[3/21/2024]** We now support Mixture-of-Experts (MoE) + +**[3/17/2024]** We now support AMD MI250X GPUs + +**[3/15/2024]** We now support [Mamba](https://github.com/state-spaces/mamba) with tensor parallelism! See [the PR](https://github.com/EleutherAI/gpt-neox/pull/1184) + **[8/10/2023]** We now support checkpointing with AWS S3! Activate with the `s3_path` config option (for more detail, see [the PR](https://github.com/EleutherAI/gpt-neox/pull/1010)) **[9/20/2023]** As of https://github.com/EleutherAI/gpt-neox/pull/1035, we have deprecated Flash Attention 0.x and 1.x, and migrated support to Flash Attention 2.x. We don't believe this will cause problems, but if you have a specific use-case that requires old flash support using the latest GPT-NeoX, please raise an issue. @@ -88,7 +100,7 @@ Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherA ### Host Setup -First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8. +This codebase has primarily developed and tested for Python 3.8-3.10, and PyTorch 1.8-2.0. This is not a strict requirement, and other versions and combinations of libraries may work. To install the remaining basic dependencies, run: @@ -96,6 +108,7 @@ To install the remaining basic dependencies, run: pip install -r requirements/requirements.txt pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard +pip install -r requirements/requirements-comet.txt # optional, if logging via Comet ``` from the repository root. @@ -294,7 +307,7 @@ You can then run any job you want from inside the container. Concerns when running for a long time or in detached mode include - You will have to terminate the container manually when you are no longer using it - If you want processes to continue running when your shell session ends, you will need to background them. - - If you then want logging, you will have to make sure to pipe logs to disk or set up wandb. + - If you then want logging, you will have to make sure to pipe logs to disk, and set up wandb and/or Comet logging. If you prefer to run the prebuilt container image from dockerhub, you can run the docker compose commands with ```-f docker-compose-dockerhub.yml``` instead, e.g., @@ -457,7 +470,7 @@ You can pass in an arbitrary number of configs which will all be merged at runti You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path. -E.G: +For example: ```bash python ./deepy.py train.py -d configs 125M.yml local_setup.yml @@ -574,15 +587,28 @@ To convert from a Hugging Face model into a NeoX-loadable, run `tools/ckpts/conv # Monitoring -In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard/) +In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site), [TensorBoard](https://www.tensorflow.org/tensorboard/), and [Comet](https://www.comet.com/site) ## Weights and Biases -EleutherAI is currently using [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox). If you are logged into Weights & Biases on your machine—you can do this by executing `wandb login`—your runs will automatically be recorded. There are two optional fields associated with Weights & Biases: wandb_group allows you to name the run group and wandb_team allows you to assign your runs to an organization or team account. +[Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox) is a machine learning monitoring platform. To use wandb to monitor your gpt-neox experiments: +1. Create an account at https://wandb.ai/site to generate your API key +2. Log into Weights & Biases on your machine—you can do this by executing `wandb login`—your runs will automatically be recorded. +3. Dependencies required for wandb monitoring can be found in and installed from `./requirements/requirements-wandb.txt`. An example config is provided in `./configs/local_setup_wandb.yml`. +4. There are two optional fields associated with Weights & Biases: wandb_group allows you to name the run group and wandb_team allows you to assign your runs to an organization or team account. An example config is provided in `./configs/local_setup_wandb.yml`. ## TensorBoard -We also support using TensorBoard via the tensorboard-dir field. Dependencies required for TensorBoard monitoring can be found in and installed from `./requirements/requirements-tensorboard.txt`. +We support using TensorBoard via the tensorboard-dir field. Dependencies required for TensorBoard monitoring can be found in and installed from `./requirements/requirements-tensorboard.txt`. + +## Comet + +[Comet](https://www.comet.com/site) is a machine learning monitoring platform. To use comet to monitor your gpt-neox experiments: +1. Create an account at https://www.comet.com/login to generate your API key. +2. Once generated, link your API key at runtime by running `comet login` or passing `export COMET_API_KEY=` +3. Install `comet_ml` and any dependency libraries via `pip install -r requirements/requirements-comet.txt` +4. Enable Comet with `use_comet: True`. You can also customize where data is being logged with `comet_workspace` and `comet_project`. A full example config with comet enabled is provided in `configs/local_setup_comet.yml`. +5. Run your experiment, and monitor metrics in the Comet workspace that you passed! # Running on multi-node @@ -594,7 +620,9 @@ We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memo ## Nsight Systems Profiling -To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop`. Launch training with: +To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config). + +To populate nsys metrics, launch training with: ``` nsys profile -s none -t nvtx,cuda -o --force-overwrite true \ @@ -604,22 +632,22 @@ $TRAIN_PATH/train.py --conf_dir configs The generated output file can then by viewed with the Nsight Systems GUI: -![Alt text](images/nsight_profiling.png) +![nsight-prof](images/nsight_profiling.png) ## PyTorch Profiling -To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`. +To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config). The PyTorch profiler will save traces to your `tensorboard` log directory. You can view these traces within TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). -![Alt text](images/pytorch_profiling.png) +![torch-prof](images/pytorch_profiling.png) ## PyTorch Memory Profiling -To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`. +To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config). -![Alt text](images/memory_profiling.png) +![mem-prof](images/memory_profiling.png) View the generated profile with the [memory_viz.py](https://github.com/pytorch/pytorch/blob/main/torch/cuda/_memory_viz.py) script. Run with: @@ -677,7 +705,7 @@ The following publications by other research groups use this library: The following models were trained using this library: ### English LLMs -- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631) +- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia) - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B) - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM) - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1) @@ -688,13 +716,15 @@ The following models were trained using this library: ### Non-English LLMs - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean) - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean) -- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) +- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese) - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean) - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese) - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese) - The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese) - The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese) - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi) +- Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese) +- The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque) ### Code Models - Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm) @@ -702,11 +732,13 @@ The following models were trained using this library: - CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B) ### AI for Science +- EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631) - Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge) -- Oak Ridge National Lab and EleutherAI's [Unnamed Material Science Domain Models (7B)](https://github.com/at-aaims/forge) +- Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691) - Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr) ### Other Modalities +- Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text) - University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1) - Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table) diff --git a/configs/README.md b/configs/README.md index d8ae81739..ac20ed89b 100644 --- a/configs/README.md +++ b/configs/README.md @@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model. For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md) -Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future. +Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future. ```yaml # GPT-3 pretraining setup { @@ -124,6 +124,8 @@ These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must # this should provide some speedup but takes a while to build, set to true if desired "scaled_upper_triang_masked_softmax_fusion": false, "train_iters": 320000, + # alternatively, use train_epochs to automatically determine the number of training iterations + #"train_epochs": 1, ``` An example of some basic settings used to configure your model's architecture and number of training steps. @@ -235,6 +237,58 @@ Additional DeepSpeed settings besides those mentioned above should be wrapped in "eval_iters": 10, ``` +For KTO style training, you'll need to add the reward & label data path, e.g.: + +```yaml + "data_impl": "mmap", + # Suggested data paths when using GPT-NeoX locally + "train_data_path": "data/enwik8/enwik8_text_document", + "train_label_data_path": "data/enwik8/enwik8_text_label_document", + "train_reward_data_path": "data/enwik8/enwik8_text_reward_document", + "test_data_path": "data/enwik8/enwik8_text_document", + "test_label_data_path": "data/enwik8/enwik8_text_label_document", + "test_reward_data_path": "data/enwik8/enwik8_text_reward_document", + "valid_data_path": "data/enwik8/enwik8_text_document", + "valid_label_data_path": "data/enwik8/enwik8_text_label_document", + "valid_reward_data_path": "data/enwik8/enwik8_text_reward_document", + "vocab_file": "data/gpt2-vocab.json", + "merge_file": "data/gpt2-merges.txt", + "save": "checkpoints", + "load": "checkpoints", + "tensorboard_dir": "tensorboard", + "log_dir": "logs", + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, +``` + +For DPO style training, you'll need to set pos/neg data paths instead of a single one, e.g. + +```yaml + "dataset_impl": "pairwise", + "train_impl": "dpo", + "pack_impl": "unpacked", + "dpo_beta": 0.1, + "dpo_fp32": true, + "pos_train_data_path": "data/enwik8/enwik8_text_pos_document", + "pos_valid_data_path": "data/enwik8/enwik8_text_pos_document", + "pos_test_data_path": "data/enwik8/enwik8_text_pos_document", + "neg_train_data_path": "data/enwik8/enwik8_text_neg_document", + "neg_valid_data_path": "data/enwik8/enwik8_text_neg_document", + "neg_test_data_path": "data/enwik8/enwik8_text_neg_document", + ## If you have labels... (likely to mask out user turns) + "pos_train_label_data_path": "data/enwik8/enwik8_text_pos_label_document", + "pos_valid_label_data_path": "data/enwik8/enwik8_text_pos_label_document", + "pos_test_label_data_path": "data/enwik8/enwik8_text_pos_label_document", + "neg_train_label_data_path": "data/enwik8/enwik8_text_neg_label_document", + "neg_valid_label_data_path": "data/enwik8/enwik8_text_neg_label_document", + "neg_test_label_data_path": "data/enwik8/enwik8_text_neg_label_document", + ## If you want to precompute the logits over your dataset... + "precompute_model_name": "gpt2", + ## Needed for the generation.py step, if precomputing + "text_gen_type": "precompute" +``` + ### LR Scheduler settings ```yaml diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml index 305567be1..a7470cae8 100644 --- a/configs/llama/13B.yml +++ b/configs/llama/13B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 40, "hidden_size": 5120, + "intermediate_size": 40960, "num_attention_heads": 40, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,11 +17,12 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml index 450f8da38..234445c77 100644 --- a/configs/llama/30B.yml +++ b/configs/llama/30B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 60, "hidden_size": 6656, + "intermediate_size": 53248, "num_attention_heads": 52, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,11 +17,12 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml index 85f199ce2..8ffffe241 100644 --- a/configs/llama/65B.yml +++ b/configs/llama/65B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 80, "hidden_size": 8192, + "intermediate_size": 65536, "num_attention_heads": 64, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,11 +17,12 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml index ecbf187a8..0d7c40b24 100644 --- a/configs/llama/7B.yml +++ b/configs/llama/7B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 32, "hidden_size": 4096, + "intermediate_size": 32768, "num_attention_heads": 32, "seq_length": 2048, "max_position_embeddings": 2048, @@ -16,11 +17,12 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama/train_config.yml b/configs/llama/train_config.yml index 64d8ff422..459332609 100644 --- a/configs/llama/train_config.yml +++ b/configs/llama/train_config.yml @@ -70,4 +70,5 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, + } diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml index 973b8bea4..7df5ad3ea 100644 --- a/configs/llama2/13B.yml +++ b/configs/llama2/13B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 40, "hidden_size": 5120, + "intermediate_size": 41472, "num_attention_heads": 40, "seq_length": 4096, "max_position_embeddings": 4096, @@ -21,6 +22,6 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama2/70B.yml b/configs/llama2/70B.yml index 615ae5d68..d175e146e 100644 --- a/configs/llama2/70B.yml +++ b/configs/llama2/70B.yml @@ -6,7 +6,7 @@ # model settings "num_layers": 80, "hidden_size": 8192, - "intermediate_size": 28672, + "intermediate_size": 86016, "num_attention_heads": 64, "num_kv_heads": 8, "seq_length": 4096, @@ -26,6 +26,6 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama2/7B.yml b/configs/llama2/7B.yml index 6a5c97e64..cdb63f02e 100644 --- a/configs/llama2/7B.yml +++ b/configs/llama2/7B.yml @@ -6,6 +6,7 @@ # model settings "num_layers": 32, "hidden_size": 4096, + "intermediate_size": 32768, "num_attention_heads": 32, "seq_length": 4096, "max_position_embeddings": 4096, @@ -21,6 +22,6 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama2/codellama_34B.yml b/configs/llama2/codellama_34B.yml index 88e9afaf6..e4cb2fc78 100644 --- a/configs/llama2/codellama_34B.yml +++ b/configs/llama2/codellama_34B.yml @@ -27,6 +27,6 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llama2/codellama_7B.yml b/configs/llama2/codellama_7B.yml index be123ebee..e8775f3eb 100644 --- a/configs/llama2/codellama_7B.yml +++ b/configs/llama2/codellama_7B.yml @@ -26,6 +26,6 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, } diff --git a/configs/llemma/34B.yml b/configs/llemma/34B.yml index bd72d7e23..1a693c7f4 100644 --- a/configs/llemma/34B.yml +++ b/configs/llemma/34B.yml @@ -30,8 +30,8 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, "optimizer": { "type": "Adam", diff --git a/configs/llemma/7B.yml b/configs/llemma/7B.yml index fb72c8c18..363cf4315 100644 --- a/configs/llemma/7B.yml +++ b/configs/llemma/7B.yml @@ -28,8 +28,8 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", + "mlp_multiple_of": 256, "optimizer": { "type": "Adam", diff --git a/configs/local_setup.yml b/configs/local_setup.yml index d031a2ad8..b8ec4b06a 100644 --- a/configs/local_setup.yml +++ b/configs/local_setup.yml @@ -24,7 +24,4 @@ "tensorboard_dir": "tensorboard", "log_dir": "logs", - "use_wandb": True, - "wandb_host": "https://api.wandb.ai", - "wandb_project": "neox" } diff --git a/configs/local_setup_comet.yml b/configs/local_setup_comet.yml new file mode 100644 index 000000000..12ff7b388 --- /dev/null +++ b/configs/local_setup_comet.yml @@ -0,0 +1,33 @@ +# Suggested data paths when using GPT-NeoX locally +{ + "data_path": "/workspace/gpt-neox-main/data/enwik8/enwik8_text_document", + + # or for weighted datasets: + # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "train-data-weights": [1., 2.], + # "test-data-weights": [2., 1.], + # "valid-data-weights": [0.5, 0.4], + + # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. + # WARNING: setting this to True will override any user provided weights + # "weight_by_num_documents": false, + # "weighted_sampler_alpha": 0.3, + + "vocab_file": "/workspace/gpt-neox-main/data/gpt2-vocab.json", + "merge_file": "/workspace/gpt-neox-main/data/gpt2-merges.txt", + + "save": "checkpoints", + "load": "checkpoints", + "checkpoint_validation_with_forward_pass": False, + + "tensorboard_dir": "tensorboard", + "log_dir": "logs", + "use_comet": True, + # "comet_workspace": "test_workspace", # CHANGE ME + "comet_project": "test_project", + "comet_experiment_name": "test_experiment", + "comet_tags": ["test_tag1", "test_tag2"], + "comet_others": {"test_others"}, +} diff --git a/configs/local_setup_wandb.yml b/configs/local_setup_wandb.yml new file mode 100644 index 000000000..d031a2ad8 --- /dev/null +++ b/configs/local_setup_wandb.yml @@ -0,0 +1,30 @@ +# Suggested data paths when using GPT-NeoX locally +{ + "data_path": "data/enwik8/enwik8_text_document", + + # or for weighted datasets: + # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], + # "train-data-weights": [1., 2.], + # "test-data-weights": [2., 1.], + # "valid-data-weights": [0.5, 0.4], + + # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. + # WARNING: setting this to True will override any user provided weights + # "weight_by_num_documents": false, + # "weighted_sampler_alpha": 0.3, + + "vocab_file": "data/gpt2-vocab.json", + "merge_file": "data/gpt2-merges.txt", + + "save": "checkpoints", + "load": "checkpoints", + "checkpoint_validation_with_forward_pass": False, + + "tensorboard_dir": "tensorboard", + "log_dir": "logs", + "use_wandb": True, + "wandb_host": "https://api.wandb.ai", + "wandb_project": "neox" +} diff --git a/configs/mamba/mamba-1.4B.yml b/configs/mamba/mamba-1.4B.yml index 2898a72fd..eae467d0e 100644 --- a/configs/mamba/mamba-1.4B.yml +++ b/configs/mamba/mamba-1.4B.yml @@ -19,5 +19,71 @@ "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0002, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00002, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 1, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml index d9a6ab92e..bd05723b2 100644 --- a/configs/mamba/mamba-130M.yml +++ b/configs/mamba/mamba-130M.yml @@ -19,5 +19,71 @@ "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0006, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00006, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0.0, + "attention_dropout": 0.0, + + # precision settings + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-2.8B.yml b/configs/mamba/mamba-2.8B.yml index 1aacb264b..d5afef368 100644 --- a/configs/mamba/mamba-2.8B.yml +++ b/configs/mamba/mamba-2.8B.yml @@ -19,5 +19,71 @@ "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00016, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.000016, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-370M.yml b/configs/mamba/mamba-370M.yml index 5e5a78cca..0058f1c0e 100644 --- a/configs/mamba/mamba-370M.yml +++ b/configs/mamba/mamba-370M.yml @@ -12,12 +12,77 @@ "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, - "attention_config": [[["mamba"], 64]], + "attention_config": [[["mamba"], 48]], "mamba_selective_scan_fusion": true, "mamba_causal_conv_fusion": true, "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.0003, + "betas": [0.9, 0.95], + "eps": 1.0e-8, + } + }, + "min_lr": 0.00003, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mamba/mamba-790M.yml b/configs/mamba/mamba-790M.yml index fcd324d9d..4aef7e813 100644 --- a/configs/mamba/mamba-790M.yml +++ b/configs/mamba/mamba-790M.yml @@ -12,12 +12,78 @@ "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-5, - "attention_config": [[["mamba"], 64]], + "attention_config": [[["mamba"], 48]], "mamba_selective_scan_fusion": true, "mamba_causal_conv_fusion": true, "mamba_inner_func_fusion": true, # supersedes scan or conv fusion "activation": "silu", - "output_layer_init_method": "single_residual_scaled_normal", + # init methods + "init_method": "small_init", + "output_layer_init_method": "single_residual_scaled_normal", + + # optimizer settings + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00025, + "betas": [0.9, 0.999], + "eps": 1.0e-8, + } + }, + "min_lr": 0.000025, + + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training + "zero_optimization": { + "stage": 1, + "allgather_partitions": True, + "allgather_bucket_size": 500000000, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 500000000, + "contiguous_gradients": True, + }, + + # batch / data settings + "train_micro_batch_size_per_gpu": 4, + "data_impl": "mmap", + + # activation checkpointing + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + # regularization + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + # precision settings + "fp16": { + "fp16": true, + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + # misc. training settings + "train_iters": 320000, + "lr_decay_iters": 320000, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.01, + "checkpoint_factor": 10000, + "eval_interval": 1000, + "eval_iters": 10, + + # logging + "log_interval": 100, + "steps_per_print": 10, + "keep_last_n_checkpoints": 4, + "wall_clock_breakdown": true, } diff --git a/configs/mistral/7B.yml b/configs/mistral/7B.yml index 587fe5d36..ba4e543d6 100644 --- a/configs/mistral/7B.yml +++ b/configs/mistral/7B.yml @@ -33,8 +33,7 @@ "bias_gelu_fusion": false, "use_bias_in_norms": false, "use_bias_in_attn_linear": false, - "mlp_type": "llama", - "activation": "silu", + "activation": "swiglu", "tokenizer_type": "SPMTokenizer", #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index a39b8a058..45e1ab196 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -14,14 +14,19 @@ LR Scheduler Arguments Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'. - - **lr_decay_iters**: int Default = None - Number of iterations to decay learning rate over, If None defaults to --train-iters + Number of iterations to decay learning rate over. If None, defaults to + --train-iters or the equivalent inferred value from train_epochs. + +- **lr_decay_fraction**: float + Default = None + Effective fraction of training over which to decay lr. Overrides lr_decay_iters. + Useful when specifying train_epochs. - **min_lr**: float @@ -133,6 +138,54 @@ Logging Arguments +- **use_comet**: bool + + Default = None + + Flag indicating if comet is to be used. + + + +- **comet_workspace**: Optional + + Default = None + + Comet workspace name, if not configured Comet Experiments will be created in the user configured default workspace. + + + +- **comet_project**: Optional + + Default = None + + Comet project name, if not configured Comet Experiments will be created in the Uncategorized Experiments project. + + + +- **comet_experiment_name**: Optional + + Default = None + + Custom name for the Comet experiment. If not provided, a random name is used. + + + +- **comet_tags**: Optional + + Default = None + + List of tags to attach to the created Comet Experiment. + + + +- **comet_others**: Optional + + Default = None + + Custom metadata to attach to the created Comet Experiment. + + + - **log_interval**: int Default = 100 @@ -337,9 +390,23 @@ Model Arguments Default = None - Transformer intermediate size. Currently only used for "mlp_type": "llama". + Transformer intermediate size. Default = 4h - If not passed, will be set to a reasonable default. + + +- **mlp_multiple_of**: int + + Default = 1 + + force mlp size to be a multiple of this value + + + +- **expansion_factor**: float + + Default = None + + Transformer intermediate size. Default = 4 @@ -391,11 +458,11 @@ Model Arguments -- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm'] +- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm', 'te_rmsnorm', 'te_layernorm'] Default = layernorm - Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm". + Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm". @@ -407,6 +474,14 @@ Model Arguments +- **rmsnorm_fusion**: bool + + Default = False + + Use fused RMS norm kernel (if `norm` is `rmsnorm`). + + + - **use_qk_layernorm**: bool Default = False @@ -553,11 +628,19 @@ Model Arguments -- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu'] +- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu', 'reglu', 'swiglu', 'bilinear', 'glu'] Default = gelu - Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] + Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"] + + + +- **use_flashattn_swiglu**: bool + + Default = False + + Use flash attention's version of swiglu @@ -737,13 +820,11 @@ Model Arguments -- **mlp_type**: str +- **use_bias_in_mlp**: bool - Default = regular + Default = True - Types: - regular: Megatron implementation - llama: LLaMA MLP (SiLU-gated MLP) + If false, mlps will not have bias terms @@ -818,6 +899,29 @@ Model Arguments +- **dim_att**: int + + Default = None + + Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size. + + + +- **head_size**: int + + Default = None + + Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads. + + + +- **ffn_dim**: int + + Default = None + + Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor. + + ## NeoXArgsOptimizer Optimizer Arguments @@ -1112,6 +1216,16 @@ Parallelism Arguments +- **sequence_parallel**: bool + + Default = False + + flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198) + (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1. + **Set by user, in contrast to neox_args.is_pipe_parallel.** + + + ## NeoXArgsTemplate NeoXArgsTemplate() @@ -1129,7 +1243,15 @@ Text Generation arguments Default = None How to generate text/sample the model. - Options: `unconditional`, `input-file`, `interactive` + Options: `unconditional`, `input-file`, `interactive`, `precompute` + + + +- **precompute_model_name**: str + + Default = None + + Model name to use for saving precomputed logprobs @@ -1287,11 +1409,19 @@ Training Arguments -- **label_data_paths**: list +- **train_label_data_paths**: list Default = None - List of paths to label datasets (not shifted by 1 yet!). + List of paths to train label datasets (not shifted by 1 yet!). + + + +- **train_reward_data_paths**: list + + Default = None + + List of paths to train reward datasets @@ -1303,6 +1433,22 @@ Training Arguments +- **test_label_data_paths**: list + + Default = None + + List of paths to test label datasets (not shifted by 1 yet!). + + + +- **test_reward_data_paths**: list + + Default = None + + List of paths to test reward datasets + + + - **valid_data_paths**: list Default = None @@ -1311,6 +1457,118 @@ Training Arguments +- **valid_label_data_paths**: list + + Default = None + + List of paths to validation label datasets (not shifted by 1 yet!). + + + +- **valid_reward_data_paths**: list + + Default = None + + List of paths to validation reward datasets + + + +- **pos_train_data_paths**: list + + Default = None + + + + + +- **neg_train_data_paths**: list + + Default = None + + List of paths to positive and negative training datasets. + + + +- **pos_train_label_data_paths**: list + + Default = None + + + + + +- **neg_train_label_data_paths**: list + + Default = None + + List of paths to positive and negative training label datasets (not shifted by 1 yet!). + + + +- **pos_valid_data_paths**: list + + Default = None + + + + + +- **neg_valid_data_paths**: list + + Default = None + + List of paths to positive and negative validation datasets. + + + +- **pos_valid_label_data_paths**: list + + Default = None + + + + + +- **neg_valid_label_data_paths**: list + + Default = None + + List of paths to positive and negative validation label datasets (not shifted by 1 yet!). + + + +- **pos_test_data_paths**: list + + Default = None + + + + + +- **neg_test_data_paths**: list + + Default = None + + List of paths to positive and negative test datasets. + + + +- **pos_test_label_data_paths**: list + + Default = None + + + + + +- **neg_test_label_data_paths**: list + + Default = None + + List of paths to positive and negative test label datasets (not shifted by 1 yet!). + + + - **train_data_weights**: list Default = None @@ -1378,6 +1636,99 @@ Training Arguments +- **pack_impl**: typing.Literal['packed', 'pack_until_overflow', 'unpacked'] + + Default = packed + + Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked". + + warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets + + + +- **dataset_impl**: typing.Literal['gpt2', 'pairwise'] + + Default = gpt2 + + Dataset implementation, can be one of "gpt2" or "pairwise" + + + +- **train_impl**: typing.Literal['normal', 'dpo', 'rm', 'kto'] + + Default = normal + + Training implementation, can be one of "normal", "dpo", "kto", or "rm" + + + +- **dpo_fp32**: bool + + Default = True + + Whether to cast logits to fp32 for DPO loss calculation. + + + +- **dpo_reference_free**: bool + + Default = False + + Whether to use reference-free DPO. + + + +- **dpo_beta**: float + + Default = 0.1 + + Beta value for DPO + + + +- **kto_fp32**: bool + + Default = True + + Whether to cast logits to fp32 for KTO loss calculation. + + + +- **kto_desirable_weight**: float + + Default = 1.0 + + Weight for desirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes. + + + +- **kto_undesirable_weight**: float + + Default = 1.0 + + Weight for undesirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes. + + + +- **kto_beta**: float + + Default = 0.1 + + Beta value for KTO + + + +- **allow_chopped**: bool + + Default = True + + WARNING: if your packing impl is packed, this is ignored. + + Allow chopped samples in the dataset. + (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024) + + + - **mmap_warmup**: bool Default = False @@ -1524,6 +1875,15 @@ Training Arguments +- **train_epochs**: int + + Default = None + + Number of epochs to run for training. Do not specify both train_epochs and train_iters. + Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed' + + + - **eval_iters**: int Default = 100 diff --git a/configs/prof.yml b/configs/prof.yml new file mode 100644 index 000000000..c2f2ee118 --- /dev/null +++ b/configs/prof.yml @@ -0,0 +1,17 @@ +# Sample profiling config +{ + # Turns on nsys and pytorch profiling + "profile": true, + + # pytorch profiler options + "profile_step_start": 10, + "profile_step_stop": 12, + + # pytorch memory profiler options + "memory_profiling": true, + "memory_profiling_path": tensorboard, + + + # All trace files (pytorch, nsys, tensorboard, etc) will be written here + "tensorboard_dir": "tensorboard", +} diff --git a/configs/slurm_local.json b/configs/slurm_local.json index 36e16089b..4b9ce5c56 100644 --- a/configs/slurm_local.json +++ b/configs/slurm_local.json @@ -4,8 +4,5 @@ "save": "checkpoints", "checkpoint_validation_with_forward_pass": false, "tensorboard-dir": "tensorboard", - "log-dir": "logs", - "use_wandb": true, - "wandb_host": "https://api.wandb.ai", - "wandb_project": "neox" + "log-dir": "logs" } diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml index 1a2b73aba..3aa3f3742 100644 --- a/configs/slurm_local.yml +++ b/configs/slurm_local.yml @@ -6,7 +6,4 @@ "checkpoint_validation_with_forward_pass": false, "tensorboard_dir": "tensorboard", "log_dir": "logs", - "use_wandb": true, - "wandb_host": "https://api.wandb.ai", - "wandb_project": "neox" } diff --git a/eval.py b/eval.py index 93093f21d..53bd21e0c 100644 --- a/eval.py +++ b/eval.py @@ -54,6 +54,7 @@ def main(input_args=None, overwrite_values=None): v2, neox_args.iteration, use_wandb=neox_args.use_wandb, + comet_experiment=neox_args.comet_experiment, ) else: tb_wandb_log( @@ -61,6 +62,7 @@ def main(input_args=None, overwrite_values=None): v, neox_args.iteration, use_wandb=neox_args.use_wandb, + comet_experiment=neox_args.comet_experiment, ) pprint(results) diff --git a/generate.py b/generate.py index 743e350d0..e19ef2e0e 100755 --- a/generate.py +++ b/generate.py @@ -23,6 +23,7 @@ generate_samples_from_prompt, generate_samples_unconditional, generate_samples_interactive, + precompute_logits, ) @@ -83,6 +84,8 @@ def main(input_args=None, overwrite_values=None): top_p=neox_args.top_p, ) + elif neox_args.text_gen_type == "precompute": + precompute_logits(neox_args=neox_args, model=model) else: raise ValueError( f"`text_gen_type` either not specified or not recognised: {neox_args.text_gen_type}" diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 12b81e202..1b6909c9f 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -393,6 +393,7 @@ def load_checkpoint( load_lr_scheduler_states=load_optim_and_scheduler, load_module_only=not load_optim_and_scheduler, tag=tag, + load_module_strict=neox_args.train_impl != "rm", ) if checkpoint_name is None: diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py index bc5754cdb..c08b60151 100644 --- a/megatron/data/data_utils.py +++ b/megatron/data/data_utils.py @@ -16,13 +16,14 @@ import torch import numpy as np from typing import List, Tuple -from itertools import zip_longest +from itertools import zip_longest, cycle from functools import partial from megatron import mpu, print_rank_0 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset from megatron.data.blendable_dataset import BlendableDataset from megatron.data.gpt2_dataset import GPT2Dataset +from megatron.data.pairwise_dataset import PairwiseDataset from megatron.data.samplers import DistributedBatchSampler @@ -53,39 +54,122 @@ def make_data_loader(dataset, neox_args): def build_the_dataset( data_prefix, + pos_data_prefix, + neg_data_prefix, name, data_impl, + pack_impl, + dataset_impl, + allow_chopped, num_samples, + num_epochs, seq_length, seed, skip_warmup, build_index_mappings=True, label_prefix=None, + pos_label_prefix=None, + neg_label_prefix=None, + precompute_model_name=None, + reward_prefix=None, ): """Build train/valid/test datasets.""" - - indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) - if label_prefix is None: - label_dataset = None + if dataset_impl == "gpt2": + indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup) + if label_prefix is None: + label_dataset = None + else: + label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup) + if precompute_model_name is not None: + # If we have the name, assume it exists. If it doesn't, it will just be None which is fine. + precompute_indexed_dataset = make_indexed_dataset( + data_prefix + "_" + precompute_model_name, data_impl, skip_warmup + ) + precompute_indexed_dataset = precompute_indexed_dataset + else: + precompute_indexed_dataset = None + if reward_prefix is not None: + reward_dataset = make_indexed_dataset(reward_prefix, data_impl, skip_warmup) + else: + reward_dataset = None + elif dataset_impl == "pairwise": + pos_indexed_dataset = make_indexed_dataset( + pos_data_prefix, data_impl, skip_warmup + ) + neg_indexed_dataset = make_indexed_dataset( + neg_data_prefix, data_impl, skip_warmup + ) + if pos_label_prefix is None: + pos_label_dataset = None + # Also do neg here since they both must be the same + assert neg_label_prefix is None + neg_label_dataset = None + else: + pos_label_dataset = make_indexed_dataset( + pos_label_prefix, data_impl, skip_warmup + ) + # Also do neg here since they both must be the same + assert neg_label_prefix is not None + neg_label_dataset = make_indexed_dataset( + neg_label_prefix, data_impl, skip_warmup + ) + if precompute_model_name is None: + pos_ref_dataset = None + neg_ref_dataset = None + else: + pos_ref_dataset = make_indexed_dataset( + pos_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup + ) + neg_ref_dataset = make_indexed_dataset( + neg_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup + ) else: - label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup) + raise NotImplementedError(f"dataset_impl={dataset_impl} not implemented") - total_num_of_documents = indexed_dataset.sizes.shape[0] + total_num_of_documents = ( + indexed_dataset.sizes.shape[0] + if dataset_impl == "gpt2" + else pos_indexed_dataset.sizes.shape[0] + ) print_rank_0(" {}:".format(name)) print_rank_0(" no. of documents:{}".format(total_num_of_documents)) dataset = None documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32) - dataset = GPT2Dataset( - name, - data_prefix, - documents, - indexed_dataset, - num_samples, - seq_length, - seed, - build_index_mappings=build_index_mappings, - label_dataset=label_dataset, - ) + if dataset_impl == "gpt2": + dataset = GPT2Dataset( + name, + data_prefix, + documents, + indexed_dataset, + num_samples, + num_epochs, + seq_length, + seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, + build_index_mappings=build_index_mappings, + label_dataset=label_dataset, + reward_dataset=reward_dataset, + ref_dataset=precompute_indexed_dataset, + ) + elif dataset_impl == "pairwise": + dataset = PairwiseDataset( + name, + pos_data_prefix, + documents, + pos_indexed_dataset, + neg_indexed_dataset, + num_samples, + seq_length, + seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, + build_index_mappings=build_index_mappings, + pos_label_dataset=pos_label_dataset, + neg_label_dataset=neg_label_dataset, + pos_ref_dataset=pos_ref_dataset, + neg_ref_dataset=neg_ref_dataset, + ) return dataset @@ -93,8 +177,11 @@ def build_train_valid_test_datasets( data_prefix, use_shared_fs, data_impl, + pack_impl, + allow_chopped, splits_string, train_valid_test_num_samples, + train_valid_test_epochs, seq_length, seed, skip_warmup, @@ -129,15 +216,17 @@ def build_dataset(index, name): documents = np.arange( start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32 ) - dataset = GPT2Dataset( name, data_prefix, documents, indexed_dataset, train_valid_test_num_samples[index], + train_valid_test_epochs[index], seq_length, seed, + pack_impl=pack_impl, + allow_chopped=allow_chopped, use_shared_fs=use_shared_fs, ) return dataset @@ -183,12 +272,15 @@ def get_normalized_weights_and_num_samples( weight_sum = sum(weights) assert weight_sum > 0.0 weights = [weight / weight_sum for weight in weights] - # Add 0.5% (the 1.005 factor) so in case the blending dataset does - # not uniformly distribute the number of samples, we still have - # samples left to feed to the network. - weighted_num_samples = [] - for weight in weights: - weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005))) + if num_samples is not None: + # Add 0.5% (the 1.005 factor) so in case the blending dataset does + # not uniformly distribute the number of samples, we still have + # samples left to feed to the network. + weighted_num_samples = [] + for weight in weights: + weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005))) + else: + weighted_num_samples = [None for _ in weights] return weights, weighted_num_samples @@ -197,61 +289,154 @@ def build_weighted_datasets( train_num_samples, valid_num_samples, test_num_samples, - train_weights, - valid_weights, - test_weights, + train_epochs, + valid_epochs, + test_epochs, build_index_mappings=True, ): # build individual datasets train_datasets, valid_datasets, test_datasets = [], [], [] - for i, (train_path, label_path, valid_path, test_path) in enumerate( + for i, ( + train_path, + train_label_path, + train_reward_path, + valid_path, + valid_label_path, + valid_reward_path, + test_path, + test_label_path, + test_reward_path, + pos_train_path, + neg_train_path, + pos_train_label_path, + neg_train_label_path, + pos_valid_path, + neg_valid_path, + pos_valid_label_path, + neg_valid_label_path, + pos_test_path, + neg_test_path, + pos_test_label_path, + neg_test_label_path, + ) in enumerate( zip_longest( - neox_args.train_data_paths, - neox_args.label_data_paths if neox_args.label_data_paths else [], - neox_args.valid_data_paths, - neox_args.test_data_paths, + neox_args.train_data_paths if neox_args.train_data_paths else [], + neox_args.train_label_data_paths + if neox_args.train_label_data_paths + else [], + neox_args.train_reward_data_paths + if neox_args.train_reward_data_paths + else [], + neox_args.valid_data_paths if neox_args.valid_data_paths else [], + neox_args.valid_label_data_paths + if neox_args.valid_label_data_paths + else [], + neox_args.valid_reward_data_paths + if neox_args.valid_reward_data_paths + else [], + neox_args.test_data_paths if neox_args.test_data_paths else [], + neox_args.test_label_data_paths if neox_args.test_label_data_paths else [], + neox_args.test_reward_data_paths + if neox_args.test_reward_data_paths + else [], + neox_args.pos_train_data_paths if neox_args.pos_train_data_paths else [], + neox_args.neg_train_data_paths if neox_args.neg_train_data_paths else [], + neox_args.pos_train_label_data_paths + if neox_args.pos_train_label_data_paths + else [], + neox_args.neg_train_label_data_paths + if neox_args.neg_train_label_data_paths + else [], + neox_args.pos_valid_data_paths if neox_args.pos_valid_data_paths else [], + neox_args.neg_valid_data_paths if neox_args.neg_valid_data_paths else [], + neox_args.pos_valid_label_data_paths + if neox_args.pos_valid_label_data_paths + else [], + neox_args.neg_valid_label_data_paths + if neox_args.neg_valid_label_data_paths + else [], + neox_args.pos_test_data_paths if neox_args.pos_test_data_paths else [], + neox_args.neg_test_data_paths if neox_args.neg_test_data_paths else [], + neox_args.pos_test_label_data_paths + if neox_args.pos_test_label_data_paths + else [], + neox_args.neg_test_label_data_paths + if neox_args.neg_test_label_data_paths + else [], ) ): - if train_path: + if train_path or pos_train_path: train_datasets.append( build_the_dataset( data_prefix=train_path, name=f"train_{i}", data_impl=neox_args.data_impl, + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, num_samples=train_num_samples[i], + num_epochs=train_epochs, seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, - label_prefix=label_path, + label_prefix=train_label_path, + dataset_impl=neox_args.dataset_impl, + pos_data_prefix=pos_train_path, + neg_data_prefix=neg_train_path, + pos_label_prefix=pos_train_label_path, + neg_label_prefix=neg_train_label_path, + precompute_model_name=neox_args.precompute_model_name, + reward_prefix=train_reward_path, ) ) - if valid_path: + if valid_path or pos_valid_path: valid_datasets.append( build_the_dataset( data_prefix=valid_path, name=f"valid_{i}", data_impl=neox_args.data_impl, + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, num_samples=valid_num_samples[i], + num_epochs=valid_epochs, seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, + label_prefix=valid_label_path, + dataset_impl=neox_args.dataset_impl, + pos_data_prefix=pos_valid_path, + neg_data_prefix=neg_valid_path, + pos_label_prefix=pos_valid_label_path, + neg_label_prefix=neg_valid_label_path, + precompute_model_name=neox_args.precompute_model_name, + reward_prefix=valid_reward_path, ) ) - if test_path: + if test_path or pos_test_path: test_datasets.append( build_the_dataset( data_prefix=test_path, name=f"test_{i}", data_impl=neox_args.data_impl, + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, num_samples=test_num_samples[i], + num_epochs=test_epochs, seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), build_index_mappings=build_index_mappings, + label_prefix=test_label_path, + dataset_impl=neox_args.dataset_impl, + pos_data_prefix=pos_test_path, + neg_data_prefix=neg_test_path, + pos_label_prefix=pos_test_label_path, + neg_label_prefix=neg_test_label_path, + precompute_model_name=neox_args.precompute_model_name, + reward_prefix=test_reward_path, ) ) return train_datasets, valid_datasets, test_datasets @@ -294,9 +479,44 @@ def weights_by_num_docs(l: list, alpha=0.3): return weights -def build_train_valid_test_data_iterators(neox_args): +def validate_train_epochs(neox_args): + """Check for unsupported neox_args when using train_epochs instead of train_iters""" + if neox_args.train_epochs is None: + return + + if neox_args.train_epochs and neox_args.train_iters: + raise ValueError( + "Cannot specify both train epochs and train iters simultaneously" + ) + + if neox_args.pack_impl != "packed": + raise ValueError( + "Packing implementations other than 'packed' are currently unsupported with train_epochs" + ) + + if neox_args.weight_by_num_documents: + raise ValueError( + "Weighting by number of documents is currently unsupported with train_epochs" + ) + + if neox_args.train_data_weights and ( + not all(weight == 1.0 for weight in neox_args.train_data_weights) + ): + raise ValueError( + "train_data_weights != None is currently unsupported with train_epochs" + ) + + if neox_args.dataset_impl != "gpt2": + raise ValueError( + "non gpt2 datasets are not currently unsupported with train_epochs" + ) + + +def build_train_valid_test_data_loaders(neox_args): """XXX""" + validate_train_epochs(neox_args) + (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None) print_rank_0("> building train, validation, and test datasets ...") @@ -314,16 +534,23 @@ def build_train_valid_test_data_iterators(neox_args): # Data loader only on rank 0 of each model parallel group. if mpu.get_model_parallel_rank() == 0 and pipe_load: # Number of train/valid/test samples. - train_iters = neox_args.train_iters - eval_iters = (train_iters // neox_args.eval_interval + 1) * neox_args.eval_iters - test_iters = neox_args.eval_iters - train_val_test_num_samples = [ - train_iters * neox_args.train_batch_size, - eval_iters * neox_args.train_batch_size, - test_iters * neox_args.train_batch_size, - ] - - if neox_args.train_data_paths: + if neox_args.train_iters is not None: + train_iters = neox_args.train_iters + eval_iters = ( + train_iters // neox_args.eval_interval + 1 + ) * neox_args.eval_iters + test_iters = neox_args.eval_iters + train_val_test_num_samples = [ + train_iters * neox_args.train_batch_size, + eval_iters * neox_args.train_batch_size, + test_iters * neox_args.train_batch_size, + ] + train_val_test_epochs = [None, None, None] + elif neox_args.train_epochs is not None: + train_val_test_num_samples = [None, None, None] + train_val_test_epochs = [1, 1, 1] + + if (neox_args.train_data_paths) or (neox_args.pos_train_data_paths): # when individual train / valid / test data paths are provided # normalize weight values and get num samples for each dataset train_weights, train_num_samples = get_normalized_weights_and_num_samples( @@ -342,14 +569,13 @@ def build_train_valid_test_data_iterators(neox_args): train_num_samples, valid_num_samples, test_num_samples, - train_weights, - valid_weights, - test_weights, + train_val_test_epochs[0], + train_val_test_epochs[1], + train_val_test_epochs[2], build_index_mappings=not neox_args.weight_by_num_documents, ) if neox_args.weight_by_num_documents: - # gets the number of documents in each datapath get_num_docs_list = lambda datasets: [ dataset.indexed_dataset.sizes.shape[0] for dataset in datasets @@ -391,9 +617,9 @@ def build_train_valid_test_data_iterators(neox_args): train_num_samples, valid_num_samples, test_num_samples, - train_weights, - valid_weights, - test_weights, + train_val_test_epochs[0], + train_val_test_epochs[1], + train_val_test_epochs[2], ) if train_datasets: @@ -411,9 +637,12 @@ def build_train_valid_test_data_iterators(neox_args): data_impl=neox_args.data_impl, splits_string=neox_args.split, train_valid_test_num_samples=train_val_test_num_samples, + train_valid_test_epochs=train_val_test_epochs, seq_length=neox_args.seq_length, seed=neox_args.seed, skip_warmup=(not neox_args.mmap_warmup), + pack_impl=neox_args.pack_impl, + allow_chopped=neox_args.allow_chopped, ) # Build dataloders. @@ -422,9 +651,15 @@ def build_train_valid_test_data_iterators(neox_args): test_dataloader = make_data_loader(test_ds, neox_args=neox_args) # Flags to know if we need to do training/validation/testing. - do_train = train_dataloader is not None and neox_args.train_iters > 0 - do_valid = valid_dataloader is not None and neox_args.eval_iters > 0 - do_test = test_dataloader is not None and neox_args.eval_iters > 0 + if neox_args.train_epochs: + do_train = train_dataloader is not None + do_valid = valid_dataloader is not None + do_test = test_dataloader is not None + else: + do_train = train_dataloader is not None and neox_args.train_iters > 0 + do_valid = valid_dataloader is not None and neox_args.eval_iters > 0 + do_test = test_dataloader is not None and neox_args.eval_iters > 0 + # Need to broadcast num_tokens and num_type_tokens. flags = torch.cuda.LongTensor([int(do_train), int(do_valid), int(do_test)]) else: @@ -444,6 +679,19 @@ def build_train_valid_test_data_iterators(neox_args): neox_args.do_train = flags[0].item() neox_args.do_valid = flags[1].item() neox_args.do_test = flags[2].item() + data_loaders = { + "train": train_dataloader, + "valid": valid_dataloader, + "test": test_dataloader, + } + return data_loaders + + +def shift_and_wrap_data_loaders(neox_args, data_loaders, loop=True): + """Shift start iteration and wrap data_loaders in iterators""" + train_dataloader = data_loaders["train"] + valid_dataloader = data_loaders["valid"] + test_dataloader = data_loaders["test"] # Shift the start iterations. if train_dataloader is not None: @@ -469,19 +717,34 @@ def build_train_valid_test_data_iterators(neox_args): ) ) + def loop_iterator(data_loader): + while True: + for x in data_loader: + yield x + data_loader.start_iter = 0 + # Build iterators. if train_dataloader is not None: - train_data_iterator = iter(train_dataloader) + if loop: + train_data_iterator = cycle(train_dataloader) + else: + train_data_iterator = iter(train_dataloader) else: train_data_iterator = None if valid_dataloader is not None: - valid_data_iterator = iter(valid_dataloader) + if loop: + valid_data_iterator = cycle(valid_dataloader) + else: + valid_data_iterator = iter(valid_dataloader) else: valid_data_iterator = None if test_dataloader is not None: - test_data_iterator = iter(test_dataloader) + if loop: + test_data_iterator = cycle(test_dataloader) + else: + test_data_iterator = iter(test_dataloader) else: test_data_iterator = None diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py index 75e601fda..73c21bebd 100644 --- a/megatron/data/gpt2_dataset.py +++ b/megatron/data/gpt2_dataset.py @@ -34,18 +34,31 @@ def __init__( documents, indexed_dataset, num_samples, + num_epochs, seq_length, seed, + pack_impl="packed", + allow_chopped=True, build_index_mappings=True, use_shared_fs=True, label_dataset=None, + reward_dataset=None, + ref_dataset=None, ): self.name = name + self.pack_impl = pack_impl + self.allow_chopped = allow_chopped self.indexed_dataset = indexed_dataset self.label_dataset = label_dataset + self.reward_dataset = reward_dataset + self.ref_dataset = ref_dataset + self.seq_length = seq_length # Checks + assert self.reward_dataset is None or ( + pack_impl == "unpacked" + ), "Reward dataset only supported with unpacked data." assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -56,10 +69,14 @@ def __init__( data_prefix, documents, self.indexed_dataset.sizes, + self.label_dataset, num_samples, + num_epochs, seq_length, seed, + self.pack_impl, use_shared_fs=use_shared_fs, + allow_chopped=self.allow_chopped, ) self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1 self.sample_idx_len = self.sample_idx.shape[0] - 1 @@ -82,47 +99,101 @@ def __getitem__(self, idx): offset_f = self.sample_idx[idx][1] offset_l = self.sample_idx[idx + 1][1] # Labels and texts are supposed to be fully in sync. - datasets = ( - [self.indexed_dataset] - if self.label_dataset is None - else [self.indexed_dataset, self.label_dataset] - ) + datasets = [self.indexed_dataset] + rw_indx = 1 + if self.label_dataset is not None: + rw_indx += 1 + datasets.append(self.label_dataset) + if self.reward_dataset is not None: + datasets.append(self.reward_dataset) + else: + rw_indx = -1 + if self.ref_dataset is not None: + datasets.append(self.ref_dataset) samples = [] + sample_lengths = [] # If we are within the same document, just extract the chunk. for n, dataset in enumerate(datasets): if doc_index_f == doc_index_l: - samples.append( - dataset.get( - self.doc_idx[doc_index_f], - offset=offset_f, - length=offset_l - offset_f + 1, + if rw_indx == n: + # If we are in the reward dataset, we only need the last token. + rw = dataset.get(self.doc_idx[doc_index_f]) + samples.append( + np.array([rw[0] for _ in range(len(samples[-1]))]) + ) + else: + samples.append( + dataset.get( + self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1, + ) ) - ) else: + if n != rw_indx: + # reset + sample_lengths = [] # Otherwise, get the rest of the initial document. - sample_list = [ - dataset.get(self.doc_idx[doc_index_f], offset=offset_f) - ] + if n == rw_indx: + rw = dataset.get(self.doc_idx[doc_index_f]) + sample_list = [ + np.array([rw[0] for _ in range(sample_lengths.pop(0))]) + ] + else: + sample_list = [ + dataset.get(self.doc_idx[doc_index_f], offset=offset_f) + ] + sample_lengths.append(len(sample_list[-1])) # Loop over all in between documents and add the entire document. for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(dataset.get(self.doc_idx[i])) + if n == rw_indx: + rw = dataset.get(self.doc_idx[i]) + sample_list.append( + np.array([rw[0] for _ in range(sample_lengths.pop(0))]) + ) + else: + sample_list.append(dataset.get(self.doc_idx[i])) + sample_lengths.append(len(sample_list[-1])) # And finally add the relevant portion of last document. - sample_list.append( - dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1) - ) + if n == rw_indx: + rw = dataset.get(self.doc_idx[doc_index_l]) + sample_list.append( + np.array([rw[0] for _ in range(sample_lengths.pop(0))]) + ) + else: + sample_list.append( + dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1) + ) + sample_lengths.append(len(sample_list[-1])) samples.append(np.concatenate(sample_list)) - - if len(datasets) == 1: - return {"text": np.array(samples[0], dtype=np.int64)} - else: - return { - "text": np.array(samples[0], dtype=np.int64), - "label": np.array(samples[1], dtype=np.int64), - } - except IndexError: + for i in range(len(samples)): + mask = (self.label_dataset is not None) and (i == 1) + if len(samples[i]) < (self.seq_length + 1): + # Pad + samples[i] = np.pad( + samples[i], + (0, (self.seq_length + 1) - len(samples[i])), + mode="constant", + constant_values=-100 if mask else 0, + ) + elif len(samples[i]) > (self.seq_length + 1): + # Truncate + samples[i] = samples[i][: (self.seq_length + 1)] + ret = {"text": np.array(samples[0], dtype=np.int64)} + next_idx = 1 + if self.label_dataset is not None: + ret["label"] = np.array(samples[next_idx], dtype=np.int64) + next_idx += 1 + if self.reward_dataset is not None: + ret["reward"] = np.array(samples[next_idx], dtype=np.float32) + next_idx += 1 + if self.ref_dataset is not None: + ret["ref"] = np.array(samples[next_idx], dtype=np.float32) + return ret + except IndexError as err: new_idx = idx % len(self) print( - f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" + f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx}), error: {err}" ) return self[new_idx] @@ -132,10 +203,14 @@ def _build_index_mappings( data_prefix, documents, sizes, + label_dataset, num_samples, + num_epochs, seq_length, seed, + packing_impl, use_shared_fs=True, + allow_chopped=True, ): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. @@ -145,7 +220,8 @@ def _build_index_mappings( """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) - num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + if not num_epochs: + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # rng state np_rng = np.random.RandomState(seed=seed) @@ -155,6 +231,9 @@ def _build_index_mappings( _filename += "_{}ns".format(num_samples) _filename += "_{}sl".format(seq_length) _filename += "_{}s".format(seed) + _filename += "_{}pi".format(packing_impl) + if allow_chopped: + _filename += "_ac" doc_idx_filename = _filename + "_doc_idx.npy" sample_idx_filename = _filename + "_sample_idx.npy" shuffle_idx_filename = _filename + "_shuffle_idx.npy" @@ -177,44 +256,116 @@ def _build_index_mappings( ) # doc-idx. start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng) - np.save(doc_idx_filename, doc_idx, allow_pickle=True) - print_rank_0( - " > elapsed time to build and save doc-idx mapping " - "(seconds): {:4f}".format(time.time() - start_time) - ) - # sample-idx. - start_time = time.time() - # Use C++ implementation for speed. - from megatron.data import helpers - - assert doc_idx.dtype == np.int32 - assert sizes.dtype == np.int32 - - num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length - if 2 * (num_samples + 1) < np.iinfo(np.int32).max: - sample_idx = helpers.build_sample_idx_int32( - sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + if packing_impl == "packed": + doc_idx = _build_doc_idx(documents, num_epochs, np_rng) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + print_rank_0( + " > elapsed time to build and save doc-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) ) - else: - sample_idx = helpers.build_sample_idx_int64( - sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + # sample-idx. + start_time = time.time() + # Use C++ implementation for speed. + from megatron.data import helpers + + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + + num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length + if 2 * (num_samples + 1) < np.iinfo(np.int32).max: + sample_idx = helpers.build_sample_idx_int32( + sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + ) + else: + sample_idx = helpers.build_sample_idx_int64( + sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch + ) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0( + " > elapsed time to build and save sample-idx mapping " + "(seconds): {:4f}".format(time.time() - start_time) ) - np.save(sample_idx_filename, sample_idx, allow_pickle=True) - print_rank_0( - " > elapsed time to build and save sample-idx mapping " - "(seconds): {:4f}".format(time.time() - start_time) - ) - # shuffle-idx. - start_time = time.time() - # -1 is due to data structure used to retrieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng) - np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) - print_rank_0( - " > elapsed time to build and save shuffle-idx mapping" - " (seconds): {:4f}".format(time.time() - start_time) - ) + # shuffle-idx. + start_time = time.time() + # -1 is due to data structure used to retrieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + print_rank_0( + " > elapsed time to build and save shuffle-idx mapping" + " (seconds): {:4f}".format(time.time() - start_time) + ) + elif packing_impl == "pack_until_overflow": + # Naively pack data until it overflows, then roll it over to a new one instead. + shuffle_idx = np.arange(num_samples) # Shuffle index around epochs + np_rng.shuffle(shuffle_idx) + sample_idx = [] + doc_idx = [] + # Iterate over files until we have enough samples. + temp_shuffle_idx = np.arange(len(documents)) + np_rng.shuffle(temp_shuffle_idx) + running_length = 0 + curr_shuffle_idx = 0 + while len(sample_idx) < num_samples: + if not allow_chopped: + # +1 since we shift left/right by 1 + if sizes[temp_shuffle_idx[curr_shuffle_idx]] > seq_length + 1: + curr_shuffle_idx += 1 + continue + # First, check if we need to skip this item... + if label_dataset is not None: + if np.all( + label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[ + : seq_length + 1 + ] + == -100 + ): + curr_shuffle_idx += 1 + continue + doc_length = sizes[temp_shuffle_idx[curr_shuffle_idx]] + if running_length == 0: + sample_idx.append(np.array([len(doc_idx), 0])) + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + running_length += doc_length + else: + if running_length + doc_length > (seq_length + 1): + running_length = doc_length + sample_idx.append(np.array([len(doc_idx), 0])) + else: + running_length += doc_length + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + curr_shuffle_idx += 1 + if curr_shuffle_idx == len(documents): + curr_shuffle_idx = 0 + np_rng.shuffle(temp_shuffle_idx) + sample_idx.append(np.array([len(doc_idx), 0])) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + elif packing_impl == "unpacked": + # Unpacked data, one sample per document. + shuffle_idx = np.arange(num_samples) # Shuffle index around epochs + np_rng.shuffle(shuffle_idx) + sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64) + sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)]) + sample_idx[:, 1] = 0 + doc_idx = list() + doc_i = 0 + while len(doc_idx) <= num_samples: + if not allow_chopped: + # +1 since we shift left/right by 1 + if sizes[doc_i] > seq_length + 1: + doc_i = (doc_i + 1) % len(documents) + continue + # Just in case we have bad data in the loop... + if np.all(label_dataset.get(doc_i)[:seq_length] == -100): + doc_i = (doc_i + 1) % len(documents) + continue + doc_idx.append(doc_i) + doc_i = (doc_i + 1) % len(documents) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 9b062b050..aca290854 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/data/pairwise_dataset.py b/megatron/data/pairwise_dataset.py new file mode 100644 index 000000000..e39b4d626 --- /dev/null +++ b/megatron/data/pairwise_dataset.py @@ -0,0 +1,457 @@ +# Copyright (c) 2024, EleutherAI +# This file is based on code by the authors denoted below and has been modified from its original version. +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pairwise style dataset.""" + +import os +import time + +import numpy as np +import torch + +from megatron import mpu, print_rank_0 + + +class PairwiseDataset(torch.utils.data.Dataset): + def __init__( + self, + name, + pos_data_prefix, # Don't need neg since it's assumed you have paired the data already. + documents, + pos_indexed_dataset, + neg_indexed_dataset, + num_samples, + seq_length, + seed, + pack_impl="unpacked", + build_index_mappings=True, + use_shared_fs=True, + pos_label_dataset=None, + pos_ref_dataset=None, + neg_label_dataset=None, + neg_ref_dataset=None, + allow_chopped=True, + ): + + self.name = name + self.pos_indexed_dataset = pos_indexed_dataset + self.pos_label_dataset = pos_label_dataset + self.pos_ref_dataset = pos_ref_dataset + self.neg_indexed_dataset = neg_indexed_dataset + self.neg_label_dataset = neg_label_dataset + self.neg_ref_dataset = neg_ref_dataset + self.pack_impl = pack_impl + self.seq_length = seq_length + # Checks + assert np.min(documents) >= 0 + assert (neg_label_dataset is not None and pos_label_dataset is not None) or ( + neg_label_dataset is None and pos_label_dataset is None + ), "Label datasets must be both None or both not None" + assert np.max(documents) < pos_indexed_dataset.sizes.shape[0] + assert pos_indexed_dataset.sizes.shape[0] == neg_indexed_dataset.sizes.shape[0] + assert ( + pack_impl != "packed" + ), "Packed implementation not supported for pairwise dataset" + + if build_index_mappings: + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, + pos_data_prefix, + documents, + self.pos_indexed_dataset.sizes, + self.neg_indexed_dataset.sizes, + self.pos_label_dataset, + self.neg_label_dataset, + num_samples, + seq_length, + seed, + pack_impl, + use_shared_fs=use_shared_fs, + allow_chopped=allow_chopped, + ) + self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1 + self.sample_idx_len = self.sample_idx.shape[0] - 1 + + if self.shuffle_idx_len != self.sample_idx_len - 1: + print( + f"WARNING: shuffle index length ({self.shuffle_idx_len}) is not equal to sample index length ({self.sample_idx_len})" + ) + + def __len__(self): + return min(self.shuffle_idx_len, self.sample_idx_len) + + def __getitem__(self, idx): + try: + # Get the shuffled index. + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # Labels and texts are supposed to be fully in sync. + datasets = [self.pos_indexed_dataset, self.neg_indexed_dataset] + + if self.pos_label_dataset is not None: + datasets += [ + self.pos_label_dataset, + self.neg_label_dataset, + ] + if self.pos_ref_dataset is not None: + datasets += [ + self.pos_ref_dataset, + self.neg_ref_dataset, + ] + samples = [] + pos_ref_samples = [] + neg_ref_samples = [] + # If we are within the same document, just extract the chunk. + for n, dataset in enumerate(datasets): + if doc_index_f == doc_index_l: + samples.append( + dataset.get( + self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1, + ) + ) + else: + # Otherwise, get the rest of the initial document. + sample_list = [ + dataset.get(self.doc_idx[doc_index_f], offset=offset_f) + ] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append( + dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1) + ) + samples.append(np.concatenate(sample_list)) + for i in range(len(samples)): + if len(samples[i]) < (self.seq_length + 1): + if ((i == 2) or (i == 3)) and self.pos_label_dataset is not None: + # Labels... So pad with -100 + samples[i] = np.pad( + samples[i], + (0, (self.seq_length + 1) - len(samples[i])), + mode="constant", + constant_values=-100, + ) + else: + # Pad with 0s, can use any number since it's masked. + samples[i] = np.pad( + samples[i], + (0, (self.seq_length + 1) - len(samples[i])), + mode="constant", + constant_values=0, + ) + elif len(samples[i]) > (self.seq_length + 1): + # Check for overflow and truncate. + samples[i] = samples[i][: (self.seq_length + 1)] + ret = {} + ret["pos"] = np.array(samples[0], dtype=np.int64) + ret["neg"] = np.array(samples[1], dtype=np.int64) + if self.pos_label_dataset is not None: + ret["pos_label"] = np.array(samples[2], dtype=np.int64) + ret["neg_label"] = np.array(samples[3], dtype=np.int64) + if self.pos_ref_dataset is not None: + ret["pos_ref"] = np.array(samples[4], dtype=np.float32) + ret["neg_ref"] = np.array(samples[5], dtype=np.float32) + elif self.pos_ref_dataset is not None: + # Don't have labels... + ret["pos_ref"] = np.array(samples[2], dtype=np.float32) + ret["neg_ref"] = np.array(samples[3], dtype=np.float32) + return ret + except IndexError: + new_idx = idx % len(self) + print( + f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" + ) + return self[new_idx] + + +def _build_index_mappings( + name, + pos_data_prefix, + documents, + pos_sizes, + neg_sizes, + pos_label_dataset, + neg_label_dataset, + num_samples, + seq_length, + seed, + packing_impl, + use_shared_fs=True, + allow_chopped=True, +): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, pos_sizes) + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = pos_data_prefix + _filename += "_{}_indexmap".format(name) + _filename += "_{}ns".format(num_samples) + _filename += "_{}sl".format(seq_length) + _filename += "_{}s".format(seed) + _filename += "_{}pi".format(packing_impl) + doc_idx_filename = _filename + "_doc_idx.npy" + sample_idx_filename = _filename + "_sample_idx.npy" + shuffle_idx_filename = _filename + "_shuffle_idx.npy" + + if not use_shared_fs: + should_process_dataset = int(os.environ["LOCAL_RANK"]) == 0 + else: + should_process_dataset = torch.distributed.get_rank() == 0 + + # Build the indexed mapping if not exist. + if should_process_dataset: + if ( + (not os.path.isfile(doc_idx_filename)) + or (not os.path.isfile(sample_idx_filename)) + or (not os.path.isfile(shuffle_idx_filename)) + ): + print_rank_0( + " > WARNING: could not find index map files, building " + "the indices on rank 0 ..." + ) + # doc-idx. + start_time = time.time() + if packing_impl == "pack_until_overflow": + # Naively pack data until it overflows, then roll it over to a new one instead. + shuffle_idx = np.arange(num_samples) # Shuffle index around epochs + np_rng.shuffle(shuffle_idx) + sample_idx = [] + doc_idx = [] + # Iterate over files until we have enough samples. + temp_shuffle_idx = np.arange(len(documents)) + np_rng.shuffle(temp_shuffle_idx) + running_length = 0 + curr_shuffle_idx = 0 + while len(sample_idx) < num_samples: + # If not allow_chopped, skip this item if it's chopped. + if not allow_chopped: + if ( + pos_sizes[temp_shuffle_idx[curr_shuffle_idx]] + < seq_length + 1 + ): + curr_shuffle_idx += 1 + continue + if ( + neg_sizes[temp_shuffle_idx[curr_shuffle_idx]] + < seq_length + 1 + ): + curr_shuffle_idx += 1 + continue + # Then, check if we need to skip this item... + if pos_label_dataset is not None: + if np.all( + pos_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[ + : seq_length + 1 + ] + == -100 + ): + curr_shuffle_idx += 1 + continue + if np.all( + neg_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[ + : seq_length + 1 + ] + == -100 + ): + curr_shuffle_idx += 1 + continue + doc_length = max( + pos_sizes[temp_shuffle_idx[curr_shuffle_idx]], + neg_sizes[temp_shuffle_idx[curr_shuffle_idx]], + ) + if running_length == 0: + sample_idx.append(np.array([len(doc_idx), 0])) + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + running_length += doc_length + else: + if running_length + doc_length > (seq_length + 1): + running_length = doc_length + sample_idx.append(np.array([len(doc_idx), 0])) + else: + running_length += doc_length + doc_idx.append(temp_shuffle_idx[curr_shuffle_idx]) + curr_shuffle_idx += 1 + if curr_shuffle_idx == len(documents): + curr_shuffle_idx = 0 + np_rng.shuffle(temp_shuffle_idx) + sample_idx.append(np.array([len(doc_idx), 0])) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + elif packing_impl == "unpacked": + # Unpacked data, one sample per document. + shuffle_idx = np.array([i % len(documents) for i in range(num_samples)]) + np_rng.shuffle(shuffle_idx) + sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64) + sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)]) + sample_idx[:, 1] = 0 + doc_idx = list() + doc_i = 0 + while len(doc_idx) <= num_samples: + # Check if we need to skip this item... + if not allow_chopped: + # +1 since we shift left/right by 1 + if pos_sizes[doc_i] > seq_length + 1: + doc_i = (doc_i + 1) % len(documents) + continue + if neg_sizes[doc_i] > seq_length + 1: + doc_i = (doc_i + 1) % len(documents) + continue + # In theory if we don't allow chopped we should be able to skip it, but the warm fuzzies I get + # from this are worth the extra bool check + if np.all(pos_label_dataset.get(doc_i)[:seq_length] == -100): + doc_i = (doc_i + 1) % len(documents) + continue + if np.all(neg_label_dataset.get(doc_i)[:seq_length] == -100): + doc_i = (doc_i + 1) % len(documents) + continue + doc_idx.append(doc_i) + doc_i = (doc_i + 1) % len(documents) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group()) + assert counts[0].item() == torch.distributed.get_world_size( + group=mpu.get_io_parallel_group() + ) + + # Load mappings. + start_time = time.time() + print_rank_0(" > loading doc-idx mapping from {}".format(doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r") + print_rank_0(" > loading sample-idx mapping from {}".format(sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r") + print_rank_0(" > loading shuffle-idx mapping from {}".format(shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r") + print_rank_0( + " loaded indexed file in {:3.3f} seconds".format(time.time() - start_time) + ) + print_rank_0(" total number of samples: {}".format(sample_idx.shape[0])) + print_rank_0(" total number of epochs: {}".format(num_epochs)) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _num_epochs(tokens_per_epoch, seq_length, num_samples): + """Based on number of samples and sequence length, calculate how many + epochs will be needed.""" + num_epochs = 0 + total_tokens = 0 + while True: + num_epochs += 1 + total_tokens += tokens_per_epoch + # -1 is because we need to retrieve seq_length + 1 token each time + # but the last token will overlap with the first token of the next + # sample except for the last sample. + if ((total_tokens - 1) // seq_length) >= num_samples: + return num_epochs + + +def _build_doc_idx(documents, num_epochs, np_rng): + """Build an array with length = number-of-epochs * number-of-documents. + Each index is mapped to a corresponding document.""" + doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + +def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch): + """Sample index mapping is a 2D array with sizes + [number-of-samples + 1, 2] where [..., 0] contains + the index into `doc_idx` and [..., 1] is the + starting offset in that document.""" + + # Total number of samples. For -1 see comments in `_num_epochs`. + num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length + sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int64) + + # Index into sample_idx. + sample_index = 0 + # Index into doc_idx. + doc_idx_index = 0 + # Beginning offset for each document. + doc_offset = 0 + # Start with first document and no offset. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + while sample_index <= num_samples: + # Start with a fresh sequence. + remaining_seq_length = seq_length + 1 + while remaining_seq_length != 0: + # Get the document length. + doc_id = doc_idx[doc_idx_index] + doc_length = sizes[doc_id] - doc_offset + # And add it to the current sequence. + remaining_seq_length -= doc_length + # If we have more than a full sequence, adjust offset and set + # remaining length to zero so we return from the while loop. + # Note that -1 here is for the same reason we have -1 in + # `_num_epochs` calculations. + if remaining_seq_length <= 0: + doc_offset += remaining_seq_length + doc_length - 1 + remaining_seq_length = 0 + else: + # Otherwise, start from the beginning of the next document. + doc_idx_index += 1 + doc_offset = 0 + # Record the sequence. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + + return sample_idx + + +def _build_shuffle_idx(size, np_rng): + """Build the range [0, size) and shuffle.""" + dtype_ = np.uint32 + if size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx) + return shuffle_idx diff --git a/megatron/data/samplers.py b/megatron/data/samplers.py index a9428e41c..5206636d8 100644 --- a/megatron/data/samplers.py +++ b/megatron/data/samplers.py @@ -100,7 +100,11 @@ class DistributedBatchSampler(data.sampler.BatchSampler): specifying True will result in the following samples for each gpu: GPU0: [0,2,4,6] GPU1: [1,3,5,7] specifying False will result in the following samples: - GPU0: [0,1,2,3] GPU1: [4,5,6,7]""" + GPU0: [0,1,2,3] GPU1: [4,5,6,7] + + The `infinite_loop` parameter allows the sampler to yield batches indefinitely, + restarting from the beginning of the dataset when all samples have been iterated over. + """ def __init__( self, diff --git a/megatron/logging.py b/megatron/logging.py index 247aeb1b5..af8a41fe5 100644 --- a/megatron/logging.py +++ b/megatron/logging.py @@ -23,6 +23,7 @@ from megatron import mpu, print_rank_0 from megatron.utils import report_memory +import math class Tee: @@ -106,6 +107,38 @@ def get_flops(neox_args, iter_time_s) -> float: + 18 * hidden_size * hidden_size * num_layers / num_heads ) ) + elif "mamba" in neox_args.attention_config: + # from https://github.com/Zyphra/zcookbook/blob/main/calc/calc_mamba_flops.py + if neox_args.expansion_factor: + d_inner = neox_args.hidden_size * neox_args.expansion_factor + elif neox_args.intermediate_size: + d_inner = neox_args.intermediate_size + else: + d_inner = neox_args.hidden_size * 2 # default expansion factor + d_state = 16 # TODO make d_state an arg. Currently hardcoded in neox mamba definition and here + conv_dimension = 4 # TODO make conv_dimension an arg. Currently hardcoded in neox mamba definition and here + dt_rank = math.ceil(neox_args.hidden_size / 16) + ssm_flops = ( + ckpt_activations_factor + * d_inner + * seq_len + * batch_size + * (11 * d_state + 4 * dt_rank + 1) + ) + mamba_projectors_flops = ( + ckpt_activations_factor * seq_len * batch_size * 6 * d_inner * hidden_size + ) + mamba_conv_flops = ( + ckpt_activations_factor + * seq_len + * batch_size + * 2 + * d_inner + * conv_dimension + ) + mamba_flops = ssm_flops + mamba_projectors_flops + mamba_conv_flops + embedding_flops = 6 * seq_len * batch_size * hidden_size * vocab_size + flops_per_iteration = mamba_flops * num_layers + embedding_flops else: flops_per_iteration = ( 24 @@ -201,6 +234,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) # write losses, lr, etc. every step @@ -210,6 +244,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) for key in loss_dict: tb_wandb_log( @@ -218,6 +253,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) if neox_args.fp16: tb_wandb_log( @@ -226,6 +262,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) # log gradient noise scale @@ -237,6 +274,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) # (optional) Log optimizer states to wandb / tb every step @@ -251,6 +289,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) # (optional) Log grad/param norms to wandb / tb every step @@ -276,6 +315,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, all_ranks=True, ) if neox_args.log_grad_norm: @@ -291,6 +331,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, all_ranks=True, ) if neox_args.log_param_norm: @@ -300,6 +341,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, all_ranks=True, ) @@ -315,6 +357,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) tb_wandb_log( "runtime/iteration_time", @@ -322,6 +365,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) log_string += " iteration {:8d}/{:8d} |".format( iteration, neox_args.train_iters @@ -342,6 +386,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) # log tflop / gpu @@ -356,6 +401,7 @@ def add_to_logging(name): iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) for key in total_loss_dict: @@ -394,6 +440,7 @@ def tb_wandb_log( iteration_no: int, use_wandb: bool, tensorboard_writer=None, + comet_experiment=None, all_ranks: bool = False, ): # logs to both tb and wandb (if present) from the zeroth rank @@ -403,3 +450,7 @@ def tb_wandb_log( tensorboard_writer.add_scalar(key, value, iteration_no) if use_wandb: wandb.log({key: value}, step=iteration_no) + if comet_experiment: + comet_experiment.__internal_api__log_metric__( + key, value, framework="gpt-neox", step=iteration_no + ) diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 619b4c33d..23be28936 100755 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -16,5 +16,8 @@ # limitations under the License. from .gpt2_model import GPT2ModelPipe -from .utils import get_params_for_weight_decay_optimization +from .utils import ( + get_params_for_weight_decay_optimization, + mark_norms_for_sequence_parallel_grad_sync, +) from .word_embeddings import SoftEmbedding diff --git a/megatron/model/activations.py b/megatron/model/activations.py index 7a29b0716..c0b825261 100644 --- a/megatron/model/activations.py +++ b/megatron/model/activations.py @@ -25,9 +25,23 @@ def get_activation(neox_args): - """retrieves the activation function specified in neox_args""" + """retrieves the activation function specified in neox_args and whether or not the activation is gated""" + is_gated = False if neox_args.activation == "geglu": - activation_func = GEGLU(neox_args=neox_args) + is_gated = True + activation_func = F.gelu + elif neox_args.activation == "reglu": + is_gated = True + activation_func = F.relu + elif neox_args.activation == "bilinear": + is_gated = True + activation_func = lambda x: x + elif neox_args.activation == "swiglu": + is_gated = True + activation_func = swish + elif neox_args.activation == "glu": + is_gated = True + activation_func = F.sigmoid elif neox_args.activation == "gelu": if neox_args.onnx_safe and neox_args.bias_gelu_fusion: raise ValueError("onnx_safe + bias_gelu_fusion not compatible") @@ -49,7 +63,7 @@ def get_activation(neox_args): activation_func = F.silu else: raise ValueError(f"Activation function {neox_args.activation} not recognized") - return activation_func + return activation_func, is_gated ###### BIAS GELU FUSION/ NO AUTOGRAD ################ @@ -119,21 +133,3 @@ def swish(x, beta: float = 1.0): @torch.jit.script def mish(x): return x * torch.tanh(F.softplus(x)) - - -class GEGLU(torch.nn.Module): - def __init__(self, neox_args): - super(GEGLU, self).__init__() - if neox_args.onnx_safe: - self.activation_func = erf_gelu - else: - self.activation_func = F.gelu - - def forward(self, x, bias=None): - x, gate = x.chunk(2, dim=-1) - if bias is not None: - bias_1, bias_2 = bias.chunk(2, dim=-1) - x = x + bias_1 - gate = gate + bias_2 - intermediate_parallel = self.activation_func(gate) - return intermediate_parallel * x diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index d33ded506..3fd251147 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -21,7 +21,10 @@ except: HAVE_PERSIST_LAYER_NORM = False -from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction +from apex.normalization.fused_layer_norm import ( + FusedLayerNormAffineFunction, + FusedRMSNormAffineFunction, +) global fused_layer_norm_cuda @@ -148,3 +151,112 @@ def forward(self, input): ) return output + + +class MixedFusedRMSNorm(torch.nn.Module): + def __init__( + self, + normalized_shape, + eps=1e-5, + no_persist_layer_norm=True, + sequence_parallel=False, + apply_rmsnorm_1p=False, + mem_efficient_rms=True, + ): + super(MixedFusedRMSNorm, self).__init__() + + self.apply_rmsnorm_1p = apply_rmsnorm_1p + self.mem_efficient_rms = mem_efficient_rms + self.norm_fn = FusedRMSNormAffineFunction + + global fused_layer_norm_cuda + fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda") + + # List of hiddens sizes supported in the persistent layer norm kernel + # If the hidden size is not supported, fall back to the non-persistent + # kernel. + persist_ln_hidden_sizes = [ + 1024, + 1536, + 2048, + 2304, + 3072, + 3840, + 4096, + 5120, + 6144, + 8192, + 10240, + 12288, + 12800, + 15360, + 16384, + 18432, + 20480, + 24576, + 25600, + 30720, + 32768, + 40960, + 49152, + 65536, + ] + if ( + normalized_shape not in persist_ln_hidden_sizes + or not HAVE_PERSIST_LAYER_NORM + ): + no_persist_layer_norm = True + + if isinstance(normalized_shape, numbers.Integral): + normalized_shape = (normalized_shape,) + self.normalized_shape = torch.Size(normalized_shape) + self.eps = eps + self.scale = Parameter(torch.Tensor(*normalized_shape)) + self.reset_parameters() + self.no_persist_layer_norm = no_persist_layer_norm + self.sequence_parallel = sequence_parallel + + # set sequence parallelism flag on weight and bias parameters + setattr(self.scale, "sequence_parallel", self.sequence_parallel) + + def reset_parameters(self): + + if self.apply_rmsnorm_1p: + init.zeros_(self.scale) + else: + init.ones_(self.scale) + + def forward(self, input): + + weight = self.scale + 1 if self.apply_rmsnorm_1p else self.scale + # CPU path is here for unittest sake. + if not input.is_cuda: + print( + "WARNING! The input of FusedLayerNorm should be on the GPU." + "This warning should only be triggered in the FusedRMSNorm unit tests." + ) + # Latest pytorch actually supports F.rms_norm but I don't want to break builds so... + return F.layer_norm(input, self.normalized_shape, weight, None, self.eps) + + # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect + # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet + if "memory_efficient" in inspect.getfullargspec(self.norm_fn.forward).args: + return self.norm_fn.apply( + input, + weight, + self.normalized_shape, + self.eps, + self.mem_efficient_rms, + ) + else: + return self.norm_fn.apply(input, weight, self.normalized_shape, self.eps) + + # Apex's fast layer norm function outputs a 'view' tensor (i.e., has + # a populated '_base' field). This will result in schedule.py's + # deallocate_output_tensor() throwing an error, so a viewless tensor is + # created to prevent this. + output = make_viewless_tensor( + inp=output, requires_grad=input.requires_grad, keep_graph=True + ) + + return output diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py index c3462c651..6400640bd 100644 --- a/megatron/model/gmlp.py +++ b/megatron/model/gmlp.py @@ -112,7 +112,7 @@ def __init__( init_method=init_method, skip_bias_add=True, ) - self.activation_func = get_activation(neox_args) + self.activation_func, _ = get_activation(neox_args) ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size()) if neox_args.attention_config[layer_number] == "amlp": d_attn = neox_args.gmlp_attn_dim diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py index 9e643874a..7899048db 100644 --- a/megatron/model/gpt2_model.py +++ b/megatron/model/gpt2_model.py @@ -308,7 +308,10 @@ def _logits_helper(embedding, lm_output): ) logits = parallel_lm_logits( - lm_output, embedding.word_embeddings_weight, self.parallel_output + lm_output, + embedding.word_embeddings_weight, + self.parallel_output, + seq_parallel=self.neox_args.sequence_parallel, ) return logits diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py index 86a003dbd..8a0b8e251 100644 --- a/megatron/model/init_functions.py +++ b/megatron/model/init_functions.py @@ -145,7 +145,7 @@ def init_(tensor, use_mup=use_mup_outer): def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0): """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving - the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution.""" + the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution.""" std = math.sqrt(2 / (5 * dim)) def init_(tensor, use_mup=use_mup_outer): diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py index d5d6b336f..950e36fed 100644 --- a/megatron/model/mamba/mamba.py +++ b/megatron/model/mamba/mamba.py @@ -14,7 +14,8 @@ import einops except ModuleNotFoundError: print( - "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba" + "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \ + or directly from https://github.com/state-spaces/mamba" ) pass @@ -45,12 +46,21 @@ def __init__( neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion ), "Mamba fused inner fn and bias in x_proj not compatible!" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" + # set variables, mostly following mamba defaults self.d_model = neox_args.hidden_size self.d_state = 16 # state dimensions per channel self.d_conv = 4 # convolution width - self.expand = 2 # linear projection expansion factors - self.d_inner = int(self.expand * self.d_model) + if neox_args.intermediate_size: + self.d_inner = neox_args.intermediate_size + else: + self.expand = ( + neox_args.expansion_factor if neox_args.expansion_factor else 2 + ) + self.d_inner = int(self.expand * self.d_model) self.dt_rank = math.ceil(self.d_model / 16) # rank of dt / Delta parameter self.dt_scale = 1.0 diff --git a/megatron/model/norms.py b/megatron/model/norms.py index dda44659f..ba175d3eb 100644 --- a/megatron/model/norms.py +++ b/megatron/model/norms.py @@ -18,18 +18,34 @@ def get_norm(neox_args): if neox_args.norm == "rmsnorm": - norm = RMSNorm eps = neox_args.rms_norm_epsilon + if neox_args.rmsnorm_fusion: + from .fused_layer_norm import MixedFusedRMSNorm + + norm = MixedFusedRMSNorm + else: + norm = RMSNorm elif neox_args.norm == "layernorm": eps = neox_args.layernorm_epsilon if neox_args.layernorm_fusion: from .fused_layer_norm import MixedFusedLayerNorm + norm = MixedFusedLayerNorm else: norm = LayerNorm elif neox_args.norm == "scalenorm": eps = neox_args.scalenorm_epsilon norm = ScaleNorm + elif neox_args.norm == "te_rmsnorm": + from .transformer_engine import TERMSNorm + + norm = TERMSNorm + eps = neox_args.rms_norm_epsilon + elif neox_args.norm == "te_layernorm": + from .transformer_engine import TELayerNorm + + norm = TELayerNorm + eps = neox_args.layernorm_epsilon else: raise ValueError(f"norm {neox_args.norm} not recognized") return norm, eps diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py index 5d4e0d144..b3741a3fc 100644 --- a/megatron/model/rwkv/v6/rwkv.py +++ b/megatron/model/rwkv/v6/rwkv.py @@ -247,11 +247,11 @@ def __init__(self, neox_args, layer_number): self.time_maa_k = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0)) self.time_maa_r = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0)) - self.key = nn.Linear(neox_args.hidden_size, neox_args.dim_ffn, bias=False) + self.key = nn.Linear(neox_args.hidden_size, neox_args.ffn_dim, bias=False) self.receptance = nn.Linear( neox_args.hidden_size, neox_args.hidden_size, bias=False ) - self.value = nn.Linear(neox_args.dim_ffn, neox_args.hidden_size, bias=False) + self.value = nn.Linear(neox_args.ffn_dim, neox_args.hidden_size, bias=False) def forward(self, x): xx = self.time_shift(x) - x @@ -275,14 +275,23 @@ def __init__(self, neox_args, layer_number): self.layer_number = layer_number self.fp16 = neox_args.precision == "fp16" self.bf16 = neox_args.precision == "bfloat16" + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" if not hasattr(neox_args, "dim_att"): neox_args.dim_att = neox_args.hidden_size - if not hasattr(neox_args, "dim_ffn"): - # Make hidden size 3.5x. Round to nearest multiple of 32 until we add hdim rounding logic - neox_args.dim_ffn = int((neox_args.hidden_size * 3.5) // 32 * 32) + if neox_args.intermediate_size: + neox_args.ffn_dim = neox_args.intermediate_size + else: + self.expand = ( + neox_args.expansion_factor if neox_args.expansion_factor else 3.5 + ) + neox_args.ffn_dim = int(self.expand * neox_args.hidden_size) + # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic + neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32) assert neox_args.hidden_size % 32 == 0 assert neox_args.dim_att % 32 == 0 - assert neox_args.dim_ffn % 32 == 0 + assert neox_args.ffn_dim % 32 == 0 self.neox_args.head_size = neox_args.dim_att // neox_args.num_attention_heads self.head_size = self.neox_args.head_size self.num_attention_heads = neox_args.num_attention_heads diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 694d58166..08436d54c 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -18,6 +18,8 @@ """Transformer.""" import math +from contextlib import nullcontext + import torch import torch.nn.functional as F import torch.nn as nn @@ -48,6 +50,11 @@ ) from megatron.model.utils import configure_sparse_attention +try: + from flash_attn.ops.activations import swiglu +except ImportError: + swiglu = None + # flags required to enable jit fusion kernels torch._C._jit_set_profiling_mode(False) torch._C._jit_set_profiling_executor(False) @@ -93,48 +100,71 @@ def __init__( init_method, output_layer_init_method, parallel_output=False, + multiple_of=256 ): super().__init__() + assert ( + neox_args.intermediate_size == None or neox_args.expansion_factor == None + ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections" - self.activation_func = get_activation(neox_args) + self.activation_func, self.is_gated = get_activation(neox_args) self.activation_type = neox_args.activation self.bias_gelu_fusion = neox_args.bias_gelu_fusion + self.multiple_of = multiple_of - # auto scale so geglu has equal parameters - ff_mult = int(4 * 2 / 3) if self.activation_type == "geglu" else 4 - ff_dim = ( - int(ff_mult * neox_args.hidden_size) * 2 - if self.activation_type == "geglu" - else ff_mult * neox_args.hidden_size + if neox_args.intermediate_size: + ffn_dim = neox_args.intermediate_size + elif neox_args.expansion_factor: + ffn_dim = int(neox_args.expansion_factor * neox_args.hidden_size) + else: + # 4h is default for ffn_dim + ffn_dim = 4 * neox_args.hidden_size + ffn_dim_in = ffn_dim + if self.is_gated: + # set activation function to be gated implementation + self.activation_func = Gated_Activation( + self.activation_func, + (swiglu is not None) + and (neox_args.activation == "swiglu") + and neox_args.use_flashattn_swiglu, + ) + # auto scale so gated activations has equal parameters + ffn_dim = int(ffn_dim * 2 / 3) + ffn_dim_in = ffn_dim // 2 + # set multiple + ffn_dim = int( + (2 * self.multiple_of) + * ((ffn_dim + (2 * multiple_of) - 1) // (2 * multiple_of)) + ) + ffn_dim_in = int( + self.multiple_of * ((ffn_dim_in + multiple_of - 1) // multiple_of) ) - self.dense_h_to_4h = mpu.ColumnParallelLinear( + + self.linear1 = mpu.ColumnParallelLinear( neox_args=neox_args, input_size=neox_args.hidden_size, - output_size=ff_dim, + output_size=ffn_dim, gather_output=False, init_method=init_method, skip_bias_add=True, + bias=neox_args.use_bias_in_mlp ) - ff_dim_in = ff_dim // 2 if self.activation_type == "geglu" else ff_dim # Project back to h. - self.dense_4h_to_h = mpu.RowParallelLinear( + self.linear2 = mpu.RowParallelLinear( neox_args=neox_args, - input_size=ff_dim_in, + input_size=ffn_dim_in, output_size=neox_args.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, parallel_output=parallel_output, skip_bias_add=True, + bias=neox_args.use_bias_in_mlp, ) - def forward(self, hidden_states): + # [s, b, intermediate_size] + intermediate_parallel, bias_parallel = self.linear1(hidden_states) - # [s, b, 4hp] - intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states) - - if ( - self.activation_type == "gelu" and self.bias_gelu_fusion - ) or self.activation_type == "geglu": + if self.is_gated or (self.activation_type == "gelu" and self.bias_gelu_fusion): intermediate_parallel = self.activation_func( intermediate_parallel, bias_parallel ) @@ -144,76 +174,27 @@ def forward(self, hidden_states): ) # [s, b, h] - output, output_bias = self.dense_4h_to_h(intermediate_parallel) + output, output_bias = self.linear2(intermediate_parallel) return output, output_bias -class LLaMAParallelMLP(nn.Module): - """LLaMA's MLP. - - MLP will take the input with h hidden state, project it to 4*h - hidden dimension, perform nonlinear transformation, and project the - state back into h hidden dimension. At the end, dropout is also - applied. - - Note: multiple_of is used to compute the hidden dimension of the MLP - """ - - def __init__( - self, - neox_args, - init_method, - output_layer_init_method, - parallel_output=False, - multiple_of=256, - ): +class Gated_Activation(torch.nn.Module): + def __init__(self, activation_func, use_swiglu=False): super().__init__() - - self.activation_func = get_activation(neox_args) - self.activation_type = neox_args.activation - - self.multiple_of = multiple_of - - # Allow custom intermediate size, e.g. for Mistral - if neox_args.intermediate_size is not None: - ff_dim = neox_args.intermediate_size + self.activation_func = activation_func + self.use_swiglu = use_swiglu + + def forward(self, x, bias=None): + x, gate = x.chunk(2, dim=-1) + if bias is not None: + bias_1, bias_2 = bias.chunk(2, dim=-1) + x = x + bias_1 + gate = gate + bias_2 + if not self.use_swiglu: + intermediate_parallel = self.activation_func(gate) + return intermediate_parallel * x else: - ff_dim = int(2 * neox_args.hidden_size * 4 / 3) - ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) - - self.w1 = mpu.ColumnParallelLinear( - neox_args=neox_args, - input_size=neox_args.hidden_size, - output_size=ff_dim, - gather_output=False, - init_method=init_method, - skip_bias_add=True, - bias=False, - ) - self.w3 = mpu.ColumnParallelLinear( - neox_args=neox_args, - input_size=neox_args.hidden_size, - output_size=ff_dim, - gather_output=False, - init_method=init_method, - skip_bias_add=True, - bias=False, - ) - self.w2 = mpu.RowParallelLinear( - neox_args=neox_args, - input_size=ff_dim, - output_size=neox_args.hidden_size, - input_is_parallel=True, - init_method=output_layer_init_method, - skip_bias_add=True, - parallel_output=parallel_output, - bias=False, - ) - - def forward(self, hidden_states): - w1_out, _ = self.w1(hidden_states) - w3_out, _ = self.w3(hidden_states) - return self.w2(self.activation_func(w1_out) * w3_out) + return swiglu(gate, x) class ParallelLinear(nn.Module): @@ -229,7 +210,8 @@ def __init__( is_last_layer=False, ): super().__init__() - parallelism = neox_args.output_layer_parallelism + self.is_rm = neox_args.train_impl == "rm" + parallelism = neox_args.output_layer_parallelism if not self.is_rm else "row" if parallelism == "column": self.final_linear = mpu.ColumnParallelLinear( neox_args=neox_args, @@ -240,27 +222,43 @@ def __init__( gather_output=not parallel_output, skip_bias_add=False, mup_rescale_parameters=is_last_layer, # rescale params only called if neox_args.use_mup = True, despite it not being included here + seq_dim=1, # important: must mark that this layer receives shape [b, s, h] not [s, b, h] and so Seq. Parallel comms must gather along dim=1 rather than dim=0 ) - - # else: - # print( - # 'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.' - # ) - # exit() - # self.final_linear = mpu.RowParallelLinear( - # neox_args=neox_args, - # input_size=neox_args.hidden_size, - # output_size=neox_args.padded_vocab_size, - # bias=False, - # input_is_parallel=False, - # init_method=init_method, - # parallel_output=parallel_output, - # skip_bias_add=False, - # mup_rescale_parameters=is_last_layer, # only called if neox_args.use_mup = True, despite it not being included here - # ) + else: + if not self.is_rm: + print( + 'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.' + ) + exit() + # self.final_linear = mpu.RowParallelLinear( + # neox_args=neox_args, + # input_size=neox_args.hidden_size, + # output_size=neox_args.padded_vocab_size, + # bias=False, + # input_is_parallel=False, + # init_method=init_method, + # parallel_output=parallel_output, + # skip_bias_add=False, + # mup_rescale_parameters=is_last_layer, # only called if neox_args.use_mup = True, despite it not being included here + # ) + else: # Not using cross entropy loss for RMs + self.rm_linear = mpu.RowParallelLinear( + neox_args=neox_args, + input_size=neox_args.hidden_size, + output_size=1, + bias=False, + input_is_parallel=False, + init_method=init_method, + parallel_output=False, + skip_bias_add=False, + mup_rescale_parameters=is_last_layer, # only called if neox_args.use_mup = True, despite it not being included here + ) def forward(self, hidden_states): - return self.final_linear(hidden_states) + if not self.is_rm: + return self.final_linear(hidden_states) + else: + return self.rm_linear(hidden_states) class ParallelSelfAttention(nn.Module): @@ -699,9 +697,13 @@ def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask): rpe = self.rpe(query_layer.size(0), key_layer.size(0)) else: rpe = None - return self.sparse_attn( + attn_scores = self.sparse_attn( query_layer, key_layer, value_layer, attn_mask=attn_mask, rpe=rpe ) + # apply dropout + if self.training: + attn_scores = self.attention_dropout(attn_scores) + return attn_scores def gqa_project(self, hidden_states, attention_mask, layer_past=None): # QKV projection and separation into separate Q/K/V layers for GQA, @@ -712,51 +714,16 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None): # pass through projection: [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)] mixed_x_layer, _ = self.query_key_value(hidden_states) - # First: reshape so we have seqlen, batch, and num. query heads each as separate dims - # Final dim is not exactly head dim: the first (head dim) dims are query heads, - # The last (head dim * ratio of kv to q heads) each are the "k/v heads" - # (right now we treat like we have same num. heads, but smaller head dim) - - # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))] - new_qkv_shape = ( - mixed_x_layer.shape[0], - mixed_x_layer.shape[1], - self.num_attention_heads_per_partition, - int( - self.hidden_size_per_attention_head - * ( - 1 - + 2 - * ( - self.num_kv_heads_per_partition - / self.num_attention_heads_per_partition - ) - ) - ), - ) - mixed_x_layer = mixed_x_layer.reshape(*new_qkv_shape) - - # Next: split our fake head dim. (last dim) so that the first (head dim) dimensions go to Q, - # the last smaller 2 * (head dim * kv to q head ratio) each divided between K and V separately + # split the last dim, so that the first (q head * head dim) dimensions go to Q, + # the last smaller 2 * (kv head * head dim) each divided between K and V separately split_sizes = ( - self.hidden_size_per_attention_head, - int( - ( - self.num_kv_heads_per_partition - / self.num_attention_heads_per_partition - ) - * self.hidden_size_per_attention_head - ), - int( - ( - self.num_kv_heads_per_partition - / self.num_attention_heads_per_partition - ) - * self.hidden_size_per_attention_head - ), + self.num_attention_heads_per_partition + * self.hidden_size_per_attention_head, + self.num_kv_heads_per_partition * self.hidden_size_per_attention_head, + self.num_kv_heads_per_partition * self.hidden_size_per_attention_head, ) - # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))] + # [sq, b, ((np + 2 * kvp) * hn)] --> 1 x [sq, b, np * hn] , 2 x [sq, b, kvp * hn] (query_layer, key_layer, value_layer) = [ x.contiguous() for x in torch.split( @@ -766,6 +733,17 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None): ) ] + # reshape Q to proper output shape (last dim = correct full "real" head size again) + # [sq, b, np * hn] --> [sq, b, np, hn] + new_query_shape = ( + query_layer.size(0), + query_layer.size(1), + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + + query_layer = query_layer.view(*new_query_shape) + # reshape K/V to proper output shape (last dim = correct full "real" head size again) # 2 x [sq, b, np, (hn * (kvp / np))] --> 2 x [sq, b, kvp, hn] new_kv_shape = ( @@ -956,7 +934,7 @@ def __init__( self.bias_dropout_fusion = neox_args.bias_dropout_fusion self.gpt_j_residual = neox_args.gpt_j_residual self.gpt_j_tied = neox_args.gpt_j_tied - self.mlp_type = neox_args.mlp_type + self.activation = neox_args.activation self.num_experts = ( neox_args.moe_num_experts if layer_number % neox_args.moe_expert_interval == 0 @@ -964,7 +942,14 @@ def __init__( ) if self.gpt_j_residual: - self.reduce = mpu.mappings.reduce_from_model_parallel_region + # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers. + # the reduction we use is a simple allreduce for pure Tensor Parallel, + # but needs to be a reduce-scatter when using Megatron-style Sequence Parallel (LN sharding.) + self.reduce = ( + mpu.mappings.reduce_from_model_parallel_region + if not neox_args.sequence_parallel + else mpu.mappings.reduce_scatter_to_sequence_parallel_region + ) # Self attention. self.attention = ParallelSelfAttention( @@ -984,31 +969,20 @@ def __init__( # leads to cleaner code self.post_attention_layernorm = norm(neox_args.hidden_size, eps=eps) - # Dense MLP selector - def get_mlp(mlp_type, **kw): - if mlp_type == "regular": - return ParallelMLP( - neox_args=neox_args, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - parallel_output=self.gpt_j_residual, - **kw, - ) - elif mlp_type == "llama": - return LLaMAParallelMLP( - neox_args=neox_args, - init_method=init_method, - output_layer_init_method=output_layer_init_method, - parallel_output=self.gpt_j_residual, - **kw, - ) - else: - raise KeyError(mlp_type) + # MLP + def get_mlp(**kw): + return ParallelMLP( + neox_args=neox_args, + init_method=init_method, + output_layer_init_method=output_layer_init_method, + parallel_output=self.gpt_j_residual, + multiple_of=neox_args.mlp_multiple_of, + **kw, + ) # Dense MLP if self.num_experts <= 1: - self.mlp = get_mlp(neox_args.mlp_type) - # Dropless MoE MLP + self.mlp = get_mlp() else: self.mlp = ParallelDroplessMoE( neox_args=neox_args, @@ -1058,23 +1032,27 @@ def forward(self, x, attention_mask, layer_past=None): attention_output, presents = attention_output self.layer_past = presents - with torch.enable_grad(): - attention_output = bias_dropout_fn( - attention_output, - bias=attention_bias.expand_as(attention_output), - residual=None, - prob=self.hidden_dropout, - ) + if attention_bias is not None: + with torch.enable_grad() if not self.eval else nullcontext(): + attention_output = bias_dropout_fn( + attention_output, + bias=attention_bias.expand_as(attention_output), + residual=None, + prob=self.hidden_dropout, + ) # mlp operator mlp_output, mlp_bias = self.mlp(x2) - with torch.enable_grad(): - output = bias_dropout_fn( - mlp_output, - bias=mlp_bias.expand_as(mlp_output), - residual=attention_output, - prob=self.hidden_dropout, - ) + if mlp_bias is not None: + with torch.enable_grad() if not self.eval else nullcontext(): + output = bias_dropout_fn( + mlp_output, + bias=mlp_bias.expand_as(mlp_output), + residual=attention_output, + prob=self.hidden_dropout, + ) + else: + output = mlp_output # output = (x + attn(ln(x)) + mlp(ln(x)) output = residual + self.reduce(output) @@ -1092,7 +1070,7 @@ def forward(self, x, attention_mask, layer_past=None): if self.use_cache: attention_output, presents = attention_output self.layer_past = presents - with torch.enable_grad(): + with torch.enable_grad() if not self.eval else nullcontext(): if attention_bias is not None: # Use special bias_dropout_fn if we have a bias term from the above attention layer attention_output = bias_dropout_fn( @@ -1118,9 +1096,9 @@ def forward(self, x, attention_mask, layer_past=None): # call signatures of both dense and MoE are the same mlp_output, mlp_bias = self.mlp(layernorm_output) - with torch.enable_grad(): - # dense llama MLP and MoE don't support bias - if self.mlp_type == "llama" or self.num_experts > 1: + with torch.enable_grad() if not self.eval else nullcontext(): + # MoE don't support bias + if mlp_bias == None or self.num_experts > 1: # No dropout either assert mlp_bias is None output = mlp_output + attention_output @@ -1173,10 +1151,25 @@ def forward(self, args): return self.norm(args) -def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None): +def parallel_lm_logits( + input_, + word_embeddings_weight, + parallel_output, + seq_parallel=False, + seq_dim=1, + bias=None, +): """LM logits using word embedding weights.""" # Parallel logits. - input_parallel = mpu.copy_to_model_parallel_region(input_) + if seq_parallel: + # if using Sequence Parallelism, our logits are sharded along the sequence dimension. + # gather them here. (backward pass: reduce-scatter) + input_parallel = mpu.gather_from_sequence_parallel_region( + input_, seq_dim=seq_dim + ) + else: + # Set up backprop all-reduce. + input_parallel = mpu.copy_to_model_parallel_region(input_) # Matrix multiply. if bias is None: diff --git a/megatron/model/transformer_engine.py b/megatron/model/transformer_engine.py new file mode 100644 index 000000000..338513a97 --- /dev/null +++ b/megatron/model/transformer_engine.py @@ -0,0 +1,137 @@ +import torch + +try: + import transformer_engine as te +except ImportError: + raise ImportError( + "Unable to import transformer-engine. Please refer to " + "https://github.com/NVIDIA/TransformerEngine for installation instructions." + ) + + +class TERMSNorm(torch.nn.Module): + def __init__(self, dim, eps=1e-8, **kwargs): + """ + A conditional wrapper to initialize an instance of Transformer-Engine's + `RMSNorm` based on input + :param dim: model size + :param eps: epsilon value, default 1e-8 + """ + super(TERMSNorm, self).__init__() + + self.d = dim + self.eps = eps + self.norm = te.pytorch.RMSNorm( + hidden_size=self.d, + eps=self.eps, + **kwargs, + ) + + def forward(self, x): + return self.norm(x) + + +class TELayerNorm(torch.nn.Module): + def __init__(self, dim, eps=1.0e-5, **kwargs): + """ + A conditional wrapper to initialize an instance of Transformer-Engine's + `LayerNorm` based on input + :param dim: model size + :param eps: epsilon value, default 1.0e-5 + """ + super(TELayerNorm, self).__init__() + + self.d = dim + self.eps = eps + self.norm = te.pytorch.LayerNorm( + hidden_size=self.d, + eps=self.eps, + **kwargs, + ) + + def forward(self, x): + return self.norm(x) + + +class TELinear(te.pytorch.Linear): + """ + Wrapper for the Transformer-Engine's `Linear` layer. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear): + """ + Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines + layernorm and linear layers + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TEColumnParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `ColumnParallelLinear` layer. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TERowParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `RowParallelLinear` layer. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TEDotProductAttention(te.pytorch.DotProductAttention): + """ + Wrapper for the Transformer-Engine's `DotProductAttention` layer that also + has "flash attention" enabled. + """ + + def __init__(self): + # TODO + return + + def forward(self, x): + # TODO + return + + +class TEDelayedScaling(te.common.recipe.DelayedScaling): + """ + Wrapper for the Transformer-Engine's `DelayedScaling` layer. + """ + + def __init__(self): + # TODO + return diff --git a/megatron/model/utils.py b/megatron/model/utils.py index c3da2ce8b..8176f1f7a 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -18,14 +18,18 @@ """Utilities for models.""" import torch -from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm from megatron.model.fused_softmax import SoftmaxFusionTypes +from megatron import mpu from types import GeneratorType import torch.distributed as dist +import importlib +from typing import List, Dict, Any -def get_params_for_weight_decay_optimization(module, neox_args): - """Divide params into with-weight-decay and without-weight-decay groups. + +def get_params_for_weight_decay_optimization(module: Any, neox_args: Any): + """ + Divide params into with-weight-decay and without-weight-decay groups. Layernorms and biases will have no weight decay but the rest will. """ weight_decay_params = {"params": [], "name": "weight_decay_params"} @@ -34,41 +38,38 @@ def get_params_for_weight_decay_optimization(module, neox_args): "weight_decay": 0.0, "name": "no_weight_decay_params", } - for module_ in module.modules(): - if any( - [ - isinstance(module_, LayerNorm), - isinstance(module_, RMSNorm), - isinstance(module_, ScaleNorm), + + def is_no_weight_decay_module(module_: Any) -> bool: + return ( + type(module_).__name__ + in [ + "LayerNorm", + "RMSNorm", + "ScaleNorm", + "TELayerNorm", + "TERMSNorm", + "MixedFusedLayerNorm", + "MixedFusedRMSNorm", ] - ) or ( - neox_args.weight_decay == 0.0 - ): # also include all parameters here if no weight decay is being done + or neox_args.weight_decay == 0.0 + ) + + for module_ in module.modules(): + if is_no_weight_decay_module(module_): no_weight_decay_params["params"].extend( - [p for p in list(module_._parameters.values()) if p is not None] + [p for p in module_._parameters.values() if p is not None] ) else: - weight_decay_params["params"].extend( - [ - p - for n, p in list(module_._parameters.items()) - if p is not None - and n != "bias" - and not getattr(p, "_no_weight_decay", False) - ] - ) - no_weight_decay_params["params"].extend( - [ - p - for n, p in list(module_._parameters.items()) - if p is not None - and (n == "bias" or getattr(p, "_no_weight_decay", False)) - ] - ) + for name, param in module_._parameters.items(): + if param is None: + continue + if name == "bias" or getattr(param, "_no_weight_decay", False): + no_weight_decay_params["params"].append(param) + else: + weight_decay_params["params"].append(param) + if neox_args.weight_decay == 0.0: - # only return a single param group - # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once. - # to avoid this, only use a single param group when weight decay is off. + # Only return a single param group to minimize calls to compressed_allreduce with onebitadam return [no_weight_decay_params] return weight_decay_params, no_weight_decay_params @@ -359,3 +360,45 @@ def get_fusion_type(neox_args): elif neox_args.scaled_masked_softmax_fusion: fusion_type = SoftmaxFusionTypes.general return fusion_type + + +def reduce_weight_grads_from_model_parallel_region(input_): + """A hook that can be applied to any weight tensor via .register_hook(). + Allreduces grads for e.g. LN weights across the model parallel group. + Needed to keep LNs in sync, despite them getting diff data -> diff gradients when using sequence parallel. + """ + # Bypass the function if no TP -> no comm needed. + if mpu.get_model_parallel_world_size() == 1: + return input_ + + # Bf16 convert + dt = input_.dtype + if dt == torch.bfloat16 and mpu.get_fp32_allreduce(): + input_ = input_.float() + + # All-reduce. + dist.all_reduce(input_, group=mpu.get_model_parallel_group()) + + # Bf16 convert + if dt == torch.bfloat16 and mpu.get_fp32_allreduce(): + input_ = input_.bfloat16() + + return input_ + + +def mark_norms_for_sequence_parallel_grad_sync(module, neox_args): + """Iterate through the modules in our model, and for any "...Norm" classnames, + register a hook on each of that module's parameters which will allreduce norms' weights' grads across + the model (sequence) parallel region. + """ + + if not neox_args.sequence_parallel: + # if we aren't using sequence parallelism, this is a no-op + return + + for module_ in module.modules(): + if "norm" in type(module_).__name__.lower(): + # this is a norm, we want to allreduce its weight grads across sequence parallel region + for name, param in module_.named_parameters(): + if param.requires_grad: + param.register_hook(reduce_weight_grads_from_model_parallel_region) diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py index f7372bc55..ce3c1117e 100644 --- a/megatron/model/word_embeddings.py +++ b/megatron/model/word_embeddings.py @@ -50,6 +50,11 @@ def __init__( self.hidden_size = hidden_size self.init_method = init_method self.num_tokentypes = num_tokentypes + + self.sequence_parallel = ( + neox_args.sequence_parallel + ) # if we are using sequence parallelism, then we'll want to scatter our inputs across the seqlen dim across TP ranks + self.use_mup = neox_args.use_mup self.mup_embedding_mult = neox_args.mup_embedding_mult self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult @@ -159,6 +164,11 @@ def forward(self, input_ids, position_ids, tokentype_ids=None): with torch.no_grad(): embeddings.mul_(self.mup_embedding_mult) + if self.sequence_parallel: + # TODO: megatron-lm does dropout using the scattered embs. This would save a tiny bit of time, perhaps? + # Not a priority since we don't often use dropout + embeddings = mpu.scatter_to_sequence_parallel_region(embeddings) + return embeddings diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py index f12ba7da7..318326e5f 100644 --- a/megatron/mpu/__init__.py +++ b/megatron/mpu/__init__.py @@ -51,6 +51,9 @@ from .mappings import gather_from_expert_model_parallel_region from .mappings import reduce_from_model_parallel_region from .mappings import scatter_to_model_parallel_region +from .mappings import reduce_scatter_to_sequence_parallel_region +from .mappings import gather_from_sequence_parallel_region +from .mappings import scatter_to_sequence_parallel_region from .random import checkpoint from .random import get_cuda_rng_tracker diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py index 19dff0b5f..e67015ecb 100644 --- a/megatron/mpu/layers.py +++ b/megatron/mpu/layers.py @@ -33,6 +33,8 @@ from .mappings import gather_from_model_parallel_region from .mappings import reduce_from_model_parallel_region from .mappings import scatter_to_model_parallel_region +from .mappings import reduce_scatter_to_sequence_parallel_region +from .mappings import gather_from_sequence_parallel_region from .random import get_cuda_rng_tracker from .utils import divide from .utils import VocabUtility @@ -414,6 +416,7 @@ def __init__( keep_master_weight_for_test=False, skip_bias_add=False, mup_rescale_parameters=False, + seq_dim=0, # Dimension which is the seq_len dimension. final ParallelLinear overrides this to be 1 ; otherwise, the default is used throughout. ): super(ColumnParallelLinear, self).__init__() @@ -425,6 +428,10 @@ def __init__( world_size = get_model_parallel_world_size() self.output_size_per_partition = divide(output_size, world_size) self.skip_bias_add = skip_bias_add + + self.sequence_parallel = neox_args.sequence_parallel + self.seq_dim = seq_dim + self.init_method = init_method self.stride = stride self.mup_rescale_parameters = mup_rescale_parameters @@ -549,14 +556,29 @@ def set_parallel_output(self, value: bool): def forward(self, input_): if self.use_mup and self.mup_rescale_parameters: input_ /= self.width_mult() - # Set up backprop all-reduce. - input_parallel = copy_to_model_parallel_region(input_) + + if self.sequence_parallel: + input_parallel = input_ + else: + # Set up backprop all-reduce. + input_parallel = copy_to_model_parallel_region(input_) # Matrix multiply. + if self.sequence_parallel: + # do an AG in the fwd pass, RS in bwd pass. + # gather / scatter portion happens across the sequence dim (self.seq_dim)-- + # almost always is [s, b, h] and so dim 0, but for lm_head ParallelLinear it is seq_dim=1 and [b, s, h] + input_parallel = gather_from_sequence_parallel_region( + input_parallel, seq_dim=self.seq_dim + ) + bias = self.bias if not self.skip_bias_add else None output_parallel = F.linear(input_parallel, self.weight, bias) if self.gather_output: # All-gather across the partitions. + assert ( + not self.sequence_parallel + ), "sequence_parallel=True and gather_output=True are incompatible!" output = gather_from_model_parallel_region(output_parallel) else: output = output_parallel @@ -619,6 +641,12 @@ def __init__( self.input_size_per_partition = divide(input_size, world_size) self.skip_bias_add = skip_bias_add self.parallel_output = parallel_output + + self.sequence_parallel = neox_args.sequence_parallel + assert not ( + self.sequence_parallel and not self.input_is_parallel + ), "Cannot have self.input_is_parallel=False and self.sequence_parallel=True." + self.init_method = init_method self.stride = stride self.keep_master_weight_for_test = keep_master_weight_for_test @@ -744,7 +772,12 @@ def forward(self, input_): # Matrix multiply. output_parallel = F.linear(input_parallel, self.weight) # All-reduce across all the partitions. - if not self.parallel_output: + if self.sequence_parallel and not self.parallel_output: + # do an RS in the fwd pass, AG in bwd pass. + # skip in the gpt-j parallel sublayer case (self.parallel_output=True) + # (user responsible for calling reduce-scatter) + output_ = reduce_scatter_to_sequence_parallel_region(output_parallel) + elif not self.parallel_output: output_ = reduce_from_model_parallel_region(output_parallel) else: output_ = output_parallel diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py index 5a2880b46..7fdef841b 100644 --- a/megatron/mpu/mappings.py +++ b/megatron/mpu/mappings.py @@ -25,7 +25,7 @@ get_fp32_allreduce, get_expert_token_counts_for_rank, ) -from .utils import split_tensor_along_last_dim +from .utils import split_tensor_along_last_dim, split_tensor_along_any_dim def _reduce(input_): @@ -35,17 +35,17 @@ def _reduce(input_): if get_model_parallel_world_size() == 1: return input_ - # Bf16 convert + # upcast to fp32 if using fp32 allreduce dt = input_.dtype - if dt == torch.bfloat16 and get_fp32_allreduce(): + if get_fp32_allreduce(): input_ = input_.float() # All-reduce. torch.distributed.all_reduce(input_, group=get_model_parallel_group()) - # Bf16 convert - if dt == torch.bfloat16 and get_fp32_allreduce(): - input_ = input_.bfloat16() + # reconvert to original Bf16/Fp16 dtype + if get_fp32_allreduce(): + input_ = input_.to(dt) return input_ @@ -77,11 +77,6 @@ def _gather(input_): if world_size == 1: return input_ - # Bf16 convert - dt = input_.dtype - if dt == torch.bfloat16 and get_fp32_allreduce(): - input_ = input_.float() - # Size and dimension. last_dim = input_.dim() - 1 rank = get_model_parallel_rank() @@ -185,9 +180,102 @@ def _dmoe_gather(input_: torch.Tensor, tokens_per_expert: torch.Tensor): # Note: torch.cat already creates a contiguous tensor. output = torch.cat(tensor_list, dim=gather_dim) - # Bf16 convert - if dt == torch.bfloat16 and get_fp32_allreduce(): - output = output.bfloat16() + return output + + +def _reduce_scatter_along_seq_dim(input_, seq_dim): + """Reduce-scatter the input tensor across model parallel group, scattering across sequence dim.""" + world_size = get_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + # upcast to fp32 if using fp32 allreduce + dt = input_.dtype + if get_fp32_allreduce(): + input_ = input_.float() + + dim_size = list(input_.size()) + assert ( + isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0 + ), "seq_dim must be a valid tensor dim" + assert dim_size[seq_dim] % world_size == 0 + + if seq_dim == 0: + # reduce_scatter_tensor is faster but only works correctly on dimension 0 + dim_size[seq_dim] = dim_size[seq_dim] // world_size + output = torch.empty( + dim_size, dtype=input_.dtype, device=torch.cuda.current_device() + ) + torch.distributed.reduce_scatter_tensor( + output, input_.contiguous(), group=get_model_parallel_group() + ) + else: + tensor_list = list( + torch.split(input_, input_.shape[seq_dim] // world_size, seq_dim) + ) + output = torch.empty_like(tensor_list[0]) + torch.distributed.reduce_scatter( + output, tensor_list, group=get_model_parallel_group() + ) + + # reconvert to original Bf16/Fp16 dtype + if get_fp32_allreduce(): + output = output.to(dt) + + return output + + +def _gather_along_seq_dim(input_, seq_dim): + """Gather tensors and concatinate along the (manually-specified) sequence dimension.""" + + world_size = get_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + assert ( + isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0 + ), "seq_dim must be a valid tensor dim" + dim_size[seq_dim] = dim_size[seq_dim] * world_size + + if seq_dim == 0: + # reduce_gather_tensor is faster but only works correctly on dimension 0 + output = torch.empty( + dim_size, dtype=input_.dtype, device=torch.cuda.current_device() + ) + torch.distributed.all_gather_into_tensor( + output, input_.contiguous(), group=get_model_parallel_group() + ) + else: + input_ = input_.contiguous() + rank = get_model_parallel_rank() + tensor_list = [torch.empty_like(input_) for _ in range(world_size)] + tensor_list[rank] = input_ + torch.distributed.all_gather( + tensor_list, input_, group=get_model_parallel_group() + ) + output = torch.cat(tensor_list, dim=seq_dim) + + return output + + +def _split_along_seq_dim(input_, seq_dim): + """Split the tensor along the sequence dimension (as manually selected) and keep the + corresponding slice.""" + + world_size = get_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + # Split along second dimension. + input_list = split_tensor_along_any_dim(input_, world_size, seq_dim) + + # Note: torch.split does not create contiguous tensors by default. + rank = get_model_parallel_rank() + output = input_list[rank].contiguous() return output @@ -309,6 +397,65 @@ def backward(ctx, grad_output): return _dmoe_split(grad_output, tokens_per_expert), None +class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function): + """Reduce-Scatter across sequence parallel region (same as model parallel region.) + Note: same region as model parallel region + """ + + @staticmethod + def symbolic(graph, input_, seq_dim): + return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def forward(ctx, input_, seq_dim): + ctx.seq_dim = seq_dim + return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def backward(ctx, grad_output): + seq_dim = ctx.seq_dim + return _gather_along_seq_dim(grad_output, seq_dim=seq_dim), None + + +class _GatherFromSequenceParallelRegion(torch.autograd.Function): + """All-Gather across sequence parallel region (same region as model parallel region.)""" + + @staticmethod + def symbolic(graph, input_, seq_dim): + return _gather_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def forward(ctx, input_, seq_dim): + ctx.seq_dim = seq_dim + return _gather_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def backward(ctx, grad_output): + seq_dim = ctx.seq_dim + return _reduce_scatter_along_seq_dim(grad_output, seq_dim=seq_dim), None + + +class _ScatterToSequenceParallelRegion(torch.autograd.Function): + """Scatter (split) sequence length across sequence parallel region (=> same region as model parallel.)""" + + @staticmethod + def symbolic(graph, input_, seq_dim): + return _split_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def forward(ctx, input_, seq_dim): + ctx.seq_dim = seq_dim + return _split_along_seq_dim(input_, seq_dim=seq_dim) + + @staticmethod + def backward(ctx, grad_output): + seq_dim = ctx.seq_dim + return ( + _gather_along_seq_dim(grad_output, seq_dim=seq_dim), + None, + ) + + # ----------------- # Helper functions. # ----------------- @@ -336,3 +483,17 @@ def gather_from_model_parallel_region(input_): def gather_from_expert_model_parallel_region(input_, tokens_per_expert): return _GatherFromExpertModelParallelRegion.apply(input_, tokens_per_expert) + + +def reduce_scatter_to_sequence_parallel_region(input_, seq_dim=0): + return _ReduceScatterToSequenceParallelRegion.apply(input_, seq_dim) + + +def gather_from_sequence_parallel_region(input_, seq_dim=0): + return _GatherFromSequenceParallelRegion.apply(input_, seq_dim) + + +def scatter_to_sequence_parallel_region( + input_, seq_dim=1 +): # use this fn in scattering input embeds across TP ranks. There, shape of inps is [b, s, h] instead of the usual [s, b, h] + return _ScatterToSequenceParallelRegion.apply(input_, seq_dim) diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py index 13941dc29..1f97e0e76 100644 --- a/megatron/mpu/utils.py +++ b/megatron/mpu/utils.py @@ -53,6 +53,28 @@ def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks= return tensor_list +def split_tensor_along_any_dim( + tensor, num_partitions, seq_dim, contiguous_split_chunks=False +): + """Split a tensor along a user-specified dimension. + Arguments: + tensor: input tensor. + num_partitions: number of partitions to split the tensor + seq_dim: dimension along which to split the tensor + contiguous_split_chunks: If True, make each chunk contiguous + in memory. + """ + # Get the size and dimension. + seq_dim_size = divide(tensor.size()[seq_dim], num_partitions) + # Split. + tensor_list = torch.split(tensor, seq_dim_size, dim=seq_dim) + # Note: torch.split does not create contiguous tensors by default. + if contiguous_split_chunks: + return tuple(chunk.contiguous() for chunk in tensor_list) + + return tensor_list + + class VocabUtility: """Split the vocabulary into `world_size` chunks amd return the first and last index of the vocabulary belonging to the `rank` diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index a41874971..5948e6892 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -51,6 +51,20 @@ ATTENTION_TYPE_CHOICES, ) +### ANSI escape codes ### +END = "\033[0m" +GREEN = "\033[92m" +RED = "\033[91m" +YELLOW = "\033[93m" + +### Formatted logging prefixes ### +ERROR = f"{RED}[ERROR]{END} " +FAIL = f"{RED}[FAIL]{END}" +INFO = "[INFO]" +OKAY = f"{GREEN}[OKAY]{END}" +SUCCESS = f"{GREEN} [SUCCESS] {END}" +WARNING = f"{YELLOW}[WARNING]{END}" + # ZERO defaults by deespeed # These values should not be changed unless defaults in deepspeed are changed # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training @@ -155,7 +169,7 @@ def initialize_tensorboard_writer(self): try: from torch.utils.tensorboard import SummaryWriter - print("> setting tensorboard ...") + print("> setting up tensorboard ...") self.tensorboard_writer = SummaryWriter(log_dir=self.tensorboard_dir) except (ModuleNotFoundError, ImportError): print( @@ -165,6 +179,47 @@ def initialize_tensorboard_writer(self): flush=True, ) + def initialize_comet(self): + if self.use_comet and self.rank == 0: + try: + import comet_ml + + # Deactivate output logging to avoid any potential interference with Tee + self.comet_experiment = comet_ml.start( + workspace=self.comet_workspace, + project=self.comet_project, + experiment_config=comet_ml.ExperimentConfig( + auto_output_logging=False + ), + ) + self.comet_experiment.__internal_api__log_parameters__( + self.all_config, + framework="gpt-neox", + source="manual", + flatten_nested=True, + ) + + if self.comet_experiment_name: + self.comet_experiment.set_name(self.comet_experiment_name) + + if self.comet_tags: + self.comet_experiment.add_tags(self.comet_tags) + + if self.comet_others: + self.comet_experiment.log_others(self.comet_others) + + logging.info("> setting up comet ...") + except ImportError as e: + logging.error( + f'{FAIL} importing comet. Comet can be installed with "pip install comet_llm". See https://github.com/comet-ml/comet-llm for more info. Full error is:' + ) + raise e + except Exception as e: + logging.error( + f'{FAIL} Error setting up Comet. Either set "use_comet: False" in your configuration file, or resolve the issue with Comet. Full error is:', + ) + raise e + @classmethod def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None): """ @@ -182,7 +237,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None) config_files = dict() # iterate of all to be loaded yaml files for conf_file_name in paths_to_yml_files: - # load file with open(conf_file_name) as conf_file: conf = yaml.load(conf_file, Loader=yaml.FullLoader) @@ -479,7 +533,6 @@ def get_extra_deepspeed_args(self): return extra_ds_args def get_deepspeed_main_args(self): - args_list = list() if self.autotuning_run is not None: @@ -805,7 +858,6 @@ def calculate_batch_parameters( @staticmethod def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc): - assert ( train_batch > 0 ), f"Train batch size: {train_batch} has to be greater than 0" @@ -868,7 +920,8 @@ def calculate_derived(self): dp_world_size = (global_num_gpus / pp_size) / mp_size if not (dp_world_size % 1 == 0): error_message = ( - self.__class__.__name__ + f"{ERROR}" + + self.__class__.__name__ + ".calculate_derived() " + f"(global_num_gpus / pp_size) / mp_size [({global_num_gpus} / {pp_size}) / {mp_size}] must be a whole number" ) @@ -904,38 +957,21 @@ def calculate_derived(self): } ) - # derive steps where checkpoint should be saved - if self.checkpoint_factor or self.extra_save_iters: - if self.extra_save_iters: - save_iters = set(self.extra_save_iters) - else: - save_iters = set() - - step = self.checkpoint_factor # don't save step 0 or 1 - while step < self.train_iters: - save_iters.add(step) - if self.checkpoint_scale == "log": - step *= self.checkpoint_factor - elif self.checkpoint_scale == "linear": - step += self.checkpoint_factor - - save_iters = list(save_iters) - save_iters.sort() - - self.update_values( - { - "save_iters": save_iters, - } - ) - # derive precision - fp16_conflict = "DeepSpeed fp16 field was set but precision conflicts" if self.fp16 and self.fp16.get("enabled", False): if self.precision is None: self.update_value("precision", "fp16") else: + fp16_conflict = "DeepSpeed fp16 field was set but precision conflicts" assert self.precision == "fp16", fp16_conflict + if self.bf16 and self.bf16.get("enabled", False): + if self.precision is None: + self.update_value("precision", "bfloat16") + else: + bf16_conflict = "DeepSpeed bf16 field was set but precision conflicts" + assert self.precision == "bfloat16", bf16_conflict + if self.precision == "fp16": if isinstance(self.fp16, dict) and len(self.fp16) > 0: fp16_args = copy.deepcopy(self.fp16) @@ -944,14 +980,15 @@ def calculate_derived(self): fp16_args = {"type": "fp16", "enabled": True} self.update_value("fp16", fp16_args) elif self.precision == "bfloat16": - bf_config = {"bf16": {"enabled": True}} - # dt_config = {"grad_accum_dtype": "fp32"} - if self.deepspeed_extra_args is None: - self.update_value("deepspeed_extra_args", bf_config) - else: - extra_args = copy.deepcopy(self.deepspeed_extra_args) - extra_args.update(bf_config) - self.update_value("deepspeed_extra_args", extra_args) + if not self.bf16: + bf_config = {"bf16": {"enabled": True}} + # dt_config = {"grad_accum_dtype": "fp32"} + if self.deepspeed_extra_args is None: + self.update_value("deepspeed_extra_args", bf_config) + else: + extra_args = copy.deepcopy(self.deepspeed_extra_args) + extra_args.update(bf_config) + self.update_value("deepspeed_extra_args", extra_args) zero_stage = self.zero_optimization["stage"] if self.data_types is None: @@ -1017,6 +1054,10 @@ def calculate_derived(self): ) if self.optimizer_type.lower() == "onebitadam": + assert ( + self.train_iters is not None + ), "OneBitAdam requires train_iters to be specified" + # onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args # for all other optimizers, the scheduling is handled by megatron self.scheduler = { @@ -1037,6 +1078,17 @@ def calculate_derived(self): # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) + # Do MoE checks + if self.moe_num_experts > 1: + assert not ( + self.is_pipe_parallel or self.pipe_parallel_size > 1 + ), "MoE not supported with pipeline parallelism" + assert self.zero_optimization["stage"] != 3, "MoE not compatible with zero3" + + assert ( + self.sequence_parallel is False + ), "MoE not compatible with Sequence Parallel" + # Attention config if self.attention_config is None: self.update_value("attention_config", [[["global"], self.num_layers]]) @@ -1111,15 +1163,19 @@ def calculate_derived(self): # Adding equal dataset weights if none are provided if self.train_data_paths and (self.train_data_weights is None): self.train_data_weights = [1.0] * len(self.train_data_paths) + elif self.pos_train_data_paths and (self.train_data_weights is None): + self.train_data_weights = [1.0] * len(self.pos_train_data_paths) if self.valid_data_paths and (self.valid_data_weights is None): self.valid_data_weights = [1.0] * len(self.valid_data_paths) + elif self.pos_valid_data_paths and (self.valid_data_weights is None): + self.valid_data_weights = [1.0] * len(self.pos_valid_data_paths) if self.test_data_paths and (self.test_data_weights is None): self.test_data_weights = [1.0] * len(self.test_data_paths) + elif self.pos_test_data_paths and (self.test_data_weights is None): + self.test_data_weights = [1.0] * len(self.pos_test_data_paths) - if self.label_data_paths: - err_str = ( - "Must use `label_data_paths` with `train_data_paths`, not `data_path`" - ) + if self.train_label_data_paths: + err_str = "Must use `train_label_data_paths` with `train_data_paths`, not `data_path`" assert self.train_data_paths and not self.data_path, err_str # if a sample input file is provided, default text_gen_type type to input-file @@ -1159,7 +1215,9 @@ def validate_values(self): # learning rate if self.lr is None: - error_message = self.__class__.__name__ + ".validate_values() lr is None" + error_message = ( + f"{FAIL} " + self.__class__.__name__ + ".validate_values() lr is None" + ) logging.error(error_message) raise ValueError(error_message) return False @@ -1174,7 +1232,8 @@ def validate_values(self): for req_arg in required_args: if getattr(self, req_arg) is None: error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() " + req_arg + " is None." @@ -1184,9 +1243,12 @@ def validate_values(self): return False # Checks. - if self.hidden_size % self.num_attention_heads != 0: + if self.hidden_size % self.num_attention_heads != 0 and not ( + "mamba" in self.attention_config + ): error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() hidden_size must be divisible by num_attention_heads" ) logging.error(error_message) @@ -1196,7 +1258,8 @@ def validate_values(self): if self.seq_length is not None: if not (self.max_position_embeddings >= self.seq_length): error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() max_position_embeddings must be bigger or equal seq_length" ) logging.error(error_message) @@ -1205,7 +1268,8 @@ def validate_values(self): if not (self.min_lr <= self.lr): error_message = ( - self.__class__.__name__ + "{FAIL}" + + self.__class__.__name__ + ".validate_values() min_lr must be smaller or equal lr" ) logging.error(error_message) @@ -1218,7 +1282,8 @@ def validate_values(self): and self.extra_save_iters is None ): error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() checkpoint_factor or extra_save_iters must be defined if save is defined" ) logging.error(error_message) @@ -1227,10 +1292,10 @@ def validate_values(self): # Parameters sharing does not work with torch DDP. if (self.num_unique_layers is not None) and (self.num_layers is not None): - if not (self.num_unique_layers <= self.num_layers): error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() num-unique-layers must be smaller or equal num_layers" ) logging.error(error_message) @@ -1239,7 +1304,8 @@ def validate_values(self): if not (self.num_layers % self.num_unique_layers == 0): error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() num-layers should be divisible by num-unique-layers" ) logging.error(error_message) @@ -1248,7 +1314,8 @@ def validate_values(self): if self.fp16_lm_cross_entropy and self.precision != "fp16": error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_values() lm cross entropy in fp16 only support in fp16 mode." ) logging.error(error_message) @@ -1266,13 +1333,13 @@ def validate_values(self): ] if all(has_separate_path): assert self.data_path is None, ( - "Please provide *either* `data_path` or `train/valid/test_data_path` " + f"{FAIL} Please provide *either* `data_path` or `train/valid/test_data_path` " "in args " ) # assert that if one of train/test/valid_data_path are provided, all should be assert_error_mess = ( - "One or more of train/valid/test data_path are not provided:\n\t" + f"{FAIL} One or more of train/valid/test data_path are not provided:\n\t" ) assert_error_mess += "\n\t".join( [ @@ -1328,7 +1395,8 @@ def validate_types(self): if actual_value.lower() in lowercase_accepted_values: continue logging.error( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'" ) @@ -1339,14 +1407,16 @@ def validate_types(self): continue else: logging.error( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_type}' not in {accepted_types}" ) return False logging.error( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: '{actual_type}' instead of '{field_def.type}'" ) @@ -1368,7 +1438,8 @@ def validate_types(self): return False else: logging.error( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'type'" ) @@ -1376,14 +1447,16 @@ def validate_types(self): if "params" in value: if not isinstance(value["params"], dict): logging.error( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: key 'params' must be a dict" ) return False else: logging.error( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'params'" ) @@ -1394,7 +1467,8 @@ def validate_types(self): if isinstance(value, dict): if not "enabled" in value: error_message = ( - self.__class__.__name__ + f"{FAIL}" + + self.__class__.__name__ + ".validate_types() " + f"{field_name}: must contain key 'enabled'" ) diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index 3083b7282..a464b133c 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -21,9 +21,9 @@ from template import NeoXArgsTemplate try: - from typing import List, Literal, Union + from typing import List, Literal, Union, Optional, Any except ImportError: - from typing_extensions import List, Literal, Union + from typing_extensions import List, Literal, Union, Optional ATTENTION_TYPE_CHOICES = [ @@ -46,7 +46,7 @@ def get_git_commit_hash(): try: git_hash = subprocess.check_output(["git", "describe", "--always"]).strip() git_hash = git_hash.decode() - except subprocess.CalledProcessError: + except (subprocess.CalledProcessError, FileNotFoundError): git_hash = None return git_hash @@ -85,6 +85,13 @@ class NeoXArgsParallelism(NeoXArgsTemplate): according to pipeline parallel size. """ + sequence_parallel: bool = False + """ + flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198) + (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1. + **Set by user, in contrast to neox_args.is_pipe_parallel.** + """ + @dataclass class NeoXArgsModel(NeoXArgsTemplate): @@ -109,9 +116,17 @@ class NeoXArgsModel(NeoXArgsTemplate): intermediate_size: int = None """ - Transformer intermediate size. Currently only used for "mlp_type": "llama". + Transformer intermediate size. Default = 4h + """ - If not passed, will be set to a reasonable default. + mlp_multiple_of: int = 1 + """ + force mlp size to be a multiple of this value + """ + + expansion_factor: float = None + """ + Transformer intermediate size. Default = 4 """ num_attention_heads: int = None @@ -147,9 +162,11 @@ class NeoXArgsModel(NeoXArgsTemplate): Maximum number of position embeddings to use. This is the size of position embedding. """ - norm: Literal["layernorm", "rmsnorm", "scalenorm"] = "layernorm" + norm: Literal[ + "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm" + ] = "layernorm" """ - Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm". + Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm". """ layernorm_fusion: bool = False @@ -157,6 +174,11 @@ class NeoXArgsModel(NeoXArgsTemplate): Use fused layer norm kernel (if `norm` is `layernorm`). """ + rmsnorm_fusion: bool = False + """ + Use fused RMS norm kernel (if `norm` is `rmsnorm`). + """ + use_qk_layernorm: bool = False """ Use QK Normalization @@ -266,10 +288,25 @@ class NeoXArgsModel(NeoXArgsTemplate): """ activation: Literal[ - "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu" + "gelu", + "geglu", + "relu", + "softsign", + "swish", + "mish", + "silu", + "reglu", + "swiglu", + "bilinear", + "glu", ] = "gelu" """ - Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"] + Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"] + """ + + use_flashattn_swiglu: bool = False + """ + Use flash attention's version of swiglu """ scaled_upper_triang_masked_softmax_fusion: bool = False @@ -406,12 +443,9 @@ class NeoXArgsModel(NeoXArgsTemplate): """ If false, attn_linear (e.g. QKVO) will not have bias terms """ - - mlp_type: str = "regular" + use_bias_in_mlp: bool = True """ - Types: - regular: Megatron implementation - llama: LLaMA MLP (SiLU-gated MLP) + If false, mlps will not have bias terms """ soft_prompt_tuning: dict = None @@ -463,6 +497,21 @@ class NeoXArgsModel(NeoXArgsTemplate): Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column) """ + dim_att: int = None + """ + Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size. + """ + + head_size: int = None + """ + Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads. + """ + + ffn_dim: int = None + """ + Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor. + """ + @dataclass class NeoXArgsOptimizer(NeoXArgsTemplate): @@ -534,7 +583,13 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate): lr_decay_iters: int = None """ - Number of iterations to decay learning rate over, If None defaults to --train-iters + Number of iterations to decay learning rate over, If None defaults to + --train-iters or the equivalent inferred valued from train_epochs. + """ + + lr_decay_fraction: float = None + """ + Effective fraction of training over which to decay lr, overrides lr_decay_iters, useful when specifying train_epochs """ min_lr: float = 0.0 @@ -600,6 +655,39 @@ class NeoXArgsLogging(NeoXArgsTemplate): Write TensorBoard logs to this directory. """ + use_comet: bool = None + """Flag indicating if comet is to be used.""" + + comet_workspace: Optional[str] = None + """ + Comet workspace name, if not configured Comet Experiments will be created in the user configured default workspace. + """ + + comet_project: Optional[str] = None + """ + Comet project name, if not configured Comet Experiments will be created in the Uncategorized Experiments project. + """ + + comet_experiment_name: Optional[str] = None + """ + Custom name for the Comet experiment. If not provided, a random name is used. + """ + + comet_tags: Optional[list] = None + """ + List of tags to attach to the created Comet Experiment. + """ + + comet_others: Optional[dict] = None + """ + Custom metadata to attach to the created Comet Experiment. + """ + + comet_experiment: Any = None + """ + Initialized comet experiment object used to log data + """ + log_interval: int = 100 """ Interval between logging. @@ -654,8 +742,8 @@ class NeoXArgsLogging(NeoXArgsTemplate): profile: bool = False """ - Enable nsys profiling. When using this option, - nsys options should be specified in commandline. + Enable nsys and pytorch profiling. When using this option with nsys, + nsys options should be directly specified in commandline. An example nsys commandline is ``` nsys profile -s none -t nvtx,cuda -o @@ -780,11 +868,6 @@ class NeoXArgsOther(NeoXArgsTemplate): Set during training """ - save_iters: list = None - """ - Set during training - """ - global_num_gpus: int = None """ Set during launching @@ -843,9 +926,14 @@ class NeoXArgsTraining(NeoXArgsTemplate): List of paths to train datasets. """ - label_data_paths: list = None + train_label_data_paths: list = None + """ + List of paths to train label datasets (not shifted by 1 yet!). + """ + + train_reward_data_paths: list = None """ - List of paths to label datasets (not shifted by 1 yet!). + List of paths to train reward datasets """ test_data_paths: list = None @@ -853,11 +941,67 @@ class NeoXArgsTraining(NeoXArgsTemplate): List of paths to test datasets. """ + test_label_data_paths: list = None + """ + List of paths to test label datasets (not shifted by 1 yet!). + """ + + test_reward_data_paths: list = None + """ + List of paths to test reward datasets + """ + valid_data_paths: list = None """ List of paths to validation datasets. """ + valid_label_data_paths: list = None + """ + List of paths to validation label datasets (not shifted by 1 yet!). + """ + + valid_reward_data_paths: list = None + """ + List of paths to validation reward datasets + """ + + pos_train_data_paths: list = None + neg_train_data_paths: list = None + """ + List of paths to positive and negative training datasets. + """ + + pos_train_label_data_paths: list = None + neg_train_label_data_paths: list = None + """ + List of paths to positive and negative training label datasets (not shifted by 1 yet!). + """ + + pos_valid_data_paths: list = None + neg_valid_data_paths: list = None + """ + List of paths to positive and negative validation datasets. + """ + + pos_valid_label_data_paths: list = None + neg_valid_label_data_paths: list = None + """ + List of paths to positive and negative validation label datasets (not shifted by 1 yet!). + """ + + pos_test_data_paths: list = None + neg_test_data_paths: list = None + """ + List of paths to positive and negative test datasets. + """ + + pos_test_label_data_paths: list = None + neg_test_label_data_paths: list = None + """ + List of paths to positive and negative test label datasets (not shifted by 1 yet!). + """ + train_data_weights: list = None """ List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting. @@ -907,6 +1051,73 @@ class NeoXArgsTraining(NeoXArgsTemplate): Implementation of indexed datasets, can be one of "infer", "cached", or "mmap" """ + pack_impl: Literal["packed", "pack_until_overflow", "unpacked"] = "packed" + """ + Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked". + + warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets + """ + + dataset_impl: Literal["gpt2", "pairwise"] = "gpt2" + """ + Dataset implementation, can be one of "gpt2" or "pairwise" + """ + + train_impl: Literal["normal", "dpo", "rm", "kto"] = "normal" + """ + Training implementation, can be one of "normal", "dpo", "kto", or "rm" + """ + + dpo_fp32: bool = True + """ + Whether to cast logits to fp32 for DPO loss calculation. + """ + + dpo_reference_free: bool = False + """ + Whether to use reference-free DPO. + """ + + dpo_beta: float = 0.1 + """ + Beta value for DPO + """ + + kto_fp32: bool = True + """ + Whether to cast logits to fp32 for KTO loss calculation. + """ + + kto_desirable_weight: float = 1.0 + """ + Weight for desirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes. + """ + + kto_undesirable_weight: float = 1.0 + """ + Weight for undesirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes. + """ + + z_loss: float = 0.0 + """ + Z-loss parameter, only implemented for RM training currently. + https://arxiv.org/pdf/2204.02311 + https://arxiv.org/pdf/2309.10305 + """ + + kto_beta: float = 0.1 + """ + Beta value for KTO + """ + + allow_chopped: bool = True + """ + WARNING: if your packing impl is packed, this is ignored. + + Allow chopped samples in the dataset. + (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024) + """ + mmap_warmup: bool = False """ Warm up mmap files. @@ -948,7 +1159,7 @@ class NeoXArgsTraining(NeoXArgsTemplate): while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1. """ - checkpoint_factor: int = None + checkpoint_factor: Union[int, float] = None """ Acts as a multiplier on either the "log" or "linear" checkpoint spacing. @@ -1002,6 +1213,12 @@ class NeoXArgsTraining(NeoXArgsTemplate): Number of iterations to run for training. """ + train_epochs: int = None + """ + Number of epochs to run for training. Do not specify both train_epochs and train_iters. + Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed' + """ + eval_iters: int = 100 """ Number of iterations to run for evaluation validation/test for. @@ -1195,7 +1412,12 @@ class NeoXArgsTextgen(NeoXArgsTemplate): text_gen_type: str = None """ How to generate text/sample the model. - Options: `unconditional`, `input-file`, `interactive` + Options: `unconditional`, `input-file`, `interactive`, `precompute` + """ + + precompute_model_name: str = None + """ + Model name to use for saving precomputed logprobs """ temperature: float = 0.0 diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py index 7b7a390ab..293cbaabc 100644 --- a/megatron/text_generation_utils.py +++ b/megatron/text_generation_utils.py @@ -19,16 +19,20 @@ import copy import json +import math import os import time from typing import List, Union +import numpy as np import torch import torch.nn.functional as F from megatron import print_rank_0 from megatron import mpu from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0 +from megatron.data.indexed_dataset import make_builder, make_dataset +from megatron.mpu.mappings import gather_from_model_parallel_region def get_batch(neox_args, context_tokens: torch.Tensor): @@ -52,7 +56,9 @@ def get_batch(neox_args, context_tokens: torch.Tensor): return tokens, attention_mask, position_ids -def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int): +def pad_batch( + context_tokens: List[List[int]], pad_id: int, pad_len: int, truncate: bool = False +): """ pads context lengths in context_tokens with pad_id to equal neox_args.seq_length, and returns the padded batch and the new lengths. @@ -60,17 +66,21 @@ def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int): context_tokens: list of lists of tokens pad_id: int, integer to use as padding token pad_len: int, context length to be padded; all batch items will be padded to the same length + truncate: bool, if True, truncate context tokens to pad_len if they are longer than pad_len returns: tuple of padded context tokens and a list of unpadded token count """ context_lengths = [] - for tokens in context_tokens: + for i, tokens in enumerate(context_tokens): context_length = len(tokens) if context_length < pad_len: tokens.extend([pad_id] * (pad_len - context_length)) elif context_length > pad_len: - raise ValueError("context_length is bigger than to be padded length") + if not truncate: + raise ValueError("context_length is bigger than to be padded length") + context_tokens[i] = tokens[:pad_len] + context_length = pad_len context_lengths.append(context_length) return context_tokens, context_lengths @@ -82,6 +92,8 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")): This function has been mostly taken from huggingface conversational ai code at https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313 + When both top_k and top_p are specified, tokens are first filtered according to top_k, renormalized, and then filtered according to top_p. + logits: torch.Tensor -> logits of megatron model. top_k: integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token. top_p: float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p. @@ -807,3 +819,182 @@ def generate_samples_interactive( print_rank_0("Generated Text: " + generated_text) if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0: _ = input("\n") + + +def get_logp(logits, labels, force_fp32=False): + if force_fp32: + logits = logits.float() + logp = logits.log_softmax(dim=-1) + return torch.gather(logp, dim=2, index=labels.unsqueeze(2)).squeeze(2) + + +def precompute_logits(neox_args, model): + """ + Precomputes logprobs from training/testing/validation datasets + + Saves it to the same directory as the dataset with the model name appended to it + + neox_args: NeoXArgs. + model: a Megatron model + + """ + if neox_args.precompute_model_name is None: + mdl_name = str(hash(neox_args.load)) + else: + mdl_name = neox_args.precompute_model_name + print_rank_0("Precomputing logprobs...") + model.eval() + data_paths = list() + if neox_args.train_data_paths is not None: + for path in neox_args.train_data_paths: + data_paths.append(path) + for path in neox_args.test_data_paths: + data_paths.append(path) + for path in neox_args.valid_data_paths: + data_paths.append(path) + elif neox_args.pos_train_data_paths is not None: + # Pairwise data... + for path in neox_args.pos_train_data_paths: + data_paths.append(path) + for path in neox_args.neg_train_data_paths: + data_paths.append(path) + for path in neox_args.pos_valid_data_paths: + data_paths.append(path) + for path in neox_args.neg_valid_data_paths: + data_paths.append(path) + for path in neox_args.pos_test_data_paths: + data_paths.append(path) + for path in neox_args.neg_test_data_paths: + data_paths.append(path) + for path in data_paths: + print_rank_0(f"Precomputing logits for {path}") + # Add hash to path... + out_path = path + f"_{mdl_name}" + if os.path.exists(out_path + ".idx"): + continue + dataset = make_dataset(path, neox_args.data_impl, not neox_args.mmap_warmup) + if is_mp_rank_0(): + out_dataset = make_builder(out_path + ".bin", neox_args.data_impl) + out_dataset._dtype = np.float32 + i = 0 + + # TODO: Not sure why this requires a multiple of 8? Investigate later. + while i < int(math.ceil(len(dataset) / 8.0) * 8): + start = time.time() + model.module.clear_cache() # clear kv cache between batches + if is_mp_rank_0(): + offset = ( + mpu.get_data_parallel_rank() + * neox_args.train_micro_batch_size_per_gpu + ) + context_tokens = [ + [int(x) for x in dataset.get(j % len(dataset)).tolist()] + for j in range( + i + offset, + i + (neox_args.train_micro_batch_size_per_gpu + offset), + ) + ] + # grab microbatch + # pad batch in order to allow conversion to tensor + context_tokens, context_lengths = pad_batch( + copy.deepcopy(context_tokens), + pad_id=0, + pad_len=neox_args.seq_length + 1, + truncate=True, + ) + # print(context_tokens) + label_tokens = [tokens[1:] for tokens in context_tokens] + context_tokens = [tokens[:-1] for tokens in context_tokens] + else: + context_tokens = [ + [0 for _ in range(neox_args.seq_length)] + for _ in range(neox_args.batch_size) + ] + label_tokens = [ + [0 for _ in range(neox_args.seq_length)] + for _ in range(neox_args.batch_size) + ] + context_lengths = [0 for _ in range(neox_args.batch_size)] + i += ( + neox_args.train_micro_batch_size_per_gpu + * mpu.get_data_parallel_world_size() + ) + # print(context_tokens) + # convert to tensor and broadcast + context_tokens = torch.cuda.LongTensor(context_tokens) + label_tokens = torch.cuda.LongTensor(label_tokens) + # Make sure context tokens + start tokens are the same across all ranks + token_generation_start_index = torch.cuda.LongTensor(context_lengths) + torch.distributed.broadcast( + context_tokens, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group(), + ) + torch.distributed.broadcast( + token_generation_start_index, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group(), + ) + torch.distributed.broadcast( + label_tokens, + mpu.get_model_parallel_src_rank(), + group=mpu.get_model_parallel_group(), + ) + # context_tokens = context_tokens[:, :chop_len].contiguous() + # label_tokens = label_tokens[:, :chop_len].contiguous() + with torch.no_grad(): + # get attention mask / position ids + context_tokens, attention_mask, position_ids = get_batch( + neox_args, context_tokens + ) + model_inputs = ( + context_tokens, + position_ids, + attention_mask, + ) + maybe_tuple = forward_model( + model, model_inputs, neox_args.is_pipe_parallel + ) + if isinstance(maybe_tuple, tuple): + logits, _ = maybe_tuple + else: + logits = maybe_tuple + if logits is not None: # if pipe parallel, not all ranks return logits + logits = gather_from_model_parallel_region(logits) + logp = get_logp(logits, label_tokens, True).squeeze() + if neox_args.is_pipe_parallel: + # broadcast generated tokens to pipe parallel group + src_rank = model.grid.stage_to_global(model.num_stages - 1) + logp = ( + logp + if logits is not None + else torch.zeros( + neox_args.batch_size, dtype=torch.float32 + ).cuda() + ) + torch.distributed.broadcast( + tensor=logp, + src=src_rank, + group=mpu.get_pipe_parallel_group(), + ) + logp = logp.squeeze() + logp_list = [ + torch.zeros_like(logp) + for _ in range(mpu.get_data_parallel_world_size()) + ] + torch.distributed.all_gather( + logp_list, logp, group=mpu.get_data_parallel_group() + ) + logp = torch.cat(logp_list, dim=0).cpu().numpy() + if (mpu.get_model_parallel_rank() == 0) and ( + mpu.get_data_parallel_rank() == 0 + ): + for j in range(logp.shape[0]): + out_dataset.add_item(logp[j]) + out_dataset.end_document() + print_rank_0(f"Processed {i} / {len(dataset)} in {time.time() - start}") + if is_mp_rank_0(): + out_dataset.finalize( + out_path + ".idx", + ) + torch.distributed.barrier() diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 348c7cefe..d39e18243 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -32,6 +32,10 @@ def build_tokenizer(args): if args.rank == 0: print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True) + assert ( + args.tokenizer_type is not None + ), "tokenizer_type must be specified in the .yml config" + # Select and instantiate the tokenizer. if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower(): assert args.vocab_file is not None diff --git a/megatron/training.py b/megatron/training.py index 6a67d36f8..17dcce2c7 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -21,12 +21,14 @@ """Pretrain utilities.""" from datetime import datetime from functools import partial +from collections import defaultdict import math import sys from contextlib import nullcontext import torch +import torch.nn.functional as F import deepspeed from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler import numpy as np @@ -43,9 +45,14 @@ GPT2ModelPipe, SoftEmbedding, get_params_for_weight_decay_optimization, + mark_norms_for_sequence_parallel_grad_sync, ) +from megatron.mpu.mappings import gather_from_model_parallel_region from megatron.checkpointing import load_checkpoint, save_checkpoint -from megatron.data.data_utils import build_train_valid_test_data_iterators +from megatron.data.data_utils import ( + build_train_valid_test_data_loaders, + shift_and_wrap_data_loaders, +) from megatron.initialize import initialize_megatron from megatron.learning_rates import AnnealingLR from megatron.logging import tb_wandb_log, training_log @@ -56,6 +63,7 @@ CharCounter, ) from megatron.model.gpt2_model import cross_entropy +from megatron.mpu import vocab_parallel_cross_entropy from pickle import dump import os @@ -82,7 +90,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache): base_model = GPT2ModelPipe( neox_args=neox_args, num_tokentypes=0, - parallel_output=True, + parallel_output=True if neox_args.train_impl != "rm" else False, topology=mpu.get_topology(), use_cache=use_cache, ) @@ -106,7 +114,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache): delta_model = GPT2ModelPipe( neox_args=neox_args, num_tokentypes=0, - parallel_output=True, + parallel_output=True if neox_args.train_impl != "rm" else False, topology=mpu.get_topology(), use_cache=use_cache, ) @@ -136,7 +144,7 @@ def gen(): old_hidden_size = neox_args.hidden_size neox_args.hidden_size = hidden_size - model, optimizer, _ = setup_model_and_optimizer( + model, optimizer, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=False ) @@ -168,14 +176,54 @@ def gen(): sys.exit(1) +def update_iterations(neox_args, data_loaders): + """ + Compute the number of train iterations if not specified and num_epochs, updates the neox_args object. + Note that if len(train_dataloader) % gradient_accumulation_steps != 0, this will configure neox + to do as many iterations as possible while ensuring that each example is seen *at most* train_epochs + times. + """ + if (not neox_args.do_train) or (neox_args.train_iters is not None): + pass + elif neox_args.train_iters is None and neox_args.train_epochs is None: + print_rank_0( + "ERROR:Failed to specify either train_epochs or train_iters in config file" + ) + else: + global_rank = torch.distributed.get_rank() + + if global_rank == 0: + train_dataloader = data_loaders["train"] + train_epochs = neox_args.train_epochs + gradient_accumulation_steps = neox_args.gradient_accumulation_steps + + train_dataloader_len = len(train_dataloader) + train_iterations = ( + train_dataloader_len * train_epochs + ) // gradient_accumulation_steps + + train_iters_tensor = torch.cuda.LongTensor([train_iterations]) + else: + train_iters_tensor = torch.cuda.LongTensor([0]) + + torch.distributed.broadcast(train_iters_tensor, src=0) + + neox_args.train_iters = train_iters_tensor[0].item() + + print_rank_0( + f"Training for a total of {neox_args.train_iters} iterations, corresponding to {neox_args.train_epochs} epochs." + ) + + def pretrain(neox_args): """Main training program. This function will run the following in the order provided: 1) initialize Megatron. - 2) setup model, optimizer and lr schedule - 3) call train_val_test_data_provider to get train/val/test datasets. - 4) train the model. + 2) get train/val/test datasets. + 3) setup model, optimizer and lr schedule. + 4) configure data loading + 5) train the model. Arguments: neox_args: an instance of NeoXArgs containing the configuration for pretrain @@ -184,26 +232,34 @@ def pretrain(neox_args): # setup logging and timers init_wandb(neox_args=neox_args) timers = Timers( - use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer + use_wandb=neox_args.use_wandb, + tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) # Initialize and get arguments, timers, and Tensorboard writer. initialize_megatron(neox_args=neox_args) + # Create data loaders + timers("train/valid/test data loaders").start() + data_loaders = build_train_valid_test_data_loaders(neox_args=neox_args) + update_iterations(neox_args=neox_args, data_loaders=data_loaders) + timers("train/valid/test data loaders").stop() + # Model, optimizer, and learning rate. timers("model and optimizer").start() - model, optimizer, lr_scheduler = setup_model_and_optimizer( + model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer( neox_args=neox_args, use_cache=False, iteration=neox_args.iteration ) timers("model and optimizer").stop() - # Data stuff. + # Make and configure iterators timers("train/valid/test data iterators").start() ( train_data_iterator, valid_data_iterator, test_data_iterator, - ) = build_train_valid_test_data_iterators(neox_args=neox_args) + ) = shift_and_wrap_data_loaders(neox_args=neox_args, data_loaders=data_loaders) timers("train/valid/test data iterators").stop() if neox_args.use_mup and neox_args.coord_check: @@ -211,12 +267,23 @@ def pretrain(neox_args): # Print setup timing. print_rank_0("done with setups ...") - timers.log(["model and optimizer", "train/valid/test data iterators"]) + timers.log( + [ + "train/valid/test data loaders", + "model and optimizer", + "train/valid/test data iterators", + ] + ) print_rank_0("training ...") iteration = neox_args.iteration # edge case: save step 0 checkpoint if requested and we're starting from step 0 - if neox_args.save and 0 in neox_args.save_iters and iteration == 0: + if ( + neox_args.save + and neox_args.extra_save_iters + and 0 in neox_args.extra_save_iters + and iteration == 0 + ): save_checkpoint( neox_args=neox_args, iteration=iteration, @@ -230,6 +297,7 @@ def pretrain(neox_args): neox_args=neox_args, timers=timers, model=model, + reference_model=reference_model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_data_iterator=train_data_iterator, @@ -247,6 +315,7 @@ def pretrain(neox_args): iteration=iteration, verbose=False, timers=timers, + reference_model=reference_model, ) if neox_args.save and iteration != 0: @@ -271,23 +340,29 @@ def pretrain(neox_args): verbose=True, timers=timers, chart_name="test", + reference_model=reference_model, ) -def _get_batch(neox_args, tokenizer, keys, data, datatype): +def _get_batch(neox_args, tokenizer, keys, data, datatype, label_mask_zero=False): """Support function for get_batch / get_batch pipe (to avoid code repetition)""" data_b = mpu.broadcast_data(keys, data, datatype) - + token_key = keys[0] + label_key = keys[1] if len(keys) > 1 else None # Unpack. - tokens_ = data_b["text"].long() - if "label" in data_b: + tokens_ = data_b[token_key].long() + if label_key in data_b: + label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous() labels = torch.where( - data_b["label"].long() >= 0, - data_b["label"].long(), - torch.zeros_like(data_b["label"].long()), + data_b[label_key].long() >= 0, + data_b[label_key].long(), + torch.zeros_like(data_b[label_key].long()), )[:, 1:].contiguous() else: + label_mask = (tokens_.long() >= 0)[:, 1:].contiguous() labels = tokens_[:, 1:].contiguous() + if label_mask_zero: + labels = labels * label_mask tokens = tokens_[:, :-1].contiguous() # Get the masks and position ids. @@ -297,9 +372,9 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype): eod_mask_loss=neox_args.eod_mask_loss, sliding_window_width=neox_args.sliding_window_width, ) - # If `label` is present, any token < 0 (e.g., -100, the default for torch) skips the loss computation - if "label" in data_b: - loss_mask = (data_b["label"][:, 1:] >= 0).to(loss_mask.dtype) + + # combine loss masks from get_ltor_masks_and_position_ids with loss masks from data + loss_mask = label_mask.to(loss_mask.dtype) * loss_mask return tokens, labels, loss_mask, attention_mask, position_ids @@ -307,7 +382,14 @@ def get_batch(neox_args, data_iterator): """Generate a batch""" # Items and their type. - keys = ["text", "label"] if neox_args.label_data_paths else ["text"] + if neox_args.train_impl in ["normal", "kto"]: + keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"] + elif neox_args.train_impl in ["dpo", "rm"]: + keys = ( + [["pos", "pos_label"], ["neg", "neg_label"]] + if neox_args.pos_train_label_data_paths + else [["pos"], ["neg"]] + ) datatype = torch.int64 # Broadcast data. @@ -315,19 +397,80 @@ def get_batch(neox_args, data_iterator): data = next(data_iterator) else: data = None - return _get_batch( - neox_args=neox_args, - tokenizer=neox_args.tokenizer, - keys=keys, - data=data, - datatype=datatype, - ) + if neox_args.train_impl == "normal": + return _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys, + data=data, + datatype=datatype, + ) + elif neox_args.train_impl == "kto": + assert ( + neox_args.train_micro_batch_size_per_gpu > 1 + ), "For KTO training, the train_micro_batch_size_per_gpu must be greater than 1." + tup = _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys, + data=data, + datatype=datatype, + ) + # Remove the last token from the reward since we predict the next token, so + # Reward of will be based on the label of + rw_data = mpu.broadcast_data(["reward"], data, torch.float)["reward"][ + :, :-1 + ].contiguous() + ref_data = ( + mpu.broadcast_data(["ref"], data, torch.float)["ref"][:, :-1].contiguous() + if neox_args.precompute_model_name + else None + ) + return tup + (rw_data, ref_data) + elif neox_args.train_impl in ["dpo", "rm"]: + pos_tup = _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys[0], + data=data, + datatype=datatype, + label_mask_zero=True, + ) + neg_tup = _get_batch( + neox_args=neox_args, + tokenizer=neox_args.tokenizer, + keys=keys[1], + data=data, + datatype=datatype, + label_mask_zero=True, + ) + if neox_args.precompute_model_name: + ref_data = mpu.broadcast_data(["pos_ref", "neg_ref"], data, torch.float) + else: + ref_data = {"pos_ref": None} + return [ + torch.cat((pos_item, neg_item), dim=0) + for pos_item, neg_item in zip(pos_tup, neg_tup) + ] + [ + torch.cat((ref_data["pos_ref"], ref_data["neg_ref"]), dim=0)[ + :, :-1 + ].contiguous() + if ref_data["pos_ref"] is not None + else None + ] def get_batch_pipe(data, neox_args, curr_scheduler=None): """A modification of get_batch() to work with the latest batch instead of an iterator.""" + + assert neox_args.train_impl not in [ + "kto", + "dpo", + "rm", + ], "Pipeline parallel is currently unsupported when using any of kto, dpo, rm. Set pipe_parallel_size to 0" + # Items and their type. - keys = ["text", "label"] if neox_args.label_data_paths else ["text"] + keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"] datatype = torch.int64 tokens, labels, loss_mask, attention_mask, position_ids = _get_batch( @@ -366,20 +509,41 @@ def get_batch_sequential(forward_input, neox_args): def forward_step( - data_iterator, model, neox_args, timers, return_logits=False, is_train=False + data_iterator, + model, + neox_args, + timers, + return_logits=False, + is_train=False, + reference_model=None, ): """Forward step.""" if neox_args.is_pipe_parallel: return model.eval_batch(data_iterator, return_logits=return_logits) # Get the batch. - if neox_args.memory_profiling and neox_args.it: + if neox_args.memory_profiling and neox_args.iteration: torch.cuda.nvtx.range_push(f"Get batch") if timers is not None: timers("batch generator").start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - neox_args=neox_args, data_iterator=data_iterator - ) + if neox_args.train_impl == "normal": + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + neox_args=neox_args, data_iterator=data_iterator + ) + elif neox_args.train_impl == "kto": + ( + tokens, + labels, + loss_mask, + attention_mask, + position_ids, + rewards, + ref_logp, + ) = get_batch(neox_args=neox_args, data_iterator=data_iterator) + if neox_args.train_impl in ["dpo", "rm"]: + tokens, labels, loss_mask, attention_mask, position_ids, ref_logp = get_batch( + neox_args=neox_args, data_iterator=data_iterator + ) if timers is not None: timers("batch generator").stop() @@ -388,22 +552,220 @@ def forward_step( if neox_args.memory_profiling: torch.cuda.nvtx.range_push(f"Forward pass") - outputs = model((tokens, position_ids, attention_mask), neox_args=neox_args) - if ( - is_train - and neox_args.curriculum_learning - and neox_args.curriculum_seqlen < neox_args.seq_length - ): - loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous() - labels = labels[:, : neox_args.curriculum_seqlen].contiguous() - loss = cross_entropy( - outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy - ) + metrics = {} + if neox_args.train_impl == "normal": + outputs = model((tokens, position_ids, attention_mask), neox_args=neox_args) + if ( + is_train + and neox_args.curriculum_learning + and neox_args.curriculum_seqlen < neox_args.seq_length + ): + loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous() + labels = labels[:, : neox_args.curriculum_seqlen].contiguous() + loss = cross_entropy( + outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy + ) + elif neox_args.train_impl == "rm": + maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args) + if type(maybe_tuple) is tuple: + outputs, _ = maybe_tuple + else: + outputs = maybe_tuple + pos, neg = torch.chunk(outputs, 2, 0) + pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0) + # We assume that each pos, neg pair occur in the same order + # e.g. second nonzero pos is the corresponding second nonzero neg + # and that there are also an equal number of pos and neg in each sequence. + pos_indx = pos_loss_mask.nonzero() + neg_indx = neg_loss_mask.nonzero() + # indx[:, 0] is the batch index, indx[:, 1] is the token index, we only care about the token index. + pos_indx = pos_indx[:, 1].unsqueeze(1) + neg_indx = neg_indx[:, 1].unsqueeze(1) + pos = torch.gather(pos.squeeze(), dim=1, index=pos_indx) + neg = torch.gather(neg.squeeze(), dim=1, index=neg_indx) + with torch.no_grad(): + metrics["pos_values"] = pos.clone().detach().mean() + metrics["neg_values"] = neg.clone().detach().mean() + metrics["margin"] = (pos - neg).clone().detach().mean() + metrics["accuracy"] = ((pos - neg) > 0).clone().detach().float().mean() + loss = (-F.logsigmoid(pos - neg).mean()) + ( + (neox_args.z_loss * (pos**2 + neg**2)).mean() + ) + elif neox_args.train_impl == "dpo": + # Based on https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90 + with torch.inference_mode(): + # So we can gather token logps... + token_logp_labels = labels.clone() + pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0) + if neox_args.dpo_reference_free: + ref_pos = 0 + ref_neg = 0 + elif ref_logp is None: + ref_maybe_tuple = reference_model( + (tokens, position_ids, attention_mask), neox_args=neox_args + ) + if type(ref_maybe_tuple) is tuple: + # We should ignore MoE losses yeah? + ref_outputs, _ = ref_maybe_tuple + else: + ref_outputs = ref_maybe_tuple + ref_pos, ref_neg = get_pos_neg_logp( + ref_outputs, token_logp_labels, neox_args.dpo_fp32 + ) + else: + ref_pos, ref_neg = torch.chunk(ref_logp, 2, 0) + ref_pos = (ref_pos * pos_loss_mask).sum(-1) + ref_neg = (ref_neg * neg_loss_mask).sum(-1) + chosen_maybe_tuple = model( + (tokens, position_ids, attention_mask), neox_args=neox_args + ) + if type(chosen_maybe_tuple) is tuple: + # We should ignore MoE losses yeah? + chosen_outputs, _ = chosen_maybe_tuple + else: + chosen_outputs = chosen_maybe_tuple + chosen_pos, chosen_neg = get_pos_neg_logp( + chosen_outputs, token_logp_labels, neox_args.dpo_fp32 + ) + chosen_pos = (chosen_pos * pos_loss_mask).sum(-1) + chosen_neg = (chosen_neg * neg_loss_mask).sum(-1) + with torch.no_grad(): + # Collect metrics... + if not neox_args.dpo_reference_free: + metrics["ref_neg"] = ref_neg.clone().detach().mean() + metrics["ref_pos"] = ref_pos.clone().detach().mean() + metrics["chosen_neg"] = chosen_neg.clone().detach().mean() + metrics["chosen_pos"] = chosen_pos.clone().detach().mean() + if not neox_args.dpo_reference_free: + chosen_rewards = neox_args.dpo_beta * ( + chosen_pos.clone().detach() - ref_pos.clone().detach() + ) + rejected_rewards = neox_args.dpo_beta * ( + chosen_neg.clone().detach() - ref_neg.clone().detach() + ) + metrics["chosen_rewards"] = chosen_rewards.mean() + metrics["rejected_rewards"] = rejected_rewards.mean() + reward_acc = (chosen_rewards > rejected_rewards).float() + metrics["reward_acc"] = reward_acc.mean() + metrics["margins"] = (chosen_rewards - rejected_rewards).mean() + pi_logrations = chosen_pos - chosen_neg + ref_logrations = ref_pos - ref_neg + logits = pi_logrations - ref_logrations + loss = -F.logsigmoid(neox_args.dpo_beta * logits).mean() + elif neox_args.train_impl == "kto": + # Based on https://github.com/huggingface/trl/blob/main/trl/trainer/kto_trainer.py + # Except we don't have an extra input for KL logp, we just split the batch in half + with torch.no_grad(): + # So we can gather token logps... + token_logp_labels = labels.clone() + token_logp_labels[token_logp_labels == -100] = 0 + if ref_logp is None: + # Did not precompute logits.... + ref_maybe_tuple = reference_model( + (tokens, position_ids, attention_mask), neox_args=neox_args + ) + if type(ref_maybe_tuple) is tuple: + # We should ignore MoE losses yeah? + ref_outputs, _ = ref_maybe_tuple + else: + ref_outputs = ref_maybe_tuple + # gather across tensor parallel group + ref_outputs = gather_from_model_parallel_region(ref_outputs) + + ref_logp = get_logp(ref_outputs, token_logp_labels, neox_args.kto_fp32) + else: + print(f"REF LOGP: {ref_logp.clone().detach().mean()}") + ref_logp = ref_logp * loss_mask + scaling = (rewards.sum(-1) > 0.001).float() * neox_args.kto_desirable_weight + scaling += ( + rewards.sum(-1) < -0.001 + ).float() * neox_args.kto_undesirable_weight + pos_mask = (rewards > 0.001).float() + neg_mask = (rewards < -0.001).float() + chosen_maybe_tuple = model( + (tokens, position_ids, attention_mask), neox_args=neox_args + ) + if type(chosen_maybe_tuple) is tuple: + # We should ignore MoE losses yeah? + chosen_outputs, _ = chosen_maybe_tuple + else: + chosen_outputs = chosen_maybe_tuple + chosen_outputs = gather_from_model_parallel_region(chosen_outputs) + chosen_logp = get_logp(chosen_outputs, token_logp_labels, neox_args.kto_fp32) + chosen_logp = chosen_logp * loss_mask + with torch.no_grad(): + # Collect metrics... + metrics["ref_logp"] = ref_logp.clone().detach().sum(-1).mean() + metrics["policy_logp"] = chosen_logp.clone().detach().sum(-1).mean() + metrics["pos_ref_logp"] = ( + (ref_logp * pos_mask).clone().detach().sum(-1).mean() + ) + metrics["neg_ref_logp"] = ( + (ref_logp * neg_mask).clone().detach().sum(-1).mean() + ) + metrics["pos_policy_logp"] = ( + (chosen_logp * pos_mask).clone().detach().sum(-1).mean() + ) + metrics["neg_policy_logp"] = ( + (chosen_logp * neg_mask).clone().detach().sum(-1).mean() + ) + metrics["kl"] = ( + chosen_logp.clone().detach() - ref_logp.clone().detach() + ).sum() / loss_mask.sum() + policy_rewards = ( + neox_args.kto_beta + * rewards + * (chosen_logp.clone().detach() - ref_logp.clone().detach()) + ) + reward_acc = (policy_rewards.sum(-1) > 0.0).float() + metrics["reward_acc"] = reward_acc.mean() + metrics["policy_rewards"] = policy_rewards.sum() + print(metrics) + pol_logp1, pol_logp2 = torch.chunk(chosen_logp, 2, 0) + ref_logp1, ref_logp2 = torch.chunk(ref_logp, 2, 0) + reward1, reward2 = torch.chunk(rewards, 2, 0) + scaling1, scaling2 = torch.chunk(scaling, 2, 0) + kl1 = torch.clamp((pol_logp1 - ref_logp1).sum(-1), min=0).mean() + kl2 = torch.clamp((pol_logp2 - ref_logp2).sum(-1), min=0).mean() + log_ratio1 = pol_logp1 - ref_logp1 + log_ratio2 = pol_logp2 - ref_logp2 + + # TODO: Add pack_until_overflow sequence support + loss = ( + 0.5 + * scaling1.mean(-1) + * ( + 1 + - F.sigmoid( + ( + neox_args.kto_beta + * reward1.mean(-1) + * (log_ratio1.sum(-1) - kl2.clone().detach()) + ) + ) + ) + ) + ( + 0.5 + * scaling2.mean(-1) + * ( + 1 + - F.sigmoid( + ( + neox_args.kto_beta + * reward2.mean(-1) + * (log_ratio2.sum(-1) - kl1.clone().detach()) + ) + ) + ) + ) + # print(loss.shape) + loss = loss.mean() + # print(loss.shape) if neox_args.memory_profiling: torch.cuda.nvtx.range_pop() if return_logits: - return loss, outputs - return loss + return loss, outputs, metrics + return loss, metrics def get_model(neox_args, use_cache=False): @@ -417,13 +779,30 @@ def get_model(neox_args, use_cache=False): old_use_mup = neox_args.use_mup neox_args.use_mup = False + if neox_args.zero_stage in [2, 3]: + if neox_args.pipe_parallel_size == 1: + print_rank_0( + "ZeRO stage 2/3 and the PipelineModule are incompatible, please set 'pipe_parallel_size' to 0 instead" + ) + exit() + if neox_args.pipe_parallel_size > 1: + print_rank_0( + "ZeRO stage 2/3 and pipeline paralleism are not supported simultaneously" + ) + exit() + if neox_args.model_parallel_size > 1: + print_rank_0( + "ZeRO stage 2/3 and model paralleism are not currently supported simultaneously" + ) + exit() + with deepspeed.zero.Init( config_dict_or_path=neox_args.deepspeed_config ) if neox_args.zero_stage == 3 else nullcontext() as gs: model = GPT2ModelPipe( neox_args=neox_args, num_tokentypes=0, - parallel_output=True, + parallel_output=True if neox_args.train_impl != "rm" else False, topology=mpu.get_topology(), use_cache=use_cache, ) @@ -478,9 +857,14 @@ def get_model(neox_args, use_cache=False): raise ValueError("Must be using deepspeed to run neox") -def get_optimizer(model, neox_args): +def get_optimizer(model, neox_args, dummy=False): """Set up the optimizer.""" - if neox_args.no_load_optim: + if neox_args.no_load_optim and neox_args.deepspeed: + # Required to have something so... + dummy = True + neox_args.optimizer = {"params": {"lr": 0.0}} + neox_args.optimizer_type = "adam" + elif neox_args.no_load_optim: return None, None if neox_args.optimizer is None: @@ -504,8 +888,13 @@ def get_optimizer(model, neox_args): _param_groups = [] for param_group in param_groups: trainable_params = [p for p in param_group["params"] if p.requires_grad] + if dummy: + trainable_params = [trainable_params[0]] # just take the first one param_group["params"] = trainable_params _param_groups.append(param_group) + if dummy: + # Only need one. + break param_groups = _param_groups # If we're using mup, then the optimizer must be adam or sgd @@ -619,7 +1008,7 @@ def get_optimizer(model, neox_args): def get_learning_rate_scheduler(optimizer, neox_args): """Build the learning rate scheduler.""" - if neox_args.no_load_optim: + if (neox_args.no_load_optim) and not neox_args.deepspeed: # TODO: this should be configured as a separate arg return None if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam": @@ -632,6 +1021,8 @@ def get_learning_rate_scheduler(optimizer, neox_args): # Add linear learning rate scheduler. if neox_args.lr_decay_iters is not None: num_iters = neox_args.lr_decay_iters + elif neox_args.lr_decay_fraction is not None: + num_iters = math.floor(neox_args.train_iters * neox_args.lr_decay_fraction) else: num_iters = neox_args.train_iters num_iters = max(1, num_iters) @@ -664,19 +1055,32 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): ) """Setup model and optimizer.""" + needs_reference_model = ( + (neox_args.train_impl == "dpo") + and (neox_args.precompute_model_name is None) + and (not neox_args.dpo_reference_free) + ) or ((neox_args.train_impl == "kto") and (neox_args.precompute_model_name is None)) model = get_model(neox_args=neox_args, use_cache=use_cache) + if needs_reference_model: + reference_model = get_model(neox_args=neox_args, use_cache=use_cache) + else: + reference_model = None optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) - + if neox_args.deepspeed and needs_reference_model: + # Need an optimizer & lr_scheduler so make a very small one to keep deepspeed happy... + ref_optimizer, ref_param_groups = get_optimizer( + model=reference_model, neox_args=neox_args, dummy=True + ) + ref_lr_scheduler = get_learning_rate_scheduler( + optimizer=ref_optimizer, neox_args=neox_args + ) + else: + ref_optimizer, ref_param_groups, ref_lr_scheduler = None, None, None if neox_args.deepspeed: print_rank_0("DeepSpeed is enabled.") - if neox_args.no_load_optim: - assert optimizer is None - _model_params = None - _lr_scheduler = None - else: - _model_params = param_groups if optimizer is None else None - _lr_scheduler = lr_scheduler + _model_params = param_groups if optimizer is None else None + _lr_scheduler = lr_scheduler model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, @@ -689,6 +1093,17 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): # config_params=neox_args.deepspeed_config, mpu=mpu if not neox_args.is_pipe_parallel else None, ) + if needs_reference_model: + reference_model, _, _, _ = deepspeed.initialize( + model=reference_model, + optimizer=ref_optimizer, + args=neox_args, + lr_scheduler=ref_lr_scheduler, + dist_init_required=False, + model_parameters=ref_param_groups, + mpu=mpu if not neox_args.is_pipe_parallel else None, + ) + mark_norms_for_sequence_parallel_grad_sync(model, neox_args) model.total_params = get_total_params(model.module) print_rank_0(f' > total params: {"{:,}".format(model.total_params)}') @@ -721,6 +1136,15 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): lr_scheduler=lr_scheduler, iteration=iteration, ) + if needs_reference_model: + _ = load_checkpoint( + neox_args=neox_args, + model=reference_model, + optimizer=ref_optimizer, + lr_scheduler=ref_lr_scheduler, + iteration=iteration, + ) + reference_model.eval() print_rank_0( f"Loading checkpoint and starting from iteration {neox_args.iteration}" ) @@ -732,7 +1156,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None): if lr_scheduler is not None: lr_scheduler.optimizer = model.optimizer - return model, optimizer, lr_scheduler + return model, optimizer, lr_scheduler, reference_model def backward_step(neox_args, timers, optimizer, model, loss): @@ -754,7 +1178,15 @@ def backward_step(neox_args, timers, optimizer, model, loss): raise ValueError("Must be using deepspeed to run neox") -def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler): +def train_step( + neox_args, + timers, + data_iterator, + model, + optimizer, + lr_scheduler, + reference_model=None, +): """Single training step.""" # Pipeline parallelism schedules forward/backward/step @@ -762,6 +1194,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) reduced_loss = train_step_pipe( neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator ) + reduce_metrics = reduced_loss if ( neox_args.memory_profiling and neox_args.iteration >= neox_args.profile_step_start @@ -771,18 +1204,22 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) save_snapshot(neox_args) else: losses = [] + metric_dicts = defaultdict(list) for _ in range(neox_args.gradient_accumulation_steps): # Forward model for one step. timers("forward").start() - loss = forward_step( + loss, metric_dict = forward_step( neox_args=neox_args, timers=timers, data_iterator=data_iterator, model=model, is_train=True, + reference_model=reference_model, ) timers("forward").stop() losses.append(loss) + for key in metric_dict.keys(): + metric_dicts[key].append(metric_dict[key]) # Calculate gradients, reduce across processes, and clip. if ( neox_args.profile @@ -812,6 +1249,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) and neox_args.iteration <= neox_args.profile_step_stop ): torch.cuda.nvtx.range_push(f"Optimizer step") + timers("optimizer").start() if neox_args.deepspeed: model.step() @@ -831,17 +1269,19 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler) and torch.distributed.get_rank() == 0 ): save_snapshot(neox_args) - reduced_loss = { - "lm_loss": reduce_losses(losses).mean() - } # reduces losses across machines for logging + # reduces metrics across machines for logging + reduce_metrics = { + key: reduce_losses(metric_dicts[key]).mean() for key in metric_dicts.keys() + } + reduce_metrics["lm_loss"] = reduce_losses(losses).mean() if neox_args.precision == "fp16" and model.optimizer.overflow: skipped_iter = 1 else: skipped_iter = 0 - collect_loss_for_unit_test(reduced_loss["lm_loss"]) - return reduced_loss, skipped_iter + collect_loss_for_unit_test(reduce_metrics["lm_loss"]) + return reduce_metrics, skipped_iter def train_step_pipe(neox_args, timers, model, data_iterator): @@ -863,10 +1303,34 @@ def train_step_pipe(neox_args, timers, model, data_iterator): return loss_dict +def is_save_iter(neox_args, iteration): + if neox_args.extra_save_iters and iteration in neox_args.extra_save_iters: + return True + + if neox_args.checkpoint_factor: + if neox_args.checkpoint_scale == "linear": + assert float( + neox_args.checkpoint_factor + ).is_integer(), "checkpoint_factor must be a whole number when using linear checkpoint_scale" + return iteration % neox_args.checkpoint_factor == 0 + elif neox_args.checkpoint_scale == "log": + # Check if iteration is a power of checkpoint_factor + assert neox_args.checkpoint_factor > 1 + power = 1 + while power < iteration + 1: + if int(power) == iteration: + return True + power *= neox_args.checkpoint_factor + return False + + return False + + def train( neox_args, timers, model, + reference_model, optimizer, lr_scheduler, train_data_iterator, @@ -922,6 +1386,7 @@ def train( model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, + reference_model=reference_model, ) if neox_args.profile and iteration == neox_args.profile_step_stop: torch.cuda.cudart().cudaProfilerStop() @@ -957,7 +1422,7 @@ def train( ) # Checkpointing - if neox_args.save and iteration in neox_args.save_iters: + if neox_args.save and is_save_iter(neox_args, iteration): save_checkpoint( neox_args=neox_args, iteration=iteration, @@ -981,6 +1446,7 @@ def train( iteration=iteration, verbose=False, timers=timers, + reference_model=reference_model, ) if neox_args.exit_interval and iteration % neox_args.exit_interval == 0: @@ -998,7 +1464,13 @@ def train( def evaluate( - neox_args, forward_step_fn, data_iterator, model, verbose=False, timers=None + neox_args, + forward_step_fn, + data_iterator, + model, + verbose=False, + timers=None, + reference_model=None, ): """Evaluation. neox_args: NeoX Arguments @@ -1012,6 +1484,7 @@ def evaluate( # Turn on evaluation mode which disables dropout. model.eval() losses = [] + metric_dicts = defaultdict(list) if neox_args.char_level_ppl: data_iterator = CharCounter(data_iterator, neox_args.tokenizer) @@ -1033,14 +1506,16 @@ def evaluate( else neox_args.gradient_accumulation_steps ): # Forward evaluation - loss = forward_step_fn( + loss, metric_dict = forward_step_fn( model=model, data_iterator=data_iterator, neox_args=neox_args, timers=timers, + reference_model=reference_model, ) losses.append(loss) - + for key in metric_dict.keys(): + metric_dicts[key].append(metric_dict[key]) # When contiguous memory optimizations are enabled, the buffers # allocated by the optimizations are deallocated during backward pass # in the absence of backward pass the buffers should be reset after each @@ -1050,6 +1525,8 @@ def evaluate( # reduces losses across processes for logging & run eval harness tasks eval_results = {"lm_loss": reduce_losses(losses).mean().item()} + for key in metric_dicts.keys(): + eval_results[key] = reduce_losses(metric_dicts[key]).mean().item() eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"]) if neox_args.char_level_ppl: @@ -1092,6 +1569,7 @@ def evaluate_and_print_results( verbose=False, timers=None, chart_name="validation", + reference_model=None, ): """Helper function to evaluate and dump results on screen.""" total_loss_dict = evaluate( @@ -1101,6 +1579,7 @@ def evaluate_and_print_results( model=model, verbose=verbose, timers=timers, + reference_model=reference_model, ) string = f" {chart_name} results at {prefix} | " for k, v in total_loss_dict.items(): @@ -1117,6 +1596,7 @@ def evaluate_and_print_results( iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) else: string += f"{k} value: {v:.6E} | " @@ -1126,6 +1606,7 @@ def evaluate_and_print_results( iteration, use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer, + comet_experiment=neox_args.comet_experiment, ) length = len(string) + 1 diff --git a/megatron/utils.py b/megatron/utils.py index 26b4439bd..507c44179 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -275,10 +275,11 @@ def elapsed(self, reset=True): class Timers: """Group of timers.""" - def __init__(self, use_wandb, tensorboard_writer): + def __init__(self, use_wandb, tensorboard_writer, comet_experiment): self.timers = {} self.use_wandb = use_wandb self.tensorboard_writer = tensorboard_writer + self.comet_experiment = comet_experiment def __call__(self, name): if name not in self.timers: @@ -300,6 +301,14 @@ def write(self, names, iteration, normalizer=1.0, reset=False): if self.use_wandb: wandb.log({f"timers/{name}": value}, step=iteration) + if self.comet_experiment: + self.comet_experiment.__internal_api__log_metric__( + f"timers/{name}", + value, + framework="gpt-neox", + step=iteration, + ) + def log(self, names, normalizer=1.0, reset=True): """Log a group of timers.""" assert normalizer > 0.0 @@ -449,7 +458,7 @@ def setup_for_inference_or_eval(use_cache=True, overwrite_values=None, input_arg initialize_megatron(neox_args) # set up model and load checkpoint. - model, _, _ = setup_model_and_optimizer( + model, _, _, _ = setup_model_and_optimizer( neox_args=neox_args, use_cache=use_cache, iteration=neox_args.iteration, diff --git a/post-training/README.md b/post-training/README.md new file mode 100644 index 000000000..fb7ac8eb4 --- /dev/null +++ b/post-training/README.md @@ -0,0 +1,57 @@ +# Post-Training + +Examples for running post-training with ultrafeedback data for SFT/DPO/RM training. + +```bash +python tools/ckpts/convert_hf_llama_to_neox.py --tp 4 --model meta-llama/Meta-Llama-3-8B-Instruct --model_path checkpoints/neox_converted/llama3-8b-instruct +``` + +## Data generation +First, grab the jsonl file... + +```bash +python post-training/llama_data.py +``` +## DPO data +```bash +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --only-last +``` + +## RM data +```bash +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --for-rm +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_rm_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --for-rm +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --for-rm +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --for-rm +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_rm_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --for-rm +python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --for-rm +``` + +## SFT data +```bash +python tools/datasets/preprocess_data_with_chat_template.py --input data/sft/llama3_sft_train_filtered.jsonl --output-prefix data/sft/llama3_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages +python tools/datasets/preprocess_data_with_chat_template.py --input data/sft/llama3_sft_test_filtered.jsonl --output-prefix data/sft/llama3_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages +python tools/datasets/preprocess_data_with_chat_template.py --input data/sft/llama3_sft_train_filtered.jsonl --output-prefix data/sft/llama3_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages +``` + +## KTO data +```bash +python tools/datasets/preprocess_data_with_chat_template.py --input data/kto/llama3_sft_train_filtered.jsonl --output-prefix data/kto/llama3_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages --reward-key reward +python tools/datasets/preprocess_data_with_chat_template.py --input data/kto/llama3_sft_test_filtered.jsonl --output-prefix data/kto/llama3_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages --reward-key reward +python tools/datasets/preprocess_data_with_chat_template.py --input data/kto/llama3_sft_train_filtered.jsonl --output-prefix data/kto/llama3_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages --reward-key reward +``` + + +## Converting back to hf +```bash +# RM +python tools/ckpts/convert_neox_to_hf.py --input_dir eleuther-neox/checkpoints/rm/llama3/llama3-8b-instruct/global_step100 --output_dir checkpoints/rm/llama3_hf --config_file checkpoints/rm/llama3/llama3-8b-instruct/global_step100/configs/llama3-8b-rm.yml --precision bf16 --vocab-is-hf-tokenizer --architecture llama --pad-token-id 128002 + +# SFT/DPO +python tools/ckpts/convert_neox_to_hf.py --input_dir eleuther-neox/checkpoints//llama3/llama3-8b-instruct/global_step100 --output_dir checkpoints//llama3_hf --config_file checkpoints//llama3/llama3-8b-instruct/global_step100/configs/llama3-8b-rm.yml --precision bf16 --vocab-is-hf-tokenizer --architecture llama +``` diff --git a/post-training/configs/benchmarking/llama-13b-dpo.yml b/post-training/configs/benchmarking/llama-13b-dpo.yml new file mode 100644 index 000000000..1b97f51b4 --- /dev/null +++ b/post-training/configs/benchmarking/llama-13b-dpo.yml @@ -0,0 +1,127 @@ +{ + "pipe_parallel_size": 0, + "model_parallel_size": 2, + "make_vocab_size_divisible_by": 64, + + # model settings + "num_layers": 40, + "hidden_size": 5120, + "num_attention_heads": 40, + "num_kv_heads": 40, + # following along with zephyr's max length... + "seq_length": 1024, + "max_position_embeddings": 1024, + "pos_emb": "rotary", + "rotary_pct": 1, + "rotary_emb_base": 500000, + "rope_fusion": true, + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + "rmsnorm_fusion": true, + + "attention_config": [[["flash"], 40]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "use_bias_in_mlp": false, + "use_flashattn_swiglu": true, + "activation": "swiglu", + "intermediate_size": 13824, + "mlp_multiple_of": 13824, + + + "optimizer": { + "type": "Adam", + "params": { + "lr": 5.0e-7, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.0, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1000000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1000000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_impl": "dpo", + "dataset_impl": "pairwise", + "dpo_reference_free": false, + "dpo_fp32": false, + "dpo_beta": 0.01, + "allow_chopped": false, + "pos_train_data_paths": [ "data/pairwise/dpo_train_chosen_document" ], + "pos_train_label_data_paths": [ "data/pairwise/dpo_train_chosen_label_document" ], + "neg_train_data_paths": [ "data/pairwise/dpo_train_rejected_document" ], + "neg_train_label_data_paths": [ "data/pairwise/dpo_train_rejected_label_document" ], + "pos_valid_data_paths": [ "data/pairwise/dpo_val_chosen_document" ], + "pos_valid_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ], + "neg_valid_data_paths": [ "data/pairwise/dpo_val_rejected_document" ], + "neg_valid_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ], + "pos_test_data_paths": [ "data/pairwise/dpo_val_chosen_document" ], + "pos_test_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ], + "neg_test_data_paths": [ "data/pairwise/dpo_val_rejected_document" ], + "neg_test_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ], + + + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 16, + "data_impl": "mmap", + "pack_impl": "unpacked", + "num_workers": 4, + + "checkpoint_activations": false, + "checkpoint_num_layers": 1, + "partition_activations": false, + "synchronize_each_layer": false, + + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + "precision": "bfloat16", + "fp32_allreduce": false, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "bf16" + }, + + "train_iters": 477, + "lr_decay_iters": 477, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.1, + "checkpoint_factor": 1000, + "eval_interval": 143000, + "eval_iters": 10, + + "log_interval": 1, + "steps_per_print": 1, + "wall_clock_breakdown": true, + + + "save": "checkpoints/pairwise/llama-13b-dpo", + #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" + # use the same mistral tokenizer just for performance testing + "vocab-file": "checkpoints/neox_converted/zephyr-sft/tokenizer/tokenizer.json", + "use_wandb": true, + "finetune": true, # set to false once resuming from intermediate finetuning step + "tokenizer_type": "HFTokenizer", + "wandb_group": "llama-13b", + "wandb_project": "llama-13b-perf-test", +} diff --git a/post-training/configs/benchmarking/mistral-dpo.yml b/post-training/configs/benchmarking/mistral-dpo.yml new file mode 100644 index 000000000..3e2f1a5ac --- /dev/null +++ b/post-training/configs/benchmarking/mistral-dpo.yml @@ -0,0 +1,126 @@ +{ + "pipe_parallel_size": 0, + "model_parallel_size": 4, + "make_vocab_size_divisible_by": 1, + + # model settings + "num_layers": 32, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_kv_heads": 8, + # following along with zephyr's max length... + "seq_length": 1024, + "max_position_embeddings": 1024, + "pos_emb": "rotary", + "rotary_pct": 1, + "rotary_emb_base": 10000, + "rope_fusion": true, + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + "rmsnorm_fusion": true, + + "attention_config": [[["flash"], 32]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "use_bias_in_mlp": false, + "use_flashattn_swiglu": true, + "activation": "swiglu", + "intermediate_size": 14336, + "mlp_multiple_of": 14336, + + + "optimizer": { + "type": "Adam", + "params": { + "lr": 5.0e-7, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.0, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_impl": "dpo", + "dataset_impl": "pairwise", + "dpo_fp32": false, + "dpo_beta": 0.01, + "allow_chopped": false, + "pos_train_data_paths": [ "data/pairwise/dpo_train_chosen_document" ], + "pos_train_label_data_paths": [ "data/pairwise/dpo_train_chosen_label_document" ], + "neg_train_data_paths": [ "data/pairwise/dpo_train_rejected_document" ], + "neg_train_label_data_paths": [ "data/pairwise/dpo_train_rejected_label_document" ], + "pos_valid_data_paths": [ "data/pairwise/dpo_val_chosen_document" ], + "pos_valid_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ], + "neg_valid_data_paths": [ "data/pairwise/dpo_val_rejected_document" ], + "neg_valid_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ], + "pos_test_data_paths": [ "data/pairwise/dpo_val_chosen_document" ], + "pos_test_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ], + "neg_test_data_paths": [ "data/pairwise/dpo_val_rejected_document" ], + "neg_test_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ], + + + "train_micro_batch_size_per_gpu": 8, + "gradient_accumulation_steps": 8, + "data_impl": "mmap", + "pack_impl": "unpacked", + "num_workers": 1, + + "checkpoint_activations": false, + "checkpoint_num_layers": 32, + "partition_activations": true, + "synchronize_each_layer": true, + + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + "precision": "bfloat16", + "fp32_allreduce": false, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "bf16" + }, + + "train_iters": 477, + "lr_decay_iters": 477, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.1, + "checkpoint_factor": 1000, + "eval_interval": 143000, + "eval_iters": 10, + + "log_interval": 1, + "steps_per_print": 1, + "wall_clock_breakdown": true, + + + "save": "checkpoints/pairwise/zephyr-beta-recreation", + #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" + "load": "checkpoints/neox_converted/zephyr-sft", + "vocab-file": "checkpoints/neox_converted/zephyr-sft/tokenizer/tokenizer.json", + "use_wandb": true, + "finetune": true, # set to false once resuming from intermediate finetuning step + "tokenizer_type": "HFTokenizer", + "wandb_group": "zephyr-beta-dpo", + "wandb_project": "zephyr-beta-dpo", +} diff --git a/post-training/configs/llama3-8b-dpo.yml b/post-training/configs/llama3-8b-dpo.yml new file mode 100644 index 000000000..8a75caef0 --- /dev/null +++ b/post-training/configs/llama3-8b-dpo.yml @@ -0,0 +1,125 @@ +{ + "pipe_parallel_size": 0, + "model_parallel_size": 4, + "make_vocab_size_divisible_by": 1, + + # model settings + "num_layers": 32, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_kv_heads": 8, + # llama3 supports more than this but this is just for testing. + "seq_length": 1024, + "max_position_embeddings": 1024, + "pos_emb": "rotary", + "rotary_pct": 1, + "rotary_emb_base": 500000, + "rope_fusion": true, + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + + "attention_config": [[["flash"], 32]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "use_bias_in_mlp": false, + "use_flashattn_swiglu": true, + "activation": "swiglu", + "intermediate_size": 14336, + "mlp_multiple_of": 14336, + + + + "optimizer": { + "type": "Adam", + "params": { + "lr": 5.0e-7, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.0, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_impl": "dpo", + "dataset_impl": "pairwise", + "dpo_fp32": true, + "dpo_beta": 0.01, + "allow_chopped": false, + "pos_train_data_paths": [ "data/pairwise/llama3_dpo_train_chosen_document" ], + "pos_train_label_data_paths": [ "data/pairwise/llama3_dpo_train_chosen_label_document" ], + "neg_train_data_paths": [ "data/pairwise/llama3_dpo_train_rejected_document" ], + "neg_train_label_data_paths": [ "data/pairwise/llama3_dpo_train_rejected_label_document" ], + "pos_valid_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_document" ], + "pos_valid_label_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_label_document" ], + "neg_valid_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_document" ], + "neg_valid_label_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_label_document" ], + "pos_test_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_document" ], + "pos_test_label_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_label_document" ], + "neg_test_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_document" ], + "neg_test_label_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_label_document" ], + + "train_micro_batch_size_per_gpu": 32, + "gradient_accumulation_steps": 2, + "data_impl": "mmap", + "pack_impl": "unpacked", + "num_workers": 1, + + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + "precision": "bfloat16", + "fp32_allreduce": true, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "fp32" + }, + + "train_iters": 477, + "lr_decay_iters": 477, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.1, + "checkpoint_factor": 1000, + "eval_interval": 100, + "eval_iters": 10, + + "log_interval": 1, + "steps_per_print": 1, + "wall_clock_breakdown": true, + + + "save": "checkpoints/dpo/llama3/llama3-8b-instruct", + #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" + "load": "checkpoints/neox_converted/llama3-8b-instruct", + "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json", + "use_wandb": true, + "wandb_group": "llama3-8b-instruct", + "wandb_project": "ultrafeedback-dpo", + "finetune": true, # set to false once resuming from intermediate finetuning step + "tokenizer_type": "HFTokenizer", +} diff --git a/post-training/configs/llama3-8b-kto.yml b/post-training/configs/llama3-8b-kto.yml new file mode 100644 index 000000000..e819d37cb --- /dev/null +++ b/post-training/configs/llama3-8b-kto.yml @@ -0,0 +1,120 @@ +{ + "pipe_parallel_size": 0, + "model_parallel_size": 4, + "make_vocab_size_divisible_by": 1, + + # model settings + "num_layers": 32, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_kv_heads": 8, + # llama3 supports more than this but this is just for testing. + "seq_length": 1024, + "max_position_embeddings": 1024, + "pos_emb": "rotary", + "rotary_pct": 1, + "rotary_emb_base": 500000, + "rope_fusion": true, + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + + "attention_config": [[["flash"], 32]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "use_bias_in_mlp": false, + "use_flashattn_swiglu": true, + "activation": "swiglu", + "intermediate_size": 14336, + "mlp_multiple_of": 14336, + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00001, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.000001, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + + "train_impl": "kto", + "kto_fp32": true, + "kto_beta": 0.1, + "allow_chopped": false, + "train_label_data_paths": [ "data/kto/llama3_train_messages_label_document" ], + "test_label_data_paths": [ "data/kto/llama3_test_messages_label_document" ], + "valid_label_data_paths": [ "data/kto/llama3_train_messages_label_document" ], + "train_data_paths": [ "data/kto/llama3_train_messages_document" ], + "test_data_paths": [ "data/kto/llama3_test_messages_document" ], + "valid_data_paths": [ "data/kto/llama3_train_messages_document" ], + "train_reward_data_paths": [ "data/kto/llama3_train_messages_reward_document" ], + "test_reward_data_paths": [ "data/kto/llama3_test_messages_reward_document" ], + "valid_reward_data_paths": [ "data/kto/llama3_train_messages_reward_document" ], + + "train_micro_batch_size_per_gpu": 32, + "gradient_accumulation_steps": 2, + "data_impl": "mmap", + "pack_impl": "unpacked", + "num_workers": 1, + + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + "precision": "bfloat16", + "fp32_allreduce": true, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "fp32" + }, + + "train_iters": 477, + "lr_decay_iters": 477, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.1, + "checkpoint_factor": 1000, + "eval_interval": 100, + "eval_iters": 10, + + "log_interval": 1, + "steps_per_print": 1, + "wall_clock_breakdown": true, + + + "save": "checkpoints/kto/llama3/llama3-8b-instruct", + #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" + "load": "checkpoints/neox_converted/llama3-8b-instruct", + "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json", + "use_wandb": true, + "wandb_group": "llama3-8b-instruct", + "wandb_project": "ultrafeedback-kto", + "finetune": true, # set to false once resuming from intermediate finetuning step + "tokenizer_type": "HFTokenizer", +} diff --git a/post-training/configs/llama3-8b-rm.yml b/post-training/configs/llama3-8b-rm.yml new file mode 100644 index 000000000..43117bf95 --- /dev/null +++ b/post-training/configs/llama3-8b-rm.yml @@ -0,0 +1,121 @@ +{ + "pipe_parallel_size": 0, + "model_parallel_size": 4, + "make_vocab_size_divisible_by": 1, + + # model settings + "num_layers": 32, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_kv_heads": 8, + # llama3 supports more than this but this is just for testing. + "seq_length": 1024, + "max_position_embeddings": 1024, + "pos_emb": "rotary", + "rotary_pct": 1, + "rotary_emb_base": 500000, + "rope_fusion": true, + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + + "attention_config": [[["flash"], 32]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "use_bias_in_mlp": false, + "use_flashattn_swiglu": true, + "activation": "swiglu", + "intermediate_size": 14336, + "mlp_multiple_of": 14336, + + "optimizer": { + "type": "Adam", + "params": { + "lr": 5.0e-7, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.0, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_impl": "rm", + "dataset_impl": "pairwise", + "allow_chopped": false, + "pos_train_data_paths": [ "data/pairwise/llama3_rm_train_chosen_document" ], + "pos_train_label_data_paths": [ "data/pairwise/llama3_rm_train_chosen_label_document" ], + "neg_train_data_paths": [ "data/pairwise/llama3_rm_train_rejected_document" ], + "neg_train_label_data_paths": [ "data/pairwise/llama3_rm_train_rejected_label_document" ], + "pos_valid_data_paths": [ "data/pairwise/llama3_rm_val_chosen_document" ], + "pos_valid_label_data_paths": [ "data/pairwise/llama3_rm_val_chosen_label_document" ], + "neg_valid_data_paths": [ "data/pairwise/llama3_rm_val_rejected_document" ], + "neg_valid_label_data_paths": [ "data/pairwise/llama3_rm_val_rejected_label_document" ], + "pos_test_data_paths": [ "data/pairwise/llama3_rm_val_chosen_document" ], + "pos_test_label_data_paths": [ "data/pairwise/llama3_rm_val_chosen_label_document" ], + "neg_test_data_paths": [ "data/pairwise/llama3_rm_val_rejected_document" ], + "neg_test_label_data_paths": [ "data/pairwise/llama3_rm_val_rejected_label_document" ], + + "train_micro_batch_size_per_gpu": 32, + "gradient_accumulation_steps": 2, + "data_impl": "mmap", + "pack_impl": "unpacked", + "num_workers": 1, + + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + "precision": "bfloat16", + "fp32_allreduce": true, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "fp32" + }, + + "train_iters": 477, + "lr_decay_iters": 477, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.1, + "checkpoint_factor": 1000, + "eval_interval": 100, + "eval_iters": 10, + + "log_interval": 1, + "steps_per_print": 1, + "wall_clock_breakdown": true, + + + "save": "checkpoints/rm/llama3/llama3-8b-instruct", + #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" + "load": "checkpoints/neox_converted/llama3-8b-instruct", + "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json", + "use_wandb": true, + "wandb_group": "llama3-8b-instruct", + "wandb_project": "ultrafeedback-rm", + "finetune": true, # set to false once resuming from intermediate finetuning step + "tokenizer_type": "HFTokenizer", +} diff --git a/post-training/configs/llama3-8b-sft.yml b/post-training/configs/llama3-8b-sft.yml new file mode 100644 index 000000000..bfcea1142 --- /dev/null +++ b/post-training/configs/llama3-8b-sft.yml @@ -0,0 +1,112 @@ +{ + "pipe_parallel_size": 0, + "model_parallel_size": 4, + "make_vocab_size_divisible_by": 1, + + # model settings + "num_layers": 32, + "hidden_size": 4096, + "num_attention_heads": 32, + "num_kv_heads": 8, + # llama3 supports more than this but this is just for testing. + "seq_length": 1024, + "max_position_embeddings": 1024, + "pos_emb": "rotary", + "rotary_pct": 1, + "rotary_emb_base": 500000, + "rope_fusion": true, + "no_weight_tying": true, + "gpt_j_residual": false, + "output_layer_parallelism": "column", + "norm": "rmsnorm", + "rms_norm_epsilon": 1.0e-5, + + "attention_config": [[["flash"], 32]], + + "scaled_upper_triang_masked_softmax_fusion": true, + "bias_gelu_fusion": false, + "use_bias_in_norms": false, + "use_bias_in_attn_linear": false, + "use_bias_in_mlp": false, + "use_flashattn_swiglu": true, + "activation": "swiglu", + "intermediate_size": 14336, + "mlp_multiple_of": 14336, + + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00001, + "betas": [0.9, 0.95], + "eps": 1.0e-8 + } + }, + "min_lr": 0.000001, + + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1260000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1260000000, + "contiguous_gradients": true, + "cpu_offload": false + }, + + "train_label_data_paths": [ "data/sft/llama3_train_messages_label_document" ], + "test_label_data_paths": [ "data/sft/llama3_test_messages_label_document" ], + "valid_label_data_paths": [ "data/sft/llama3_train_messages_label_document" ], + "train_data_paths": [ "data/sft/llama3_train_messages_document" ], + "test_data_paths": [ "data/sft/llama3_test_messages_document" ], + "valid_data_paths": [ "data/sft/llama3_train_messages_document" ], + + "train_micro_batch_size_per_gpu": 32, + "gradient_accumulation_steps": 2, + "data_impl": "mmap", + "pack_impl": "unpacked", + "num_workers": 1, + + "checkpoint_activations": true, + "checkpoint_num_layers": 1, + "partition_activations": true, + "synchronize_each_layer": true, + + "gradient_clipping": 1.0, + "weight_decay": 0.1, + "hidden_dropout": 0, + "attention_dropout": 0, + + "precision": "bfloat16", + "fp32_allreduce": true, + "bf16": { + "enabled": true + }, + "data_types": { + "grad_accum_dtype": "fp32" + }, + + "train_iters": 477, + "lr_decay_iters": 477, + "distributed_backend": "nccl", + "lr_decay_style": "cosine", + "warmup": 0.1, + "checkpoint_factor": 1000, + "eval_interval": 100, + "eval_iters": 10, + + "log_interval": 1, + "steps_per_print": 1, + "wall_clock_breakdown": true, + + + "save": "checkpoints/sft/llama3/llama3-8b-instruct", + #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save" + "load": "checkpoints/neox_converted/llama3-8b-instruct", + "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json", + "use_wandb": true, + "wandb_group": "llama3-8b-instruct", + "wandb_project": "ultrafeedback-sft", + "finetune": true, # set to false once resuming from intermediate finetuning step + "tokenizer_type": "HFTokenizer", +} diff --git a/post-training/dpo_data.py b/post-training/dpo_data.py new file mode 100644 index 000000000..d24eb43e5 --- /dev/null +++ b/post-training/dpo_data.py @@ -0,0 +1,103 @@ +""" +https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_dpo.py +adapted to just grab the dataset +""" +import os +from alignment import ( + DataArguments, + DPOConfig, + H4ArgumentParser, + ModelArguments, + apply_chat_template, + decontaminate_humaneval, + get_checkpoint, + get_datasets, + get_kbit_device_map, + get_peft_config, + get_quantization_config, + get_tokenizer, + is_adapter_model, +) +from datasets import load_dataset, DatasetDict +from transformers import AutoTokenizer + +import jsonlines + +############### +# Load datasets +############### +raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized") +raw_datasets = DatasetDict( + { + "train": raw_datasets["train_prefs"], + "test": raw_datasets["test_prefs"], + } +) +column_names = list(raw_datasets["train"].features) + +##################################### +# Load tokenizer and process datasets +##################################### +truncation_side = ( + "left" # Truncate from left to ensure we don't lose labels in final turn +) +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") + +##################### +# Apply chat template +##################### +raw_datasets = raw_datasets.map( + apply_chat_template, + fn_kwargs={ + "tokenizer": tokenizer, + "task": "dpo", + "auto_insert_empty_system_msg": True, + }, + desc="Formatting comparisons with prompt template", +) + +########################## +# Decontaminate benchmarks +########################## +num_raw_train_samples = len(raw_datasets["train"]) +raw_datasets = raw_datasets.filter( + decontaminate_humaneval, + fn_kwargs={"text_column": "text_chosen"}, + batched=True, + batch_size=10_000, + num_proc=1, + desc="Decontaminating HumanEval samples", +) +num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"]) +print( + f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples / num_raw_train_samples * 100:.2f}%) samples from the training set." +) +############### +# Length filter +############### +# Since the alignment handbook recipes call for a max token limit of 1024... +num_filtered_train_samples = len(raw_datasets["train"]) + + +def length_filter(example): + return (len(tokenizer.apply_chat_template(example["chosen"])) < 1024) and ( + len(tokenizer.apply_chat_template(example["rejected"])) < 1024 + ) + + +num_length_filtered_train_samples = num_filtered_train_samples - len( + raw_datasets["train"] +) +print( + f"Length Filtered {num_length_filtered_train_samples} ({num_length_filtered_train_samples / num_filtered_train_samples * 100:.2f}%) samples from the training set." +) +# get directory of the python script +dir_path = os.path.dirname(os.path.realpath(__file__)) +for split in ["train", "test"]: + with open(os.path.join(dir_path, f"dpo_{split}_filtered.jsonl"), "w") as f: + writer = jsonlines.Writer(f) + for item in raw_datasets[split]: + # add empty system messages + item["chosen"] = [{"role": "system", "content": ""}] + item["chosen"] + item["rejected"] = [{"role": "system", "content": ""}] + item["rejected"] + writer.write(item) diff --git a/post-training/llama_data.py b/post-training/llama_data.py new file mode 100644 index 000000000..eab6ac9f1 --- /dev/null +++ b/post-training/llama_data.py @@ -0,0 +1,49 @@ +import os + +from datasets import load_dataset, DatasetDict + +import jsonlines + +############### +# Load datasets +############### +raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized") +# convert to just train and test, not necessary but it looks better +raw_datasets = DatasetDict( + { + "train": raw_datasets["train_prefs"], + "test": raw_datasets["test_prefs"], + } +) +os.makedirs(os.path.join("data", "pairwise"), exist_ok=True) +for split in ["train", "test"]: + with open( + os.path.join("data", "pairwise", f"llama3_dpo_{split}_filtered.jsonl"), "w" + ) as f: + writer = jsonlines.Writer(f) + for item in raw_datasets[split]: + item["chosen"] = item["chosen"] + item["rejected"] = item["rejected"] + writer.write(item) +os.makedirs(os.path.join("data", "sft"), exist_ok=True) +for split in ["train", "test"]: + with open( + os.path.join("data", "sft", f"llama3_sft_{split}_filtered.jsonl"), "w" + ) as f: + writer = jsonlines.Writer(f) + for item in raw_datasets[split]: + item["messages"] = item["chosen"] + writer.write(item) +os.makedirs(os.path.join("data", "kto"), exist_ok=True) +for split in ["train", "test"]: + with open( + os.path.join("data", "kto", f"llama3_kto_{split}_filtered.jsonl"), "w" + ) as f: + writer = jsonlines.Writer(f) + for item in raw_datasets[split]: + item["messages"] = item["chosen"] + item["reward"] = 1 + writer.write(item) + item["messages"] = item["rejected"] + item["reward"] = -1 + writer.write(item) diff --git a/post-training/recreating_zephyr_dpo.md b/post-training/recreating_zephyr_dpo.md new file mode 100644 index 000000000..d97eb3791 --- /dev/null +++ b/post-training/recreating_zephyr_dpo.md @@ -0,0 +1,39 @@ +# Initial setup + +```bash +python tools/ckpts/convert_hf_llama_to_neox.py --tp 2 --model HuggingFaceH4/mistral-7b-sft-beta --model_path checkpoints/neox_converted/zephyr-sft_tp2 +``` + + +# To generate data +First make a new environment... We want to keep the same data between runs so the easiest way is to create a new conda +environment and follow the steps below. +``` +conda create -n handbook python=3.10 && conda activate handbook +git clone https://github.com/huggingface/alignment-handbook.git +cd ./alignment-handbook/ +python -m pip install . +python -m pip install jsonlines +``` + +## DPO data +```bash +# from the gpt-neox repo +conda activate handbook +python post-training/dpo_data.py +conda deactivate +# activate your neox conda environment, or whatever you need to switch to the neox environment +mkdir data +mkdir data/pairwise +python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last +python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last +``` + +## Running +```bash +python deepy.py train.py post-training/configs/benchmarking/mistral-dpo.yml +``` diff --git a/requirements/pyproject-apex-pip.toml b/requirements/pyproject-apex-pip.toml new file mode 100644 index 000000000..df41dc925 --- /dev/null +++ b/requirements/pyproject-apex-pip.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-apex-pip" +version = "0.1.0" +description = "Apex pip requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +pip = "23.3.2" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-comet.toml b/requirements/pyproject-comet.toml new file mode 100644 index 000000000..04422a213 --- /dev/null +++ b/requirements/pyproject-comet.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-comet" +version = "0.1.0" +description = "Comet ML requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +comet_ml = ">=3.45.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-flashattention.toml b/requirements/pyproject-flashattention.toml new file mode 100644 index 000000000..14c7ad112 --- /dev/null +++ b/requirements/pyproject-flashattention.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-flashattention" +version = "0.1.0" +description = "Flash Attention requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +flash-attn = "2.5.6" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-mamba.toml b/requirements/pyproject-mamba.toml new file mode 100644 index 000000000..0f6191662 --- /dev/null +++ b/requirements/pyproject-mamba.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "gpt-neox-mamba" +version = "0.1.0" +description = "Mamba requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +causal_conv1d = ">=1.1.0" +einops = "*" +mamba_ssm = ">=1.2.0.post1" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-neox-dev.toml b/requirements/pyproject-neox-dev.toml new file mode 100644 index 000000000..55b00f6ba --- /dev/null +++ b/requirements/pyproject-neox-dev.toml @@ -0,0 +1,23 @@ +[tool.poetry] +name = "gpt-neox-dev" +version = "0.1.0" +description = "Development requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +autopep8 = ">=1.5.6" +clang-format = ">=13.0.1" +pre-commit = ">=2.17.0" +pytest = ">=6.2.3" +pytest-cov = ">=2.11.1" +pytest-forked = ">=1.3.0" +pytest-html = "4.1.1" +pytest-xdist = "*" +toml = ">=0.10.2" +packaging = ">=23.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-onebitadam.toml b/requirements/pyproject-onebitadam.toml new file mode 100644 index 000000000..aeaf33aa6 --- /dev/null +++ b/requirements/pyproject-onebitadam.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-onebitadam" +version = "0.1.0" +description = "OneBitAdam requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +cupy-cuda111 = ">=8.6.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-s3.toml b/requirements/pyproject-s3.toml new file mode 100644 index 000000000..a0cb99aef --- /dev/null +++ b/requirements/pyproject-s3.toml @@ -0,0 +1,15 @@ +[tool.poetry] +name = "gpt-neox-s3" +version = "0.1.0" +description = "S3 requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +boto3 = "*" +hf-transfer = ">=0.1.3" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-sparseattention.toml b/requirements/pyproject-sparseattention.toml new file mode 100644 index 000000000..2864c799b --- /dev/null +++ b/requirements/pyproject-sparseattention.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-sparseattention" +version = "0.1.0" +description = "Sparse Attention requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +triton = "2.1.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-tensorboard.toml b/requirements/pyproject-tensorboard.toml new file mode 100644 index 000000000..79bbfa900 --- /dev/null +++ b/requirements/pyproject-tensorboard.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-tensorboard" +version = "0.1.0" +description = "TensorBoard requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +tensorboard = "2.13.0" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-transformerengine.toml b/requirements/pyproject-transformerengine.toml new file mode 100644 index 000000000..7c313e0d9 --- /dev/null +++ b/requirements/pyproject-transformerengine.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-transformerengine" +version = "0.1.0" +description = "Transformer Engine requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +transformer-engine = {git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "stable"} + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject-wandb.toml b/requirements/pyproject-wandb.toml new file mode 100644 index 000000000..c5806b341 --- /dev/null +++ b/requirements/pyproject-wandb.toml @@ -0,0 +1,14 @@ +[tool.poetry] +name = "gpt-neox-wandb" +version = "0.1.0" +description = "Weights & Biases requirements for GPT-NeoX" +authors = ["EleutherAI "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = "^3.8" +wandb = ">=0.10.28" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/pyproject.toml b/requirements/pyproject.toml new file mode 100644 index 000000000..91d6fc1dd --- /dev/null +++ b/requirements/pyproject.toml @@ -0,0 +1,33 @@ +[tool.poetry] +name = "gpt-neox" +version = "2.0.0" +description = "An open-source library for training large-scale language models on GPUs" +authors = ["EleutherAI "] +license = "Apache-2.0" +readme = "README.md" +homepage = "https://www.github.com/eleutherai/gpt-neox" +repository = "https://www.github.com/eleutherai/gpt-neox" +documentation = "https://www.github.com/eleutherai/gpt-neox" + +[tool.poetry.dependencies] +python = "^3.8" +deepspeed = {git = "https://github.com/EleutherAI/DeeperSpeed.git", rev = "02e2ebf7dee6aaab3d89094ed470a4609763c742"} +ftfy = "^6.0.1" +huggingface_hub = "^0.11.0" +jinja2 = "3.1.4" +lm_dataformat = {git = "https://github.com/EleutherAI/lm_dataformat.git", rev = "4eec05349977071bf67fc072290b95e31c8dd836"} +lm_eval = ">=0.4.0,<=0.4.1" +mpi4py = "^3.0.3" +numpy = "<2.0" +pybind11 = "^2.6.2" +regex = "*" +sentencepiece = "*" +six = "*" +tiktoken = "^0.1.2" +tokenizers = "^0.12.1" +transformers = "4.38.0" +toml = "*" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements/requirements-comet.txt b/requirements/requirements-comet.txt new file mode 100644 index 000000000..904301eaa --- /dev/null +++ b/requirements/requirements-comet.txt @@ -0,0 +1 @@ +comet_ml>=3.45.0 diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 60ff3224f..8dfd5595c 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,8 +1,10 @@ autopep8>=1.5.6 clang-format>=13.0.1 +packaging>=23.0 pre-commit>=2.17.0 pytest>=6.2.3 pytest-cov>=2.11.1 pytest-forked>=1.3.0 pytest-html==4.1.1 pytest-xdist +toml>=0.10.2 diff --git a/requirements/requirements-transformerengine.txt b/requirements/requirements-transformerengine.txt new file mode 100644 index 000000000..2050d7566 --- /dev/null +++ b/requirements/requirements-transformerengine.txt @@ -0,0 +1 @@ +pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 3ac92598a..b5a84674b 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,11 +1,11 @@ deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed ftfy>=6.0.1 -lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0 jinja2==3.1.4 +lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 lm_eval>=0.4.0,<=0.4.1 mpi4py>=3.0.3 -numpy>=1.22.0 +numpy<2.0 pybind11>=2.6.2 regex sentencepiece diff --git a/tests/README.md b/tests/README.md index 316096cc5..32618d757 100644 --- a/tests/README.md +++ b/tests/README.md @@ -3,6 +3,7 @@ Tests use pytests with coverage and forked plugins. Install with: ```bash +pip install -r requirements/requirements.txt pip install -r requirements/requirements-dev.txt ``` @@ -32,7 +33,7 @@ pytest --forked tests/model/test_model_generation.py Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu. The test cases for cpu can be run with: -```` +``` pytest tests -m cpu ``` @@ -49,3 +50,80 @@ if You see this kind of error: RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method ``` It usually means that you used some pytorch.cuda function before the test creates the processes. However just importing `from torch.utils import cpp_extension` can also trigger this. + + +## CPU Test Integration + +Tests can be run against physical CPUs through GitHub Actions. To have tests run on the physical CPU test, here is generally how the CI should be written: + +### runs-on + +#### NOTE: These BKMs were written to work with CI infrastructure that is no longer in place. To use the Github runners (ubuntu-22.04 / ubuntu-latest), skip the 'runs-on' section. + +The CI needs to be written to target the CPU Github Action runner. The jobs that need to run on CPU should use the hardware runner's labels: +```yaml +jobs: + cpu-test-job: + runs-on: [ 'self-hosted', 'aws', 'test'] # these labels tell GitHub to execute on the runner with the 'aws' and 'test' labels +``` + +### Software dependencies + +Hardware tests that need python and docker should install them as part of the test execution to make sure the tests run as expected: +```yaml +steps: + # sample syntax to setup python with pip + - uses: actions/setup-python@v4 + with: + python-version: "3.8" + cache: "pip" + + # sample setup of docker (there's no official Docker setup action) + - name: Docker setup + run: | # taken from Docker's installation page: https://docs.docker.com/engine/install/ubuntu/ + # Add Docker's official GPG key: + sudo apt-get update + sudo apt-get install ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + # Add the repository to Apt sources: + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y +``` + +Any other software dependencies should be assumed to be missing and installed as part of the CI. + +### Using Docker image + +Using the Docker image and running tests in a container is recommended to resolve environment issues. There is a modified docker-compose.yml in tests/cpu_tests directory that is recommended to be used for CPU tests: + +```bash +cp tests/cpu_tests/docker-compose.yml . +# export any env variables here that should be used: +export NEOX_DATA_PATH='./data/enwik8' +docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null +# then can set up and run tests in the container using docker exec +docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt +# etc. +# please clean up the container as part of the CI: +docker rm $CONTAINER +``` + +At the time of writing there is no built-in method to provide an offline-built Docker image to `jobs..container`. + +### Using existing CPU test CI + +There is an existing CPU test workflow that can be included in existing CI: + +```yaml +steps: + - name: Run CPU Tests + uses: + target_test_ref: $GITHUB_REF # replace with the ref/SHA that the tests should be run on + # have a look at the reusable workflow here: https://github.com/EleutherAI/gpt-neox/blob/main/tests/cpu_tests/action.yml +``` diff --git a/tests/model/test_model_train.py b/tests/model/test_model_train.py index 31798f342..65adfcdee 100644 --- a/tests/model/test_model_train.py +++ b/tests/model/test_model_train.py @@ -28,7 +28,6 @@ PARAMS_TO_TEST = { "gpt_j_residual": [True, False], - "mlp_type": ["llama", "regular"], "pos_emb": ["learned", "rotary", "sinusoidal", "rpe", "alibi", "none"], "attention_config": [ "global", diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py index 176151c2a..5f8ba7bd2 100644 --- a/tests/neox_args/test_neoxargs_usage.py +++ b/tests/neox_args/test_neoxargs_usage.py @@ -66,7 +66,9 @@ def test_neoxargs_usage(): # find args matches matches = list( - re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents) + re.findall( + r"(?<=neox_args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents + ) ) if len(matches) == 0: continue diff --git a/tests/requirements/test_requirements.py b/tests/requirements/test_requirements.py new file mode 100644 index 000000000..20e8ad0dd --- /dev/null +++ b/tests/requirements/test_requirements.py @@ -0,0 +1,131 @@ +import pytest +import toml +from pathlib import Path +from typing import Dict, List, Optional +from packaging.version import parse as parse_version, Version +from dataclasses import dataclass + + +@dataclass +class Dependency: + name: str + version: Optional[str] = None + + @classmethod + def from_requirement(cls, requirement: str) -> "Dependency": + """Parse a requirement string into a Dependency object.""" + # Common version specifiers + specifiers = ["==", ">=", ">", "<=", "<"] + name = requirement + version = None + + for spec in specifiers: + if spec in requirement: + name, version = requirement.split(spec, 1) + version = version.strip() + break + + return cls(name.lower().strip(), version) + + def matches_version(self, other_version: str) -> bool: + """Check if this dependency's version matches another version string.""" + if not self.version or not other_version: + return True + + try: + # Convert versions to comparable objects + our_version = parse_version(self.version) + their_version = parse_version(other_version.replace("*", "0")) + return our_version == their_version + except ValueError: + # If versions can't be parsed, fall back to string comparison + return self.version.replace("*", "0") == other_version.replace("*", "0") + + +class DependencyValidator: + def __init__(self, requirements_dir: Path): + self.requirements_dir = requirements_dir + + def parse_requirements(self, file_path: Path) -> List[Dependency]: + """Parse requirements.txt file into a list of Dependencies.""" + try: + with open(file_path, "r") as f: + lines = [ + line.strip() + for line in f + if line.strip() and not line.startswith("#") + ] + return [Dependency.from_requirement(line) for line in lines] + except FileNotFoundError: + raise FileNotFoundError(f"Requirements file not found: {file_path}") + except Exception as e: + raise ValueError(f"Error parsing requirements file {file_path}: {str(e)}") + + def parse_pyproject(self, file_path: Path) -> Dict[str, str]: + """Parse pyproject.toml file and extract dependencies.""" + try: + with open(file_path, "r") as f: + pyproject_data = toml.load(f) + return { + name.lower(): str(version) + for name, version in pyproject_data["tool"]["poetry"][ + "dependencies" + ].items() + if name.lower() != "python" # Exclude Python version + } + except FileNotFoundError: + raise FileNotFoundError(f"pyproject.toml file not found: {file_path}") + except Exception as e: + raise ValueError(f"Error parsing pyproject.toml {file_path}: {str(e)}") + + def compare_dependencies( + self, req_deps: List[Dependency], pyproject_deps: Dict[str, str] + ) -> tuple[bool, List[str]]: + """Compare dependencies between requirements.txt and pyproject.toml.""" + mismatches = [] + + for req in req_deps: + if req.name not in pyproject_deps: + mismatches.append( + f"Dependency '{req.name}' not found in pyproject.toml" + ) + continue + + if not req.matches_version(pyproject_deps[req.name]): + mismatches.append( + f"Version mismatch for '{req.name}': " + f"requirements.txt={req.version}, " + f"pyproject.toml={pyproject_deps[req.name]}" + ) + + return len(mismatches) == 0, mismatches + + +def get_corresponding_pyproject(req_file: Path) -> Path: + """Get the corresponding pyproject.toml file for a requirements file.""" + env_name = req_file.stem.split("-")[1] + return req_file.parent / f"pyproject-{env_name}.toml" + + +@pytest.mark.parametrize("req_file", Path("requirements").glob("requirements-*.txt")) +def test_pyproject_matches_requirements(req_file: Path): + """Test that requirements.txt dependencies match pyproject.toml dependencies.""" + validator = DependencyValidator(req_file.parent) + pyproject_file = get_corresponding_pyproject(req_file) + + # Parse both dependency files + req_deps = validator.parse_requirements(req_file) + pyproject_deps = validator.parse_pyproject(pyproject_file) + + # Compare dependencies and get detailed mismatches + is_match, mismatches = validator.compare_dependencies(req_deps, pyproject_deps) + + # Create detailed error message if there are mismatches + if not is_match: + error_msg = "\n".join( + [ + f"\nDependency mismatches found between {req_file} and {pyproject_file}:", + *[f"- {msg}" for msg in mismatches], + ] + ) + pytest.fail(error_msg) diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py index e0801434c..6935e480a 100644 --- a/tests/unit/test_format_conversion_scripts.py +++ b/tests/unit/test_format_conversion_scripts.py @@ -4,8 +4,12 @@ from megatron.neox_arguments.neox_args import NeoXArgsTokenizer +@pytest.mark.skip( + reason="Conversion test is skipped until we fix the CUDA + torch multiprocessing issue." +) def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path): # Generate random GPT-NEOX model, check we can convert to hf format + model_dir = str(tmpdir) input_args = ["train.py", "tests/config/test_setup.yml"] deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args) diff --git a/tools/ckpts/README.md b/tools/ckpts/README.md index 24d5cf31c..770cfb9c6 100644 --- a/tools/ckpts/README.md +++ b/tools/ckpts/README.md @@ -131,3 +131,20 @@ options: --num_output_shards NUM_OUTPUT_SHARDS --pipeline_parallel Only use if PP>1 ``` + +### `convert_hf_llama_to_neox.py` +Takes an HF Llama checkpoint and puts it into a NeoX-compatible format. + +Note that this does not support pipeline parallelism! + +``` +usage: convert_hf_llama_to_neox.py [-h] [--tp TP] [--pp PP] [--model MODEL] [--model_path MODEL_PATH] + +options: + -h, --help show this help message and exit + --tp TP Number of tensor parallelism ranks + --pp PP Number of pipeline parallelism stages + --model MODEL HF model name + --model_path MODEL_PATH + Path to save model +``` diff --git a/tools/ckpts/convert_hf_llama_to_neox.py b/tools/ckpts/convert_hf_llama_to_neox.py new file mode 100644 index 000000000..21249995b --- /dev/null +++ b/tools/ckpts/convert_hf_llama_to_neox.py @@ -0,0 +1,211 @@ +import torch +import argparse +from transformers import AutoTokenizer, AutoModelForCausalLM +import os +import tqdm + + +def convert_model(hf_state_dict, hf_config, tp_ranks): + conv_state_dicts = [{} for _ in range(tp_ranks)] + # get embeddings... + for i, chunk in enumerate( + torch.chunk(hf_state_dict["model.embed_tokens.weight"], tp_ranks, dim=0) + ): + conv_state_dicts[i][ + "sequential.0.word_embeddings.weight" + ] = chunk.clone().detach() + print( + "model.embed_tokens.weight", + hf_state_dict["model.embed_tokens.weight"].shape, + "sequential.0.word_embeddings.weight", + conv_state_dicts[0]["sequential.0.word_embeddings.weight"].shape, + ) + # Get config data... + num_kv_heads = hf_config.num_key_value_heads + num_q_heads = hf_config.num_attention_heads + head_dim = hf_config.hidden_size // num_q_heads + # do layers... + for layer_num in tqdm.tqdm(range(model.model.config.num_hidden_layers)): + # --- attention --- + # Output first since it's a simple row parallel... + for i, chunk in enumerate( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"], + tp_ranks, + dim=1, + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.attention.dense.weight" + ] = chunk.clone().detach() + print( + f"model.layers.{layer_num}.self_attn.o_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"].shape, + f"sequential.{layer_num+2}.attention.dense.weight", + conv_state_dicts[0][ + f"sequential.{layer_num+2}.attention.dense.weight" + ].shape, + ) + # Now for attention... + # Split into heads... + q = hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"] + k = hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"] + v = hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"] + # The GQA code splits the heads by the num_q_heads so we also do that + # here to ensure it matches... + q = q.view(num_q_heads, -1, q.shape[-1]) + k = k.view(num_q_heads, -1, q.shape[-1]) + v = v.view(num_q_heads, -1, q.shape[-1]) + # Chunk for tensor parallelism... + for i, q_chunk, k_chunk, v_chunk in zip( + range(tp_ranks), + torch.chunk(q, tp_ranks, dim=0), + torch.chunk(k, tp_ranks, dim=0), + torch.chunk(v, tp_ranks, dim=0), + ): + # Need to join the heads across q, k, v... + conv_state_dicts[i][ + f"sequential.{layer_num+2}.attention.query_key_value.weight" + ] = ( + torch.cat([q_chunk, k_chunk, v_chunk], dim=1) + .view(-1, q.shape[-1]) + .clone() + .detach() + ) + print( + f"model.layers.{layer_num}.self_attn.(q/k/v)_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"].shape, + hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"].shape, + hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"].shape, + f"sequential.{layer_num+2}.attention.query_key_value.weight", + conv_state_dicts[0][ + f"sequential.{layer_num+2}.attention.query_key_value.weight" + ].shape, + ) + # --- mlp --- + # Do SwiGLU weights... + # w1... + for i, (w1, w3) in enumerate( + zip( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"], + tp_ranks, + dim=0, + ), + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"], + tp_ranks, + dim=0, + ), + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.mlp.linear1.weight" + ] = torch.cat([w3.clone().detach(), w1.clone().detach()], dim=0) + print( + f"model.layers.{layer_num}.mlp.gate_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"].shape, + f"model.layers.{layer_num}.mlp.up_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"].shape, + f"sequential.{layer_num+2}.mlp.w3.weight", + conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.linear1.weight"].shape, + ) + # w2 (output)... + for i, chunk in enumerate( + torch.chunk( + hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"], + tp_ranks, + dim=1, + ) + ): + conv_state_dicts[i][ + f"sequential.{layer_num+2}.mlp.linear2.weight" + ] = chunk.clone().detach() + print( + f"model.layers.{layer_num}.mlp.down_proj.weight", + hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"].shape, + f"sequential.{layer_num+2}.mlp.linear2.weight", + conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.linear2.weight"].shape, + ) + # --- norm --- + for i in range(tp_ranks): + conv_state_dicts[i][f"sequential.{layer_num+2}.input_layernorm.scale"] = ( + hf_state_dict[f"model.layers.{layer_num}.input_layernorm.weight"] + .clone() + .detach() + ) + conv_state_dicts[i][ + f"sequential.{layer_num+2}.post_attention_layernorm.scale" + ] = ( + hf_state_dict[ + f"model.layers.{layer_num}.post_attention_layernorm.weight" + ] + .clone() + .detach() + ) + + # Get final ln/linear.... + index = model.model.config.num_hidden_layers + 3 + for i in range(tp_ranks): + conv_state_dicts[i][f"sequential.{index}.norm.scale"] = ( + hf_state_dict["model.norm.weight"].clone().detach() + ) + index += 1 + # do output... + for i, chunk in enumerate( + torch.chunk(hf_state_dict["lm_head.weight"], tp_ranks, dim=0) + ): + conv_state_dicts[i][ + f"sequential.{index}.final_linear.weight" + ] = chunk.clone().detach() + print( + "lm_head.weight", + hf_state_dict["lm_head.weight"].shape, + f"sequential.{index}.final_linear.weight", + conv_state_dicts[0][f"sequential.{index}.final_linear.weight"].shape, + ) + return conv_state_dicts + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tp", type=int, default=1, help="Number of tensor parallelism ranks" + ) + parser.add_argument( + "--pp", type=int, default=0, help="Number of pipeline parallelism stages" + ) + parser.add_argument("--model", type=str, default="gpt2", help="HF model name") + parser.add_argument( + "--model_path", type=str, default=None, help="Path to save model" + ) + args = parser.parse_args() + assert args.pp == 0, "Pipeline parallelism not supported yet" + tokenizer = AutoTokenizer.from_pretrained(args.model).save_pretrained( + args.model_path + "/tokenizer" + ) + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype="auto") + state_dict = model.state_dict() + for key in state_dict.keys(): + print(key, state_dict[key].shape) + os.makedirs(args.model_path, exist_ok=True) + # Setup model directory... + os.makedirs(f"{args.model_path}/0", exist_ok=True) + # Save the latest file so neox can figure out where to grab the weights... + with open(f"{args.model_path}/latest", "w") as f: + f.write("0") + # Convert the model... + tp_state_dicts = convert_model(state_dict, model.model.config, args.tp) + for i in range(args.tp): + torch.save( + { + "dp_world_size": 1, + "mp_world_size": args.tp, + "optimizer": {}, + "global_steps": 1, + "skipped_steps": 1, + "iteration": 1, + "module": tp_state_dicts[i], + }, + f"{args.model_path}/0/mp_rank_{i:02d}_model_states.pt", + ) diff --git a/tools/ckpts/convert_hf_to_sequential.py b/tools/ckpts/convert_hf_to_sequential.py index c53f28391..5e0ada334 100644 --- a/tools/ckpts/convert_hf_to_sequential.py +++ b/tools/ckpts/convert_hf_to_sequential.py @@ -119,16 +119,27 @@ def shard_sequential_mp(num_mp_ranks, sequential): ranks = {x: dict() for x in range(num_mp_ranks)} for k, v in sequential.items(): if reduce( + np.logical_or, + [ + x in k + for x in [ + "dense_4h_to_h.bias", + "attention.dense.bias", + ] + ], + ): + # Divide by tp_size since they get added together + for x in range(num_mp_ranks): + ranks[x][k] = v / num_mp_ranks + elif reduce( np.logical_or, [ x in k for x in [ "layernorm", "rotary_emb", - "dense_4h_to_h.bias", "norm.weight", "norm.bias", - "attention.dense.bias", ] ], ): @@ -504,6 +515,7 @@ def get_non_existing_dir(tmp_dir): neox_args.configure_distributed_args() neox_args.build_tokenizer() neox_args.initialize_tensorboard_writer() + neox_args.comet() # setup logging and timers # init_wandb(neox_args=neox_args) diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py index 35812383e..8dfe02d54 100644 --- a/tools/ckpts/convert_neox_to_hf.py +++ b/tools/ckpts/convert_neox_to_hf.py @@ -26,6 +26,7 @@ GPTNeoXConfig, AutoModelForCausalLM, AutoConfig, + AutoModelForSequenceClassification, ) from typing import List, Literal @@ -50,57 +51,110 @@ # Model definitions: a list of keys, and where they fall in terms of handling them in the presence of TP. # in format : {model arch: {param type: {param in neox: param in HF}}} - MODEL_KEYS = { "neox": { - "COLUMN_PARALLEL_LINEAR_KEYS": { - "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight", - "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias", - "attention.query_key_value.weight": "attention.query_key_value.weight", - "attention.query_key_value.bias": "attention.query_key_value.bias", # TODO: handle GQA separately? - }, - "ROW_PARALLEL_LINEAR_KEYS": { - "attention.dense.weight": "attention.dense.weight", - "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight", - }, - "ROW_PARALLEL_BIAS_KEYS": { - "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias", - "attention.dense.bias": "attention.dense.bias", + "new": { + "COLUMN_PARALLEL_LINEAR_KEYS": { + "mlp.linear1.weight": "mlp.dense_h_to_4h.weight", + "mlp.linear1.bias": "mlp.dense_h_to_4h.bias", + "attention.query_key_value.weight": "attention.query_key_value.weight", + "attention.query_key_value.bias": "attention.query_key_value.bias", # TODO: handle GQA separately? + }, + "ROW_PARALLEL_LINEAR_KEYS": { + "attention.dense.weight": "attention.dense.weight", + "mlp.linear2.weight": "mlp.dense_4h_to_h.weight", + }, + "ROW_PARALLEL_BIAS_KEYS": { + "mlp.linear2.bias": "mlp.dense_4h_to_h.bias", + "attention.dense.bias": "attention.dense.bias", + }, + "NORM_KEYS": { + "input_layernorm.weight": "input_layernorm.weight", + "input_layernorm.bias": "input_layernorm.bias", + "post_attention_layernorm.weight": "post_attention_layernorm.weight", + "post_attention_layernorm.bias": "post_attention_layernorm.bias", + }, + "FINAL_NORM_KEYS": { + "norm.weight": "weight", + "norm.bias": "bias", + }, }, - "NORM_KEYS": { - "input_layernorm.weight": "input_layernorm.weight", - "input_layernorm.bias": "input_layernorm.bias", - "post_attention_layernorm.weight": "post_attention_layernorm.weight", - "post_attention_layernorm.bias": "post_attention_layernorm.bias", - }, - "FINAL_NORM_KEYS": { - "norm.weight": "weight", - "norm.bias": "bias", + "legacy": { + "COLUMN_PARALLEL_LINEAR_KEYS": { + "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight", + "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias", + "attention.query_key_value.weight": "attention.query_key_value.weight", + "attention.query_key_value.bias": "attention.query_key_value.bias", # TODO: handle GQA separately? + }, + "ROW_PARALLEL_LINEAR_KEYS": { + "attention.dense.weight": "attention.dense.weight", + "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight", + }, + "ROW_PARALLEL_BIAS_KEYS": { + "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias", + "attention.dense.bias": "attention.dense.bias", + }, + "NORM_KEYS": { + "input_layernorm.weight": "input_layernorm.weight", + "input_layernorm.bias": "input_layernorm.bias", + "post_attention_layernorm.weight": "post_attention_layernorm.weight", + "post_attention_layernorm.bias": "post_attention_layernorm.bias", + }, + "FINAL_NORM_KEYS": { + "norm.weight": "weight", + "norm.bias": "bias", + }, }, }, "llama": { - "COLUMN_PARALLEL_LINEAR_KEYS": { - "mlp.w1.weight": "mlp.gate_proj.weight", - "mlp.w3.weight": "mlp.up_proj.weight", - }, - "ROW_PARALLEL_LINEAR_KEYS": { - "attention.dense.weight": "self_attn.o_proj.weight", - "mlp.w2.weight": "mlp.down_proj.weight", + "new": { + "COLUMN_PARALLEL_LINEAR_KEYS": { + "mlp.linear1.weight": ["mlp.up_proj.weight", "mlp.gate_proj.weight"] + }, + "ROW_PARALLEL_LINEAR_KEYS": { + "attention.dense.weight": "self_attn.o_proj.weight", + "mlp.linear2.weight": "mlp.down_proj.weight", + }, + "ROW_PARALLEL_BIAS_KEYS": {}, # No biases in RowParallelLinear layers + "NORM_KEYS": { + "input_layernorm.scale": "input_layernorm.weight", + "post_attention_layernorm.scale": "post_attention_layernorm.weight", + }, + "FINAL_NORM_KEYS": { + "norm.scale": "weight", + }, + "GQA_QKV_KEYS": { # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately. + "attention.query_key_value.weight": [ + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + ], + }, }, - "ROW_PARALLEL_BIAS_KEYS": {}, # No biases in RowParallelLinear layers - "NORM_KEYS": { - "input_layernorm.scale": "input_layernorm.weight", - "post_attention_layernorm.scale": "post_attention_layernorm.weight", - }, - "FINAL_NORM_KEYS": { - "norm.scale": "weight", - }, - "GQA_QKV_KEYS": { # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately. - "attention.query_key_value.weight": [ - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - ], + "legacy": { + "COLUMN_PARALLEL_LINEAR_KEYS": { + "mlp.w1.weight": "mlp.gate_proj.weight", + "mlp.w3.weight": "mlp.up_proj.weight", + }, + "ROW_PARALLEL_LINEAR_KEYS": { + "attention.dense.weight": "self_attn.o_proj.weight", + "mlp.w2.weight": "mlp.down_proj.weight", + }, + "ROW_PARALLEL_BIAS_KEYS": {}, # No biases in RowParallelLinear layers + "NORM_KEYS": { + "input_layernorm.scale": "input_layernorm.weight", + "post_attention_layernorm.scale": "post_attention_layernorm.weight", + }, + "FINAL_NORM_KEYS": { + "norm.scale": "weight", + }, + "GQA_QKV_KEYS": { # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately. + "attention.query_key_value.weight": [ + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + ], + }, }, }, } @@ -165,7 +219,7 @@ def get_key(loaded_config, key, default=None): return default -def create_config(neox_config, architecture="neox"): +def create_config(neox_config, architecture="neox", is_rm=False, pad_token_id=-1): """take in a loaded yaml from NeoX and assign relevant values to HF config. Returns: GPTNeoXConfig() object """ @@ -238,7 +292,9 @@ def __init__(self, neox_config): "num-kv-heads", get_key(neox_config, "num-attention-heads"), ), - "hidden_act": get_key(neox_config, "activation", default="silu"), + "hidden_act": get_key( + neox_config, "activation", default="silu" + ).replace("swiglu", "silu"), "rms_norm_eps": get_key(neox_config, "rms-norm-epsilon", 1.0e-6), "bos_token_id": tokenizer.eod, "eos_token_id": tokenizer.eod, @@ -285,6 +341,9 @@ def __init__(self, neox_config): } ) hf_config = GPTNeoXConfig(**args) + if is_rm: + hf_config.num_labels = 1 + hf_config.pad_token_id = pad_token_id return hf_config @@ -383,6 +442,30 @@ def reshard_and_split_qkv( return state_dict +def get_mlp_naming_convention(loaded_tp_ranks, layer_idx, sequential): + """Determine whether the checkpoint uses the legacy or new MLP naming convention.""" + print(list(loaded_tp_ranks[0]["module"].keys())) + if any( + [ + ["mlp.linear1.weight" in key for key in list(state_dict["module"].keys())] + for state_dict in loaded_tp_ranks + ] + ): + return "new" + elif any( + [ + [ + "mlp.dense_h_to_4h.weight" in key + for key in list(state_dict["module"].keys()) + ] + for state_dict in loaded_tp_ranks + ] + ): + return "legacy" + else: + raise ValueError("Unable to determine MLP naming convention in checkpoint") + + def convert( input_checkpoint_path, loaded_config, @@ -390,6 +473,8 @@ def convert( sequential: bool = True, precision: Literal["auto", "fp16", "bf16", "fp32"] = "auto", architecture: Literal["neox", "llama", "mistral"] = "neox", + is_rm: bool = False, + pad_token_id: int = -1, ): """convert a NeoX checkpoint to a HF model format. should perform model-parallel merging correctly @@ -398,9 +483,14 @@ def convert( ARCH = MODEL_KEYS[architecture] - hf_config = create_config(loaded_config, architecture=architecture) + hf_config = create_config( + loaded_config, architecture=architecture, is_rm=is_rm, pad_token_id=pad_token_id + ) - hf_model = AutoModelForCausalLM.from_config(hf_config) + if not is_rm: + hf_model = AutoModelForCausalLM.from_config(hf_config) + else: + hf_model = AutoModelForSequenceClassification.from_config(hf_config) if architecture == "neox": hf_transformer = hf_model.gpt_neox @@ -474,6 +564,20 @@ def convert( ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {embed_in.shape[0]}" ### End Embedding Layer ### + # grab from 3rd layer to pass embeddings + mlp_naming = get_mlp_naming_convention( + load_partitions( + input_checkpoint_path, + mp_partitions, + layer_idx=3, + sequential=sequential, + ), + 0, + sequential, + ) + print(f"Detected MLP naming convention: {mlp_naming}") + ARCH = ARCH[mlp_naming] + for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))): # get layer from hf model @@ -509,12 +613,31 @@ def convert( # LinearWithTPMerge for key, hf_key in ARCH["COLUMN_PARALLEL_LINEAR_KEYS"].items(): - state_dict[hf_key] = torch.cat( - get_state( - loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential - ), - dim=0, - ) + if type(hf_key) == list: + # Llama magic - split the weight into two parts for the gate and up proj + states = [ + torch.chunk(state, chunks=2, dim=0) + for state in get_state( + loaded_tp_ranks, + key, + layer_idx=layer_i + 2, + sequential=sequential, + ) + ] + # Set up proj... + state_dict[hf_key[0]] = torch.cat([state[0] for state in states], dim=0) + # Set gate proj... + state_dict[hf_key[1]] = torch.cat([state[1] for state in states], dim=0) + else: + state_dict[hf_key] = torch.cat( + get_state( + loaded_tp_ranks, + key, + layer_idx=layer_i + 2, + sequential=sequential, + ), + dim=0, + ) # LinearWithTPSplitBias for key, hf_key in ARCH["ROW_PARALLEL_BIAS_KEYS"].items(): @@ -556,10 +679,6 @@ def convert( sequential=sequential, ) # Load final layer norm - if architecture == "neox": - lm_head = hf_model.embed_out - else: - lm_head = hf_model.lm_head norm_state_dict = {} for key, hf_key in ARCH["FINAL_NORM_KEYS"].items(): norm_state_dict[hf_key] = sum( @@ -580,30 +699,64 @@ def convert( # Load output embedding if not sequential: - loaded_tp_ranks = load_partitions( - input_checkpoint_path, - mp_partitions, - get_key(loaded_config, "num-layers") + 4, - sequential=sequential, - ) + if get_key(loaded_config, "no-weight-tying", False): + # if we have trained input + output embedding layers without tied weights + loaded_tp_ranks = load_partitions( + input_checkpoint_path, + mp_partitions, + get_key(loaded_config, "num-layers") + 4, + sequential=sequential, + ) + else: + # in this case, output embedding layer and input embedding layer are tied. + # load + save the input embed weights into the output embedding layer's place. + loaded_tp_ranks = load_partitions( + input_checkpoint_path, + mp_partitions, + layer_idx=0, + sequential=sequential, + ) # output embedding / LM head - if architecture == "neox": # name of lm head / final linear proj varies - lm_head = hf_model.embed_out + if not is_rm: + if architecture == "neox": # name of lm head / final linear proj varies + lm_head = hf_model.embed_out + else: + lm_head = hf_model.lm_head else: - lm_head = hf_model.lm_head - lm_head.load_state_dict( - { - "weight": torch.cat( - get_state( - loaded_tp_ranks, - "final_linear.weight", - layer_idx=get_key(loaded_config, "num-layers") + 4, - sequential=sequential, + lm_head = hf_model.score + + if get_key(loaded_config, "no-weight-tying", False): + # save the (untied) final linear into LM head for HF + lm_head.load_state_dict( + { + "weight": torch.cat( + get_state( + loaded_tp_ranks, + "final_linear.weight" if not is_rm else "rm_linear.weight", + layer_idx=get_key(loaded_config, "num-layers") + 4, + sequential=sequential, + ), + dim=0 if not is_rm else 1, ), - dim=0, - ), - } - ) + } + ) + else: + # don't need to worry about rm here since you can't really tie them... + + # embedding layers are tied. transpose input layer and save + lm_head.load_state_dict( + { + "weight": torch.cat( + get_state( + loaded_tp_ranks, + "word_embeddings.weight", + layer_idx=0, + sequential=sequential, + ), + dim=0, + ), + } + ) del loaded_tp_ranks @@ -642,6 +795,17 @@ def main(input_args=None, overwrite_values=None): action="store_true", help="Whether to skip saving the tokenizer alongside a model.", ) + parser.add_argument( + "--vocab-is-hf-tokenizer", + action="store_true", + help="Whether the vocab file is in a Huggingface tokenizer path.", + ) + parser.add_argument( + "--pad-token-id", + type=int, + default=-1, + help="Pad token id to set in tokenizer. Required for RM style models.", + ) parser.add_argument( "--architecture", type=str, @@ -674,6 +838,9 @@ def main(input_args=None, overwrite_values=None): # while Sequential model state dicts are saved all together in one mp_rank_xx_model_states.pt # file per tensor/model parallel shard. pipeline_world_size = get_key(loaded_config, "pipe-parallel-size", 1) + is_rm = get_key(loaded_config, "train_impl", "normal") == "rm" + if is_rm and args.pad_token_id == -1: + raise ValueError("RM models require a pad token id to be set.") if pipeline_world_size == 0: sequential = True print( @@ -692,6 +859,8 @@ def main(input_args=None, overwrite_values=None): args.output_dir, sequential=sequential, architecture=args.architecture, + is_rm=is_rm, + pad_token_id=args.pad_token_id, ) # Save to disk. @@ -700,8 +869,18 @@ def main(input_args=None, overwrite_values=None): if not args.no_save_tokenizer: # save tokenizer to directory as well, for easy loading of model as a HF model. tokenizer_type = get_key(loaded_config, "tokenizer-type") + if args.vocab_is_hf_tokenizer: + from transformers import AutoTokenizer - if tokenizer_type == "HFTokenizer": # TODO: handle sentencepiece tokenizers? + tokenizer = AutoTokenizer.from_pretrained( + os.path.dirname(get_key(loaded_config, "vocab-file")) + ) + if args.pad_token_id != -1: + tokenizer.pad_token_id = args.pad_token_id + print("loaded tokenizer: ", tokenizer) + tokenizer.save_pretrained(args.output_dir) + print("tokenizer saved!") + elif tokenizer_type == "HFTokenizer": # TODO: handle sentencepiece tokenizers? print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}") print( "Warning: please check that your model config and tokenizer end with the correct special tokens (EOS, BOS)." @@ -711,6 +890,8 @@ def main(input_args=None, overwrite_values=None): tokenizer = PreTrainedTokenizerFast( tokenizer_file=get_key(loaded_config, "vocab-file") ) + if args.pad_token_id != -1: + tokenizer.pad_token_id = args.pad_token_id print("loaded tokenizer: ", tokenizer) tokenizer.save_pretrained(args.output_dir) print("tokenizer saved!") diff --git a/tools/datasets/README.md b/tools/datasets/README.md index f8215959c..af3009a23 100644 --- a/tools/datasets/README.md +++ b/tools/datasets/README.md @@ -93,6 +93,57 @@ output data: --dataset-impl {lazy,cached,mmap} Dataset implementation to use. Default: mmap +runtime: + --workers WORKERS Number of worker processes to launch + --log-interval LOG_INTERVAL + Interval between progress updates +``` +## `preprocess_data_with_chat_template.py` +Similar, but uses huggingface's [chat templates](https://huggingface.co/docs/transformers/main/en/chat_templating) to +tokenize the data to support multiturn and more complicated use cases. + +N.B. If using this, you **must** specify your data when training/finetuning with the following configs +```json +"train_data_paths": ["train_documents"], +"test_data_paths": ["test_documents"], +"valid_data_paths": ["test_documents"], +"label_data_paths": ["label_documents"] +``` + +the `"data_path"` option will not work with `"label_data_paths"`. + + +``` +usage: preprocess_data_with_chat_template.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--no-mask] + [--generation-role GENERATION_ROLE] [--only-last] [--num-docs NUM_DOCS] + --tokenizer-path TOKENIZER_PATH [--ftfy] --output-prefix OUTPUT_PREFIX + [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] + [--log-interval LOG_INTERVAL] + +options: + -h, --help show this help message and exit + +input data: + --input INPUT Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list + --jsonl-keys JSONL_KEYS [JSONL_KEYS ...] + space separate listed of keys to extract from jsonl. Default: text + --no-mask If set, this will not mask any tokens in the input data. + --generation-role GENERATION_ROLE + The role of the model generating the chat, usually 'assistant'. Default: assistant + --only-last If set, this will mask everything except the last turn in the chat. + --num-docs NUM_DOCS Optional: Number of documents in the input data (if known) for an accurate progress bar. + +tokenizer: + --tokenizer-path TOKENIZER_PATH + Path to HF Tokenizer. + --ftfy Use ftfy to clean text + +output data: + --output-prefix OUTPUT_PREFIX + Path to binary output file without suffix + --dataset-impl {lazy,cached,mmap} + Dataset implementation to use. Default: mmap + runtime: --workers WORKERS Number of worker processes to launch --log-interval LOG_INTERVAL diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py new file mode 100644 index 000000000..ee2b983b6 --- /dev/null +++ b/tools/datasets/preprocess_data_with_chat_template.py @@ -0,0 +1,416 @@ +# Copyright (c) 2024, EleutherAI +# This file is based on code by the authors denoted below and has been modified from its original version. +# +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A script for processing a dataset such that chat templates are utilized in the creation of the data. +These are then used to perform instruction/chat model finetunes (for example, finetuning a model on only the assistant +portions of a chatml dataset). + +This follows the same output format as 'preprocess_data_with_mask.py' but using chat templates to generate the data. +This way we can support multiturn chat data in the finetuning process. instead of relying on a single turn of data. + +To run this script, first edit `tools/datasets/corpora.py` such that the command to call + `tools/datasets/preprocess_data_with_chat_template.py` is as follows: + +``` +cmd = f"python tools/datasets/preprocess_data_with_with_chat_template.py \ + --input {jsonl_filepath} \ + --output-prefix {parent_folder}/{self.name} \ + --tokenizer-path {hf-tokenizer} \ + --jsonl-keys {jsonl_keys} \ + --dataset-impl mmap \ + --workers {self.num_workers} " + +if self.only_last: + cmd += f"--only-last " + +if self.no_mask: + cmd += f"--no-mask " +``` + +Then, specify +``` +"train_data_paths": ["/path/to/dataset/name_text_document"], +"label_data_paths": ["/path/to/dataset/name_label_document"] +``` +in your YML config. This will then allow for finetuning on the data with loss masks set appropriately. + +""" + +import argparse +import multiprocessing +import os +import sys + +import lm_dataformat as lmd +import numpy as np + +sys.path.append( + os.path.abspath( + os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir) + ) +) + +import time +import tqdm +import jsonlines + +from megatron.data import indexed_dataset +from threading import Semaphore +from typing import List, Dict, Tuple +from transformers import AutoTokenizer, PreTrainedTokenizer + + +def build_chat( + chat: List[Dict[str, str]], + generation_role: str, + apply_mask: bool, + tokenizer: PreTrainedTokenizer, + only_last_turn: bool = False, + for_rm: bool = False, +) -> Tuple[List[int], List[int]]: + """ + Build a chat from a list of dictionaries. Each dictionary should have a "role" and "content" key, this follows the + Chat Template from https://huggingface.co/docs/transformers/main/en/chat_templating + + :param chat: A list of dictionaries with "role" and "content" keys + :param generation_role: The role of the model generating the chat, usually "assistant" + :param apply_mask: Whether to apply a loss mask to the chat, if False, all tokens will be included in the loss + :param tokenizer: A HF tokenizer + :param only_last_turn: Whether to only include the last turn in the chat, needed for some fine-tuning tasks + """ + tokens = [] + mask = [] + if apply_mask is False: + tokens = tokenizer.apply_chat_template(chat) + mask = tokens + return tokens, mask + elif for_rm: + tokens = tokenizer.apply_chat_template(chat) + mask = [-100] * len(tokens) + if tokenizer.eos_token_id is not None: + # since this is processed in a causal format (input[:-1], mask[1:], we need to put two here... + mask.append(-100) + tokens.append(tokenizer.eos_token_id) + mask.append(tokenizer.eos_token_id) + tokens.append(tokenizer.eos_token_id) + else: + raise ValueError( + "Tokenizer does not have an EOS token, unable to determine good mask, please edit and make your own." + ) + return tokens, mask + for i, turn in enumerate(chat): + add_gen = ( + False if i == len(chat) - 1 else chat[i + 1]["role"] == generation_role + ) + chat_tokens = tokenizer.apply_chat_template( + chat[: i + 1], add_generation_prompt=add_gen + )[len(tokens) :] + # remove previous stuff... + tokens.extend(chat_tokens) + if only_last_turn and (i != len(chat) - 1): + mask.extend([-100] * len(chat_tokens)) + elif apply_mask and (turn["role"] != generation_role): + mask.extend([-100] * len(chat_tokens)) + else: + mask.extend(chat_tokens) + if tokenizer.eos_token_id is not None: + mask.append(tokenizer.eos_token_id if mask[-1] != -100 else -100) + tokens.append(tokenizer.eos_token_id) + return tokens, mask + + +class Encoder(object): + def __init__(self, args): + self.args = args + + def initializer(self): + # Use Encoder class as a container for global data + Encoder.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_path) + + def encode(self, text): + ids = {} + for key in self.args.jsonl_keys: + text_ids, label_ids = build_chat( + text[key], + self.args.generation_role, + not self.args.no_mask, + Encoder.tokenizer, + self.args.only_last, + self.args.for_rm, + ) + if self.args.reward_key is not None: + reward = text[self.args.reward_key] + if self.args.binary_reward: + reward = [1] if reward else [-1] + elif type(reward) == float: + reward = [reward] + ids[key] = (text_ids, label_ids, reward) + else: + ids[key] = (text_ids, label_ids, None) + return ids, len(text) + + +def get_args(): + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title="input data") + group.add_argument( + "--input", + type=str, + required=True, + help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated " + "list", + ) + group.add_argument( + "--jsonl-keys", + nargs="+", + default=["conversation"], + help="space separate listed of keys to extract from jsonl. Default: text", + ) + group.add_argument( + "--no-mask", + help="If set, this will not mask any tokens in the input data.", + action="store_true", + ) + group.add_argument( + "--for-rm", + help="If set, this will mask everything except the last token in the chat.", + action="store_true", + ) + + group.add_argument( + "--generation-role", + type=str, + default="assistant", + help="The role of the model generating the chat, usually 'assistant'. Default: assistant", + ) + group.add_argument( + "--only-last", + help="If set, this will mask everything except the last turn in the chat.", + action="store_true", + ) + group.add_argument( + "--reward-key", + type=str, + default=None, + help="Optional: key to use for reward data in the input data.", + ) + group.add_argument( + "--binary-reward", + help="If set, this will treat the reward data as a boolean.", + action="store_true", + ) + group.add_argument( + "--num-docs", + default=None, + help="Optional: Number of documents in the input data (if known) for an accurate progress bar.", + type=int, + ) + group = parser.add_argument_group(title="tokenizer") + group.add_argument( + "--tokenizer-path", + type=str, + required=True, + help="Path to HF Tokenizer.", + ) + group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text") + group = parser.add_argument_group(title="output data") + group.add_argument( + "--output-prefix", + type=str, + required=True, + help="Path to binary output file without suffix", + ) + group.add_argument( + "--dataset-impl", + type=str, + default="mmap", + choices=["lazy", "cached", "mmap"], + help="Dataset implementation to use. Default: mmap", + ) + + group = parser.add_argument_group(title="runtime") + group.add_argument( + "--workers", type=int, default=1, help="Number of worker processes to launch" + ) + group.add_argument( + "--log-interval", + type=int, + default=100, + help="Interval between progress updates", + ) + args = parser.parse_args() + args.keep_empty = False + + # some default/dummy values for the tokenizer + args.rank = 0 + args.make_vocab_size_divisible_by = 128 + args.model_parallel_size = 1 + + return args + + +def yield_from_files(fnames: list, semaphore): + """ + Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts / + other compressed formats. Also filters out empty documents. + + :param fnames: list of filenames + """ + + def yielder(fname, semaphore): + with open(fname, encoding="utf-8") as f: + reader = jsonlines.Reader(f) + for f in reader: + semaphore.acquire() + yield f + + for fname in fnames: + semaphore.acquire() + + yield from yielder(fname, semaphore) + + +def main(): + args = get_args() + encoder = Encoder(args) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path) + print(f"Vocab size: {tokenizer.vocab_size}") + print(f"Output prefix: {args.output_prefix}") + + # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and + # hence building up memory + semaphore = Semaphore(10000 + args.workers) + + # use multiprocessing to iterate over input documents + fin = yield_from_files(args.input.split(","), semaphore) + + if args.workers > 1: + pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer) + encoded_docs = pool.imap(encoder.encode, fin, chunksize=25) + else: + encoder.initializer() + encoded_docs = (encoder.encode(doc) for doc in fin) + + # make a dataset builder for each key in args.jsonl_keys + # each key will output to a different file beginning with args.output_prefix + output_bin_files = {} + output_idx_files = {} + builders = {} + for key in args.jsonl_keys: + output_bin_files[key] = "{}_{}_{}.bin".format( + args.output_prefix, key, "document" + ) + output_idx_files[key] = "{}_{}_{}.idx".format( + args.output_prefix, key, "document" + ) + builders[key] = indexed_dataset.make_builder( + output_bin_files[key], + impl=args.dataset_impl, + vocab_size=tokenizer.vocab_size, + ) + builders[key]._dtype = np.int32 + if not args.no_mask: + assert ( + key + "_label" not in args.jsonl_keys + ), "label should not be included as it will be generated according to the mask." + label_key = key + "_label" + output_bin_files[label_key] = "{}_{}_{}.bin".format( + args.output_prefix, label_key, "document" + ) + output_idx_files[label_key] = "{}_{}_{}.idx".format( + args.output_prefix, label_key, "document" + ) + builders[label_key] = indexed_dataset.make_builder( + output_bin_files[label_key], + impl=args.dataset_impl, + vocab_size=tokenizer.vocab_size, + ) + builders[label_key]._dtype = np.int32 + if args.reward_key is not None: + assert ( + key + "_reward" not in args.jsonl_keys + ), "reward should not be included as it will be generated from the data." + reward_key = key + "_reward" + output_bin_files[reward_key] = "{}_{}_{}.bin".format( + args.output_prefix, reward_key, "document" + ) + output_idx_files[reward_key] = "{}_{}_{}.idx".format( + args.output_prefix, reward_key, "document" + ) + builders[reward_key] = indexed_dataset.make_builder( + output_bin_files[reward_key], + impl=args.dataset_impl, + vocab_size=tokenizer.vocab_size, + ) + builders[reward_key]._dtype = np.int32 + + # actually do tokenization + proc_start = time.time() + total_bytes_processed = 0 + pbar = tqdm.tqdm() + for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): + total_bytes_processed += bytes_processed + + # release semaphore so `yield_from_files` can add another file to the buffer + semaphore.release() + + # add each tokenized document / sentence + for key, conv in doc.items(): + tokens = conv[0] + token_mask = conv[1] + reward = conv[2] + builders[key].add_item(np.array(tokens, dtype=builders[key].dtype)) + builders[key + "_label"].add_item( + np.array(token_mask, dtype=builders[key + "_label"].dtype) + ) + if args.reward_key is not None: + builders[key + "_reward"].add_item( + np.array(reward, dtype=builders[key + "_reward"].dtype) + ) + # add indx... + builders[key].end_document() + builders[key + "_label"].end_document() + if args.reward_key is not None: + builders[key + "_reward"].end_document() + if i == 1: + print("key: ", key) + print("tokens: ", tokens) + print("token_mask: ", token_mask) + print("Reward: ", reward) + # log progress + if i % args.log_interval == 0: + current = time.time() + elapsed = current - proc_start + mbs = total_bytes_processed / elapsed / 1024 / 1024 + pbar.set_description( + f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)." + ) + if i != 0: + pbar.update(args.log_interval) + + # save output file + update_keys = args.jsonl_keys + for key in update_keys: + builders[key].finalize(output_idx_files[key]) + builders[key + "_label"].finalize(output_idx_files[key + "_label"]) + if args.reward_key is not None: + builders[key + "_reward"].finalize(output_idx_files[key + "_reward"]) + + +if __name__ == "__main__": + main() diff --git a/train.py b/train.py index 2e4b09954..3e01a6306 100644 --- a/train.py +++ b/train.py @@ -27,6 +27,7 @@ def main(input_args=None, overwrite_values=None): neox_args.configure_distributed_args() neox_args.build_tokenizer() # tokenizer needs to be build in training in order to set the padding vocab neox_args.initialize_tensorboard_writer() # is initialized if tensorboard directory is defined + neox_args.initialize_comet() # is initialized if comet directory is defined pretrain(neox_args=neox_args)