diff --git a/.github/workflows/cpu_ci_on_pr.yml b/.github/workflows/.cpu_ci_on_pr.yml
similarity index 58%
rename from .github/workflows/cpu_ci_on_pr.yml
rename to .github/workflows/.cpu_ci_on_pr.yml
index 971640c18..43ce025c0 100644
--- a/.github/workflows/cpu_ci_on_pr.yml
+++ b/.github/workflows/.cpu_ci_on_pr.yml
@@ -1,3 +1,5 @@
+# This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed.
+
 name: "Pull Request CPU Tests"
 
 on:
@@ -7,7 +9,7 @@ on:
 
 jobs:
   run-tests:
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v4
diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml
index a79d0d8fb..128d279cc 100644
--- a/.github/workflows/coverity_scan.yml
+++ b/.github/workflows/coverity_scan.yml
@@ -17,9 +17,10 @@ jobs:
     runs-on: ubuntu-latest
 
     env:
-      COV_USER: ${{ secrets.COV_USER }}
+      COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions
       COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }}
-      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }}
+      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard:
+        # https://scan.coverity.com/projects/<project>?tab=project_settings
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml
index 9160fccab..6910b8a1c 100644
--- a/.github/workflows/cpu_ci.yml
+++ b/.github/workflows/cpu_ci.yml
@@ -5,7 +5,7 @@ on: "push"
 jobs:
   run-tests:
     #runs-on: ubuntu-latest
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
 
diff --git a/.github/workflows/cpu_ci_dispatch.yml b/.github/workflows/cpu_ci_dispatch.yml
index b1d108b3b..38485d6a6 100644
--- a/.github/workflows/cpu_ci_dispatch.yml
+++ b/.github/workflows/cpu_ci_dispatch.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   run-tests:
-    runs-on: [ 'test', 'self-hosted' ]
+    runs-on: ubuntu-22.04
     steps:
     - name: Checkout Repository
       uses: actions/checkout@v4
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 3213718df..7b06256bf 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -1,6 +1,7 @@
 name: Pull Request
 
-on: [pull_request]
+#on: [pull_request, workflow_dispatch]
+on: workflow_dispatch
 
 jobs:
   pre-commit:
@@ -9,7 +10,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.10
+          python-version: "3.10.14"
           cache: "pip"
           cache-dependency-path: "**/requirements*.txt"
       # Need the right version of clang-format
@@ -40,10 +41,20 @@ jobs:
           git commit -m "Update NeoXArgs docs automatically"
           git push
   run-tests:
-    runs-on: self-hosted
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10.13"
+          cache-dependency-path: "**/requirements*.txt"
       - name: prepare data
-        run: python prepare_data.py
+        run: python3 prepare_data.py
+      - name: install pytest
+        run: python3 -m pip install pytest pytest-forked pyyaml requests wandb
+      - name: install torch
+        run: python3 -m pip install torch
+      - name: install requirements
+        run: pip install -r requirements/requirements.txt
       - name: Run Tests
         run: pytest --forked tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7de35027a..249255306 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
       hooks:
       - id: codespell
         args: [
-              '--ignore-words-list=reord,dout',  # Word used in error messages that need rewording
+              '--ignore-words-list=reord,dout,te',  # Word used in error messages that need rewording. te --> transformerengine
               --check-filenames,
               --check-hidden,
           ]
diff --git a/README.md b/README.md
index ef97cdc17..c4f2fc23a 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,21 @@ GPT-NeoX leverages many of the same features and technologies as the popular Meg
 * Cutting edge architectural innovations including rotary and alibi positional embeddings, parallel feedforward attention layers, and flash attention.
 * Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 \& 2
 * Curriculum Learning
-* Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, logging via [WandB](https://wandb.ai/site), and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
+* Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, monitor experiments via [WandB](https://wandb.ai/site)/[Comet](https://www.comet.com/site/)/TensorBoard, and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
 
 ## News
+**[9/9/2024]** We now support preference learning via [DPO](https://arxiv.org/abs/2305.18290), [KTO](https://arxiv.org/abs/2402.01306), and reward modeling
+
+**[9/9/2024]** We now support integration with [Comet ML](https://www.comet.com/site/), a machine learning monitoring platform
+
+**[5/21/2024]** We now support [RWKV](https://www.rwkv.com/) with pipeline parallelism!. See the PRs for [RWKV](https://github.com/EleutherAI/gpt-neox/pull/1198) and [RWKV+pipeline](https://github.com/EleutherAI/gpt-neox/pull/1221)
+
+**[3/21/2024]** We now support Mixture-of-Experts (MoE)
+
+**[3/17/2024]** We now support AMD MI250X GPUs
+
+**[3/15/2024]** We now support [Mamba](https://github.com/state-spaces/mamba) with tensor parallelism! See [the PR](https://github.com/EleutherAI/gpt-neox/pull/1184)
+
 **[8/10/2023]** We now support checkpointing with AWS S3! Activate with the `s3_path` config option (for more detail, see [the PR](https://github.com/EleutherAI/gpt-neox/pull/1010))
 
 **[9/20/2023]** As of https://github.com/EleutherAI/gpt-neox/pull/1035, we have deprecated Flash Attention 0.x and 1.x, and migrated support to Flash Attention 2.x. We don't believe this will cause problems, but if you have a specific use-case that requires old flash support using the latest GPT-NeoX, please raise an issue.
@@ -88,7 +100,7 @@ Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherA
 
 ### Host Setup
 
-First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8.
+This codebase has primarily developed and tested for Python 3.8-3.10, and PyTorch 1.8-2.0. This is not a strict requirement, and other versions and combinations of libraries may work.
 
 To install the remaining basic dependencies, run:
 
@@ -96,6 +108,7 @@ To install the remaining basic dependencies, run:
 pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
 pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
+pip install -r requirements/requirements-comet.txt # optional, if logging via Comet
 ```
 
 from the repository root.
@@ -294,7 +307,7 @@ You can then run any job you want from inside the container.
 Concerns when running for a long time or in detached mode include
  - You will have to terminate the container manually when you are no longer using it
  - If you want processes to continue running when your shell session ends, you will need to background them.
- - If you then want logging, you will have to make sure to pipe logs to disk or set up wandb.
+ - If you then want logging, you will have to make sure to pipe logs to disk, and set up wandb and/or Comet logging.
 
 If you prefer to run the prebuilt container image from dockerhub, you can run the docker compose commands with ```-f docker-compose-dockerhub.yml``` instead, e.g.,
 
@@ -457,7 +470,7 @@ You can pass in an arbitrary number of configs which will all be merged at runti
 
 You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
 
-E.G:
+For example:
 
 ```bash
 python ./deepy.py train.py -d configs 125M.yml local_setup.yml
@@ -574,15 +587,28 @@ To convert from a Hugging Face model into a NeoX-loadable, run `tools/ckpts/conv
 
 # Monitoring
 
-In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard/)
+In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site), [TensorBoard](https://www.tensorflow.org/tensorboard/), and [Comet](https://www.comet.com/site)
 
 ## Weights and Biases
 
-EleutherAI is currently using [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox). If you are logged into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account.
+[Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox) is a machine learning monitoring platform. To use wandb to monitor your gpt-neox experiments:
+1. Create an account at https://wandb.ai/site to generate your API key
+2. Log into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded.
+3. Dependencies required for wandb monitoring can be found in and installed from `./requirements/requirements-wandb.txt`. An example config is provided in `./configs/local_setup_wandb.yml`.
+4. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account. An example config is provided in `./configs/local_setup_wandb.yml`.
 
 ## TensorBoard
 
-We also support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from  `./requirements/requirements-tensorboard.txt`.
+We support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from  `./requirements/requirements-tensorboard.txt`.
+
+## Comet
+
+[Comet](https://www.comet.com/site) is a machine learning monitoring platform. To use comet to monitor your gpt-neox experiments:
+1. Create an account at https://www.comet.com/login to generate your API key.
+2. Once generated, link your API key at runtime by running `comet login` or passing `export COMET_API_KEY=<your-key-here>`
+3. Install `comet_ml` and any dependency libraries via `pip install -r requirements/requirements-comet.txt`
+4. Enable Comet with `use_comet: True`. You can also customize where data is being logged with `comet_workspace` and `comet_project`. A full example config with comet enabled is provided in `configs/local_setup_comet.yml`.
+5. Run your experiment, and monitor metrics in the Comet workspace that you passed!
 
 # Running on multi-node
 
@@ -594,7 +620,9 @@ We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memo
 
 ## Nsight Systems Profiling
 
-To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop`. Launch training with:
+To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
+
+To populate nsys metrics, launch training with:
 
 ```
 nsys profile -s none -t nvtx,cuda -o <path/to/profiling/output> --force-overwrite true \
@@ -604,22 +632,22 @@ $TRAIN_PATH/train.py --conf_dir configs <config files>
 
 The generated output file can then by viewed with the Nsight Systems GUI:
 
-![Alt text](images/nsight_profiling.png)
+![nsight-prof](images/nsight_profiling.png)
 
 ## PyTorch Profiling
 
-To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`.
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
 
 The PyTorch profiler will save traces to your `tensorboard` log directory.  You can view these traces within
 TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
 
-![Alt text](images/pytorch_profiling.png)
+![torch-prof](images/pytorch_profiling.png)
 
 ## PyTorch Memory Profiling
 
-To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.
+To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
 
-![Alt text](images/memory_profiling.png)
+![mem-prof](images/memory_profiling.png)
 
 View the generated profile with the [memory_viz.py](https://github.com/pytorch/pytorch/blob/main/torch/cuda/_memory_viz.py) script. Run with:
 
@@ -677,7 +705,7 @@ The following publications by other research groups use this library:
 The following models were trained using this library:
 
 ### English LLMs
-- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
 - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
 - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
 - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
@@ -688,13 +716,15 @@ The following models were trained using this library:
 ### Non-English LLMs
 - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
 - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
-- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b)
+- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese)
 - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
 - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
 - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
 - The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
 - The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
 - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
+- Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese)
+- The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque)
 
 ### Code Models
 - Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
@@ -702,11 +732,13 @@ The following models were trained using this library:
 - CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
 
 ### AI for Science
+- EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
 - Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
-- Oak Ridge National Lab and EleutherAI's [Unnamed Material Science Domain Models (7B)](https://github.com/at-aaims/forge)
+- Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691)
 - Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr)
 
 ### Other Modalities
+-  Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text)
 -  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
 -  Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
 
diff --git a/configs/README.md b/configs/README.md
index d8ae81739..ac20ed89b 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -9,7 +9,7 @@ Below is an example configuration `.yaml` to train a ~160M parameter GPT model.
 
 For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
 
-Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
+Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
 ```yaml
 # GPT-3 pretraining setup
 {
@@ -124,6 +124,8 @@ These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must
     # this should provide some speedup but takes a while to build, set to true if desired
    "scaled_upper_triang_masked_softmax_fusion": false,
    "train_iters": 320000,
+    # alternatively, use train_epochs to automatically determine the number of training iterations
+    #"train_epochs": 1,
 ```
 An example of some basic settings used to configure your model's architecture and number of training steps.
 
@@ -235,6 +237,58 @@ Additional DeepSpeed settings besides those mentioned above should be wrapped in
    "eval_iters": 10,
 ```
 
+For KTO style training, you'll need to add the reward & label data path, e.g.:
+
+```yaml
+   "data_impl": "mmap",
+   # Suggested data paths when using GPT-NeoX locally
+   "train_data_path": "data/enwik8/enwik8_text_document",
+   "train_label_data_path": "data/enwik8/enwik8_text_label_document",
+   "train_reward_data_path": "data/enwik8/enwik8_text_reward_document",
+   "test_data_path": "data/enwik8/enwik8_text_document",
+   "test_label_data_path": "data/enwik8/enwik8_text_label_document",
+   "test_reward_data_path": "data/enwik8/enwik8_text_reward_document",
+   "valid_data_path": "data/enwik8/enwik8_text_document",
+   "valid_label_data_path": "data/enwik8/enwik8_text_label_document",
+   "valid_reward_data_path": "data/enwik8/enwik8_text_reward_document",
+   "vocab_file": "data/gpt2-vocab.json",
+   "merge_file": "data/gpt2-merges.txt",
+   "save": "checkpoints",
+   "load": "checkpoints",
+   "tensorboard_dir": "tensorboard",
+   "log_dir": "logs",
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+```
+
+For DPO style training, you'll need to set pos/neg data paths instead of a single one, e.g.
+
+```yaml
+   "dataset_impl": "pairwise",
+   "train_impl": "dpo",
+   "pack_impl": "unpacked",
+   "dpo_beta": 0.1,
+   "dpo_fp32": true,
+   "pos_train_data_path": "data/enwik8/enwik8_text_pos_document",
+   "pos_valid_data_path": "data/enwik8/enwik8_text_pos_document",
+   "pos_test_data_path": "data/enwik8/enwik8_text_pos_document",
+   "neg_train_data_path": "data/enwik8/enwik8_text_neg_document",
+   "neg_valid_data_path": "data/enwik8/enwik8_text_neg_document",
+   "neg_test_data_path": "data/enwik8/enwik8_text_neg_document",
+   ## If you have labels... (likely to mask out user turns)
+   "pos_train_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "pos_valid_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "pos_test_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "neg_train_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   "neg_valid_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   "neg_test_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   ## If you want to precompute the logits over your dataset...
+   "precompute_model_name": "gpt2",
+   ## Needed for the generation.py step, if precomputing
+   "text_gen_type": "precompute"
+```
+
 ### LR Scheduler settings
 
 ```yaml
diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
index 305567be1..a7470cae8 100644
--- a/configs/llama/13B.yml
+++ b/configs/llama/13B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 40,
   "hidden_size": 5120,
+  "intermediate_size": 40960,
   "num_attention_heads": 40,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,11 +17,12 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
index 450f8da38..234445c77 100644
--- a/configs/llama/30B.yml
+++ b/configs/llama/30B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 60,
   "hidden_size": 6656,
+  "intermediate_size": 53248,
   "num_attention_heads": 52,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,11 +17,12 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
index 85f199ce2..8ffffe241 100644
--- a/configs/llama/65B.yml
+++ b/configs/llama/65B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 80,
   "hidden_size": 8192,
+  "intermediate_size": 65536,
   "num_attention_heads": 64,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,11 +17,12 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
index ecbf187a8..0d7c40b24 100644
--- a/configs/llama/7B.yml
+++ b/configs/llama/7B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 32,
   "hidden_size": 4096,
+  "intermediate_size": 32768,
   "num_attention_heads": 32,
   "seq_length": 2048,
   "max_position_embeddings": 2048,
@@ -16,11 +17,12 @@
   "output_layer_parallelism": "column",
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-6,
+  "use_bias_in_mlp": False,
 
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama/train_config.yml b/configs/llama/train_config.yml
index 64d8ff422..459332609 100644
--- a/configs/llama/train_config.yml
+++ b/configs/llama/train_config.yml
@@ -70,4 +70,5 @@
   "steps_per_print": 10,
   "keep_last_n_checkpoints": 4,
   "wall_clock_breakdown": true,
+
 }
diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml
index 973b8bea4..7df5ad3ea 100644
--- a/configs/llama2/13B.yml
+++ b/configs/llama2/13B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 40,
   "hidden_size": 5120,
+  "intermediate_size": 41472,
   "num_attention_heads": 40,
   "seq_length": 4096,
   "max_position_embeddings": 4096,
@@ -21,6 +22,6 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama2/70B.yml b/configs/llama2/70B.yml
index 615ae5d68..d175e146e 100644
--- a/configs/llama2/70B.yml
+++ b/configs/llama2/70B.yml
@@ -6,7 +6,7 @@
   # model settings
   "num_layers": 80,
   "hidden_size": 8192,
-  "intermediate_size": 28672,
+  "intermediate_size": 86016,
   "num_attention_heads": 64,
   "num_kv_heads": 8,
   "seq_length": 4096,
@@ -26,6 +26,6 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama2/7B.yml b/configs/llama2/7B.yml
index 6a5c97e64..cdb63f02e 100644
--- a/configs/llama2/7B.yml
+++ b/configs/llama2/7B.yml
@@ -6,6 +6,7 @@
   # model settings
   "num_layers": 32,
   "hidden_size": 4096,
+  "intermediate_size": 32768,
   "num_attention_heads": 32,
   "seq_length": 4096,
   "max_position_embeddings": 4096,
@@ -21,6 +22,6 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama2/codellama_34B.yml b/configs/llama2/codellama_34B.yml
index 88e9afaf6..e4cb2fc78 100644
--- a/configs/llama2/codellama_34B.yml
+++ b/configs/llama2/codellama_34B.yml
@@ -27,6 +27,6 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llama2/codellama_7B.yml b/configs/llama2/codellama_7B.yml
index be123ebee..e8775f3eb 100644
--- a/configs/llama2/codellama_7B.yml
+++ b/configs/llama2/codellama_7B.yml
@@ -26,6 +26,6 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 }
diff --git a/configs/llemma/34B.yml b/configs/llemma/34B.yml
index bd72d7e23..1a693c7f4 100644
--- a/configs/llemma/34B.yml
+++ b/configs/llemma/34B.yml
@@ -30,8 +30,8 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 
    "optimizer": {
      "type": "Adam",
diff --git a/configs/llemma/7B.yml b/configs/llemma/7B.yml
index fb72c8c18..363cf4315 100644
--- a/configs/llemma/7B.yml
+++ b/configs/llemma/7B.yml
@@ -28,8 +28,8 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
 
    "optimizer": {
      "type": "Adam",
diff --git a/configs/local_setup.yml b/configs/local_setup.yml
index d031a2ad8..b8ec4b06a 100644
--- a/configs/local_setup.yml
+++ b/configs/local_setup.yml
@@ -24,7 +24,4 @@
 
   "tensorboard_dir": "tensorboard",
   "log_dir": "logs",
-  "use_wandb": True,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
 }
diff --git a/configs/local_setup_comet.yml b/configs/local_setup_comet.yml
new file mode 100644
index 000000000..12ff7b388
--- /dev/null
+++ b/configs/local_setup_comet.yml
@@ -0,0 +1,33 @@
+# Suggested data paths when using GPT-NeoX locally
+{
+  "data_path": "/workspace/gpt-neox-main/data/enwik8/enwik8_text_document",
+
+  # or for weighted datasets:
+  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "train-data-weights": [1., 2.],
+  # "test-data-weights": [2., 1.],
+  # "valid-data-weights": [0.5, 0.4],
+
+  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
+  # WARNING: setting this to True will override any user provided weights
+  # "weight_by_num_documents": false,
+  # "weighted_sampler_alpha": 0.3,
+
+  "vocab_file": "/workspace/gpt-neox-main/data/gpt2-vocab.json",
+  "merge_file": "/workspace/gpt-neox-main/data/gpt2-merges.txt",
+
+  "save": "checkpoints",
+  "load": "checkpoints",
+  "checkpoint_validation_with_forward_pass": False,
+
+  "tensorboard_dir": "tensorboard",
+  "log_dir": "logs",
+  "use_comet": True,
+  # "comet_workspace": "test_workspace", # CHANGE ME
+  "comet_project": "test_project",
+  "comet_experiment_name": "test_experiment",
+  "comet_tags": ["test_tag1", "test_tag2"],
+  "comet_others": {"test_others"},
+}
diff --git a/configs/local_setup_wandb.yml b/configs/local_setup_wandb.yml
new file mode 100644
index 000000000..d031a2ad8
--- /dev/null
+++ b/configs/local_setup_wandb.yml
@@ -0,0 +1,30 @@
+# Suggested data paths when using GPT-NeoX locally
+{
+  "data_path": "data/enwik8/enwik8_text_document",
+
+  # or for weighted datasets:
+  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
+  # "train-data-weights": [1., 2.],
+  # "test-data-weights": [2., 1.],
+  # "valid-data-weights": [0.5, 0.4],
+
+  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
+  # WARNING: setting this to True will override any user provided weights
+  # "weight_by_num_documents": false,
+  # "weighted_sampler_alpha": 0.3,
+
+  "vocab_file": "data/gpt2-vocab.json",
+  "merge_file": "data/gpt2-merges.txt",
+
+  "save": "checkpoints",
+  "load": "checkpoints",
+  "checkpoint_validation_with_forward_pass": False,
+
+  "tensorboard_dir": "tensorboard",
+  "log_dir": "logs",
+  "use_wandb": True,
+  "wandb_host": "https://api.wandb.ai",
+  "wandb_project": "neox"
+}
diff --git a/configs/mamba/mamba-1.4B.yml b/configs/mamba/mamba-1.4B.yml
index 2898a72fd..eae467d0e 100644
--- a/configs/mamba/mamba-1.4B.yml
+++ b/configs/mamba/mamba-1.4B.yml
@@ -19,5 +19,71 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 1,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-130M.yml b/configs/mamba/mamba-130M.yml
index d9a6ab92e..bd05723b2 100644
--- a/configs/mamba/mamba-130M.yml
+++ b/configs/mamba/mamba-130M.yml
@@ -19,5 +19,71 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-2.8B.yml b/configs/mamba/mamba-2.8B.yml
index 1aacb264b..d5afef368 100644
--- a/configs/mamba/mamba-2.8B.yml
+++ b/configs/mamba/mamba-2.8B.yml
@@ -19,5 +19,71 @@
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00016,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000016,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-370M.yml b/configs/mamba/mamba-370M.yml
index 5e5a78cca..0058f1c0e 100644
--- a/configs/mamba/mamba-370M.yml
+++ b/configs/mamba/mamba-370M.yml
@@ -12,12 +12,77 @@
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-5,
 
-  "attention_config": [[["mamba"], 64]],
+  "attention_config": [[["mamba"], 48]],
 
   "mamba_selective_scan_fusion": true,
   "mamba_causal_conv_fusion": true,
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0003,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00003,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mamba/mamba-790M.yml b/configs/mamba/mamba-790M.yml
index fcd324d9d..4aef7e813 100644
--- a/configs/mamba/mamba-790M.yml
+++ b/configs/mamba/mamba-790M.yml
@@ -12,12 +12,78 @@
   "norm": "rmsnorm",
   "rms_norm_epsilon": 1.0e-5,
 
-  "attention_config": [[["mamba"], 64]],
+  "attention_config": [[["mamba"], 48]],
 
   "mamba_selective_scan_fusion": true,
   "mamba_causal_conv_fusion": true,
   "mamba_inner_func_fusion": true, # supersedes scan or conv fusion
   "activation": "silu",
 
-  "output_layer_init_method": "single_residual_scaled_normal",
+  # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "single_residual_scaled_normal",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00025,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000025,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
 }
diff --git a/configs/mistral/7B.yml b/configs/mistral/7B.yml
index 587fe5d36..ba4e543d6 100644
--- a/configs/mistral/7B.yml
+++ b/configs/mistral/7B.yml
@@ -33,8 +33,7 @@
   "bias_gelu_fusion": false,
   "use_bias_in_norms": false,
   "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
+  "activation": "swiglu",
 
   "tokenizer_type": "SPMTokenizer",
   #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index a39b8a058..45e1ab196 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -14,14 +14,19 @@ LR Scheduler Arguments
     Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'.
 
 
-
 - **lr_decay_iters**: int
 
     Default = None
 
-    Number of iterations to decay learning rate over, If None defaults to --train-iters
+    Number of iterations to decay learning rate over. If None, defaults to 
+    --train-iters or the equivalent inferred value from train_epochs.
+
+- **lr_decay_fraction**: float
 
+    Default = None
 
+    Effective fraction of training over which to decay lr. Overrides lr_decay_iters. 
+    Useful when specifying train_epochs.
 
 - **min_lr**: float
 
@@ -133,6 +138,54 @@ Logging Arguments
 
 
 
+- **use_comet**: bool
+
+    Default = None
+
+    Flag indicating if comet is to be used.
+
+
+
+- **comet_workspace**: Optional
+
+    Default = None
+
+    Comet workspace name, if not configured Comet Experiments will be created in the user configured default workspace.
+
+
+
+- **comet_project**: Optional
+
+    Default = None
+
+    Comet project name, if not configured Comet Experiments will be created in the Uncategorized Experiments project.
+
+
+
+- **comet_experiment_name**: Optional
+
+    Default = None
+
+    Custom name for the Comet experiment. If not provided, a random name is used.
+
+
+
+- **comet_tags**: Optional
+
+    Default = None
+
+    List of tags to attach to the created Comet Experiment.
+
+
+
+- **comet_others**: Optional
+
+    Default = None
+
+    Custom metadata to attach to the created Comet Experiment.
+
+
+
 - **log_interval**: int
 
     Default = 100
@@ -337,9 +390,23 @@ Model Arguments
 
     Default = None
 
-    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+    Transformer intermediate size. Default = 4h
 
-    If not passed, will be set to a reasonable default.
+
+
+- **mlp_multiple_of**: int
+
+    Default = 1
+
+    force mlp size to be a multiple of this value
+
+
+
+- **expansion_factor**: float
+
+    Default = None
+
+    Transformer intermediate size. Default = 4
 
 
 
@@ -391,11 +458,11 @@ Model Arguments
 
 
 
-- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
+- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm', 'te_rmsnorm', 'te_layernorm']
 
     Default = layernorm
 
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm".
 
 
 
@@ -407,6 +474,14 @@ Model Arguments
 
 
 
+- **rmsnorm_fusion**: bool
+
+    Default = False
+
+    Use fused RMS norm kernel (if `norm` is `rmsnorm`).
+
+
+
 - **use_qk_layernorm**: bool
 
     Default = False
@@ -553,11 +628,19 @@ Model Arguments
 
 
 
-- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
+- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu', 'reglu', 'swiglu', 'bilinear', 'glu']
 
     Default = gelu
 
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"]
+
+
+
+- **use_flashattn_swiglu**: bool
+
+    Default = False
+
+    Use flash attention's version of swiglu
 
 
 
@@ -737,13 +820,11 @@ Model Arguments
 
 
 
-- **mlp_type**: str
+- **use_bias_in_mlp**: bool
 
-    Default = regular
+    Default = True
 
-    Types:
-        regular: Megatron implementation
-        llama: LLaMA MLP (SiLU-gated MLP)
+    If false, mlps will not have bias terms
 
 
 
@@ -818,6 +899,29 @@ Model Arguments
 
 
 
+- **dim_att**: int
+
+    Default = None
+
+    Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size.
+
+
+
+- **head_size**: int
+
+    Default = None
+
+    Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads.
+
+
+
+- **ffn_dim**: int
+
+    Default = None
+
+    Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor.
+
+
 ## NeoXArgsOptimizer
 
 Optimizer Arguments
@@ -1112,6 +1216,16 @@ Parallelism Arguments
 
 
 
+- **sequence_parallel**: bool
+
+    Default = False
+
+    flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198)
+    (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1.
+    **Set by user, in contrast to neox_args.is_pipe_parallel.**
+
+
+
 ## NeoXArgsTemplate
 
 NeoXArgsTemplate()
@@ -1129,7 +1243,15 @@ Text Generation arguments
     Default = None
 
     How to generate text/sample the model.
-    Options: `unconditional`, `input-file`, `interactive`
+    Options: `unconditional`, `input-file`, `interactive`, `precompute`
+
+
+
+- **precompute_model_name**: str
+
+    Default = None
+
+    Model name to use for saving precomputed logprobs
 
 
 
@@ -1287,11 +1409,19 @@ Training Arguments
 
 
 
-- **label_data_paths**: list
+- **train_label_data_paths**: list
 
     Default = None
 
-    List of paths to label datasets (not shifted by 1 yet!).
+    List of paths to train label datasets (not shifted by 1 yet!).
+
+
+
+- **train_reward_data_paths**: list
+
+    Default = None
+
+    List of paths to train reward datasets
 
 
 
@@ -1303,6 +1433,22 @@ Training Arguments
 
 
 
+- **test_label_data_paths**: list
+
+    Default = None
+
+    List of paths to test label datasets (not shifted by 1 yet!).
+
+
+
+- **test_reward_data_paths**: list
+
+    Default = None
+
+    List of paths to test reward datasets
+
+
+
 - **valid_data_paths**: list
 
     Default = None
@@ -1311,6 +1457,118 @@ Training Arguments
 
 
 
+- **valid_label_data_paths**: list
+
+    Default = None
+
+    List of paths to validation label datasets (not shifted by 1 yet!).
+
+
+
+- **valid_reward_data_paths**: list
+
+    Default = None
+
+    List of paths to validation reward datasets
+
+
+
+- **pos_train_data_paths**: list
+
+    Default = None
+
+    
+
+
+
+- **neg_train_data_paths**: list
+
+    Default = None
+
+    List of paths to positive and negative training datasets.
+
+
+
+- **pos_train_label_data_paths**: list
+
+    Default = None
+
+    
+
+
+
+- **neg_train_label_data_paths**: list
+
+    Default = None
+
+    List of paths to positive and negative training label datasets (not shifted by 1 yet!).
+
+
+
+- **pos_valid_data_paths**: list
+
+    Default = None
+
+    
+
+
+
+- **neg_valid_data_paths**: list
+
+    Default = None
+
+    List of paths to positive and negative validation datasets.
+
+
+
+- **pos_valid_label_data_paths**: list
+
+    Default = None
+
+    
+
+
+
+- **neg_valid_label_data_paths**: list
+
+    Default = None
+
+    List of paths to positive and negative validation label datasets (not shifted by 1 yet!).
+
+
+
+- **pos_test_data_paths**: list
+
+    Default = None
+
+    
+
+
+
+- **neg_test_data_paths**: list
+
+    Default = None
+
+    List of paths to positive and negative test datasets.
+
+
+
+- **pos_test_label_data_paths**: list
+
+    Default = None
+
+    
+
+
+
+- **neg_test_label_data_paths**: list
+
+    Default = None
+
+    List of paths to positive and negative test label datasets (not shifted by 1 yet!).
+
+
+
 - **train_data_weights**: list
 
     Default = None
@@ -1378,6 +1636,99 @@ Training Arguments
 
 
 
+- **pack_impl**: typing.Literal['packed', 'pack_until_overflow', 'unpacked']
+
+    Default = packed
+
+    Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked".
+
+    warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets
+
+
+
+- **dataset_impl**: typing.Literal['gpt2', 'pairwise']
+
+    Default = gpt2
+
+    Dataset implementation, can be one of "gpt2" or "pairwise"
+
+
+
+- **train_impl**: typing.Literal['normal', 'dpo', 'rm', 'kto']
+
+    Default = normal
+
+    Training implementation, can be one of "normal", "dpo", "kto", or "rm"
+
+
+
+- **dpo_fp32**: bool
+
+    Default = True
+
+    Whether to cast logits to fp32 for DPO loss calculation.
+
+
+
+- **dpo_reference_free**: bool
+
+    Default = False
+
+    Whether to use reference-free DPO.
+
+
+
+- **dpo_beta**: float
+
+    Default = 0.1
+
+    Beta value for DPO
+
+
+
+- **kto_fp32**: bool
+
+    Default = True
+
+    Whether to cast logits to fp32 for KTO loss calculation.
+
+
+
+- **kto_desirable_weight**: float
+
+    Default = 1.0
+
+    Weight for desirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes.
+
+
+
+- **kto_undesirable_weight**: float
+
+    Default = 1.0
+
+    Weight for undesirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes.
+
+
+
+- **kto_beta**: float
+
+    Default = 0.1
+
+    Beta value for KTO
+
+
+
+- **allow_chopped**: bool
+
+    Default = True
+
+    WARNING: if your packing impl is packed, this is ignored.
+
+    Allow chopped samples in the dataset.
+    (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024)
+
+
+
 - **mmap_warmup**: bool
 
     Default = False
@@ -1524,6 +1875,15 @@ Training Arguments
 
 
 
+- **train_epochs**: int
+
+    Default = None
+
+    Number of epochs to run for training. Do not specify both train_epochs and train_iters.
+    Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed'
+
+
+
 - **eval_iters**: int
 
     Default = 100
diff --git a/configs/prof.yml b/configs/prof.yml
new file mode 100644
index 000000000..c2f2ee118
--- /dev/null
+++ b/configs/prof.yml
@@ -0,0 +1,17 @@
+# Sample profiling config
+{
+  # Turns on nsys and pytorch profiling
+  "profile": true,
+
+  # pytorch profiler options
+  "profile_step_start": 10,
+  "profile_step_stop": 12,
+
+  # pytorch memory profiler options
+  "memory_profiling": true,
+  "memory_profiling_path": tensorboard,
+
+
+  # All trace files (pytorch, nsys, tensorboard, etc) will be written here
+  "tensorboard_dir": "tensorboard",
+}
diff --git a/configs/slurm_local.json b/configs/slurm_local.json
index 36e16089b..4b9ce5c56 100644
--- a/configs/slurm_local.json
+++ b/configs/slurm_local.json
@@ -4,8 +4,5 @@
   "save": "checkpoints",
   "checkpoint_validation_with_forward_pass": false,
   "tensorboard-dir": "tensorboard",
-  "log-dir": "logs",
-  "use_wandb": true,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
+  "log-dir": "logs"
 }
diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml
index 1a2b73aba..3aa3f3742 100644
--- a/configs/slurm_local.yml
+++ b/configs/slurm_local.yml
@@ -6,7 +6,4 @@
   "checkpoint_validation_with_forward_pass": false,
   "tensorboard_dir": "tensorboard",
   "log_dir": "logs",
-  "use_wandb": true,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
 }
diff --git a/eval.py b/eval.py
index 93093f21d..53bd21e0c 100644
--- a/eval.py
+++ b/eval.py
@@ -54,6 +54,7 @@ def main(input_args=None, overwrite_values=None):
                         v2,
                         neox_args.iteration,
                         use_wandb=neox_args.use_wandb,
+                        comet_experiment=neox_args.comet_experiment,
                     )
             else:
                 tb_wandb_log(
@@ -61,6 +62,7 @@ def main(input_args=None, overwrite_values=None):
                     v,
                     neox_args.iteration,
                     use_wandb=neox_args.use_wandb,
+                    comet_experiment=neox_args.comet_experiment,
                 )
 
         pprint(results)
diff --git a/generate.py b/generate.py
index 743e350d0..e19ef2e0e 100755
--- a/generate.py
+++ b/generate.py
@@ -23,6 +23,7 @@
     generate_samples_from_prompt,
     generate_samples_unconditional,
     generate_samples_interactive,
+    precompute_logits,
 )
 
 
@@ -83,6 +84,8 @@ def main(input_args=None, overwrite_values=None):
             top_p=neox_args.top_p,
         )
 
+    elif neox_args.text_gen_type == "precompute":
+        precompute_logits(neox_args=neox_args, model=model)
     else:
         raise ValueError(
             f"`text_gen_type` either not specified or not recognised: {neox_args.text_gen_type}"
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index 12b81e202..1b6909c9f 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -393,6 +393,7 @@ def load_checkpoint(
             load_lr_scheduler_states=load_optim_and_scheduler,
             load_module_only=not load_optim_and_scheduler,
             tag=tag,
+            load_module_strict=neox_args.train_impl != "rm",
         )
 
         if checkpoint_name is None:
diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
index bc5754cdb..c08b60151 100644
--- a/megatron/data/data_utils.py
+++ b/megatron/data/data_utils.py
@@ -16,13 +16,14 @@
 import torch
 import numpy as np
 from typing import List, Tuple
-from itertools import zip_longest
+from itertools import zip_longest, cycle
 from functools import partial
 
 from megatron import mpu, print_rank_0
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.gpt2_dataset import GPT2Dataset
+from megatron.data.pairwise_dataset import PairwiseDataset
 from megatron.data.samplers import DistributedBatchSampler
 
 
@@ -53,39 +54,122 @@ def make_data_loader(dataset, neox_args):
 
 def build_the_dataset(
     data_prefix,
+    pos_data_prefix,
+    neg_data_prefix,
     name,
     data_impl,
+    pack_impl,
+    dataset_impl,
+    allow_chopped,
     num_samples,
+    num_epochs,
     seq_length,
     seed,
     skip_warmup,
     build_index_mappings=True,
     label_prefix=None,
+    pos_label_prefix=None,
+    neg_label_prefix=None,
+    precompute_model_name=None,
+    reward_prefix=None,
 ):
     """Build train/valid/test datasets."""
-
-    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
-    if label_prefix is None:
-        label_dataset = None
+    if dataset_impl == "gpt2":
+        indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+        if label_prefix is None:
+            label_dataset = None
+        else:
+            label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup)
+        if precompute_model_name is not None:
+            # If we have the name, assume it exists. If it doesn't, it will just be None which is fine.
+            precompute_indexed_dataset = make_indexed_dataset(
+                data_prefix + "_" + precompute_model_name, data_impl, skip_warmup
+            )
+            precompute_indexed_dataset = precompute_indexed_dataset
+        else:
+            precompute_indexed_dataset = None
+        if reward_prefix is not None:
+            reward_dataset = make_indexed_dataset(reward_prefix, data_impl, skip_warmup)
+        else:
+            reward_dataset = None
+    elif dataset_impl == "pairwise":
+        pos_indexed_dataset = make_indexed_dataset(
+            pos_data_prefix, data_impl, skip_warmup
+        )
+        neg_indexed_dataset = make_indexed_dataset(
+            neg_data_prefix, data_impl, skip_warmup
+        )
+        if pos_label_prefix is None:
+            pos_label_dataset = None
+            # Also do neg here since they both must be the same
+            assert neg_label_prefix is None
+            neg_label_dataset = None
+        else:
+            pos_label_dataset = make_indexed_dataset(
+                pos_label_prefix, data_impl, skip_warmup
+            )
+            # Also do neg here since they both must be the same
+            assert neg_label_prefix is not None
+            neg_label_dataset = make_indexed_dataset(
+                neg_label_prefix, data_impl, skip_warmup
+            )
+        if precompute_model_name is None:
+            pos_ref_dataset = None
+            neg_ref_dataset = None
+        else:
+            pos_ref_dataset = make_indexed_dataset(
+                pos_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup
+            )
+            neg_ref_dataset = make_indexed_dataset(
+                neg_data_prefix + "_" + precompute_model_name, data_impl, skip_warmup
+            )
     else:
-        label_dataset = make_indexed_dataset(label_prefix, data_impl, skip_warmup)
+        raise NotImplementedError(f"dataset_impl={dataset_impl} not implemented")
 
-    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    total_num_of_documents = (
+        indexed_dataset.sizes.shape[0]
+        if dataset_impl == "gpt2"
+        else pos_indexed_dataset.sizes.shape[0]
+    )
     print_rank_0("    {}:".format(name))
     print_rank_0("     no. of documents:{}".format(total_num_of_documents))
     dataset = None
     documents = np.arange(start=0, stop=total_num_of_documents, step=1, dtype=np.int32)
-    dataset = GPT2Dataset(
-        name,
-        data_prefix,
-        documents,
-        indexed_dataset,
-        num_samples,
-        seq_length,
-        seed,
-        build_index_mappings=build_index_mappings,
-        label_dataset=label_dataset,
-    )
+    if dataset_impl == "gpt2":
+        dataset = GPT2Dataset(
+            name,
+            data_prefix,
+            documents,
+            indexed_dataset,
+            num_samples,
+            num_epochs,
+            seq_length,
+            seed,
+            pack_impl=pack_impl,
+            allow_chopped=allow_chopped,
+            build_index_mappings=build_index_mappings,
+            label_dataset=label_dataset,
+            reward_dataset=reward_dataset,
+            ref_dataset=precompute_indexed_dataset,
+        )
+    elif dataset_impl == "pairwise":
+        dataset = PairwiseDataset(
+            name,
+            pos_data_prefix,
+            documents,
+            pos_indexed_dataset,
+            neg_indexed_dataset,
+            num_samples,
+            seq_length,
+            seed,
+            pack_impl=pack_impl,
+            allow_chopped=allow_chopped,
+            build_index_mappings=build_index_mappings,
+            pos_label_dataset=pos_label_dataset,
+            neg_label_dataset=neg_label_dataset,
+            pos_ref_dataset=pos_ref_dataset,
+            neg_ref_dataset=neg_ref_dataset,
+        )
     return dataset
 
 
@@ -93,8 +177,11 @@ def build_train_valid_test_datasets(
     data_prefix,
     use_shared_fs,
     data_impl,
+    pack_impl,
+    allow_chopped,
     splits_string,
     train_valid_test_num_samples,
+    train_valid_test_epochs,
     seq_length,
     seed,
     skip_warmup,
@@ -129,15 +216,17 @@ def build_dataset(index, name):
             documents = np.arange(
                 start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32
             )
-
             dataset = GPT2Dataset(
                 name,
                 data_prefix,
                 documents,
                 indexed_dataset,
                 train_valid_test_num_samples[index],
+                train_valid_test_epochs[index],
                 seq_length,
                 seed,
+                pack_impl=pack_impl,
+                allow_chopped=allow_chopped,
                 use_shared_fs=use_shared_fs,
             )
         return dataset
@@ -183,12 +272,15 @@ def get_normalized_weights_and_num_samples(
     weight_sum = sum(weights)
     assert weight_sum > 0.0
     weights = [weight / weight_sum for weight in weights]
-    # Add 0.5% (the 1.005 factor) so in case the blending dataset does
-    # not uniformly distribute the number of samples, we still have
-    # samples left to feed to the network.
-    weighted_num_samples = []
-    for weight in weights:
-        weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
+    if num_samples is not None:
+        # Add 0.5% (the 1.005 factor) so in case the blending dataset does
+        # not uniformly distribute the number of samples, we still have
+        # samples left to feed to the network.
+        weighted_num_samples = []
+        for weight in weights:
+            weighted_num_samples.append(int(math.ceil(num_samples * weight * 1.005)))
+    else:
+        weighted_num_samples = [None for _ in weights]
     return weights, weighted_num_samples
 
 
@@ -197,61 +289,154 @@ def build_weighted_datasets(
     train_num_samples,
     valid_num_samples,
     test_num_samples,
-    train_weights,
-    valid_weights,
-    test_weights,
+    train_epochs,
+    valid_epochs,
+    test_epochs,
     build_index_mappings=True,
 ):
     # build individual datasets
     train_datasets, valid_datasets, test_datasets = [], [], []
-    for i, (train_path, label_path, valid_path, test_path) in enumerate(
+    for i, (
+        train_path,
+        train_label_path,
+        train_reward_path,
+        valid_path,
+        valid_label_path,
+        valid_reward_path,
+        test_path,
+        test_label_path,
+        test_reward_path,
+        pos_train_path,
+        neg_train_path,
+        pos_train_label_path,
+        neg_train_label_path,
+        pos_valid_path,
+        neg_valid_path,
+        pos_valid_label_path,
+        neg_valid_label_path,
+        pos_test_path,
+        neg_test_path,
+        pos_test_label_path,
+        neg_test_label_path,
+    ) in enumerate(
         zip_longest(
-            neox_args.train_data_paths,
-            neox_args.label_data_paths if neox_args.label_data_paths else [],
-            neox_args.valid_data_paths,
-            neox_args.test_data_paths,
+            neox_args.train_data_paths if neox_args.train_data_paths else [],
+            neox_args.train_label_data_paths
+            if neox_args.train_label_data_paths
+            else [],
+            neox_args.train_reward_data_paths
+            if neox_args.train_reward_data_paths
+            else [],
+            neox_args.valid_data_paths if neox_args.valid_data_paths else [],
+            neox_args.valid_label_data_paths
+            if neox_args.valid_label_data_paths
+            else [],
+            neox_args.valid_reward_data_paths
+            if neox_args.valid_reward_data_paths
+            else [],
+            neox_args.test_data_paths if neox_args.test_data_paths else [],
+            neox_args.test_label_data_paths if neox_args.test_label_data_paths else [],
+            neox_args.test_reward_data_paths
+            if neox_args.test_reward_data_paths
+            else [],
+            neox_args.pos_train_data_paths if neox_args.pos_train_data_paths else [],
+            neox_args.neg_train_data_paths if neox_args.neg_train_data_paths else [],
+            neox_args.pos_train_label_data_paths
+            if neox_args.pos_train_label_data_paths
+            else [],
+            neox_args.neg_train_label_data_paths
+            if neox_args.neg_train_label_data_paths
+            else [],
+            neox_args.pos_valid_data_paths if neox_args.pos_valid_data_paths else [],
+            neox_args.neg_valid_data_paths if neox_args.neg_valid_data_paths else [],
+            neox_args.pos_valid_label_data_paths
+            if neox_args.pos_valid_label_data_paths
+            else [],
+            neox_args.neg_valid_label_data_paths
+            if neox_args.neg_valid_label_data_paths
+            else [],
+            neox_args.pos_test_data_paths if neox_args.pos_test_data_paths else [],
+            neox_args.neg_test_data_paths if neox_args.neg_test_data_paths else [],
+            neox_args.pos_test_label_data_paths
+            if neox_args.pos_test_label_data_paths
+            else [],
+            neox_args.neg_test_label_data_paths
+            if neox_args.neg_test_label_data_paths
+            else [],
         )
     ):
-        if train_path:
+        if train_path or pos_train_path:
             train_datasets.append(
                 build_the_dataset(
                     data_prefix=train_path,
                     name=f"train_{i}",
                     data_impl=neox_args.data_impl,
+                    pack_impl=neox_args.pack_impl,
+                    allow_chopped=neox_args.allow_chopped,
                     num_samples=train_num_samples[i],
+                    num_epochs=train_epochs,
                     seq_length=neox_args.seq_length,
                     seed=neox_args.seed,
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
-                    label_prefix=label_path,
+                    label_prefix=train_label_path,
+                    dataset_impl=neox_args.dataset_impl,
+                    pos_data_prefix=pos_train_path,
+                    neg_data_prefix=neg_train_path,
+                    pos_label_prefix=pos_train_label_path,
+                    neg_label_prefix=neg_train_label_path,
+                    precompute_model_name=neox_args.precompute_model_name,
+                    reward_prefix=train_reward_path,
                 )
             )
 
-        if valid_path:
+        if valid_path or pos_valid_path:
             valid_datasets.append(
                 build_the_dataset(
                     data_prefix=valid_path,
                     name=f"valid_{i}",
                     data_impl=neox_args.data_impl,
+                    pack_impl=neox_args.pack_impl,
+                    allow_chopped=neox_args.allow_chopped,
                     num_samples=valid_num_samples[i],
+                    num_epochs=valid_epochs,
                     seq_length=neox_args.seq_length,
                     seed=neox_args.seed,
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
+                    label_prefix=valid_label_path,
+                    dataset_impl=neox_args.dataset_impl,
+                    pos_data_prefix=pos_valid_path,
+                    neg_data_prefix=neg_valid_path,
+                    pos_label_prefix=pos_valid_label_path,
+                    neg_label_prefix=neg_valid_label_path,
+                    precompute_model_name=neox_args.precompute_model_name,
+                    reward_prefix=valid_reward_path,
                 )
             )
 
-        if test_path:
+        if test_path or pos_test_path:
             test_datasets.append(
                 build_the_dataset(
                     data_prefix=test_path,
                     name=f"test_{i}",
                     data_impl=neox_args.data_impl,
+                    pack_impl=neox_args.pack_impl,
+                    allow_chopped=neox_args.allow_chopped,
                     num_samples=test_num_samples[i],
+                    num_epochs=test_epochs,
                     seq_length=neox_args.seq_length,
                     seed=neox_args.seed,
                     skip_warmup=(not neox_args.mmap_warmup),
                     build_index_mappings=build_index_mappings,
+                    label_prefix=test_label_path,
+                    dataset_impl=neox_args.dataset_impl,
+                    pos_data_prefix=pos_test_path,
+                    neg_data_prefix=neg_test_path,
+                    pos_label_prefix=pos_test_label_path,
+                    neg_label_prefix=neg_test_label_path,
+                    precompute_model_name=neox_args.precompute_model_name,
+                    reward_prefix=test_reward_path,
                 )
             )
     return train_datasets, valid_datasets, test_datasets
@@ -294,9 +479,44 @@ def weights_by_num_docs(l: list, alpha=0.3):
     return weights
 
 
-def build_train_valid_test_data_iterators(neox_args):
+def validate_train_epochs(neox_args):
+    """Check for unsupported neox_args when using train_epochs instead of train_iters"""
+    if neox_args.train_epochs is None:
+        return
+
+    if neox_args.train_epochs and neox_args.train_iters:
+        raise ValueError(
+            "Cannot specify both train epochs and train iters simultaneously"
+        )
+
+    if neox_args.pack_impl != "packed":
+        raise ValueError(
+            "Packing implementations other than 'packed' are currently unsupported with train_epochs"
+        )
+
+    if neox_args.weight_by_num_documents:
+        raise ValueError(
+            "Weighting by number of documents is currently unsupported with train_epochs"
+        )
+
+    if neox_args.train_data_weights and (
+        not all(weight == 1.0 for weight in neox_args.train_data_weights)
+    ):
+        raise ValueError(
+            "train_data_weights != None is currently unsupported with train_epochs"
+        )
+
+    if neox_args.dataset_impl != "gpt2":
+        raise ValueError(
+            "non gpt2 datasets are not currently unsupported with train_epochs"
+        )
+
+
+def build_train_valid_test_data_loaders(neox_args):
     """XXX"""
 
+    validate_train_epochs(neox_args)
+
     (train_dataloader, valid_dataloader, test_dataloader) = (None, None, None)
 
     print_rank_0("> building train, validation, and test datasets ...")
@@ -314,16 +534,23 @@ def build_train_valid_test_data_iterators(neox_args):
     # Data loader only on rank 0 of each model parallel group.
     if mpu.get_model_parallel_rank() == 0 and pipe_load:
         # Number of train/valid/test samples.
-        train_iters = neox_args.train_iters
-        eval_iters = (train_iters // neox_args.eval_interval + 1) * neox_args.eval_iters
-        test_iters = neox_args.eval_iters
-        train_val_test_num_samples = [
-            train_iters * neox_args.train_batch_size,
-            eval_iters * neox_args.train_batch_size,
-            test_iters * neox_args.train_batch_size,
-        ]
-
-        if neox_args.train_data_paths:
+        if neox_args.train_iters is not None:
+            train_iters = neox_args.train_iters
+            eval_iters = (
+                train_iters // neox_args.eval_interval + 1
+            ) * neox_args.eval_iters
+            test_iters = neox_args.eval_iters
+            train_val_test_num_samples = [
+                train_iters * neox_args.train_batch_size,
+                eval_iters * neox_args.train_batch_size,
+                test_iters * neox_args.train_batch_size,
+            ]
+            train_val_test_epochs = [None, None, None]
+        elif neox_args.train_epochs is not None:
+            train_val_test_num_samples = [None, None, None]
+            train_val_test_epochs = [1, 1, 1]
+
+        if (neox_args.train_data_paths) or (neox_args.pos_train_data_paths):
             # when individual train / valid / test data paths are provided
             # normalize weight values and get num samples for each dataset
             train_weights, train_num_samples = get_normalized_weights_and_num_samples(
@@ -342,14 +569,13 @@ def build_train_valid_test_data_iterators(neox_args):
                 train_num_samples,
                 valid_num_samples,
                 test_num_samples,
-                train_weights,
-                valid_weights,
-                test_weights,
+                train_val_test_epochs[0],
+                train_val_test_epochs[1],
+                train_val_test_epochs[2],
                 build_index_mappings=not neox_args.weight_by_num_documents,
             )
 
             if neox_args.weight_by_num_documents:
-
                 # gets the number of documents in each datapath
                 get_num_docs_list = lambda datasets: [
                     dataset.indexed_dataset.sizes.shape[0] for dataset in datasets
@@ -391,9 +617,9 @@ def build_train_valid_test_data_iterators(neox_args):
                     train_num_samples,
                     valid_num_samples,
                     test_num_samples,
-                    train_weights,
-                    valid_weights,
-                    test_weights,
+                    train_val_test_epochs[0],
+                    train_val_test_epochs[1],
+                    train_val_test_epochs[2],
                 )
 
             if train_datasets:
@@ -411,9 +637,12 @@ def build_train_valid_test_data_iterators(neox_args):
                 data_impl=neox_args.data_impl,
                 splits_string=neox_args.split,
                 train_valid_test_num_samples=train_val_test_num_samples,
+                train_valid_test_epochs=train_val_test_epochs,
                 seq_length=neox_args.seq_length,
                 seed=neox_args.seed,
                 skip_warmup=(not neox_args.mmap_warmup),
+                pack_impl=neox_args.pack_impl,
+                allow_chopped=neox_args.allow_chopped,
             )
 
         # Build dataloders.
@@ -422,9 +651,15 @@ def build_train_valid_test_data_iterators(neox_args):
         test_dataloader = make_data_loader(test_ds, neox_args=neox_args)
 
         # Flags to know if we need to do training/validation/testing.
-        do_train = train_dataloader is not None and neox_args.train_iters > 0
-        do_valid = valid_dataloader is not None and neox_args.eval_iters > 0
-        do_test = test_dataloader is not None and neox_args.eval_iters > 0
+        if neox_args.train_epochs:
+            do_train = train_dataloader is not None
+            do_valid = valid_dataloader is not None
+            do_test = test_dataloader is not None
+        else:
+            do_train = train_dataloader is not None and neox_args.train_iters > 0
+            do_valid = valid_dataloader is not None and neox_args.eval_iters > 0
+            do_test = test_dataloader is not None and neox_args.eval_iters > 0
+
         # Need to broadcast num_tokens and num_type_tokens.
         flags = torch.cuda.LongTensor([int(do_train), int(do_valid), int(do_test)])
     else:
@@ -444,6 +679,19 @@ def build_train_valid_test_data_iterators(neox_args):
     neox_args.do_train = flags[0].item()
     neox_args.do_valid = flags[1].item()
     neox_args.do_test = flags[2].item()
+    data_loaders = {
+        "train": train_dataloader,
+        "valid": valid_dataloader,
+        "test": test_dataloader,
+    }
+    return data_loaders
+
+
+def shift_and_wrap_data_loaders(neox_args, data_loaders, loop=True):
+    """Shift start iteration and wrap data_loaders in iterators"""
+    train_dataloader = data_loaders["train"]
+    valid_dataloader = data_loaders["valid"]
+    test_dataloader = data_loaders["test"]
 
     # Shift the start iterations.
     if train_dataloader is not None:
@@ -469,19 +717,34 @@ def build_train_valid_test_data_iterators(neox_args):
             )
         )
 
+    def loop_iterator(data_loader):
+        while True:
+            for x in data_loader:
+                yield x
+            data_loader.start_iter = 0
+
     # Build iterators.
     if train_dataloader is not None:
-        train_data_iterator = iter(train_dataloader)
+        if loop:
+            train_data_iterator = cycle(train_dataloader)
+        else:
+            train_data_iterator = iter(train_dataloader)
     else:
         train_data_iterator = None
 
     if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader)
+        if loop:
+            valid_data_iterator = cycle(valid_dataloader)
+        else:
+            valid_data_iterator = iter(valid_dataloader)
     else:
         valid_data_iterator = None
 
     if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader)
+        if loop:
+            test_data_iterator = cycle(test_dataloader)
+        else:
+            test_data_iterator = iter(test_dataloader)
     else:
         test_data_iterator = None
 
diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
index 75e601fda..73c21bebd 100644
--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -34,18 +34,31 @@ def __init__(
         documents,
         indexed_dataset,
         num_samples,
+        num_epochs,
         seq_length,
         seed,
+        pack_impl="packed",
+        allow_chopped=True,
         build_index_mappings=True,
         use_shared_fs=True,
         label_dataset=None,
+        reward_dataset=None,
+        ref_dataset=None,
     ):
 
         self.name = name
+        self.pack_impl = pack_impl
+        self.allow_chopped = allow_chopped
         self.indexed_dataset = indexed_dataset
         self.label_dataset = label_dataset
+        self.reward_dataset = reward_dataset
+        self.ref_dataset = ref_dataset
+        self.seq_length = seq_length
 
         # Checks
+        assert self.reward_dataset is None or (
+            pack_impl == "unpacked"
+        ), "Reward dataset only supported with unpacked data."
         assert np.min(documents) >= 0
         assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
@@ -56,10 +69,14 @@ def __init__(
                 data_prefix,
                 documents,
                 self.indexed_dataset.sizes,
+                self.label_dataset,
                 num_samples,
+                num_epochs,
                 seq_length,
                 seed,
+                self.pack_impl,
                 use_shared_fs=use_shared_fs,
+                allow_chopped=self.allow_chopped,
             )
             self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1
             self.sample_idx_len = self.sample_idx.shape[0] - 1
@@ -82,47 +99,101 @@ def __getitem__(self, idx):
             offset_f = self.sample_idx[idx][1]
             offset_l = self.sample_idx[idx + 1][1]
             # Labels and texts are supposed to be fully in sync.
-            datasets = (
-                [self.indexed_dataset]
-                if self.label_dataset is None
-                else [self.indexed_dataset, self.label_dataset]
-            )
+            datasets = [self.indexed_dataset]
+            rw_indx = 1
+            if self.label_dataset is not None:
+                rw_indx += 1
+                datasets.append(self.label_dataset)
+            if self.reward_dataset is not None:
+                datasets.append(self.reward_dataset)
+            else:
+                rw_indx = -1
+            if self.ref_dataset is not None:
+                datasets.append(self.ref_dataset)
             samples = []
+            sample_lengths = []
             # If we are within the same document, just extract the chunk.
             for n, dataset in enumerate(datasets):
                 if doc_index_f == doc_index_l:
-                    samples.append(
-                        dataset.get(
-                            self.doc_idx[doc_index_f],
-                            offset=offset_f,
-                            length=offset_l - offset_f + 1,
+                    if rw_indx == n:
+                        # If we are in the reward dataset, we only need the last token.
+                        rw = dataset.get(self.doc_idx[doc_index_f])
+                        samples.append(
+                            np.array([rw[0] for _ in range(len(samples[-1]))])
+                        )
+                    else:
+                        samples.append(
+                            dataset.get(
+                                self.doc_idx[doc_index_f],
+                                offset=offset_f,
+                                length=offset_l - offset_f + 1,
+                            )
                         )
-                    )
                 else:
+                    if n != rw_indx:
+                        # reset
+                        sample_lengths = []
                     # Otherwise, get the rest of the initial document.
-                    sample_list = [
-                        dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
-                    ]
+                    if n == rw_indx:
+                        rw = dataset.get(self.doc_idx[doc_index_f])
+                        sample_list = [
+                            np.array([rw[0] for _ in range(sample_lengths.pop(0))])
+                        ]
+                    else:
+                        sample_list = [
+                            dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+                        ]
+                        sample_lengths.append(len(sample_list[-1]))
                     # Loop over all in between documents and add the entire document.
                     for i in range(doc_index_f + 1, doc_index_l):
-                        sample_list.append(dataset.get(self.doc_idx[i]))
+                        if n == rw_indx:
+                            rw = dataset.get(self.doc_idx[i])
+                            sample_list.append(
+                                np.array([rw[0] for _ in range(sample_lengths.pop(0))])
+                            )
+                        else:
+                            sample_list.append(dataset.get(self.doc_idx[i]))
+                            sample_lengths.append(len(sample_list[-1]))
                     # And finally add the relevant portion of last document.
-                    sample_list.append(
-                        dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
-                    )
+                    if n == rw_indx:
+                        rw = dataset.get(self.doc_idx[doc_index_l])
+                        sample_list.append(
+                            np.array([rw[0] for _ in range(sample_lengths.pop(0))])
+                        )
+                    else:
+                        sample_list.append(
+                            dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
+                        )
+                        sample_lengths.append(len(sample_list[-1]))
                     samples.append(np.concatenate(sample_list))
-
-            if len(datasets) == 1:
-                return {"text": np.array(samples[0], dtype=np.int64)}
-            else:
-                return {
-                    "text": np.array(samples[0], dtype=np.int64),
-                    "label": np.array(samples[1], dtype=np.int64),
-                }
-        except IndexError:
+            for i in range(len(samples)):
+                mask = (self.label_dataset is not None) and (i == 1)
+                if len(samples[i]) < (self.seq_length + 1):
+                    # Pad
+                    samples[i] = np.pad(
+                        samples[i],
+                        (0, (self.seq_length + 1) - len(samples[i])),
+                        mode="constant",
+                        constant_values=-100 if mask else 0,
+                    )
+                elif len(samples[i]) > (self.seq_length + 1):
+                    # Truncate
+                    samples[i] = samples[i][: (self.seq_length + 1)]
+            ret = {"text": np.array(samples[0], dtype=np.int64)}
+            next_idx = 1
+            if self.label_dataset is not None:
+                ret["label"] = np.array(samples[next_idx], dtype=np.int64)
+                next_idx += 1
+            if self.reward_dataset is not None:
+                ret["reward"] = np.array(samples[next_idx], dtype=np.float32)
+                next_idx += 1
+            if self.ref_dataset is not None:
+                ret["ref"] = np.array(samples[next_idx], dtype=np.float32)
+            return ret
+        except IndexError as err:
             new_idx = idx % len(self)
             print(
-                f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
+                f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx}), error: {err}"
             )
             return self[new_idx]
 
@@ -132,10 +203,14 @@ def _build_index_mappings(
     data_prefix,
     documents,
     sizes,
+    label_dataset,
     num_samples,
+    num_epochs,
     seq_length,
     seed,
+    packing_impl,
     use_shared_fs=True,
+    allow_chopped=True,
 ):
     """Build doc-idx, sample-idx, and shuffle-idx.
     doc-idx: is an array (ordered) of documents to be used in training.
@@ -145,7 +220,8 @@ def _build_index_mappings(
     """
     # Number of tokens in each epoch and number of required epochs.
     tokens_per_epoch = _num_tokens(documents, sizes)
-    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+    if not num_epochs:
+        num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
     # rng state
     np_rng = np.random.RandomState(seed=seed)
 
@@ -155,6 +231,9 @@ def _build_index_mappings(
     _filename += "_{}ns".format(num_samples)
     _filename += "_{}sl".format(seq_length)
     _filename += "_{}s".format(seed)
+    _filename += "_{}pi".format(packing_impl)
+    if allow_chopped:
+        _filename += "_ac"
     doc_idx_filename = _filename + "_doc_idx.npy"
     sample_idx_filename = _filename + "_sample_idx.npy"
     shuffle_idx_filename = _filename + "_shuffle_idx.npy"
@@ -177,44 +256,116 @@ def _build_index_mappings(
             )
             # doc-idx.
             start_time = time.time()
-            doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
-            np.save(doc_idx_filename, doc_idx, allow_pickle=True)
-            print_rank_0(
-                " > elapsed time to build and save doc-idx mapping "
-                "(seconds): {:4f}".format(time.time() - start_time)
-            )
-            # sample-idx.
-            start_time = time.time()
-            # Use C++ implementation for speed.
-            from megatron.data import helpers
-
-            assert doc_idx.dtype == np.int32
-            assert sizes.dtype == np.int32
-
-            num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
-            if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
-                sample_idx = helpers.build_sample_idx_int32(
-                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+            if packing_impl == "packed":
+                doc_idx = _build_doc_idx(documents, num_epochs, np_rng)
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                print_rank_0(
+                    " > elapsed time to build and save doc-idx mapping "
+                    "(seconds): {:4f}".format(time.time() - start_time)
                 )
-            else:
-                sample_idx = helpers.build_sample_idx_int64(
-                    sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                # sample-idx.
+                start_time = time.time()
+                # Use C++ implementation for speed.
+                from megatron.data import helpers
+
+                assert doc_idx.dtype == np.int32
+                assert sizes.dtype == np.int32
+
+                num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length
+                if 2 * (num_samples + 1) < np.iinfo(np.int32).max:
+                    sample_idx = helpers.build_sample_idx_int32(
+                        sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                    )
+                else:
+                    sample_idx = helpers.build_sample_idx_int64(
+                        sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch
+                    )
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                print_rank_0(
+                    " > elapsed time to build and save sample-idx mapping "
+                    "(seconds): {:4f}".format(time.time() - start_time)
                 )
-            np.save(sample_idx_filename, sample_idx, allow_pickle=True)
-            print_rank_0(
-                " > elapsed time to build and save sample-idx mapping "
-                "(seconds): {:4f}".format(time.time() - start_time)
-            )
-            # shuffle-idx.
-            start_time = time.time()
-            # -1 is due to data structure used to retrieve the index:
-            #    sample i --> [sample_idx[i], sample_idx[i+1])
-            shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
-            np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
-            print_rank_0(
-                " > elapsed time to build and save shuffle-idx mapping"
-                " (seconds): {:4f}".format(time.time() - start_time)
-            )
+                # shuffle-idx.
+                start_time = time.time()
+                # -1 is due to data structure used to retrieve the index:
+                #    sample i --> [sample_idx[i], sample_idx[i+1])
+                shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+                print_rank_0(
+                    " > elapsed time to build and save shuffle-idx mapping"
+                    " (seconds): {:4f}".format(time.time() - start_time)
+                )
+            elif packing_impl == "pack_until_overflow":
+                # Naively pack data until it overflows, then roll it over to a new one instead.
+                shuffle_idx = np.arange(num_samples)  # Shuffle index around epochs
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = []
+                doc_idx = []
+                # Iterate over files until we have enough samples.
+                temp_shuffle_idx = np.arange(len(documents))
+                np_rng.shuffle(temp_shuffle_idx)
+                running_length = 0
+                curr_shuffle_idx = 0
+                while len(sample_idx) < num_samples:
+                    if not allow_chopped:
+                        # +1 since we shift left/right by 1
+                        if sizes[temp_shuffle_idx[curr_shuffle_idx]] > seq_length + 1:
+                            curr_shuffle_idx += 1
+                            continue
+                    # First, check if we need to skip this item...
+                    if label_dataset is not None:
+                        if np.all(
+                            label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[
+                                : seq_length + 1
+                            ]
+                            == -100
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                    doc_length = sizes[temp_shuffle_idx[curr_shuffle_idx]]
+                    if running_length == 0:
+                        sample_idx.append(np.array([len(doc_idx), 0]))
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                        running_length += doc_length
+                    else:
+                        if running_length + doc_length > (seq_length + 1):
+                            running_length = doc_length
+                            sample_idx.append(np.array([len(doc_idx), 0]))
+                        else:
+                            running_length += doc_length
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                    curr_shuffle_idx += 1
+                    if curr_shuffle_idx == len(documents):
+                        curr_shuffle_idx = 0
+                        np_rng.shuffle(temp_shuffle_idx)
+                sample_idx.append(np.array([len(doc_idx), 0]))
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            elif packing_impl == "unpacked":
+                # Unpacked data, one sample per document.
+                shuffle_idx = np.arange(num_samples)  # Shuffle index around epochs
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64)
+                sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)])
+                sample_idx[:, 1] = 0
+                doc_idx = list()
+                doc_i = 0
+                while len(doc_idx) <= num_samples:
+                    if not allow_chopped:
+                        # +1 since we shift left/right by 1
+                        if sizes[doc_i] > seq_length + 1:
+                            doc_i = (doc_i + 1) % len(documents)
+                            continue
+                    # Just in case we have bad data in the loop...
+                    if np.all(label_dataset.get(doc_i)[:seq_length] == -100):
+                        doc_i = (doc_i + 1) % len(documents)
+                        continue
+                    doc_idx.append(doc_i)
+                    doc_i = (doc_i + 1) % len(documents)
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
 
     # This should be a barrier but nccl barrier assumes
     # device_index=rank which is not the case for model
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index 9b062b050..aca290854 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }      // if (num_remain_sent > 1) {
-            }          // for (int doc=0; doc < num_docs; ++doc) {
-        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }  // if (num_remain_sent > 1) {
+            }  // for (int doc=0; doc < num_docs; ++doc) {
+        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/data/pairwise_dataset.py b/megatron/data/pairwise_dataset.py
new file mode 100644
index 000000000..e39b4d626
--- /dev/null
+++ b/megatron/data/pairwise_dataset.py
@@ -0,0 +1,457 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pairwise style dataset."""
+
+import os
+import time
+
+import numpy as np
+import torch
+
+from megatron import mpu, print_rank_0
+
+
+class PairwiseDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        name,
+        pos_data_prefix,  # Don't need neg since it's assumed you have paired the data already.
+        documents,
+        pos_indexed_dataset,
+        neg_indexed_dataset,
+        num_samples,
+        seq_length,
+        seed,
+        pack_impl="unpacked",
+        build_index_mappings=True,
+        use_shared_fs=True,
+        pos_label_dataset=None,
+        pos_ref_dataset=None,
+        neg_label_dataset=None,
+        neg_ref_dataset=None,
+        allow_chopped=True,
+    ):
+
+        self.name = name
+        self.pos_indexed_dataset = pos_indexed_dataset
+        self.pos_label_dataset = pos_label_dataset
+        self.pos_ref_dataset = pos_ref_dataset
+        self.neg_indexed_dataset = neg_indexed_dataset
+        self.neg_label_dataset = neg_label_dataset
+        self.neg_ref_dataset = neg_ref_dataset
+        self.pack_impl = pack_impl
+        self.seq_length = seq_length
+        # Checks
+        assert np.min(documents) >= 0
+        assert (neg_label_dataset is not None and pos_label_dataset is not None) or (
+            neg_label_dataset is None and pos_label_dataset is None
+        ), "Label datasets must be both None or both not None"
+        assert np.max(documents) < pos_indexed_dataset.sizes.shape[0]
+        assert pos_indexed_dataset.sizes.shape[0] == neg_indexed_dataset.sizes.shape[0]
+        assert (
+            pack_impl != "packed"
+        ), "Packed implementation not supported for pairwise dataset"
+
+        if build_index_mappings:
+            # Build index mappings.
+            self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+                self.name,
+                pos_data_prefix,
+                documents,
+                self.pos_indexed_dataset.sizes,
+                self.neg_indexed_dataset.sizes,
+                self.pos_label_dataset,
+                self.neg_label_dataset,
+                num_samples,
+                seq_length,
+                seed,
+                pack_impl,
+                use_shared_fs=use_shared_fs,
+                allow_chopped=allow_chopped,
+            )
+            self.shuffle_idx_len = self.shuffle_idx.shape[0] - 1
+            self.sample_idx_len = self.sample_idx.shape[0] - 1
+
+            if self.shuffle_idx_len != self.sample_idx_len - 1:
+                print(
+                    f"WARNING: shuffle index length ({self.shuffle_idx_len}) is not equal to sample index length ({self.sample_idx_len})"
+                )
+
+    def __len__(self):
+        return min(self.shuffle_idx_len, self.sample_idx_len)
+
+    def __getitem__(self, idx):
+        try:
+            # Get the shuffled index.
+            idx = self.shuffle_idx[idx]
+            # Start and end documents and offsets.
+            doc_index_f = self.sample_idx[idx][0]
+            doc_index_l = self.sample_idx[idx + 1][0]
+            offset_f = self.sample_idx[idx][1]
+            offset_l = self.sample_idx[idx + 1][1]
+            # Labels and texts are supposed to be fully in sync.
+            datasets = [self.pos_indexed_dataset, self.neg_indexed_dataset]
+
+            if self.pos_label_dataset is not None:
+                datasets += [
+                    self.pos_label_dataset,
+                    self.neg_label_dataset,
+                ]
+            if self.pos_ref_dataset is not None:
+                datasets += [
+                    self.pos_ref_dataset,
+                    self.neg_ref_dataset,
+                ]
+            samples = []
+            pos_ref_samples = []
+            neg_ref_samples = []
+            # If we are within the same document, just extract the chunk.
+            for n, dataset in enumerate(datasets):
+                if doc_index_f == doc_index_l:
+                    samples.append(
+                        dataset.get(
+                            self.doc_idx[doc_index_f],
+                            offset=offset_f,
+                            length=offset_l - offset_f + 1,
+                        )
+                    )
+                else:
+                    # Otherwise, get the rest of the initial document.
+                    sample_list = [
+                        dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+                    ]
+                    # Loop over all in between documents and add the entire document.
+                    for i in range(doc_index_f + 1, doc_index_l):
+                        sample_list.append(dataset.get(self.doc_idx[i]))
+                    # And finally add the relevant portion of last document.
+                    sample_list.append(
+                        dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
+                    )
+                    samples.append(np.concatenate(sample_list))
+            for i in range(len(samples)):
+                if len(samples[i]) < (self.seq_length + 1):
+                    if ((i == 2) or (i == 3)) and self.pos_label_dataset is not None:
+                        # Labels... So pad with -100
+                        samples[i] = np.pad(
+                            samples[i],
+                            (0, (self.seq_length + 1) - len(samples[i])),
+                            mode="constant",
+                            constant_values=-100,
+                        )
+                    else:
+                        # Pad with 0s, can use any number since it's masked.
+                        samples[i] = np.pad(
+                            samples[i],
+                            (0, (self.seq_length + 1) - len(samples[i])),
+                            mode="constant",
+                            constant_values=0,
+                        )
+                elif len(samples[i]) > (self.seq_length + 1):
+                    # Check for overflow and truncate.
+                    samples[i] = samples[i][: (self.seq_length + 1)]
+            ret = {}
+            ret["pos"] = np.array(samples[0], dtype=np.int64)
+            ret["neg"] = np.array(samples[1], dtype=np.int64)
+            if self.pos_label_dataset is not None:
+                ret["pos_label"] = np.array(samples[2], dtype=np.int64)
+                ret["neg_label"] = np.array(samples[3], dtype=np.int64)
+                if self.pos_ref_dataset is not None:
+                    ret["pos_ref"] = np.array(samples[4], dtype=np.float32)
+                    ret["neg_ref"] = np.array(samples[5], dtype=np.float32)
+            elif self.pos_ref_dataset is not None:
+                # Don't have labels...
+                ret["pos_ref"] = np.array(samples[2], dtype=np.float32)
+                ret["neg_ref"] = np.array(samples[3], dtype=np.float32)
+            return ret
+        except IndexError:
+            new_idx = idx % len(self)
+            print(
+                f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
+            )
+            return self[new_idx]
+
+
+def _build_index_mappings(
+    name,
+    pos_data_prefix,
+    documents,
+    pos_sizes,
+    neg_sizes,
+    pos_label_dataset,
+    neg_label_dataset,
+    num_samples,
+    seq_length,
+    seed,
+    packing_impl,
+    use_shared_fs=True,
+    allow_chopped=True,
+):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, pos_sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+
+    # Filename of the index mappings.
+    _filename = pos_data_prefix
+    _filename += "_{}_indexmap".format(name)
+    _filename += "_{}ns".format(num_samples)
+    _filename += "_{}sl".format(seq_length)
+    _filename += "_{}s".format(seed)
+    _filename += "_{}pi".format(packing_impl)
+    doc_idx_filename = _filename + "_doc_idx.npy"
+    sample_idx_filename = _filename + "_sample_idx.npy"
+    shuffle_idx_filename = _filename + "_shuffle_idx.npy"
+
+    if not use_shared_fs:
+        should_process_dataset = int(os.environ["LOCAL_RANK"]) == 0
+    else:
+        should_process_dataset = torch.distributed.get_rank() == 0
+
+    # Build the indexed mapping if not exist.
+    if should_process_dataset:
+        if (
+            (not os.path.isfile(doc_idx_filename))
+            or (not os.path.isfile(sample_idx_filename))
+            or (not os.path.isfile(shuffle_idx_filename))
+        ):
+            print_rank_0(
+                " > WARNING: could not find index map files, building "
+                "the indices on rank 0 ..."
+            )
+            # doc-idx.
+            start_time = time.time()
+            if packing_impl == "pack_until_overflow":
+                # Naively pack data until it overflows, then roll it over to a new one instead.
+                shuffle_idx = np.arange(num_samples)  # Shuffle index around epochs
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = []
+                doc_idx = []
+                # Iterate over files until we have enough samples.
+                temp_shuffle_idx = np.arange(len(documents))
+                np_rng.shuffle(temp_shuffle_idx)
+                running_length = 0
+                curr_shuffle_idx = 0
+                while len(sample_idx) < num_samples:
+                    # If not allow_chopped, skip this item if it's chopped.
+                    if not allow_chopped:
+                        if (
+                            pos_sizes[temp_shuffle_idx[curr_shuffle_idx]]
+                            < seq_length + 1
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                        if (
+                            neg_sizes[temp_shuffle_idx[curr_shuffle_idx]]
+                            < seq_length + 1
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                    # Then, check if we need to skip this item...
+                    if pos_label_dataset is not None:
+                        if np.all(
+                            pos_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[
+                                : seq_length + 1
+                            ]
+                            == -100
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                        if np.all(
+                            neg_label_dataset.get(temp_shuffle_idx[curr_shuffle_idx])[
+                                : seq_length + 1
+                            ]
+                            == -100
+                        ):
+                            curr_shuffle_idx += 1
+                            continue
+                    doc_length = max(
+                        pos_sizes[temp_shuffle_idx[curr_shuffle_idx]],
+                        neg_sizes[temp_shuffle_idx[curr_shuffle_idx]],
+                    )
+                    if running_length == 0:
+                        sample_idx.append(np.array([len(doc_idx), 0]))
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                        running_length += doc_length
+                    else:
+                        if running_length + doc_length > (seq_length + 1):
+                            running_length = doc_length
+                            sample_idx.append(np.array([len(doc_idx), 0]))
+                        else:
+                            running_length += doc_length
+                        doc_idx.append(temp_shuffle_idx[curr_shuffle_idx])
+                    curr_shuffle_idx += 1
+                    if curr_shuffle_idx == len(documents):
+                        curr_shuffle_idx = 0
+                        np_rng.shuffle(temp_shuffle_idx)
+                sample_idx.append(np.array([len(doc_idx), 0]))
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+            elif packing_impl == "unpacked":
+                # Unpacked data, one sample per document.
+                shuffle_idx = np.array([i % len(documents) for i in range(num_samples)])
+                np_rng.shuffle(shuffle_idx)
+                sample_idx = np.zeros((num_samples + 1, 2), dtype=np.int64)
+                sample_idx[:, 0] = np.array([i for i in range(num_samples + 1)])
+                sample_idx[:, 1] = 0
+                doc_idx = list()
+                doc_i = 0
+                while len(doc_idx) <= num_samples:
+                    # Check if we need to skip this item...
+                    if not allow_chopped:
+                        # +1 since we shift left/right by 1
+                        if pos_sizes[doc_i] > seq_length + 1:
+                            doc_i = (doc_i + 1) % len(documents)
+                            continue
+                        if neg_sizes[doc_i] > seq_length + 1:
+                            doc_i = (doc_i + 1) % len(documents)
+                            continue
+                    # In theory if we don't allow chopped we should be able to skip it, but the warm fuzzies I get
+                    # from this are worth the extra bool check
+                    if np.all(pos_label_dataset.get(doc_i)[:seq_length] == -100):
+                        doc_i = (doc_i + 1) % len(documents)
+                        continue
+                    if np.all(neg_label_dataset.get(doc_i)[:seq_length] == -100):
+                        doc_i = (doc_i + 1) % len(documents)
+                        continue
+                    doc_idx.append(doc_i)
+                    doc_i = (doc_i + 1) % len(documents)
+                np.save(doc_idx_filename, doc_idx, allow_pickle=True)
+                np.save(sample_idx_filename, sample_idx, allow_pickle=True)
+                np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
+
+    # This should be a barrier but nccl barrier assumes
+    # device_index=rank which is not the case for model
+    # parallel case
+    counts = torch.cuda.LongTensor([1])
+    torch.distributed.all_reduce(counts, group=mpu.get_io_parallel_group())
+    assert counts[0].item() == torch.distributed.get_world_size(
+        group=mpu.get_io_parallel_group()
+    )
+
+    # Load mappings.
+    start_time = time.time()
+    print_rank_0(" > loading doc-idx mapping from {}".format(doc_idx_filename))
+    doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(" > loading sample-idx mapping from {}".format(sample_idx_filename))
+    sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(" > loading shuffle-idx mapping from {}".format(shuffle_idx_filename))
+    shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r")
+    print_rank_0(
+        "    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time)
+    )
+    print_rank_0("    total number of samples: {}".format(sample_idx.shape[0]))
+    print_rank_0("    total number of epochs: {}".format(num_epochs))
+
+    return doc_idx, sample_idx, shuffle_idx
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence length, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng):
+    """Build an array with length = number-of-epochs * number-of-documents.
+    Each index is mapped to a corresponding document."""
+    doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
+    doc_idx[:] = documents
+    doc_idx = doc_idx.reshape(-1)
+    doc_idx = doc_idx.astype(np.int32)
+    np_rng.shuffle(doc_idx)
+    return doc_idx
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int64)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Beginning offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += remaining_seq_length + doc_length - 1
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the beginning of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    dtype_ = np.uint32
+    if size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+    shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx)
+    return shuffle_idx
diff --git a/megatron/data/samplers.py b/megatron/data/samplers.py
index a9428e41c..5206636d8 100644
--- a/megatron/data/samplers.py
+++ b/megatron/data/samplers.py
@@ -100,7 +100,11 @@ class DistributedBatchSampler(data.sampler.BatchSampler):
     specifying True will result in the following samples for each gpu:
         GPU0: [0,2,4,6] GPU1: [1,3,5,7]
     specifying False will result in the following samples:
-        GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
+        GPU0: [0,1,2,3] GPU1: [4,5,6,7]
+
+    The `infinite_loop` parameter allows the sampler to yield batches indefinitely,
+    restarting from the beginning of the dataset when all samples have been iterated over.
+    """
 
     def __init__(
         self,
diff --git a/megatron/logging.py b/megatron/logging.py
index 247aeb1b5..af8a41fe5 100644
--- a/megatron/logging.py
+++ b/megatron/logging.py
@@ -23,6 +23,7 @@
 
 from megatron import mpu, print_rank_0
 from megatron.utils import report_memory
+import math
 
 
 class Tee:
@@ -106,6 +107,38 @@ def get_flops(neox_args, iter_time_s) -> float:
                 + 18 * hidden_size * hidden_size * num_layers / num_heads
             )
         )
+    elif "mamba" in neox_args.attention_config:
+        # from https://github.com/Zyphra/zcookbook/blob/main/calc/calc_mamba_flops.py
+        if neox_args.expansion_factor:
+            d_inner = neox_args.hidden_size * neox_args.expansion_factor
+        elif neox_args.intermediate_size:
+            d_inner = neox_args.intermediate_size
+        else:
+            d_inner = neox_args.hidden_size * 2  # default expansion factor
+        d_state = 16  # TODO make d_state an arg. Currently hardcoded in neox mamba definition and here
+        conv_dimension = 4  # TODO make conv_dimension an arg. Currently hardcoded in neox mamba definition and here
+        dt_rank = math.ceil(neox_args.hidden_size / 16)
+        ssm_flops = (
+            ckpt_activations_factor
+            * d_inner
+            * seq_len
+            * batch_size
+            * (11 * d_state + 4 * dt_rank + 1)
+        )
+        mamba_projectors_flops = (
+            ckpt_activations_factor * seq_len * batch_size * 6 * d_inner * hidden_size
+        )
+        mamba_conv_flops = (
+            ckpt_activations_factor
+            * seq_len
+            * batch_size
+            * 2
+            * d_inner
+            * conv_dimension
+        )
+        mamba_flops = ssm_flops + mamba_projectors_flops + mamba_conv_flops
+        embedding_flops = 6 * seq_len * batch_size * hidden_size * vocab_size
+        flops_per_iteration = mamba_flops * num_layers + embedding_flops
     else:
         flops_per_iteration = (
             24
@@ -201,6 +234,7 @@ def add_to_logging(name):
                             iteration,
                             use_wandb=neox_args.use_wandb,
                             tensorboard_writer=neox_args.tensorboard_writer,
+                            comet_experiment=neox_args.comet_experiment,
                         )
 
     # write losses, lr, etc. every step
@@ -210,6 +244,7 @@ def add_to_logging(name):
         iteration,
         use_wandb=neox_args.use_wandb,
         tensorboard_writer=neox_args.tensorboard_writer,
+        comet_experiment=neox_args.comet_experiment,
     )
     for key in loss_dict:
         tb_wandb_log(
@@ -218,6 +253,7 @@ def add_to_logging(name):
             iteration,
             use_wandb=neox_args.use_wandb,
             tensorboard_writer=neox_args.tensorboard_writer,
+            comet_experiment=neox_args.comet_experiment,
         )
     if neox_args.fp16:
         tb_wandb_log(
@@ -226,6 +262,7 @@ def add_to_logging(name):
             iteration,
             use_wandb=neox_args.use_wandb,
             tensorboard_writer=neox_args.tensorboard_writer,
+            comet_experiment=neox_args.comet_experiment,
         )
 
     # log gradient noise scale
@@ -237,6 +274,7 @@ def add_to_logging(name):
                 iteration,
                 use_wandb=neox_args.use_wandb,
                 tensorboard_writer=neox_args.tensorboard_writer,
+                comet_experiment=neox_args.comet_experiment,
             )
 
     # (optional) Log optimizer states to wandb / tb every step
@@ -251,6 +289,7 @@ def add_to_logging(name):
                         iteration,
                         use_wandb=neox_args.use_wandb,
                         tensorboard_writer=neox_args.tensorboard_writer,
+                        comet_experiment=neox_args.comet_experiment,
                     )
 
     # (optional) Log grad/param norms to wandb / tb every step
@@ -276,6 +315,7 @@ def add_to_logging(name):
                             iteration,
                             use_wandb=neox_args.use_wandb,
                             tensorboard_writer=neox_args.tensorboard_writer,
+                            comet_experiment=neox_args.comet_experiment,
                             all_ranks=True,
                         )
             if neox_args.log_grad_norm:
@@ -291,6 +331,7 @@ def add_to_logging(name):
                             iteration,
                             use_wandb=neox_args.use_wandb,
                             tensorboard_writer=neox_args.tensorboard_writer,
+                            comet_experiment=neox_args.comet_experiment,
                             all_ranks=True,
                         )
             if neox_args.log_param_norm:
@@ -300,6 +341,7 @@ def add_to_logging(name):
                     iteration,
                     use_wandb=neox_args.use_wandb,
                     tensorboard_writer=neox_args.tensorboard_writer,
+                    comet_experiment=neox_args.comet_experiment,
                     all_ranks=True,
                 )
 
@@ -315,6 +357,7 @@ def add_to_logging(name):
             iteration,
             use_wandb=neox_args.use_wandb,
             tensorboard_writer=neox_args.tensorboard_writer,
+            comet_experiment=neox_args.comet_experiment,
         )
         tb_wandb_log(
             "runtime/iteration_time",
@@ -322,6 +365,7 @@ def add_to_logging(name):
             iteration,
             use_wandb=neox_args.use_wandb,
             tensorboard_writer=neox_args.tensorboard_writer,
+            comet_experiment=neox_args.comet_experiment,
         )
         log_string += " iteration {:8d}/{:8d} |".format(
             iteration, neox_args.train_iters
@@ -342,6 +386,7 @@ def add_to_logging(name):
                 iteration,
                 use_wandb=neox_args.use_wandb,
                 tensorboard_writer=neox_args.tensorboard_writer,
+                comet_experiment=neox_args.comet_experiment,
             )
 
         # log tflop / gpu
@@ -356,6 +401,7 @@ def add_to_logging(name):
             iteration,
             use_wandb=neox_args.use_wandb,
             tensorboard_writer=neox_args.tensorboard_writer,
+            comet_experiment=neox_args.comet_experiment,
         )
 
         for key in total_loss_dict:
@@ -394,6 +440,7 @@ def tb_wandb_log(
     iteration_no: int,
     use_wandb: bool,
     tensorboard_writer=None,
+    comet_experiment=None,
     all_ranks: bool = False,
 ):
     # logs to both tb and wandb (if present) from the zeroth rank
@@ -403,3 +450,7 @@ def tb_wandb_log(
             tensorboard_writer.add_scalar(key, value, iteration_no)
         if use_wandb:
             wandb.log({key: value}, step=iteration_no)
+        if comet_experiment:
+            comet_experiment.__internal_api__log_metric__(
+                key, value, framework="gpt-neox", step=iteration_no
+            )
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 619b4c33d..23be28936 100755
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -16,5 +16,8 @@
 # limitations under the License.
 
 from .gpt2_model import GPT2ModelPipe
-from .utils import get_params_for_weight_decay_optimization
+from .utils import (
+    get_params_for_weight_decay_optimization,
+    mark_norms_for_sequence_parallel_grad_sync,
+)
 from .word_embeddings import SoftEmbedding
diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 7a29b0716..c0b825261 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -25,9 +25,23 @@
 
 
 def get_activation(neox_args):
-    """retrieves the activation function specified in neox_args"""
+    """retrieves the activation function specified in neox_args and whether or not the activation is gated"""
+    is_gated = False
     if neox_args.activation == "geglu":
-        activation_func = GEGLU(neox_args=neox_args)
+        is_gated = True
+        activation_func = F.gelu
+    elif neox_args.activation == "reglu":
+        is_gated = True
+        activation_func = F.relu
+    elif neox_args.activation == "bilinear":
+        is_gated = True
+        activation_func = lambda x: x
+    elif neox_args.activation == "swiglu":
+        is_gated = True
+        activation_func = swish
+    elif neox_args.activation == "glu":
+        is_gated = True
+        activation_func = F.sigmoid
     elif neox_args.activation == "gelu":
         if neox_args.onnx_safe and neox_args.bias_gelu_fusion:
             raise ValueError("onnx_safe + bias_gelu_fusion not compatible")
@@ -49,7 +63,7 @@ def get_activation(neox_args):
         activation_func = F.silu
     else:
         raise ValueError(f"Activation function {neox_args.activation} not recognized")
-    return activation_func
+    return activation_func, is_gated
 
 
 ###### BIAS GELU FUSION/ NO AUTOGRAD ################
@@ -119,21 +133,3 @@ def swish(x, beta: float = 1.0):
 @torch.jit.script
 def mish(x):
     return x * torch.tanh(F.softplus(x))
-
-
-class GEGLU(torch.nn.Module):
-    def __init__(self, neox_args):
-        super(GEGLU, self).__init__()
-        if neox_args.onnx_safe:
-            self.activation_func = erf_gelu
-        else:
-            self.activation_func = F.gelu
-
-    def forward(self, x, bias=None):
-        x, gate = x.chunk(2, dim=-1)
-        if bias is not None:
-            bias_1, bias_2 = bias.chunk(2, dim=-1)
-            x = x + bias_1
-            gate = gate + bias_2
-        intermediate_parallel = self.activation_func(gate)
-        return intermediate_parallel * x
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index d33ded506..3fd251147 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -21,7 +21,10 @@
 except:
     HAVE_PERSIST_LAYER_NORM = False
 
-from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+from apex.normalization.fused_layer_norm import (
+    FusedLayerNormAffineFunction,
+    FusedRMSNormAffineFunction,
+)
 
 
 global fused_layer_norm_cuda
@@ -148,3 +151,112 @@ def forward(self, input):
             )
 
             return output
+
+
+class MixedFusedRMSNorm(torch.nn.Module):
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-5,
+        no_persist_layer_norm=True,
+        sequence_parallel=False,
+        apply_rmsnorm_1p=False,
+        mem_efficient_rms=True,
+    ):
+        super(MixedFusedRMSNorm, self).__init__()
+
+        self.apply_rmsnorm_1p = apply_rmsnorm_1p
+        self.mem_efficient_rms = mem_efficient_rms
+        self.norm_fn = FusedRMSNormAffineFunction
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        if (
+            normalized_shape not in persist_ln_hidden_sizes
+            or not HAVE_PERSIST_LAYER_NORM
+        ):
+            no_persist_layer_norm = True
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.scale = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+        self.no_persist_layer_norm = no_persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.scale, "sequence_parallel", self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.apply_rmsnorm_1p:
+            init.zeros_(self.scale)
+        else:
+            init.ones_(self.scale)
+
+    def forward(self, input):
+
+        weight = self.scale + 1 if self.apply_rmsnorm_1p else self.scale
+        # CPU path is here for unittest sake.
+        if not input.is_cuda:
+            print(
+                "WARNING! The input of FusedLayerNorm should be on the GPU."
+                "This warning should only be triggered in the FusedRMSNorm unit tests."
+            )
+            # Latest pytorch actually supports F.rms_norm but I don't want to break builds so...
+            return F.layer_norm(input, self.normalized_shape, weight, None, self.eps)
+
+        # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect
+        # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet
+        if "memory_efficient" in inspect.getfullargspec(self.norm_fn.forward).args:
+            return self.norm_fn.apply(
+                input,
+                weight,
+                self.normalized_shape,
+                self.eps,
+                self.mem_efficient_rms,
+            )
+        else:
+            return self.norm_fn.apply(input, weight, self.normalized_shape, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+            return output
diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py
index c3462c651..6400640bd 100644
--- a/megatron/model/gmlp.py
+++ b/megatron/model/gmlp.py
@@ -112,7 +112,7 @@ def __init__(
             init_method=init_method,
             skip_bias_add=True,
         )
-        self.activation_func = get_activation(neox_args)
+        self.activation_func, _ = get_activation(neox_args)
         ff_dim_parallel = mpu.divide(ff_dim, mpu.get_model_parallel_world_size())
         if neox_args.attention_config[layer_number] == "amlp":
             d_attn = neox_args.gmlp_attn_dim
diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index 9e643874a..7899048db 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -308,7 +308,10 @@ def _logits_helper(embedding, lm_output):
                 )
 
             logits = parallel_lm_logits(
-                lm_output, embedding.word_embeddings_weight, self.parallel_output
+                lm_output,
+                embedding.word_embeddings_weight,
+                self.parallel_output,
+                seq_parallel=self.neox_args.sequence_parallel,
             )
             return logits
 
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 86a003dbd..8a0b8e251 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -145,7 +145,7 @@ def init_(tensor, use_mup=use_mup_outer):
 
 def small_init_init_method(dim, use_mup_outer=False, mup_init_scale=1.0):
     """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
-    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2010), using a normal distribution."""
+    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution."""
     std = math.sqrt(2 / (5 * dim))
 
     def init_(tensor, use_mup=use_mup_outer):
diff --git a/megatron/model/mamba/mamba.py b/megatron/model/mamba/mamba.py
index d5d6b336f..950e36fed 100644
--- a/megatron/model/mamba/mamba.py
+++ b/megatron/model/mamba/mamba.py
@@ -14,7 +14,8 @@
     import einops
 except ModuleNotFoundError:
     print(
-        "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba"
+        "Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
+    or directly from https://github.com/state-spaces/mamba"
     )
     pass
 
@@ -45,12 +46,21 @@ def __init__(
             neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion
         ), "Mamba fused inner fn and bias in x_proj not compatible!"
 
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
+
         # set variables, mostly following mamba defaults
         self.d_model = neox_args.hidden_size
         self.d_state = 16  # state dimensions per channel
         self.d_conv = 4  # convolution width
-        self.expand = 2  # linear projection expansion factors
-        self.d_inner = int(self.expand * self.d_model)
+        if neox_args.intermediate_size:
+            self.d_inner = neox_args.intermediate_size
+        else:
+            self.expand = (
+                neox_args.expansion_factor if neox_args.expansion_factor else 2
+            )
+            self.d_inner = int(self.expand * self.d_model)
         self.dt_rank = math.ceil(self.d_model / 16)  # rank of dt / Delta parameter
         self.dt_scale = 1.0
 
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
index dda44659f..ba175d3eb 100644
--- a/megatron/model/norms.py
+++ b/megatron/model/norms.py
@@ -18,18 +18,34 @@
 
 def get_norm(neox_args):
     if neox_args.norm == "rmsnorm":
-        norm = RMSNorm
         eps = neox_args.rms_norm_epsilon
+        if neox_args.rmsnorm_fusion:
+            from .fused_layer_norm import MixedFusedRMSNorm
+
+            norm = MixedFusedRMSNorm
+        else:
+            norm = RMSNorm
     elif neox_args.norm == "layernorm":
         eps = neox_args.layernorm_epsilon
         if neox_args.layernorm_fusion:
             from .fused_layer_norm import MixedFusedLayerNorm
+
             norm = MixedFusedLayerNorm
         else:
             norm = LayerNorm
     elif neox_args.norm == "scalenorm":
         eps = neox_args.scalenorm_epsilon
         norm = ScaleNorm
+    elif neox_args.norm == "te_rmsnorm":
+        from .transformer_engine import TERMSNorm
+
+        norm = TERMSNorm
+        eps = neox_args.rms_norm_epsilon
+    elif neox_args.norm == "te_layernorm":
+        from .transformer_engine import TELayerNorm
+
+        norm = TELayerNorm
+        eps = neox_args.layernorm_epsilon
     else:
         raise ValueError(f"norm {neox_args.norm} not recognized")
     return norm, eps
diff --git a/megatron/model/rwkv/v6/rwkv.py b/megatron/model/rwkv/v6/rwkv.py
index 5d4e0d144..b3741a3fc 100644
--- a/megatron/model/rwkv/v6/rwkv.py
+++ b/megatron/model/rwkv/v6/rwkv.py
@@ -247,11 +247,11 @@ def __init__(self, neox_args, layer_number):
             self.time_maa_k = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
             self.time_maa_r = nn.Parameter(1.0 - torch.pow(ddd, ratio_1_to_almost0))
 
-        self.key = nn.Linear(neox_args.hidden_size, neox_args.dim_ffn, bias=False)
+        self.key = nn.Linear(neox_args.hidden_size, neox_args.ffn_dim, bias=False)
         self.receptance = nn.Linear(
             neox_args.hidden_size, neox_args.hidden_size, bias=False
         )
-        self.value = nn.Linear(neox_args.dim_ffn, neox_args.hidden_size, bias=False)
+        self.value = nn.Linear(neox_args.ffn_dim, neox_args.hidden_size, bias=False)
 
     def forward(self, x):
         xx = self.time_shift(x) - x
@@ -275,14 +275,23 @@ def __init__(self, neox_args, layer_number):
         self.layer_number = layer_number
         self.fp16 = neox_args.precision == "fp16"
         self.bf16 = neox_args.precision == "bfloat16"
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
         if not hasattr(neox_args, "dim_att"):
             neox_args.dim_att = neox_args.hidden_size
-        if not hasattr(neox_args, "dim_ffn"):
-            # Make hidden size 3.5x. Round to nearest multiple of 32 until we add hdim rounding logic
-            neox_args.dim_ffn = int((neox_args.hidden_size * 3.5) // 32 * 32)
+        if neox_args.intermediate_size:
+            neox_args.ffn_dim = neox_args.intermediate_size
+        else:
+            self.expand = (
+                neox_args.expansion_factor if neox_args.expansion_factor else 3.5
+            )
+            neox_args.ffn_dim = int(self.expand * neox_args.hidden_size)
+            # Make hidden size 3.5x by default. Round to nearest multiple of 32 until we add hdim rounding logic
+        neox_args.ffn_dim = int(neox_args.ffn_dim // 32 * 32)
         assert neox_args.hidden_size % 32 == 0
         assert neox_args.dim_att % 32 == 0
-        assert neox_args.dim_ffn % 32 == 0
+        assert neox_args.ffn_dim % 32 == 0
         self.neox_args.head_size = neox_args.dim_att // neox_args.num_attention_heads
         self.head_size = self.neox_args.head_size
         self.num_attention_heads = neox_args.num_attention_heads
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 694d58166..08436d54c 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -18,6 +18,8 @@
 """Transformer."""
 
 import math
+from contextlib import nullcontext
+
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
@@ -48,6 +50,11 @@
 )
 from megatron.model.utils import configure_sparse_attention
 
+try:
+    from flash_attn.ops.activations import swiglu
+except ImportError:
+    swiglu = None
+
 # flags required to enable jit fusion kernels
 torch._C._jit_set_profiling_mode(False)
 torch._C._jit_set_profiling_executor(False)
@@ -93,48 +100,71 @@ def __init__(
         init_method,
         output_layer_init_method,
         parallel_output=False,
+        multiple_of=256
     ):
         super().__init__()
+        assert (
+            neox_args.intermediate_size == None or neox_args.expansion_factor == None
+        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"
 
-        self.activation_func = get_activation(neox_args)
+        self.activation_func, self.is_gated = get_activation(neox_args)
         self.activation_type = neox_args.activation
         self.bias_gelu_fusion = neox_args.bias_gelu_fusion
+        self.multiple_of = multiple_of
 
-        # auto scale so geglu has equal parameters
-        ff_mult = int(4 * 2 / 3) if self.activation_type == "geglu" else 4
-        ff_dim = (
-            int(ff_mult * neox_args.hidden_size) * 2
-            if self.activation_type == "geglu"
-            else ff_mult * neox_args.hidden_size
+        if neox_args.intermediate_size:
+            ffn_dim = neox_args.intermediate_size
+        elif neox_args.expansion_factor:
+            ffn_dim = int(neox_args.expansion_factor * neox_args.hidden_size)
+        else:
+            # 4h is default for ffn_dim
+            ffn_dim = 4 * neox_args.hidden_size
+        ffn_dim_in = ffn_dim
+        if self.is_gated:
+            # set activation function to be gated implementation
+            self.activation_func = Gated_Activation(
+                self.activation_func,
+                (swiglu is not None)
+                and (neox_args.activation == "swiglu")
+                and neox_args.use_flashattn_swiglu,
+            )
+            # auto scale so gated activations has equal parameters
+            ffn_dim = int(ffn_dim * 2 / 3)
+            ffn_dim_in = ffn_dim // 2
+        # set multiple
+        ffn_dim = int(
+            (2 * self.multiple_of)
+            * ((ffn_dim + (2 * multiple_of) - 1) // (2 * multiple_of))
+        )
+        ffn_dim_in = int(
+            self.multiple_of * ((ffn_dim_in + multiple_of - 1) // multiple_of)
         )
-        self.dense_h_to_4h = mpu.ColumnParallelLinear(
+
+        self.linear1 = mpu.ColumnParallelLinear(
             neox_args=neox_args,
             input_size=neox_args.hidden_size,
-            output_size=ff_dim,
+            output_size=ffn_dim,
             gather_output=False,
             init_method=init_method,
             skip_bias_add=True,
+            bias=neox_args.use_bias_in_mlp
         )
-        ff_dim_in = ff_dim // 2 if self.activation_type == "geglu" else ff_dim
         # Project back to h.
-        self.dense_4h_to_h = mpu.RowParallelLinear(
+        self.linear2 = mpu.RowParallelLinear(
             neox_args=neox_args,
-            input_size=ff_dim_in,
+            input_size=ffn_dim_in,
             output_size=neox_args.hidden_size,
             input_is_parallel=True,
             init_method=output_layer_init_method,
             parallel_output=parallel_output,
             skip_bias_add=True,
+            bias=neox_args.use_bias_in_mlp,
         )
-
     def forward(self, hidden_states):
+        # [s, b, intermediate_size]
+        intermediate_parallel, bias_parallel = self.linear1(hidden_states)
 
-        # [s, b, 4hp]
-        intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
-
-        if (
-            self.activation_type == "gelu" and self.bias_gelu_fusion
-        ) or self.activation_type == "geglu":
+        if self.is_gated or (self.activation_type == "gelu" and self.bias_gelu_fusion):
             intermediate_parallel = self.activation_func(
                 intermediate_parallel, bias_parallel
             )
@@ -144,76 +174,27 @@ def forward(self, hidden_states):
             )
 
         # [s, b, h]
-        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
+        output, output_bias = self.linear2(intermediate_parallel)
         return output, output_bias
 
 
-class LLaMAParallelMLP(nn.Module):
-    """LLaMA's MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension. At the end, dropout is also
-    applied.
-
-    Note: multiple_of is used to compute the hidden dimension of the MLP
-    """
-
-    def __init__(
-        self,
-        neox_args,
-        init_method,
-        output_layer_init_method,
-        parallel_output=False,
-        multiple_of=256,
-    ):
+class Gated_Activation(torch.nn.Module):
+    def __init__(self, activation_func, use_swiglu=False):
         super().__init__()
-
-        self.activation_func = get_activation(neox_args)
-        self.activation_type = neox_args.activation
-
-        self.multiple_of = multiple_of
-
-        # Allow custom intermediate size, e.g. for Mistral
-        if neox_args.intermediate_size is not None:
-            ff_dim = neox_args.intermediate_size
+        self.activation_func = activation_func
+        self.use_swiglu = use_swiglu
+
+    def forward(self, x, bias=None):
+        x, gate = x.chunk(2, dim=-1)
+        if bias is not None:
+            bias_1, bias_2 = bias.chunk(2, dim=-1)
+            x = x + bias_1
+            gate = gate + bias_2
+        if not self.use_swiglu:
+            intermediate_parallel = self.activation_func(gate)
+            return intermediate_parallel * x
         else:
-            ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
-            ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
-
-        self.w1 = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=ff_dim,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True,
-            bias=False,
-        )
-        self.w3 = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=ff_dim,
-            gather_output=False,
-            init_method=init_method,
-            skip_bias_add=True,
-            bias=False,
-        )
-        self.w2 = mpu.RowParallelLinear(
-            neox_args=neox_args,
-            input_size=ff_dim,
-            output_size=neox_args.hidden_size,
-            input_is_parallel=True,
-            init_method=output_layer_init_method,
-            skip_bias_add=True,
-            parallel_output=parallel_output,
-            bias=False,
-        )
-
-    def forward(self, hidden_states):
-        w1_out, _ = self.w1(hidden_states)
-        w3_out, _ = self.w3(hidden_states)
-        return self.w2(self.activation_func(w1_out) * w3_out)
+            return swiglu(gate, x)
 
 
 class ParallelLinear(nn.Module):
@@ -229,7 +210,8 @@ def __init__(
         is_last_layer=False,
     ):
         super().__init__()
-        parallelism = neox_args.output_layer_parallelism
+        self.is_rm = neox_args.train_impl == "rm"
+        parallelism = neox_args.output_layer_parallelism if not self.is_rm else "row"
         if parallelism == "column":
             self.final_linear = mpu.ColumnParallelLinear(
                 neox_args=neox_args,
@@ -240,27 +222,43 @@ def __init__(
                 gather_output=not parallel_output,
                 skip_bias_add=False,
                 mup_rescale_parameters=is_last_layer,  # rescale params only called if neox_args.use_mup = True, despite it not being included here
+                seq_dim=1,  # important: must mark that this layer receives shape [b, s, h] not [s, b, h] and so Seq. Parallel comms must gather along dim=1 rather than dim=0
             )
-
-    #        else:
-    #            print(
-    #                'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
-    #            )
-    #            exit()
-    #            self.final_linear = mpu.RowParallelLinear(
-    #                neox_args=neox_args,
-    #                input_size=neox_args.hidden_size,
-    #                output_size=neox_args.padded_vocab_size,
-    #                bias=False,
-    #                input_is_parallel=False,
-    #                init_method=init_method,
-    #                parallel_output=parallel_output,
-    #                skip_bias_add=False,
-    #                mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
-    #            )
+        else:
+            if not self.is_rm:
+                print(
+                    'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
+                )
+                exit()
+                # self.final_linear = mpu.RowParallelLinear(
+                #     neox_args=neox_args,
+                #     input_size=neox_args.hidden_size,
+                #     output_size=neox_args.padded_vocab_size,
+                #     bias=False,
+                #     input_is_parallel=False,
+                #     init_method=init_method,
+                #     parallel_output=parallel_output,
+                #     skip_bias_add=False,
+                #     mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
+                # )
+            else:  # Not using cross entropy loss for RMs
+                self.rm_linear = mpu.RowParallelLinear(
+                    neox_args=neox_args,
+                    input_size=neox_args.hidden_size,
+                    output_size=1,
+                    bias=False,
+                    input_is_parallel=False,
+                    init_method=init_method,
+                    parallel_output=False,
+                    skip_bias_add=False,
+                    mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
+                )
 
     def forward(self, hidden_states):
-        return self.final_linear(hidden_states)
+        if not self.is_rm:
+            return self.final_linear(hidden_states)
+        else:
+            return self.rm_linear(hidden_states)
 
 
 class ParallelSelfAttention(nn.Module):
@@ -699,9 +697,13 @@ def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
             rpe = self.rpe(query_layer.size(0), key_layer.size(0))
         else:
             rpe = None
-        return self.sparse_attn(
+        attn_scores = self.sparse_attn(
             query_layer, key_layer, value_layer, attn_mask=attn_mask, rpe=rpe
         )
+        # apply dropout
+        if self.training:
+            attn_scores = self.attention_dropout(attn_scores)
+        return attn_scores
 
     def gqa_project(self, hidden_states, attention_mask, layer_past=None):
         # QKV projection and separation into separate Q/K/V layers for GQA,
@@ -712,51 +714,16 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None):
         # pass through projection: [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
         mixed_x_layer, _ = self.query_key_value(hidden_states)
 
-        # First: reshape so we have seqlen, batch, and num. query heads each as separate dims
-        # Final dim is not exactly head dim: the first (head dim) dims are query heads,
-        # The last (head dim * ratio of kv to q heads) each are the "k/v heads"
-        # (right now we treat like we have same num. heads, but smaller head dim)
-
-        # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
-        new_qkv_shape = (
-            mixed_x_layer.shape[0],
-            mixed_x_layer.shape[1],
-            self.num_attention_heads_per_partition,
-            int(
-                self.hidden_size_per_attention_head
-                * (
-                    1
-                    + 2
-                    * (
-                        self.num_kv_heads_per_partition
-                        / self.num_attention_heads_per_partition
-                    )
-                )
-            ),
-        )
-        mixed_x_layer = mixed_x_layer.reshape(*new_qkv_shape)
-
-        # Next: split our fake head dim. (last dim) so that the first (head dim) dimensions go to Q,
-        # the last smaller 2 * (head dim * kv to q head ratio) each divided between K and V separately
+        # split the last dim, so that the first (q head * head dim) dimensions go to Q,
+        # the last smaller 2 * (kv head * head dim) each divided between K and V separately
         split_sizes = (
-            self.hidden_size_per_attention_head,
-            int(
-                (
-                    self.num_kv_heads_per_partition
-                    / self.num_attention_heads_per_partition
-                )
-                * self.hidden_size_per_attention_head
-            ),
-            int(
-                (
-                    self.num_kv_heads_per_partition
-                    / self.num_attention_heads_per_partition
-                )
-                * self.hidden_size_per_attention_head
-            ),
+            self.num_attention_heads_per_partition
+            * self.hidden_size_per_attention_head,
+            self.num_kv_heads_per_partition * self.hidden_size_per_attention_head,
+            self.num_kv_heads_per_partition * self.hidden_size_per_attention_head,
         )
 
-        # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
+        # [sq, b, ((np + 2 * kvp) * hn)] --> 1 x [sq, b, np * hn] , 2 x [sq, b, kvp * hn]
         (query_layer, key_layer, value_layer) = [
             x.contiguous()
             for x in torch.split(
@@ -766,6 +733,17 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None):
             )
         ]
 
+        # reshape Q to proper output shape (last dim = correct full "real" head size again)
+        # [sq, b, np * hn] --> [sq, b, np, hn]
+        new_query_shape = (
+            query_layer.size(0),
+            query_layer.size(1),
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+
+        query_layer = query_layer.view(*new_query_shape)
+
         # reshape K/V to proper output shape (last dim = correct full "real" head size again)
         # 2 x [sq, b, np, (hn * (kvp / np))] --> 2 x [sq, b, kvp, hn]
         new_kv_shape = (
@@ -956,7 +934,7 @@ def __init__(
         self.bias_dropout_fusion = neox_args.bias_dropout_fusion
         self.gpt_j_residual = neox_args.gpt_j_residual
         self.gpt_j_tied = neox_args.gpt_j_tied
-        self.mlp_type = neox_args.mlp_type
+        self.activation = neox_args.activation
         self.num_experts = (
             neox_args.moe_num_experts
             if layer_number % neox_args.moe_expert_interval == 0
@@ -964,7 +942,14 @@ def __init__(
         )
 
         if self.gpt_j_residual:
-            self.reduce = mpu.mappings.reduce_from_model_parallel_region
+            # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers.
+            # the reduction we use is a simple allreduce for pure Tensor Parallel,
+            # but needs to be a reduce-scatter when using Megatron-style Sequence Parallel (LN sharding.)
+            self.reduce = (
+                mpu.mappings.reduce_from_model_parallel_region
+                if not neox_args.sequence_parallel
+                else mpu.mappings.reduce_scatter_to_sequence_parallel_region
+            )
 
         # Self attention.
         self.attention = ParallelSelfAttention(
@@ -984,31 +969,20 @@ def __init__(
         # leads to cleaner code
         self.post_attention_layernorm = norm(neox_args.hidden_size, eps=eps)
 
-        # Dense MLP selector
-        def get_mlp(mlp_type, **kw):
-            if mlp_type == "regular":
-                return ParallelMLP(
-                    neox_args=neox_args,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    parallel_output=self.gpt_j_residual,
-                    **kw,
-                )
-            elif mlp_type == "llama":
-                return LLaMAParallelMLP(
-                    neox_args=neox_args,
-                    init_method=init_method,
-                    output_layer_init_method=output_layer_init_method,
-                    parallel_output=self.gpt_j_residual,
-                    **kw,
-                )
-            else:
-                raise KeyError(mlp_type)
+        # MLP
+        def get_mlp(**kw):
+            return ParallelMLP(
+                neox_args=neox_args,
+                init_method=init_method,
+                output_layer_init_method=output_layer_init_method,
+                parallel_output=self.gpt_j_residual,
+                multiple_of=neox_args.mlp_multiple_of,
+                **kw,
+            )
 
         # Dense MLP
         if self.num_experts <= 1:
-            self.mlp = get_mlp(neox_args.mlp_type)
-        # Dropless MoE MLP
+            self.mlp = get_mlp()
         else:
             self.mlp = ParallelDroplessMoE(
                 neox_args=neox_args,
@@ -1058,23 +1032,27 @@ def forward(self, x, attention_mask, layer_past=None):
                 attention_output, presents = attention_output
                 self.layer_past = presents
 
-            with torch.enable_grad():
-                attention_output = bias_dropout_fn(
-                    attention_output,
-                    bias=attention_bias.expand_as(attention_output),
-                    residual=None,
-                    prob=self.hidden_dropout,
-                )
+            if attention_bias is not None:
+                with torch.enable_grad() if not self.eval else nullcontext():
+                    attention_output = bias_dropout_fn(
+                        attention_output,
+                        bias=attention_bias.expand_as(attention_output),
+                        residual=None,
+                        prob=self.hidden_dropout,
+                    )
 
             # mlp operator
             mlp_output, mlp_bias = self.mlp(x2)
-            with torch.enable_grad():
-                output = bias_dropout_fn(
-                    mlp_output,
-                    bias=mlp_bias.expand_as(mlp_output),
-                    residual=attention_output,
-                    prob=self.hidden_dropout,
-                )
+            if mlp_bias is not None:
+                with torch.enable_grad() if not self.eval else nullcontext():
+                    output = bias_dropout_fn(
+                        mlp_output,
+                        bias=mlp_bias.expand_as(mlp_output),
+                        residual=attention_output,
+                        prob=self.hidden_dropout,
+                    )
+            else:
+                output = mlp_output
 
             # output = (x + attn(ln(x)) + mlp(ln(x))
             output = residual + self.reduce(output)
@@ -1092,7 +1070,7 @@ def forward(self, x, attention_mask, layer_past=None):
             if self.use_cache:
                 attention_output, presents = attention_output
                 self.layer_past = presents
-            with torch.enable_grad():
+            with torch.enable_grad() if not self.eval else nullcontext():
                 if attention_bias is not None:
                     # Use special bias_dropout_fn if we have a bias term from the above attention layer
                     attention_output = bias_dropout_fn(
@@ -1118,9 +1096,9 @@ def forward(self, x, attention_mask, layer_past=None):
             # call signatures of both dense and MoE are the same
             mlp_output, mlp_bias = self.mlp(layernorm_output)
 
-            with torch.enable_grad():
-                # dense llama MLP and MoE don't support bias
-                if self.mlp_type == "llama" or self.num_experts > 1:
+            with torch.enable_grad() if not self.eval else nullcontext():
+                # MoE don't support bias
+                if mlp_bias == None or self.num_experts > 1:
                     # No dropout either
                     assert mlp_bias is None
                     output = mlp_output + attention_output
@@ -1173,10 +1151,25 @@ def forward(self, args):
         return self.norm(args)
 
 
-def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
+def parallel_lm_logits(
+    input_,
+    word_embeddings_weight,
+    parallel_output,
+    seq_parallel=False,
+    seq_dim=1,
+    bias=None,
+):
     """LM logits using word embedding weights."""
     # Parallel logits.
-    input_parallel = mpu.copy_to_model_parallel_region(input_)
+    if seq_parallel:
+        # if using Sequence Parallelism, our logits are sharded along the sequence dimension.
+        # gather them here. (backward pass: reduce-scatter)
+        input_parallel = mpu.gather_from_sequence_parallel_region(
+            input_, seq_dim=seq_dim
+        )
+    else:
+        # Set up backprop all-reduce.
+        input_parallel = mpu.copy_to_model_parallel_region(input_)
 
     # Matrix multiply.
     if bias is None:
diff --git a/megatron/model/transformer_engine.py b/megatron/model/transformer_engine.py
new file mode 100644
index 000000000..338513a97
--- /dev/null
+++ b/megatron/model/transformer_engine.py
@@ -0,0 +1,137 @@
+import torch
+
+try:
+    import transformer_engine as te
+except ImportError:
+    raise ImportError(
+        "Unable to import transformer-engine. Please refer to "
+        "https://github.com/NVIDIA/TransformerEngine for installation instructions."
+    )
+
+
+class TERMSNorm(torch.nn.Module):
+    def __init__(self, dim, eps=1e-8, **kwargs):
+        """
+            A conditional wrapper to initialize an instance of Transformer-Engine's
+            `RMSNorm` based on input
+        :param dim: model size
+        :param eps:  epsilon value, default 1e-8
+        """
+        super(TERMSNorm, self).__init__()
+
+        self.d = dim
+        self.eps = eps
+        self.norm = te.pytorch.RMSNorm(
+            hidden_size=self.d,
+            eps=self.eps,
+            **kwargs,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TELayerNorm(torch.nn.Module):
+    def __init__(self, dim, eps=1.0e-5, **kwargs):
+        """
+            A conditional wrapper to initialize an instance of Transformer-Engine's
+            `LayerNorm` based on input
+        :param dim: model size
+        :param eps:  epsilon value, default 1.0e-5
+        """
+        super(TELayerNorm, self).__init__()
+
+        self.d = dim
+        self.eps = eps
+        self.norm = te.pytorch.LayerNorm(
+            hidden_size=self.d,
+            eps=self.eps,
+            **kwargs,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TELinear(te.pytorch.Linear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
+    """
+    Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines
+    layernorm and linear layers
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TEColumnParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `ColumnParallelLinear` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TERowParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `RowParallelLinear` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TEDotProductAttention(te.pytorch.DotProductAttention):
+    """
+    Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
+    has "flash attention" enabled.
+    """
+
+    def __init__(self):
+        # TODO
+        return
+
+    def forward(self, x):
+        # TODO
+        return
+
+
+class TEDelayedScaling(te.common.recipe.DelayedScaling):
+    """
+    Wrapper for the Transformer-Engine's `DelayedScaling` layer.
+    """
+
+    def __init__(self):
+        # TODO
+        return
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index c3da2ce8b..8176f1f7a 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -18,14 +18,18 @@
 """Utilities for models."""
 
 import torch
-from megatron.model.norms import LayerNorm, RMSNorm, ScaleNorm
 from megatron.model.fused_softmax import SoftmaxFusionTypes
+from megatron import mpu
 from types import GeneratorType
 import torch.distributed as dist
 
+import importlib
+from typing import List, Dict, Any
 
-def get_params_for_weight_decay_optimization(module, neox_args):
-    """Divide params into with-weight-decay and without-weight-decay groups.
+
+def get_params_for_weight_decay_optimization(module: Any, neox_args: Any):
+    """
+    Divide params into with-weight-decay and without-weight-decay groups.
     Layernorms and biases will have no weight decay but the rest will.
     """
     weight_decay_params = {"params": [], "name": "weight_decay_params"}
@@ -34,41 +38,38 @@ def get_params_for_weight_decay_optimization(module, neox_args):
         "weight_decay": 0.0,
         "name": "no_weight_decay_params",
     }
-    for module_ in module.modules():
-        if any(
-            [
-                isinstance(module_, LayerNorm),
-                isinstance(module_, RMSNorm),
-                isinstance(module_, ScaleNorm),
+
+    def is_no_weight_decay_module(module_: Any) -> bool:
+        return (
+            type(module_).__name__
+            in [
+                "LayerNorm",
+                "RMSNorm",
+                "ScaleNorm",
+                "TELayerNorm",
+                "TERMSNorm",
+                "MixedFusedLayerNorm",
+                "MixedFusedRMSNorm",
             ]
-        ) or (
-            neox_args.weight_decay == 0.0
-        ):  # also include all parameters here if no weight decay is being done
+            or neox_args.weight_decay == 0.0
+        )
+
+    for module_ in module.modules():
+        if is_no_weight_decay_module(module_):
             no_weight_decay_params["params"].extend(
-                [p for p in list(module_._parameters.values()) if p is not None]
+                [p for p in module_._parameters.values() if p is not None]
             )
         else:
-            weight_decay_params["params"].extend(
-                [
-                    p
-                    for n, p in list(module_._parameters.items())
-                    if p is not None
-                    and n != "bias"
-                    and not getattr(p, "_no_weight_decay", False)
-                ]
-            )
-            no_weight_decay_params["params"].extend(
-                [
-                    p
-                    for n, p in list(module_._parameters.items())
-                    if p is not None
-                    and (n == "bias" or getattr(p, "_no_weight_decay", False))
-                ]
-            )
+            for name, param in module_._parameters.items():
+                if param is None:
+                    continue
+                if name == "bias" or getattr(param, "_no_weight_decay", False):
+                    no_weight_decay_params["params"].append(param)
+                else:
+                    weight_decay_params["params"].append(param)
+
     if neox_args.weight_decay == 0.0:
-        # only return a single param group
-        # with onebitadam, we want to minimize the calls to compressed_allreduce. Every param group calls it once.
-        # to avoid this, only use a single param group when weight decay is off.
+        # Only return a single param group to minimize calls to compressed_allreduce with onebitadam
         return [no_weight_decay_params]
     return weight_decay_params, no_weight_decay_params
 
@@ -359,3 +360,45 @@ def get_fusion_type(neox_args):
     elif neox_args.scaled_masked_softmax_fusion:
         fusion_type = SoftmaxFusionTypes.general
     return fusion_type
+
+
+def reduce_weight_grads_from_model_parallel_region(input_):
+    """A hook that can be applied to any weight tensor via .register_hook().
+    Allreduces grads for e.g. LN weights across the model parallel group.
+    Needed to keep LNs in sync, despite them getting diff data -> diff gradients when using sequence parallel.
+    """
+    # Bypass the function if no TP -> no comm needed.
+    if mpu.get_model_parallel_world_size() == 1:
+        return input_
+
+    # Bf16 convert
+    dt = input_.dtype
+    if dt == torch.bfloat16 and mpu.get_fp32_allreduce():
+        input_ = input_.float()
+
+    # All-reduce.
+    dist.all_reduce(input_, group=mpu.get_model_parallel_group())
+
+    # Bf16 convert
+    if dt == torch.bfloat16 and mpu.get_fp32_allreduce():
+        input_ = input_.bfloat16()
+
+    return input_
+
+
+def mark_norms_for_sequence_parallel_grad_sync(module, neox_args):
+    """Iterate through the modules in our model, and for any "...Norm" classnames,
+    register a hook on each of that module's parameters which will allreduce norms' weights' grads across
+    the model (sequence) parallel region.
+    """
+
+    if not neox_args.sequence_parallel:
+        # if we aren't using sequence parallelism, this is a no-op
+        return
+
+    for module_ in module.modules():
+        if "norm" in type(module_).__name__.lower():
+            # this is a norm, we want to allreduce its weight grads across sequence parallel region
+            for name, param in module_.named_parameters():
+                if param.requires_grad:
+                    param.register_hook(reduce_weight_grads_from_model_parallel_region)
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index f7372bc55..ce3c1117e 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -50,6 +50,11 @@ def __init__(
         self.hidden_size = hidden_size
         self.init_method = init_method
         self.num_tokentypes = num_tokentypes
+
+        self.sequence_parallel = (
+            neox_args.sequence_parallel
+        )  # if we are using sequence parallelism, then we'll want to scatter our inputs across the seqlen dim across TP ranks
+
         self.use_mup = neox_args.use_mup
         self.mup_embedding_mult = neox_args.mup_embedding_mult
         self.mup_rp_embedding_mult = neox_args.mup_rp_embedding_mult
@@ -159,6 +164,11 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
             with torch.no_grad():
                 embeddings.mul_(self.mup_embedding_mult)
 
+        if self.sequence_parallel:
+            # TODO: megatron-lm does dropout using the scattered embs. This would save a tiny bit of time, perhaps?
+            # Not a priority since we don't often use dropout
+            embeddings = mpu.scatter_to_sequence_parallel_region(embeddings)
+
         return embeddings
 
 
diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py
index f12ba7da7..318326e5f 100644
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -51,6 +51,9 @@
 from .mappings import gather_from_expert_model_parallel_region
 from .mappings import reduce_from_model_parallel_region
 from .mappings import scatter_to_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
+from .mappings import gather_from_sequence_parallel_region
+from .mappings import scatter_to_sequence_parallel_region
 
 from .random import checkpoint
 from .random import get_cuda_rng_tracker
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 19dff0b5f..e67015ecb 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -33,6 +33,8 @@
 from .mappings import gather_from_model_parallel_region
 from .mappings import reduce_from_model_parallel_region
 from .mappings import scatter_to_model_parallel_region
+from .mappings import reduce_scatter_to_sequence_parallel_region
+from .mappings import gather_from_sequence_parallel_region
 from .random import get_cuda_rng_tracker
 from .utils import divide
 from .utils import VocabUtility
@@ -414,6 +416,7 @@ def __init__(
         keep_master_weight_for_test=False,
         skip_bias_add=False,
         mup_rescale_parameters=False,
+        seq_dim=0,  # Dimension which is the seq_len dimension. final ParallelLinear overrides this to be 1 ; otherwise, the default is used throughout.
     ):
         super(ColumnParallelLinear, self).__init__()
 
@@ -425,6 +428,10 @@ def __init__(
         world_size = get_model_parallel_world_size()
         self.output_size_per_partition = divide(output_size, world_size)
         self.skip_bias_add = skip_bias_add
+
+        self.sequence_parallel = neox_args.sequence_parallel
+        self.seq_dim = seq_dim
+
         self.init_method = init_method
         self.stride = stride
         self.mup_rescale_parameters = mup_rescale_parameters
@@ -549,14 +556,29 @@ def set_parallel_output(self, value: bool):
     def forward(self, input_):
         if self.use_mup and self.mup_rescale_parameters:
             input_ /= self.width_mult()
-        # Set up backprop all-reduce.
-        input_parallel = copy_to_model_parallel_region(input_)
+
+        if self.sequence_parallel:
+            input_parallel = input_
+        else:
+            # Set up backprop all-reduce.
+            input_parallel = copy_to_model_parallel_region(input_)
         # Matrix multiply.
 
+        if self.sequence_parallel:
+            # do an AG in the fwd pass, RS in bwd pass.
+            # gather / scatter portion happens across the sequence dim (self.seq_dim)--
+            # almost always is [s, b, h] and so dim 0, but for lm_head ParallelLinear it is seq_dim=1 and [b, s, h]
+            input_parallel = gather_from_sequence_parallel_region(
+                input_parallel, seq_dim=self.seq_dim
+            )
+
         bias = self.bias if not self.skip_bias_add else None
         output_parallel = F.linear(input_parallel, self.weight, bias)
         if self.gather_output:
             # All-gather across the partitions.
+            assert (
+                not self.sequence_parallel
+            ), "sequence_parallel=True and gather_output=True are incompatible!"
             output = gather_from_model_parallel_region(output_parallel)
         else:
             output = output_parallel
@@ -619,6 +641,12 @@ def __init__(
         self.input_size_per_partition = divide(input_size, world_size)
         self.skip_bias_add = skip_bias_add
         self.parallel_output = parallel_output
+
+        self.sequence_parallel = neox_args.sequence_parallel
+        assert not (
+            self.sequence_parallel and not self.input_is_parallel
+        ), "Cannot have self.input_is_parallel=False and self.sequence_parallel=True."
+
         self.init_method = init_method
         self.stride = stride
         self.keep_master_weight_for_test = keep_master_weight_for_test
@@ -744,7 +772,12 @@ def forward(self, input_):
         # Matrix multiply.
         output_parallel = F.linear(input_parallel, self.weight)
         # All-reduce across all the partitions.
-        if not self.parallel_output:
+        if self.sequence_parallel and not self.parallel_output:
+            # do an RS in the fwd pass, AG in bwd pass.
+            # skip in the gpt-j parallel sublayer case (self.parallel_output=True)
+            # (user responsible for calling reduce-scatter)
+            output_ = reduce_scatter_to_sequence_parallel_region(output_parallel)
+        elif not self.parallel_output:
             output_ = reduce_from_model_parallel_region(output_parallel)
         else:
             output_ = output_parallel
diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py
index 5a2880b46..7fdef841b 100644
--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -25,7 +25,7 @@
     get_fp32_allreduce,
     get_expert_token_counts_for_rank,
 )
-from .utils import split_tensor_along_last_dim
+from .utils import split_tensor_along_last_dim, split_tensor_along_any_dim
 
 
 def _reduce(input_):
@@ -35,17 +35,17 @@ def _reduce(input_):
     if get_model_parallel_world_size() == 1:
         return input_
 
-    # Bf16 convert
+    # upcast to fp32 if using fp32 allreduce
     dt = input_.dtype
-    if dt == torch.bfloat16 and get_fp32_allreduce():
+    if get_fp32_allreduce():
         input_ = input_.float()
 
     # All-reduce.
     torch.distributed.all_reduce(input_, group=get_model_parallel_group())
 
-    # Bf16 convert
-    if dt == torch.bfloat16 and get_fp32_allreduce():
-        input_ = input_.bfloat16()
+    # reconvert to original Bf16/Fp16 dtype
+    if get_fp32_allreduce():
+        input_ = input_.to(dt)
 
     return input_
 
@@ -77,11 +77,6 @@ def _gather(input_):
     if world_size == 1:
         return input_
 
-    # Bf16 convert
-    dt = input_.dtype
-    if dt == torch.bfloat16 and get_fp32_allreduce():
-        input_ = input_.float()
-
     # Size and dimension.
     last_dim = input_.dim() - 1
     rank = get_model_parallel_rank()
@@ -185,9 +180,102 @@ def _dmoe_gather(input_: torch.Tensor, tokens_per_expert: torch.Tensor):
     # Note: torch.cat already creates a contiguous tensor.
     output = torch.cat(tensor_list, dim=gather_dim)
 
-    # Bf16 convert
-    if dt == torch.bfloat16 and get_fp32_allreduce():
-        output = output.bfloat16()
+    return output
+
+
+def _reduce_scatter_along_seq_dim(input_, seq_dim):
+    """Reduce-scatter the input tensor across model parallel group, scattering across sequence dim."""
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # upcast to fp32 if using fp32 allreduce
+    dt = input_.dtype
+    if get_fp32_allreduce():
+        input_ = input_.float()
+
+    dim_size = list(input_.size())
+    assert (
+        isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0
+    ), "seq_dim must be a valid tensor dim"
+    assert dim_size[seq_dim] % world_size == 0
+
+    if seq_dim == 0:
+        # reduce_scatter_tensor is faster but only works correctly on dimension 0
+        dim_size[seq_dim] = dim_size[seq_dim] // world_size
+        output = torch.empty(
+            dim_size, dtype=input_.dtype, device=torch.cuda.current_device()
+        )
+        torch.distributed.reduce_scatter_tensor(
+            output, input_.contiguous(), group=get_model_parallel_group()
+        )
+    else:
+        tensor_list = list(
+            torch.split(input_, input_.shape[seq_dim] // world_size, seq_dim)
+        )
+        output = torch.empty_like(tensor_list[0])
+        torch.distributed.reduce_scatter(
+            output, tensor_list, group=get_model_parallel_group()
+        )
+
+    # reconvert to original Bf16/Fp16 dtype
+    if get_fp32_allreduce():
+        output = output.to(dt)
+
+    return output
+
+
+def _gather_along_seq_dim(input_, seq_dim):
+    """Gather tensors and concatinate along the (manually-specified) sequence dimension."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert (
+        isinstance(seq_dim, int) and seq_dim < len(dim_size) and seq_dim >= 0
+    ), "seq_dim must be a valid tensor dim"
+    dim_size[seq_dim] = dim_size[seq_dim] * world_size
+
+    if seq_dim == 0:
+        # reduce_gather_tensor is faster but only works correctly on dimension 0
+        output = torch.empty(
+            dim_size, dtype=input_.dtype, device=torch.cuda.current_device()
+        )
+        torch.distributed.all_gather_into_tensor(
+            output, input_.contiguous(), group=get_model_parallel_group()
+        )
+    else:
+        input_ = input_.contiguous()
+        rank = get_model_parallel_rank()
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        tensor_list[rank] = input_
+        torch.distributed.all_gather(
+            tensor_list, input_, group=get_model_parallel_group()
+        )
+        output = torch.cat(tensor_list, dim=seq_dim)
+
+    return output
+
+
+def _split_along_seq_dim(input_, seq_dim):
+    """Split the tensor along the sequence dimension (as manually selected) and keep the
+    corresponding slice."""
+
+    world_size = get_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along second dimension.
+    input_list = split_tensor_along_any_dim(input_, world_size, seq_dim)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_model_parallel_rank()
+    output = input_list[rank].contiguous()
 
     return output
 
@@ -309,6 +397,65 @@ def backward(ctx, grad_output):
         return _dmoe_split(grad_output, tokens_per_expert), None
 
 
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce-Scatter across sequence parallel region (same as model parallel region.)
+    Note: same region as model parallel region
+    """
+
+    @staticmethod
+    def symbolic(graph, input_, seq_dim):
+        return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def forward(ctx, input_, seq_dim):
+        ctx.seq_dim = seq_dim
+        return _reduce_scatter_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        seq_dim = ctx.seq_dim
+        return _gather_along_seq_dim(grad_output, seq_dim=seq_dim), None
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """All-Gather across sequence parallel region (same region as model parallel region.)"""
+
+    @staticmethod
+    def symbolic(graph, input_, seq_dim):
+        return _gather_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def forward(ctx, input_, seq_dim):
+        ctx.seq_dim = seq_dim
+        return _gather_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        seq_dim = ctx.seq_dim
+        return _reduce_scatter_along_seq_dim(grad_output, seq_dim=seq_dim), None
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Scatter (split) sequence length across sequence parallel region (=> same region as model parallel.)"""
+
+    @staticmethod
+    def symbolic(graph, input_, seq_dim):
+        return _split_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def forward(ctx, input_, seq_dim):
+        ctx.seq_dim = seq_dim
+        return _split_along_seq_dim(input_, seq_dim=seq_dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        seq_dim = ctx.seq_dim
+        return (
+            _gather_along_seq_dim(grad_output, seq_dim=seq_dim),
+            None,
+        )
+
+
 # -----------------
 # Helper functions.
 # -----------------
@@ -336,3 +483,17 @@ def gather_from_model_parallel_region(input_):
 
 def gather_from_expert_model_parallel_region(input_, tokens_per_expert):
     return _GatherFromExpertModelParallelRegion.apply(input_, tokens_per_expert)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_, seq_dim=0):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_, seq_dim)
+
+
+def gather_from_sequence_parallel_region(input_, seq_dim=0):
+    return _GatherFromSequenceParallelRegion.apply(input_, seq_dim)
+
+
+def scatter_to_sequence_parallel_region(
+    input_, seq_dim=1
+):  # use this fn in scattering input embeds across TP ranks. There, shape of inps is [b, s, h] instead of the usual [s, b, h]
+    return _ScatterToSequenceParallelRegion.apply(input_, seq_dim)
diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
index 13941dc29..1f97e0e76 100644
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
@@ -53,6 +53,28 @@ def split_tensor_along_last_dim(tensor, num_partitions, contiguous_split_chunks=
     return tensor_list
 
 
+def split_tensor_along_any_dim(
+    tensor, num_partitions, seq_dim, contiguous_split_chunks=False
+):
+    """Split a tensor along a user-specified dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        seq_dim: dimension along which to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    seq_dim_size = divide(tensor.size()[seq_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, seq_dim_size, dim=seq_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
 class VocabUtility:
     """Split the vocabulary into `world_size` chunks amd return the
     first and last index of the vocabulary belonging to the `rank`
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index a41874971..5948e6892 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -51,6 +51,20 @@
     ATTENTION_TYPE_CHOICES,
 )
 
+### ANSI escape codes ###
+END = "\033[0m"
+GREEN = "\033[92m"
+RED = "\033[91m"
+YELLOW = "\033[93m"
+
+### Formatted logging prefixes ###
+ERROR = f"{RED}[ERROR]{END} "
+FAIL = f"{RED}[FAIL]{END}"
+INFO = "[INFO]"
+OKAY = f"{GREEN}[OKAY]{END}"
+SUCCESS = f"{GREEN} [SUCCESS] {END}"
+WARNING = f"{YELLOW}[WARNING]{END}"
+
 # ZERO defaults by deespeed
 # These values should not be changed unless defaults in deepspeed are changed
 # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
@@ -155,7 +169,7 @@ def initialize_tensorboard_writer(self):
             try:
                 from torch.utils.tensorboard import SummaryWriter
 
-                print("> setting tensorboard ...")
+                print("> setting up tensorboard ...")
                 self.tensorboard_writer = SummaryWriter(log_dir=self.tensorboard_dir)
             except (ModuleNotFoundError, ImportError):
                 print(
@@ -165,6 +179,47 @@ def initialize_tensorboard_writer(self):
                     flush=True,
                 )
 
+    def initialize_comet(self):
+        if self.use_comet and self.rank == 0:
+            try:
+                import comet_ml
+
+                # Deactivate output logging to avoid any potential interference with Tee
+                self.comet_experiment = comet_ml.start(
+                    workspace=self.comet_workspace,
+                    project=self.comet_project,
+                    experiment_config=comet_ml.ExperimentConfig(
+                        auto_output_logging=False
+                    ),
+                )
+                self.comet_experiment.__internal_api__log_parameters__(
+                    self.all_config,
+                    framework="gpt-neox",
+                    source="manual",
+                    flatten_nested=True,
+                )
+
+                if self.comet_experiment_name:
+                    self.comet_experiment.set_name(self.comet_experiment_name)
+
+                if self.comet_tags:
+                    self.comet_experiment.add_tags(self.comet_tags)
+
+                if self.comet_others:
+                    self.comet_experiment.log_others(self.comet_others)
+
+                logging.info("> setting up comet ...")
+            except ImportError as e:
+                logging.error(
+                    f'{FAIL} importing comet. Comet can be installed with "pip install comet_llm". See https://github.com/comet-ml/comet-llm for more info. Full error is:'
+                )
+                raise e
+            except Exception as e:
+                logging.error(
+                    f'{FAIL} Error setting up Comet. Either set "use_comet: False" in your configuration file, or resolve the issue with Comet. Full error is:',
+                )
+                raise e
+
     @classmethod
     def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None):
         """
@@ -182,7 +237,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None)
         config_files = dict()
         # iterate of all to be loaded yaml files
         for conf_file_name in paths_to_yml_files:
-
             # load file
             with open(conf_file_name) as conf_file:
                 conf = yaml.load(conf_file, Loader=yaml.FullLoader)
@@ -479,7 +533,6 @@ def get_extra_deepspeed_args(self):
         return extra_ds_args
 
     def get_deepspeed_main_args(self):
-
         args_list = list()
 
         if self.autotuning_run is not None:
@@ -805,7 +858,6 @@ def calculate_batch_parameters(
 
     @staticmethod
     def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc):
-
         assert (
             train_batch > 0
         ), f"Train batch size: {train_batch} has to be greater than 0"
@@ -868,7 +920,8 @@ def calculate_derived(self):
         dp_world_size = (global_num_gpus / pp_size) / mp_size
         if not (dp_world_size % 1 == 0):
             error_message = (
-                self.__class__.__name__
+                f"{ERROR}"
+                + self.__class__.__name__
                 + ".calculate_derived() "
                 + f"(global_num_gpus / pp_size) / mp_size [({global_num_gpus} / {pp_size}) / {mp_size}] must be a whole number"
             )
@@ -904,38 +957,21 @@ def calculate_derived(self):
             }
         )
 
-        # derive steps where checkpoint should be saved
-        if self.checkpoint_factor or self.extra_save_iters:
-            if self.extra_save_iters:
-                save_iters = set(self.extra_save_iters)
-            else:
-                save_iters = set()
-
-            step = self.checkpoint_factor  # don't save step 0 or 1
-            while step < self.train_iters:
-                save_iters.add(step)
-                if self.checkpoint_scale == "log":
-                    step *= self.checkpoint_factor
-                elif self.checkpoint_scale == "linear":
-                    step += self.checkpoint_factor
-
-            save_iters = list(save_iters)
-            save_iters.sort()
-
-            self.update_values(
-                {
-                    "save_iters": save_iters,
-                }
-            )
-
         # derive precision
-        fp16_conflict = "DeepSpeed fp16 field was set but precision conflicts"
         if self.fp16 and self.fp16.get("enabled", False):
             if self.precision is None:
                 self.update_value("precision", "fp16")
             else:
+                fp16_conflict = "DeepSpeed fp16 field was set but precision conflicts"
                 assert self.precision == "fp16", fp16_conflict
 
+        if self.bf16 and self.bf16.get("enabled", False):
+            if self.precision is None:
+                self.update_value("precision", "bfloat16")
+            else:
+                bf16_conflict = "DeepSpeed bf16 field was set but precision conflicts"
+                assert self.precision == "bfloat16", bf16_conflict
+
         if self.precision == "fp16":
             if isinstance(self.fp16, dict) and len(self.fp16) > 0:
                 fp16_args = copy.deepcopy(self.fp16)
@@ -944,14 +980,15 @@ def calculate_derived(self):
                 fp16_args = {"type": "fp16", "enabled": True}
             self.update_value("fp16", fp16_args)
         elif self.precision == "bfloat16":
-            bf_config = {"bf16": {"enabled": True}}
-            # dt_config = {"grad_accum_dtype": "fp32"}
-            if self.deepspeed_extra_args is None:
-                self.update_value("deepspeed_extra_args", bf_config)
-            else:
-                extra_args = copy.deepcopy(self.deepspeed_extra_args)
-                extra_args.update(bf_config)
-                self.update_value("deepspeed_extra_args", extra_args)
+            if not self.bf16:
+                bf_config = {"bf16": {"enabled": True}}
+                # dt_config = {"grad_accum_dtype": "fp32"}
+                if self.deepspeed_extra_args is None:
+                    self.update_value("deepspeed_extra_args", bf_config)
+                else:
+                    extra_args = copy.deepcopy(self.deepspeed_extra_args)
+                    extra_args.update(bf_config)
+                    self.update_value("deepspeed_extra_args", extra_args)
 
             zero_stage = self.zero_optimization["stage"]
             if self.data_types is None:
@@ -1017,6 +1054,10 @@ def calculate_derived(self):
         )
 
         if self.optimizer_type.lower() == "onebitadam":
+            assert (
+                self.train_iters is not None
+            ), "OneBitAdam requires train_iters to be specified"
+
             # onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args
             # for all other optimizers, the scheduling is handled by megatron
             self.scheduler = {
@@ -1037,6 +1078,17 @@ def calculate_derived(self):
         # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
         self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
 
+        # Do MoE checks
+        if self.moe_num_experts > 1:
+            assert not (
+                self.is_pipe_parallel or self.pipe_parallel_size > 1
+            ), "MoE not supported with pipeline parallelism"
+            assert self.zero_optimization["stage"] != 3, "MoE not compatible with zero3"
+
+            assert (
+                self.sequence_parallel is False
+            ), "MoE not compatible with Sequence Parallel"
+
         # Attention config
         if self.attention_config is None:
             self.update_value("attention_config", [[["global"], self.num_layers]])
@@ -1111,15 +1163,19 @@ def calculate_derived(self):
         # Adding equal dataset weights if none are provided
         if self.train_data_paths and (self.train_data_weights is None):
             self.train_data_weights = [1.0] * len(self.train_data_paths)
+        elif self.pos_train_data_paths and (self.train_data_weights is None):
+            self.train_data_weights = [1.0] * len(self.pos_train_data_paths)
         if self.valid_data_paths and (self.valid_data_weights is None):
             self.valid_data_weights = [1.0] * len(self.valid_data_paths)
+        elif self.pos_valid_data_paths and (self.valid_data_weights is None):
+            self.valid_data_weights = [1.0] * len(self.pos_valid_data_paths)
         if self.test_data_paths and (self.test_data_weights is None):
             self.test_data_weights = [1.0] * len(self.test_data_paths)
+        elif self.pos_test_data_paths and (self.test_data_weights is None):
+            self.test_data_weights = [1.0] * len(self.pos_test_data_paths)
 
-        if self.label_data_paths:
-            err_str = (
-                "Must use `label_data_paths` with `train_data_paths`, not `data_path`"
-            )
+        if self.train_label_data_paths:
+            err_str = "Must use `train_label_data_paths` with `train_data_paths`, not `data_path`"
             assert self.train_data_paths and not self.data_path, err_str
 
         # if a sample input file is provided, default text_gen_type type to input-file
@@ -1159,7 +1215,9 @@ def validate_values(self):
 
         # learning rate
         if self.lr is None:
-            error_message = self.__class__.__name__ + ".validate_values() lr is None"
+            error_message = (
+                f"{FAIL} " + self.__class__.__name__ + ".validate_values() lr is None"
+            )
             logging.error(error_message)
             raise ValueError(error_message)
             return False
@@ -1174,7 +1232,8 @@ def validate_values(self):
         for req_arg in required_args:
             if getattr(self, req_arg) is None:
                 error_message = (
-                    self.__class__.__name__
+                    f"{FAIL}"
+                    + self.__class__.__name__
                     + ".validate_values() "
                     + req_arg
                     + " is None."
@@ -1184,9 +1243,12 @@ def validate_values(self):
                 return False
 
         # Checks.
-        if self.hidden_size % self.num_attention_heads != 0:
+        if self.hidden_size % self.num_attention_heads != 0 and not (
+            "mamba" in self.attention_config
+        ):
             error_message = (
-                self.__class__.__name__
+                f"{FAIL}"
+                + self.__class__.__name__
                 + ".validate_values() hidden_size must be divisible by num_attention_heads"
             )
             logging.error(error_message)
@@ -1196,7 +1258,8 @@ def validate_values(self):
         if self.seq_length is not None:
             if not (self.max_position_embeddings >= self.seq_length):
                 error_message = (
-                    self.__class__.__name__
+                    f"{FAIL}"
+                    + self.__class__.__name__
                     + ".validate_values() max_position_embeddings must be bigger or equal seq_length"
                 )
                 logging.error(error_message)
@@ -1205,7 +1268,8 @@ def validate_values(self):
 
         if not (self.min_lr <= self.lr):
             error_message = (
-                self.__class__.__name__
+                "{FAIL}"
+                + self.__class__.__name__
                 + ".validate_values() min_lr must be smaller or equal lr"
             )
             logging.error(error_message)
@@ -1218,7 +1282,8 @@ def validate_values(self):
             and self.extra_save_iters is None
         ):
             error_message = (
-                self.__class__.__name__
+                f"{FAIL}"
+                + self.__class__.__name__
                 + ".validate_values() checkpoint_factor or extra_save_iters must be defined if save is defined"
             )
             logging.error(error_message)
@@ -1227,10 +1292,10 @@ def validate_values(self):
 
         # Parameters sharing does not work with torch DDP.
         if (self.num_unique_layers is not None) and (self.num_layers is not None):
-
             if not (self.num_unique_layers <= self.num_layers):
                 error_message = (
-                    self.__class__.__name__
+                    f"{FAIL}"
+                    + self.__class__.__name__
                     + ".validate_values() num-unique-layers must be smaller or equal num_layers"
                 )
                 logging.error(error_message)
@@ -1239,7 +1304,8 @@ def validate_values(self):
 
             if not (self.num_layers % self.num_unique_layers == 0):
                 error_message = (
-                    self.__class__.__name__
+                    f"{FAIL}"
+                    + self.__class__.__name__
                     + ".validate_values() num-layers should be divisible by num-unique-layers"
                 )
                 logging.error(error_message)
@@ -1248,7 +1314,8 @@ def validate_values(self):
 
         if self.fp16_lm_cross_entropy and self.precision != "fp16":
             error_message = (
-                self.__class__.__name__
+                f"{FAIL}"
+                + self.__class__.__name__
                 + ".validate_values() lm cross entropy in fp16 only support in fp16 mode."
             )
             logging.error(error_message)
@@ -1266,13 +1333,13 @@ def validate_values(self):
         ]
         if all(has_separate_path):
             assert self.data_path is None, (
-                "Please provide *either* `data_path` or `train/valid/test_data_path` "
+                f"{FAIL} Please provide *either* `data_path` or `train/valid/test_data_path` "
                 "in args "
             )
 
         # assert that if one of train/test/valid_data_path are provided, all should be
         assert_error_mess = (
-            "One or more of train/valid/test data_path are not provided:\n\t"
+            f"{FAIL} One or more of train/valid/test data_path are not provided:\n\t"
         )
         assert_error_mess += "\n\t".join(
             [
@@ -1328,7 +1395,8 @@ def validate_types(self):
                         if actual_value.lower() in lowercase_accepted_values:
                             continue
                     logging.error(
-                        self.__class__.__name__
+                        f"{FAIL}"
+                        + self.__class__.__name__
                         + ".validate_types() "
                         + f"{field_name}: '{actual_value}' Not in accepted values: '{accepted_values}'"
                     )
@@ -1339,14 +1407,16 @@ def validate_types(self):
                         continue
                     else:
                         logging.error(
-                            self.__class__.__name__
+                            f"{FAIL}"
+                            + self.__class__.__name__
                             + ".validate_types() "
                             + f"{field_name}: '{actual_type}' not in {accepted_types}"
                         )
                         return False
 
                 logging.error(
-                    self.__class__.__name__
+                    f"{FAIL}"
+                    + self.__class__.__name__
                     + ".validate_types() "
                     + f"{field_name}: '{actual_type}' instead of '{field_def.type}'"
                 )
@@ -1368,7 +1438,8 @@ def validate_types(self):
                         return False
                 else:
                     logging.error(
-                        self.__class__.__name__
+                        f"{FAIL}"
+                        + self.__class__.__name__
                         + ".validate_types() "
                         + f"{field_name}: must contain key 'type'"
                     )
@@ -1376,14 +1447,16 @@ def validate_types(self):
                 if "params" in value:
                     if not isinstance(value["params"], dict):
                         logging.error(
-                            self.__class__.__name__
+                            f"{FAIL}"
+                            + self.__class__.__name__
                             + ".validate_types() "
                             + f"{field_name}: key 'params' must be a dict"
                         )
                         return False
                 else:
                     logging.error(
-                        self.__class__.__name__
+                        f"{FAIL}"
+                        + self.__class__.__name__
                         + ".validate_types() "
                         + f"{field_name}: must contain key 'params'"
                     )
@@ -1394,7 +1467,8 @@ def validate_types(self):
             if isinstance(value, dict):
                 if not "enabled" in value:
                     error_message = (
-                        self.__class__.__name__
+                        f"{FAIL}"
+                        + self.__class__.__name__
                         + ".validate_types() "
                         + f"{field_name}: must contain key 'enabled'"
                     )
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 3083b7282..a464b133c 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -21,9 +21,9 @@
     from template import NeoXArgsTemplate
 
 try:
-    from typing import List, Literal, Union
+    from typing import List, Literal, Union, Optional, Any
 except ImportError:
-    from typing_extensions import List, Literal, Union
+    from typing_extensions import List, Literal, Union, Optional
 
 
 ATTENTION_TYPE_CHOICES = [
@@ -46,7 +46,7 @@ def get_git_commit_hash():
     try:
         git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
         git_hash = git_hash.decode()
-    except subprocess.CalledProcessError:
+    except (subprocess.CalledProcessError, FileNotFoundError):
         git_hash = None
     return git_hash
 
@@ -85,6 +85,13 @@ class NeoXArgsParallelism(NeoXArgsTemplate):
     according to pipeline parallel size.
     """
 
+    sequence_parallel: bool = False
+    """
+    flag to determine whether Megatron-style Sequence Parallelism (https://arxiv.org/abs/2205.05198)
+    (Layernorm inputs and activations are sharded across model parallel group) will be used. Has no effect when model_parallel_size is 1.
+    **Set by user, in contrast to neox_args.is_pipe_parallel.**
+    """
+
 
 @dataclass
 class NeoXArgsModel(NeoXArgsTemplate):
@@ -109,9 +116,17 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     intermediate_size: int = None
     """
-    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+    Transformer intermediate size. Default = 4h
+    """
 
-    If not passed, will be set to a reasonable default.
+    mlp_multiple_of: int = 1
+    """
+    force mlp size to be a multiple of this value
+    """
+
+    expansion_factor: float = None
+    """
+    Transformer intermediate size. Default = 4
     """
 
     num_attention_heads: int = None
@@ -147,9 +162,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Maximum number of position embeddings to use. This is the size of position embedding.
     """
 
-    norm: Literal["layernorm", "rmsnorm", "scalenorm"] = "layernorm"
+    norm: Literal[
+        "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm"
+    ] = "layernorm"
     """
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "te_rmsnorm", "te_layernorm".
     """
 
     layernorm_fusion: bool = False
@@ -157,6 +174,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Use fused layer norm kernel (if `norm` is `layernorm`).
     """
 
+    rmsnorm_fusion: bool = False
+    """
+    Use fused RMS norm kernel (if `norm` is `rmsnorm`).
+    """
+
     use_qk_layernorm: bool = False
     """
     Use QK Normalization
@@ -266,10 +288,25 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
 
     activation: Literal[
-        "gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"
+        "gelu",
+        "geglu",
+        "relu",
+        "softsign",
+        "swish",
+        "mish",
+        "silu",
+        "reglu",
+        "swiglu",
+        "bilinear",
+        "glu",
     ] = "gelu"
     """
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
+    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu", "reglu", "swiglu", "bilinear", "glu"]
+    """
+
+    use_flashattn_swiglu: bool = False
+    """
+    Use flash attention's version of swiglu
     """
 
     scaled_upper_triang_masked_softmax_fusion: bool = False
@@ -406,12 +443,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
     """
     If false, attn_linear (e.g. QKVO) will not have bias terms
     """
-
-    mlp_type: str = "regular"
+    use_bias_in_mlp: bool = True
     """
-    Types:
-        regular: Megatron implementation
-        llama: LLaMA MLP (SiLU-gated MLP)
+    If false, mlps will not have bias terms
     """
 
     soft_prompt_tuning: dict = None
@@ -463,6 +497,21 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
     """
 
+    dim_att: int = None
+    """
+    Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size.
+    """
+
+    head_size: int = None
+    """
+    Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads.
+    """
+
+    ffn_dim: int = None
+    """
+    Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor.
+    """
+
 
 @dataclass
 class NeoXArgsOptimizer(NeoXArgsTemplate):
@@ -534,7 +583,13 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
 
     lr_decay_iters: int = None
     """
-    Number of iterations to decay learning rate over, If None defaults to --train-iters
+    Number of iterations to decay learning rate over, If None defaults to
+    --train-iters or the equivalent inferred valued from train_epochs.
+    """
+
+    lr_decay_fraction: float = None
+    """
+    Effective fraction of training over which to decay lr, overrides lr_decay_iters, useful when specifying train_epochs
     """
 
     min_lr: float = 0.0
@@ -600,6 +655,39 @@ class NeoXArgsLogging(NeoXArgsTemplate):
     Write TensorBoard logs to this directory.
     """
 
+    use_comet: bool = None
+    """Flag indicating if comet is to be used."""
+
+    comet_workspace: Optional[str] = None
+    """
+    Comet workspace name, if not configured Comet Experiments will be created in the user configured default workspace.
+    """
+
+    comet_project: Optional[str] = None
+    """
+    Comet project name, if not configured Comet Experiments will be created in the Uncategorized Experiments project.
+    """
+
+    comet_experiment_name: Optional[str] = None
+    """
+    Custom name for the Comet experiment. If not provided, a random name is used.
+    """
+
+    comet_tags: Optional[list] = None
+    """
+    List of tags to attach to the created Comet Experiment.
+    """
+
+    comet_others: Optional[dict] = None
+    """
+    Custom metadata to attach to the created Comet Experiment.
+    """
+
+    comet_experiment: Any = None
+    """
+    Initialized comet experiment object used to log data
+    """
+
     log_interval: int = 100
     """
     Interval between logging.
@@ -654,8 +742,8 @@ class NeoXArgsLogging(NeoXArgsTemplate):
 
     profile: bool = False
     """
-    Enable nsys profiling. When using this option,
-    nsys options should be specified in commandline.
+    Enable nsys and pytorch profiling. When using this option with nsys,
+    nsys options should be directly specified in commandline.
     An example nsys commandline is
     ```
     nsys profile -s none -t nvtx,cuda -o <path/to/output_file>
@@ -780,11 +868,6 @@ class NeoXArgsOther(NeoXArgsTemplate):
     Set during training
     """
 
-    save_iters: list = None
-    """
-    Set during training
-    """
-
     global_num_gpus: int = None
     """
     Set during launching
@@ -843,9 +926,14 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     List of paths to train datasets.
     """
 
-    label_data_paths: list = None
+    train_label_data_paths: list = None
+    """
+    List of paths to train label datasets (not shifted by 1 yet!).
+    """
+
+    train_reward_data_paths: list = None
     """
-    List of paths to label datasets (not shifted by 1 yet!).
+    List of paths to train reward datasets
     """
 
     test_data_paths: list = None
@@ -853,11 +941,67 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     List of paths to test datasets.
     """
 
+    test_label_data_paths: list = None
+    """
+    List of paths to test label datasets (not shifted by 1 yet!).
+    """
+
+    test_reward_data_paths: list = None
+    """
+    List of paths to test reward datasets
+    """
+
     valid_data_paths: list = None
     """
     List of paths to validation datasets.
     """
 
+    valid_label_data_paths: list = None
+    """
+    List of paths to validation label datasets (not shifted by 1 yet!).
+    """
+
+    valid_reward_data_paths: list = None
+    """
+    List of paths to validation reward datasets
+    """
+
+    pos_train_data_paths: list = None
+    neg_train_data_paths: list = None
+    """
+    List of paths to positive and negative training datasets.
+    """
+
+    pos_train_label_data_paths: list = None
+    neg_train_label_data_paths: list = None
+    """
+    List of paths to positive and negative training label datasets (not shifted by 1 yet!).
+    """
+
+    pos_valid_data_paths: list = None
+    neg_valid_data_paths: list = None
+    """
+    List of paths to positive and negative validation datasets.
+    """
+
+    pos_valid_label_data_paths: list = None
+    neg_valid_label_data_paths: list = None
+    """
+    List of paths to positive and negative validation label datasets (not shifted by 1 yet!).
+    """
+
+    pos_test_data_paths: list = None
+    neg_test_data_paths: list = None
+    """
+    List of paths to positive and negative test datasets.
+    """
+
+    pos_test_label_data_paths: list = None
+    neg_test_label_data_paths: list = None
+    """
+    List of paths to positive and negative test label datasets (not shifted by 1 yet!).
+    """
+
     train_data_weights: list = None
     """
     List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
@@ -907,6 +1051,73 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Implementation of indexed datasets, can be one of "infer", "cached", or "mmap"
     """
 
+    pack_impl: Literal["packed", "pack_until_overflow", "unpacked"] = "packed"
+    """
+    Packing implementation, can be one of "packed", "pack_until_overflow", or "unpacked".
+
+    warning: pack_until_overflow is very naive and will likely have issues with pretraining scale datasets
+    """
+
+    dataset_impl: Literal["gpt2", "pairwise"] = "gpt2"
+    """
+    Dataset implementation, can be one of "gpt2" or "pairwise"
+    """
+
+    train_impl: Literal["normal", "dpo", "rm", "kto"] = "normal"
+    """
+    Training implementation, can be one of "normal", "dpo", "kto", or "rm"
+    """
+
+    dpo_fp32: bool = True
+    """
+    Whether to cast logits to fp32 for DPO loss calculation.
+    """
+
+    dpo_reference_free: bool = False
+    """
+    Whether to use reference-free DPO.
+    """
+
+    dpo_beta: float = 0.1
+    """
+    Beta value for DPO
+    """
+
+    kto_fp32: bool = True
+    """
+    Whether to cast logits to fp32 for KTO loss calculation.
+    """
+
+    kto_desirable_weight: float = 1.0
+    """
+    Weight for desirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes.
+    """
+
+    kto_undesirable_weight: float = 1.0
+    """
+    Weight for undesirable loss in KTO. Might help if you have unbalanced desirable and undesirable classes.
+    """
+
+    z_loss: float = 0.0
+    """
+    Z-loss parameter, only implemented for RM training currently.
+    https://arxiv.org/pdf/2204.02311
+    https://arxiv.org/pdf/2309.10305
+    """
+
+    kto_beta: float = 0.1
+    """
+    Beta value for KTO
+    """
+
+    allow_chopped: bool = True
+    """
+    WARNING: if your packing impl is packed, this is ignored.
+
+    Allow chopped samples in the dataset.
+    (e.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024)
+    """
+
     mmap_warmup: bool = False
     """
     Warm up mmap files.
@@ -948,7 +1159,7 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1.
     """
 
-    checkpoint_factor: int = None
+    checkpoint_factor: Union[int, float] = None
     """
     Acts as a multiplier on either the "log" or "linear" checkpoint spacing.
 
@@ -1002,6 +1213,12 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Number of iterations to run for training.
     """
 
+    train_epochs: int = None
+    """
+    Number of epochs to run for training. Do not specify both train_epochs and train_iters.
+    Not currently compatible with data reweighing, pairwise datasets, and packing other than 'packed'
+    """
+
     eval_iters: int = 100
     """
     Number of iterations to run for evaluation validation/test for.
@@ -1195,7 +1412,12 @@ class NeoXArgsTextgen(NeoXArgsTemplate):
     text_gen_type: str = None
     """
     How to generate text/sample the model.
-    Options: `unconditional`, `input-file`, `interactive`
+    Options: `unconditional`, `input-file`, `interactive`, `precompute`
+    """
+
+    precompute_model_name: str = None
+    """
+    Model name to use for saving precomputed logprobs
     """
 
     temperature: float = 0.0
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 7b7a390ab..293cbaabc 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -19,16 +19,20 @@
 
 import copy
 import json
+import math
 import os
 import time
 from typing import List, Union
 
+import numpy as np
 import torch
 import torch.nn.functional as F
 
 from megatron import print_rank_0
 from megatron import mpu
 from megatron.utils import get_ltor_masks_and_position_ids, is_mp_rank_0
+from megatron.data.indexed_dataset import make_builder, make_dataset
+from megatron.mpu.mappings import gather_from_model_parallel_region
 
 
 def get_batch(neox_args, context_tokens: torch.Tensor):
@@ -52,7 +56,9 @@ def get_batch(neox_args, context_tokens: torch.Tensor):
     return tokens, attention_mask, position_ids
 
 
-def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int):
+def pad_batch(
+    context_tokens: List[List[int]], pad_id: int, pad_len: int, truncate: bool = False
+):
     """
     pads context lengths in context_tokens with pad_id to equal neox_args.seq_length,
     and returns the padded batch and the new lengths.
@@ -60,17 +66,21 @@ def pad_batch(context_tokens: List[List[int]], pad_id: int, pad_len: int):
     context_tokens: list of lists of tokens
     pad_id: int, integer to use as padding token
     pad_len: int, context length to be padded; all batch items will be padded to the same length
+    truncate: bool, if True, truncate context tokens to pad_len if they are longer than pad_len
 
     returns: tuple of padded context tokens and a list of unpadded token count
     """
 
     context_lengths = []
-    for tokens in context_tokens:
+    for i, tokens in enumerate(context_tokens):
         context_length = len(tokens)
         if context_length < pad_len:
             tokens.extend([pad_id] * (pad_len - context_length))
         elif context_length > pad_len:
-            raise ValueError("context_length is bigger than to be padded length")
+            if not truncate:
+                raise ValueError("context_length is bigger than to be padded length")
+            context_tokens[i] = tokens[:pad_len]
+            context_length = pad_len
         context_lengths.append(context_length)
     return context_tokens, context_lengths
 
@@ -82,6 +92,8 @@ def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
     This function has been mostly taken from huggingface conversational ai code at
     https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
 
+    When both top_k and top_p are specified, tokens are first filtered according to top_k, renormalized, and then filtered according to top_p.
+
     logits: torch.Tensor -> logits of megatron model.
     top_k: integer -> integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
     top_p: float -> Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
@@ -807,3 +819,182 @@ def generate_samples_interactive(
                 print_rank_0("Generated Text: " + generated_text)
         if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
             _ = input("\n<press enter to continue>")
+
+
+def get_logp(logits, labels, force_fp32=False):
+    if force_fp32:
+        logits = logits.float()
+    logp = logits.log_softmax(dim=-1)
+    return torch.gather(logp, dim=2, index=labels.unsqueeze(2)).squeeze(2)
+
+
+def precompute_logits(neox_args, model):
+    """
+    Precomputes logprobs from training/testing/validation datasets
+
+    Saves it to the same directory as the dataset with the model name appended to it
+
+    neox_args: NeoXArgs.
+    model: a Megatron model
+
+    """
+    if neox_args.precompute_model_name is None:
+        mdl_name = str(hash(neox_args.load))
+    else:
+        mdl_name = neox_args.precompute_model_name
+    print_rank_0("Precomputing logprobs...")
+    model.eval()
+    data_paths = list()
+    if neox_args.train_data_paths is not None:
+        for path in neox_args.train_data_paths:
+            data_paths.append(path)
+        for path in neox_args.test_data_paths:
+            data_paths.append(path)
+        for path in neox_args.valid_data_paths:
+            data_paths.append(path)
+    elif neox_args.pos_train_data_paths is not None:
+        # Pairwise data...
+        for path in neox_args.pos_train_data_paths:
+            data_paths.append(path)
+        for path in neox_args.neg_train_data_paths:
+            data_paths.append(path)
+        for path in neox_args.pos_valid_data_paths:
+            data_paths.append(path)
+        for path in neox_args.neg_valid_data_paths:
+            data_paths.append(path)
+        for path in neox_args.pos_test_data_paths:
+            data_paths.append(path)
+        for path in neox_args.neg_test_data_paths:
+            data_paths.append(path)
+    for path in data_paths:
+        print_rank_0(f"Precomputing logits for {path}")
+        # Add hash to path...
+        out_path = path + f"_{mdl_name}"
+        if os.path.exists(out_path + ".idx"):
+            continue
+        dataset = make_dataset(path, neox_args.data_impl, not neox_args.mmap_warmup)
+        if is_mp_rank_0():
+            out_dataset = make_builder(out_path + ".bin", neox_args.data_impl)
+            out_dataset._dtype = np.float32
+        i = 0
+
+        # TODO: Not sure why this requires a multiple of 8? Investigate later.
+        while i < int(math.ceil(len(dataset) / 8.0) * 8):
+            start = time.time()
+            model.module.clear_cache()  # clear kv cache between batches
+            if is_mp_rank_0():
+                offset = (
+                    mpu.get_data_parallel_rank()
+                    * neox_args.train_micro_batch_size_per_gpu
+                )
+                context_tokens = [
+                    [int(x) for x in dataset.get(j % len(dataset)).tolist()]
+                    for j in range(
+                        i + offset,
+                        i + (neox_args.train_micro_batch_size_per_gpu + offset),
+                    )
+                ]
+                # grab microbatch
+                # pad batch in order to allow conversion to tensor
+                context_tokens, context_lengths = pad_batch(
+                    copy.deepcopy(context_tokens),
+                    pad_id=0,
+                    pad_len=neox_args.seq_length + 1,
+                    truncate=True,
+                )
+                # print(context_tokens)
+                label_tokens = [tokens[1:] for tokens in context_tokens]
+                context_tokens = [tokens[:-1] for tokens in context_tokens]
+            else:
+                context_tokens = [
+                    [0 for _ in range(neox_args.seq_length)]
+                    for _ in range(neox_args.batch_size)
+                ]
+                label_tokens = [
+                    [0 for _ in range(neox_args.seq_length)]
+                    for _ in range(neox_args.batch_size)
+                ]
+                context_lengths = [0 for _ in range(neox_args.batch_size)]
+            i += (
+                neox_args.train_micro_batch_size_per_gpu
+                * mpu.get_data_parallel_world_size()
+            )
+            # print(context_tokens)
+            # convert to tensor and broadcast
+            context_tokens = torch.cuda.LongTensor(context_tokens)
+            label_tokens = torch.cuda.LongTensor(label_tokens)
+            # Make sure context tokens + start tokens are the same across all ranks
+            token_generation_start_index = torch.cuda.LongTensor(context_lengths)
+            torch.distributed.broadcast(
+                context_tokens,
+                mpu.get_model_parallel_src_rank(),
+                group=mpu.get_model_parallel_group(),
+            )
+            torch.distributed.broadcast(
+                token_generation_start_index,
+                mpu.get_model_parallel_src_rank(),
+                group=mpu.get_model_parallel_group(),
+            )
+            torch.distributed.broadcast(
+                label_tokens,
+                mpu.get_model_parallel_src_rank(),
+                group=mpu.get_model_parallel_group(),
+            )
+            # context_tokens = context_tokens[:, :chop_len].contiguous()
+            # label_tokens = label_tokens[:, :chop_len].contiguous()
+            with torch.no_grad():
+                # get attention mask / position ids
+                context_tokens, attention_mask, position_ids = get_batch(
+                    neox_args, context_tokens
+                )
+                model_inputs = (
+                    context_tokens,
+                    position_ids,
+                    attention_mask,
+                )
+                maybe_tuple = forward_model(
+                    model, model_inputs, neox_args.is_pipe_parallel
+                )
+                if isinstance(maybe_tuple, tuple):
+                    logits, _ = maybe_tuple
+                else:
+                    logits = maybe_tuple
+                if logits is not None:  # if pipe parallel, not all ranks return logits
+                    logits = gather_from_model_parallel_region(logits)
+                    logp = get_logp(logits, label_tokens, True).squeeze()
+                if neox_args.is_pipe_parallel:
+                    # broadcast generated tokens to pipe parallel group
+                    src_rank = model.grid.stage_to_global(model.num_stages - 1)
+                    logp = (
+                        logp
+                        if logits is not None
+                        else torch.zeros(
+                            neox_args.batch_size, dtype=torch.float32
+                        ).cuda()
+                    )
+                    torch.distributed.broadcast(
+                        tensor=logp,
+                        src=src_rank,
+                        group=mpu.get_pipe_parallel_group(),
+                    )
+                    logp = logp.squeeze()
+                logp_list = [
+                    torch.zeros_like(logp)
+                    for _ in range(mpu.get_data_parallel_world_size())
+                ]
+                torch.distributed.all_gather(
+                    logp_list, logp, group=mpu.get_data_parallel_group()
+                )
+                logp = torch.cat(logp_list, dim=0).cpu().numpy()
+                if (mpu.get_model_parallel_rank() == 0) and (
+                    mpu.get_data_parallel_rank() == 0
+                ):
+                    for j in range(logp.shape[0]):
+                        out_dataset.add_item(logp[j])
+                        out_dataset.end_document()
+            print_rank_0(f"Processed {i} / {len(dataset)} in {time.time() - start}")
+        if is_mp_rank_0():
+            out_dataset.finalize(
+                out_path + ".idx",
+            )
+        torch.distributed.barrier()
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 348c7cefe..d39e18243 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -32,6 +32,10 @@ def build_tokenizer(args):
     if args.rank == 0:
         print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
 
+    assert (
+        args.tokenizer_type is not None
+    ), "tokenizer_type must be specified in the .yml config"
+
     # Select and instantiate the tokenizer.
     if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
         assert args.vocab_file is not None
diff --git a/megatron/training.py b/megatron/training.py
index 6a67d36f8..17dcce2c7 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -21,12 +21,14 @@
 """Pretrain utilities."""
 from datetime import datetime
 from functools import partial
+from collections import defaultdict
 
 import math
 import sys
 from contextlib import nullcontext
 
 import torch
+import torch.nn.functional as F
 import deepspeed
 from deepspeed.runtime.data_pipeline.curriculum_scheduler import CurriculumScheduler
 import numpy as np
@@ -43,9 +45,14 @@
     GPT2ModelPipe,
     SoftEmbedding,
     get_params_for_weight_decay_optimization,
+    mark_norms_for_sequence_parallel_grad_sync,
 )
+from megatron.mpu.mappings import gather_from_model_parallel_region
 from megatron.checkpointing import load_checkpoint, save_checkpoint
-from megatron.data.data_utils import build_train_valid_test_data_iterators
+from megatron.data.data_utils import (
+    build_train_valid_test_data_loaders,
+    shift_and_wrap_data_loaders,
+)
 from megatron.initialize import initialize_megatron
 from megatron.learning_rates import AnnealingLR
 from megatron.logging import tb_wandb_log, training_log
@@ -56,6 +63,7 @@
     CharCounter,
 )
 from megatron.model.gpt2_model import cross_entropy
+from megatron.mpu import vocab_parallel_cross_entropy
 
 from pickle import dump
 import os
@@ -82,7 +90,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache):
     base_model = GPT2ModelPipe(
         neox_args=neox_args,
         num_tokentypes=0,
-        parallel_output=True,
+        parallel_output=True if neox_args.train_impl != "rm" else False,
         topology=mpu.get_topology(),
         use_cache=use_cache,
     )
@@ -106,7 +114,7 @@ def save_base_shapes(neox_args, base_shapes, use_cache):
     delta_model = GPT2ModelPipe(
         neox_args=neox_args,
         num_tokentypes=0,
-        parallel_output=True,
+        parallel_output=True if neox_args.train_impl != "rm" else False,
         topology=mpu.get_topology(),
         use_cache=use_cache,
     )
@@ -136,7 +144,7 @@ def gen():
             old_hidden_size = neox_args.hidden_size
             neox_args.hidden_size = hidden_size
 
-            model, optimizer, _ = setup_model_and_optimizer(
+            model, optimizer, _, _ = setup_model_and_optimizer(
                 neox_args=neox_args, use_cache=False
             )
 
@@ -168,14 +176,54 @@ def gen():
     sys.exit(1)
 
 
+def update_iterations(neox_args, data_loaders):
+    """
+    Compute the number of train iterations if not specified and num_epochs, updates the neox_args object.
+    Note that if len(train_dataloader) % gradient_accumulation_steps != 0, this will configure neox
+    to do as many iterations as possible while ensuring that each example is seen *at most* train_epochs
+    times.
+    """
+    if (not neox_args.do_train) or (neox_args.train_iters is not None):
+        pass
+    elif neox_args.train_iters is None and neox_args.train_epochs is None:
+        print_rank_0(
+            "ERROR:Failed to specify either train_epochs or train_iters in config file"
+        )
+    else:
+        global_rank = torch.distributed.get_rank()
+
+        if global_rank == 0:
+            train_dataloader = data_loaders["train"]
+            train_epochs = neox_args.train_epochs
+            gradient_accumulation_steps = neox_args.gradient_accumulation_steps
+
+            train_dataloader_len = len(train_dataloader)
+            train_iterations = (
+                train_dataloader_len * train_epochs
+            ) // gradient_accumulation_steps
+
+            train_iters_tensor = torch.cuda.LongTensor([train_iterations])
+        else:
+            train_iters_tensor = torch.cuda.LongTensor([0])
+
+        torch.distributed.broadcast(train_iters_tensor, src=0)
+
+        neox_args.train_iters = train_iters_tensor[0].item()
+
+        print_rank_0(
+            f"Training for a total of {neox_args.train_iters} iterations, corresponding to {neox_args.train_epochs} epochs."
+        )
+
+
 def pretrain(neox_args):
     """Main training program.
 
     This function will run the following in the order provided:
         1) initialize Megatron.
-        2) setup model, optimizer and lr schedule
-        3) call train_val_test_data_provider to get train/val/test datasets.
-        4) train the model.
+        2) get train/val/test datasets.
+        3) setup model, optimizer and lr schedule.
+        4) configure data loading
+        5) train the model.
 
     Arguments:
         neox_args: an instance of NeoXArgs containing the configuration for pretrain
@@ -184,26 +232,34 @@ def pretrain(neox_args):
     # setup logging and timers
     init_wandb(neox_args=neox_args)
     timers = Timers(
-        use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
+        use_wandb=neox_args.use_wandb,
+        tensorboard_writer=neox_args.tensorboard_writer,
+        comet_experiment=neox_args.comet_experiment,
     )
 
     # Initialize and get arguments, timers, and Tensorboard writer.
     initialize_megatron(neox_args=neox_args)
 
+    # Create data loaders
+    timers("train/valid/test data loaders").start()
+    data_loaders = build_train_valid_test_data_loaders(neox_args=neox_args)
+    update_iterations(neox_args=neox_args, data_loaders=data_loaders)
+    timers("train/valid/test data loaders").stop()
+
     # Model, optimizer, and learning rate.
     timers("model and optimizer").start()
-    model, optimizer, lr_scheduler = setup_model_and_optimizer(
+    model, optimizer, lr_scheduler, reference_model = setup_model_and_optimizer(
         neox_args=neox_args, use_cache=False, iteration=neox_args.iteration
     )
     timers("model and optimizer").stop()
 
-    # Data stuff.
+    # Make and configure iterators
     timers("train/valid/test data iterators").start()
     (
         train_data_iterator,
         valid_data_iterator,
         test_data_iterator,
-    ) = build_train_valid_test_data_iterators(neox_args=neox_args)
+    ) = shift_and_wrap_data_loaders(neox_args=neox_args, data_loaders=data_loaders)
     timers("train/valid/test data iterators").stop()
 
     if neox_args.use_mup and neox_args.coord_check:
@@ -211,12 +267,23 @@ def pretrain(neox_args):
 
     # Print setup timing.
     print_rank_0("done with setups ...")
-    timers.log(["model and optimizer", "train/valid/test data iterators"])
+    timers.log(
+        [
+            "train/valid/test data loaders",
+            "model and optimizer",
+            "train/valid/test data iterators",
+        ]
+    )
     print_rank_0("training ...")
 
     iteration = neox_args.iteration
     # edge case: save step 0 checkpoint if requested and we're starting from step 0
-    if neox_args.save and 0 in neox_args.save_iters and iteration == 0:
+    if (
+        neox_args.save
+        and neox_args.extra_save_iters
+        and 0 in neox_args.extra_save_iters
+        and iteration == 0
+    ):
         save_checkpoint(
             neox_args=neox_args,
             iteration=iteration,
@@ -230,6 +297,7 @@ def pretrain(neox_args):
             neox_args=neox_args,
             timers=timers,
             model=model,
+            reference_model=reference_model,
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
             train_data_iterator=train_data_iterator,
@@ -247,6 +315,7 @@ def pretrain(neox_args):
             iteration=iteration,
             verbose=False,
             timers=timers,
+            reference_model=reference_model,
         )
 
     if neox_args.save and iteration != 0:
@@ -271,23 +340,29 @@ def pretrain(neox_args):
             verbose=True,
             timers=timers,
             chart_name="test",
+            reference_model=reference_model,
         )
 
 
-def _get_batch(neox_args, tokenizer, keys, data, datatype):
+def _get_batch(neox_args, tokenizer, keys, data, datatype, label_mask_zero=False):
     """Support function for get_batch / get_batch pipe (to avoid code repetition)"""
     data_b = mpu.broadcast_data(keys, data, datatype)
-
+    token_key = keys[0]
+    label_key = keys[1] if len(keys) > 1 else None
     # Unpack.
-    tokens_ = data_b["text"].long()
-    if "label" in data_b:
+    tokens_ = data_b[token_key].long()
+    if label_key in data_b:
+        label_mask = (data_b[label_key].long() >= 0)[:, 1:].contiguous()
         labels = torch.where(
-            data_b["label"].long() >= 0,
-            data_b["label"].long(),
-            torch.zeros_like(data_b["label"].long()),
+            data_b[label_key].long() >= 0,
+            data_b[label_key].long(),
+            torch.zeros_like(data_b[label_key].long()),
         )[:, 1:].contiguous()
     else:
+        label_mask = (tokens_.long() >= 0)[:, 1:].contiguous()
         labels = tokens_[:, 1:].contiguous()
+        if label_mask_zero:
+            labels = labels * label_mask
     tokens = tokens_[:, :-1].contiguous()
 
     # Get the masks and position ids.
@@ -297,9 +372,9 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype):
         eod_mask_loss=neox_args.eod_mask_loss,
         sliding_window_width=neox_args.sliding_window_width,
     )
-    # If `label` is present, any token < 0 (e.g., -100, the default for torch) skips the loss computation
-    if "label" in data_b:
-        loss_mask = (data_b["label"][:, 1:] >= 0).to(loss_mask.dtype)
+
+    # combine loss masks from get_ltor_masks_and_position_ids with loss masks from data
+    loss_mask = label_mask.to(loss_mask.dtype) * loss_mask
     return tokens, labels, loss_mask, attention_mask, position_ids
 
 
@@ -307,7 +382,14 @@ def get_batch(neox_args, data_iterator):
     """Generate a batch"""
 
     # Items and their type.
-    keys = ["text", "label"] if neox_args.label_data_paths else ["text"]
+    if neox_args.train_impl in ["normal", "kto"]:
+        keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
+    elif neox_args.train_impl in ["dpo", "rm"]:
+        keys = (
+            [["pos", "pos_label"], ["neg", "neg_label"]]
+            if neox_args.pos_train_label_data_paths
+            else [["pos"], ["neg"]]
+        )
     datatype = torch.int64
 
     # Broadcast data.
@@ -315,19 +397,80 @@ def get_batch(neox_args, data_iterator):
         data = next(data_iterator)
     else:
         data = None
-    return _get_batch(
-        neox_args=neox_args,
-        tokenizer=neox_args.tokenizer,
-        keys=keys,
-        data=data,
-        datatype=datatype,
-    )
+    if neox_args.train_impl == "normal":
+        return _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys,
+            data=data,
+            datatype=datatype,
+        )
+    elif neox_args.train_impl == "kto":
+        assert (
+            neox_args.train_micro_batch_size_per_gpu > 1
+        ), "For KTO training, the train_micro_batch_size_per_gpu must be greater than 1."
+        tup = _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys,
+            data=data,
+            datatype=datatype,
+        )
+        # Remove the last token from the reward since we predict the next token, so
+        # Reward of <current prediction> will be based on the label of <next token>
+        rw_data = mpu.broadcast_data(["reward"], data, torch.float)["reward"][
+            :, :-1
+        ].contiguous()
+        ref_data = (
+            mpu.broadcast_data(["ref"], data, torch.float)["ref"][:, :-1].contiguous()
+            if neox_args.precompute_model_name
+            else None
+        )
+        return tup + (rw_data, ref_data)
+    elif neox_args.train_impl in ["dpo", "rm"]:
+        pos_tup = _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys[0],
+            data=data,
+            datatype=datatype,
+            label_mask_zero=True,
+        )
+        neg_tup = _get_batch(
+            neox_args=neox_args,
+            tokenizer=neox_args.tokenizer,
+            keys=keys[1],
+            data=data,
+            datatype=datatype,
+            label_mask_zero=True,
+        )
+        if neox_args.precompute_model_name:
+            ref_data = mpu.broadcast_data(["pos_ref", "neg_ref"], data, torch.float)
+        else:
+            ref_data = {"pos_ref": None}
+        return [
+            torch.cat((pos_item, neg_item), dim=0)
+            for pos_item, neg_item in zip(pos_tup, neg_tup)
+        ] + [
+            torch.cat((ref_data["pos_ref"], ref_data["neg_ref"]), dim=0)[
+                :, :-1
+            ].contiguous()
+            if ref_data["pos_ref"] is not None
+            else None
+        ]
 
 
 def get_batch_pipe(data, neox_args, curr_scheduler=None):
     """A modification of get_batch() to work with the latest batch instead of an iterator."""
+
+    assert neox_args.train_impl not in [
+        "kto",
+        "dpo",
+        "rm",
+    ], "Pipeline parallel is currently unsupported when using any of kto, dpo, rm. Set pipe_parallel_size to 0"
+
     # Items and their type.
-    keys = ["text", "label"] if neox_args.label_data_paths else ["text"]
+    keys = ["text", "label"] if neox_args.train_label_data_paths else ["text"]
     datatype = torch.int64
 
     tokens, labels, loss_mask, attention_mask, position_ids = _get_batch(
@@ -366,20 +509,41 @@ def get_batch_sequential(forward_input, neox_args):
 
 
 def forward_step(
-    data_iterator, model, neox_args, timers, return_logits=False, is_train=False
+    data_iterator,
+    model,
+    neox_args,
+    timers,
+    return_logits=False,
+    is_train=False,
+    reference_model=None,
 ):
     """Forward step."""
     if neox_args.is_pipe_parallel:
         return model.eval_batch(data_iterator, return_logits=return_logits)
 
     # Get the batch.
-    if neox_args.memory_profiling and neox_args.it:
+    if neox_args.memory_profiling and neox_args.iteration:
         torch.cuda.nvtx.range_push(f"Get batch")
     if timers is not None:
         timers("batch generator").start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        neox_args=neox_args, data_iterator=data_iterator
-    )
+    if neox_args.train_impl == "normal":
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+            neox_args=neox_args, data_iterator=data_iterator
+        )
+    elif neox_args.train_impl == "kto":
+        (
+            tokens,
+            labels,
+            loss_mask,
+            attention_mask,
+            position_ids,
+            rewards,
+            ref_logp,
+        ) = get_batch(neox_args=neox_args, data_iterator=data_iterator)
+    if neox_args.train_impl in ["dpo", "rm"]:
+        tokens, labels, loss_mask, attention_mask, position_ids, ref_logp = get_batch(
+            neox_args=neox_args, data_iterator=data_iterator
+        )
 
     if timers is not None:
         timers("batch generator").stop()
@@ -388,22 +552,220 @@ def forward_step(
 
     if neox_args.memory_profiling:
         torch.cuda.nvtx.range_push(f"Forward pass")
-    outputs = model((tokens, position_ids, attention_mask), neox_args=neox_args)
-    if (
-        is_train
-        and neox_args.curriculum_learning
-        and neox_args.curriculum_seqlen < neox_args.seq_length
-    ):
-        loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous()
-        labels = labels[:, : neox_args.curriculum_seqlen].contiguous()
-    loss = cross_entropy(
-        outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
-    )
+    metrics = {}
+    if neox_args.train_impl == "normal":
+        outputs = model((tokens, position_ids, attention_mask), neox_args=neox_args)
+        if (
+            is_train
+            and neox_args.curriculum_learning
+            and neox_args.curriculum_seqlen < neox_args.seq_length
+        ):
+            loss_mask = loss_mask[:, : neox_args.curriculum_seqlen].contiguous()
+            labels = labels[:, : neox_args.curriculum_seqlen].contiguous()
+        loss = cross_entropy(
+            outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
+        )
+    elif neox_args.train_impl == "rm":
+        maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
+        if type(maybe_tuple) is tuple:
+            outputs, _ = maybe_tuple
+        else:
+            outputs = maybe_tuple
+        pos, neg = torch.chunk(outputs, 2, 0)
+        pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0)
+        # We assume that each pos, neg pair occur in the same order
+        # e.g. second nonzero pos is the corresponding second nonzero neg
+        # and that there are also an equal number of pos and neg in each sequence.
+        pos_indx = pos_loss_mask.nonzero()
+        neg_indx = neg_loss_mask.nonzero()
+        # indx[:, 0] is the batch index, indx[:, 1] is the token index, we only care about the token index.
+        pos_indx = pos_indx[:, 1].unsqueeze(1)
+        neg_indx = neg_indx[:, 1].unsqueeze(1)
+        pos = torch.gather(pos.squeeze(), dim=1, index=pos_indx)
+        neg = torch.gather(neg.squeeze(), dim=1, index=neg_indx)
+        with torch.no_grad():
+            metrics["pos_values"] = pos.clone().detach().mean()
+            metrics["neg_values"] = neg.clone().detach().mean()
+            metrics["margin"] = (pos - neg).clone().detach().mean()
+            metrics["accuracy"] = ((pos - neg) > 0).clone().detach().float().mean()
+        loss = (-F.logsigmoid(pos - neg).mean()) + (
+            (neox_args.z_loss * (pos**2 + neg**2)).mean()
+        )
+    elif neox_args.train_impl == "dpo":
+        # Based on https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py#L90
+        with torch.inference_mode():
+            # So we can gather token logps...
+            token_logp_labels = labels.clone()
+            pos_loss_mask, neg_loss_mask = torch.chunk(loss_mask, 2, 0)
+            if neox_args.dpo_reference_free:
+                ref_pos = 0
+                ref_neg = 0
+            elif ref_logp is None:
+                ref_maybe_tuple = reference_model(
+                    (tokens, position_ids, attention_mask), neox_args=neox_args
+                )
+                if type(ref_maybe_tuple) is tuple:
+                    # We should ignore MoE losses yeah?
+                    ref_outputs, _ = ref_maybe_tuple
+                else:
+                    ref_outputs = ref_maybe_tuple
+                ref_pos, ref_neg = get_pos_neg_logp(
+                    ref_outputs, token_logp_labels, neox_args.dpo_fp32
+                )
+            else:
+                ref_pos, ref_neg = torch.chunk(ref_logp, 2, 0)
+            ref_pos = (ref_pos * pos_loss_mask).sum(-1)
+            ref_neg = (ref_neg * neg_loss_mask).sum(-1)
+        chosen_maybe_tuple = model(
+            (tokens, position_ids, attention_mask), neox_args=neox_args
+        )
+        if type(chosen_maybe_tuple) is tuple:
+            # We should ignore MoE losses yeah?
+            chosen_outputs, _ = chosen_maybe_tuple
+        else:
+            chosen_outputs = chosen_maybe_tuple
+        chosen_pos, chosen_neg = get_pos_neg_logp(
+            chosen_outputs, token_logp_labels, neox_args.dpo_fp32
+        )
+        chosen_pos = (chosen_pos * pos_loss_mask).sum(-1)
+        chosen_neg = (chosen_neg * neg_loss_mask).sum(-1)
+        with torch.no_grad():
+            # Collect metrics...
+            if not neox_args.dpo_reference_free:
+                metrics["ref_neg"] = ref_neg.clone().detach().mean()
+                metrics["ref_pos"] = ref_pos.clone().detach().mean()
+            metrics["chosen_neg"] = chosen_neg.clone().detach().mean()
+            metrics["chosen_pos"] = chosen_pos.clone().detach().mean()
+            if not neox_args.dpo_reference_free:
+                chosen_rewards = neox_args.dpo_beta * (
+                    chosen_pos.clone().detach() - ref_pos.clone().detach()
+                )
+                rejected_rewards = neox_args.dpo_beta * (
+                    chosen_neg.clone().detach() - ref_neg.clone().detach()
+                )
+                metrics["chosen_rewards"] = chosen_rewards.mean()
+                metrics["rejected_rewards"] = rejected_rewards.mean()
+                reward_acc = (chosen_rewards > rejected_rewards).float()
+                metrics["reward_acc"] = reward_acc.mean()
+                metrics["margins"] = (chosen_rewards - rejected_rewards).mean()
+        pi_logrations = chosen_pos - chosen_neg
+        ref_logrations = ref_pos - ref_neg
+        logits = pi_logrations - ref_logrations
+        loss = -F.logsigmoid(neox_args.dpo_beta * logits).mean()
+    elif neox_args.train_impl == "kto":
+        # Based on https://github.com/huggingface/trl/blob/main/trl/trainer/kto_trainer.py
+        # Except we don't have an extra input for KL logp, we just split the batch in half
+        with torch.no_grad():
+            # So we can gather token logps...
+            token_logp_labels = labels.clone()
+            token_logp_labels[token_logp_labels == -100] = 0
+            if ref_logp is None:
+                # Did not precompute logits....
+                ref_maybe_tuple = reference_model(
+                    (tokens, position_ids, attention_mask), neox_args=neox_args
+                )
+                if type(ref_maybe_tuple) is tuple:
+                    # We should ignore MoE losses yeah?
+                    ref_outputs, _ = ref_maybe_tuple
+                else:
+                    ref_outputs = ref_maybe_tuple
+                # gather across tensor parallel group
+                ref_outputs = gather_from_model_parallel_region(ref_outputs)
+
+                ref_logp = get_logp(ref_outputs, token_logp_labels, neox_args.kto_fp32)
+            else:
+                print(f"REF LOGP: {ref_logp.clone().detach().mean()}")
+            ref_logp = ref_logp * loss_mask
+            scaling = (rewards.sum(-1) > 0.001).float() * neox_args.kto_desirable_weight
+            scaling += (
+                rewards.sum(-1) < -0.001
+            ).float() * neox_args.kto_undesirable_weight
+            pos_mask = (rewards > 0.001).float()
+            neg_mask = (rewards < -0.001).float()
+        chosen_maybe_tuple = model(
+            (tokens, position_ids, attention_mask), neox_args=neox_args
+        )
+        if type(chosen_maybe_tuple) is tuple:
+            # We should ignore MoE losses yeah?
+            chosen_outputs, _ = chosen_maybe_tuple
+        else:
+            chosen_outputs = chosen_maybe_tuple
+        chosen_outputs = gather_from_model_parallel_region(chosen_outputs)
+        chosen_logp = get_logp(chosen_outputs, token_logp_labels, neox_args.kto_fp32)
+        chosen_logp = chosen_logp * loss_mask
+        with torch.no_grad():
+            # Collect metrics...
+            metrics["ref_logp"] = ref_logp.clone().detach().sum(-1).mean()
+            metrics["policy_logp"] = chosen_logp.clone().detach().sum(-1).mean()
+            metrics["pos_ref_logp"] = (
+                (ref_logp * pos_mask).clone().detach().sum(-1).mean()
+            )
+            metrics["neg_ref_logp"] = (
+                (ref_logp * neg_mask).clone().detach().sum(-1).mean()
+            )
+            metrics["pos_policy_logp"] = (
+                (chosen_logp * pos_mask).clone().detach().sum(-1).mean()
+            )
+            metrics["neg_policy_logp"] = (
+                (chosen_logp * neg_mask).clone().detach().sum(-1).mean()
+            )
+            metrics["kl"] = (
+                chosen_logp.clone().detach() - ref_logp.clone().detach()
+            ).sum() / loss_mask.sum()
+            policy_rewards = (
+                neox_args.kto_beta
+                * rewards
+                * (chosen_logp.clone().detach() - ref_logp.clone().detach())
+            )
+            reward_acc = (policy_rewards.sum(-1) > 0.0).float()
+            metrics["reward_acc"] = reward_acc.mean()
+            metrics["policy_rewards"] = policy_rewards.sum()
+            print(metrics)
+        pol_logp1, pol_logp2 = torch.chunk(chosen_logp, 2, 0)
+        ref_logp1, ref_logp2 = torch.chunk(ref_logp, 2, 0)
+        reward1, reward2 = torch.chunk(rewards, 2, 0)
+        scaling1, scaling2 = torch.chunk(scaling, 2, 0)
+        kl1 = torch.clamp((pol_logp1 - ref_logp1).sum(-1), min=0).mean()
+        kl2 = torch.clamp((pol_logp2 - ref_logp2).sum(-1), min=0).mean()
+        log_ratio1 = pol_logp1 - ref_logp1
+        log_ratio2 = pol_logp2 - ref_logp2
+
+        # TODO: Add pack_until_overflow sequence support
+        loss = (
+            0.5
+            * scaling1.mean(-1)
+            * (
+                1
+                - F.sigmoid(
+                    (
+                        neox_args.kto_beta
+                        * reward1.mean(-1)
+                        * (log_ratio1.sum(-1) - kl2.clone().detach())
+                    )
+                )
+            )
+        ) + (
+            0.5
+            * scaling2.mean(-1)
+            * (
+                1
+                - F.sigmoid(
+                    (
+                        neox_args.kto_beta
+                        * reward2.mean(-1)
+                        * (log_ratio2.sum(-1) - kl1.clone().detach())
+                    )
+                )
+            )
+        )
+        # print(loss.shape)
+        loss = loss.mean()
+        # print(loss.shape)
     if neox_args.memory_profiling:
         torch.cuda.nvtx.range_pop()
     if return_logits:
-        return loss, outputs
-    return loss
+        return loss, outputs, metrics
+    return loss, metrics
 
 
 def get_model(neox_args, use_cache=False):
@@ -417,13 +779,30 @@ def get_model(neox_args, use_cache=False):
     old_use_mup = neox_args.use_mup
     neox_args.use_mup = False
 
+    if neox_args.zero_stage in [2, 3]:
+        if neox_args.pipe_parallel_size == 1:
+            print_rank_0(
+                "ZeRO stage 2/3 and the PipelineModule are incompatible, please set 'pipe_parallel_size' to 0 instead"
+            )
+            exit()
+        if neox_args.pipe_parallel_size > 1:
+            print_rank_0(
+                "ZeRO stage 2/3 and pipeline paralleism are not supported simultaneously"
+            )
+            exit()
+        if neox_args.model_parallel_size > 1:
+            print_rank_0(
+                "ZeRO stage 2/3 and model paralleism are not currently supported simultaneously"
+            )
+            exit()
+
     with deepspeed.zero.Init(
         config_dict_or_path=neox_args.deepspeed_config
     ) if neox_args.zero_stage == 3 else nullcontext() as gs:
         model = GPT2ModelPipe(
             neox_args=neox_args,
             num_tokentypes=0,
-            parallel_output=True,
+            parallel_output=True if neox_args.train_impl != "rm" else False,
             topology=mpu.get_topology(),
             use_cache=use_cache,
         )
@@ -478,9 +857,14 @@ def get_model(neox_args, use_cache=False):
         raise ValueError("Must be using deepspeed to run neox")
 
 
-def get_optimizer(model, neox_args):
+def get_optimizer(model, neox_args, dummy=False):
     """Set up the optimizer."""
-    if neox_args.no_load_optim:
+    if neox_args.no_load_optim and neox_args.deepspeed:
+        # Required to have something so...
+        dummy = True
+        neox_args.optimizer = {"params": {"lr": 0.0}}
+        neox_args.optimizer_type = "adam"
+    elif neox_args.no_load_optim:
         return None, None
 
     if neox_args.optimizer is None:
@@ -504,8 +888,13 @@ def get_optimizer(model, neox_args):
     _param_groups = []
     for param_group in param_groups:
         trainable_params = [p for p in param_group["params"] if p.requires_grad]
+        if dummy:
+            trainable_params = [trainable_params[0]]  # just take the first one
         param_group["params"] = trainable_params
         _param_groups.append(param_group)
+        if dummy:
+            # Only need one.
+            break
     param_groups = _param_groups
 
     # If we're using mup, then the optimizer must be adam or sgd
@@ -619,7 +1008,7 @@ def get_optimizer(model, neox_args):
 
 def get_learning_rate_scheduler(optimizer, neox_args):
     """Build the learning rate scheduler."""
-    if neox_args.no_load_optim:
+    if (neox_args.no_load_optim) and not neox_args.deepspeed:
         # TODO: this should be configured as a separate arg
         return None
     if neox_args.deepspeed and neox_args.optimizer_type.lower() == "onebitadam":
@@ -632,6 +1021,8 @@ def get_learning_rate_scheduler(optimizer, neox_args):
     # Add linear learning rate scheduler.
     if neox_args.lr_decay_iters is not None:
         num_iters = neox_args.lr_decay_iters
+    elif neox_args.lr_decay_fraction is not None:
+        num_iters = math.floor(neox_args.train_iters * neox_args.lr_decay_fraction)
     else:
         num_iters = neox_args.train_iters
     num_iters = max(1, num_iters)
@@ -664,19 +1055,32 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
         )
 
     """Setup model and optimizer."""
+    needs_reference_model = (
+        (neox_args.train_impl == "dpo")
+        and (neox_args.precompute_model_name is None)
+        and (not neox_args.dpo_reference_free)
+    ) or ((neox_args.train_impl == "kto") and (neox_args.precompute_model_name is None))
     model = get_model(neox_args=neox_args, use_cache=use_cache)
+    if needs_reference_model:
+        reference_model = get_model(neox_args=neox_args, use_cache=use_cache)
+    else:
+        reference_model = None
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
     lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args)
-
+    if neox_args.deepspeed and needs_reference_model:
+        # Need an optimizer & lr_scheduler so make a very small one to keep deepspeed happy...
+        ref_optimizer, ref_param_groups = get_optimizer(
+            model=reference_model, neox_args=neox_args, dummy=True
+        )
+        ref_lr_scheduler = get_learning_rate_scheduler(
+            optimizer=ref_optimizer, neox_args=neox_args
+        )
+    else:
+        ref_optimizer, ref_param_groups, ref_lr_scheduler = None, None, None
     if neox_args.deepspeed:
         print_rank_0("DeepSpeed is enabled.")
-        if neox_args.no_load_optim:
-            assert optimizer is None
-            _model_params = None
-            _lr_scheduler = None
-        else:
-            _model_params = param_groups if optimizer is None else None
-            _lr_scheduler = lr_scheduler
+        _model_params = param_groups if optimizer is None else None
+        _lr_scheduler = lr_scheduler
 
         model, optimizer, _, lr_scheduler = deepspeed.initialize(
             model=model,
@@ -689,6 +1093,17 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
             # config_params=neox_args.deepspeed_config,
             mpu=mpu if not neox_args.is_pipe_parallel else None,
         )
+        if needs_reference_model:
+            reference_model, _, _, _ = deepspeed.initialize(
+                model=reference_model,
+                optimizer=ref_optimizer,
+                args=neox_args,
+                lr_scheduler=ref_lr_scheduler,
+                dist_init_required=False,
+                model_parameters=ref_param_groups,
+                mpu=mpu if not neox_args.is_pipe_parallel else None,
+            )
+        mark_norms_for_sequence_parallel_grad_sync(model, neox_args)
         model.total_params = get_total_params(model.module)
         print_rank_0(f' > total params: {"{:,}".format(model.total_params)}')
 
@@ -721,6 +1136,15 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
             lr_scheduler=lr_scheduler,
             iteration=iteration,
         )
+        if needs_reference_model:
+            _ = load_checkpoint(
+                neox_args=neox_args,
+                model=reference_model,
+                optimizer=ref_optimizer,
+                lr_scheduler=ref_lr_scheduler,
+                iteration=iteration,
+            )
+            reference_model.eval()
         print_rank_0(
             f"Loading checkpoint and starting from iteration {neox_args.iteration}"
         )
@@ -732,7 +1156,7 @@ def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
     if lr_scheduler is not None:
         lr_scheduler.optimizer = model.optimizer
 
-    return model, optimizer, lr_scheduler
+    return model, optimizer, lr_scheduler, reference_model
 
 
 def backward_step(neox_args, timers, optimizer, model, loss):
@@ -754,7 +1178,15 @@ def backward_step(neox_args, timers, optimizer, model, loss):
         raise ValueError("Must be using deepspeed to run neox")
 
 
-def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler):
+def train_step(
+    neox_args,
+    timers,
+    data_iterator,
+    model,
+    optimizer,
+    lr_scheduler,
+    reference_model=None,
+):
     """Single training step."""
 
     # Pipeline parallelism schedules forward/backward/step
@@ -762,6 +1194,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         reduced_loss = train_step_pipe(
             neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator
         )
+        reduce_metrics = reduced_loss
         if (
             neox_args.memory_profiling
             and neox_args.iteration >= neox_args.profile_step_start
@@ -771,18 +1204,22 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
             save_snapshot(neox_args)
     else:
         losses = []
+        metric_dicts = defaultdict(list)
         for _ in range(neox_args.gradient_accumulation_steps):
             # Forward model for one step.
             timers("forward").start()
-            loss = forward_step(
+            loss, metric_dict = forward_step(
                 neox_args=neox_args,
                 timers=timers,
                 data_iterator=data_iterator,
                 model=model,
                 is_train=True,
+                reference_model=reference_model,
             )
             timers("forward").stop()
             losses.append(loss)
+            for key in metric_dict.keys():
+                metric_dicts[key].append(metric_dict[key])
             # Calculate gradients, reduce across processes, and clip.
             if (
                 neox_args.profile
@@ -812,6 +1249,7 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 and neox_args.iteration <= neox_args.profile_step_stop
             ):
                 torch.cuda.nvtx.range_push(f"Optimizer step")
+
             timers("optimizer").start()
             if neox_args.deepspeed:
                 model.step()
@@ -831,17 +1269,19 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 and torch.distributed.get_rank() == 0
             ):
                 save_snapshot(neox_args)
-        reduced_loss = {
-            "lm_loss": reduce_losses(losses).mean()
-        }  # reduces losses across machines for logging
+        # reduces metrics across machines for logging
+        reduce_metrics = {
+            key: reduce_losses(metric_dicts[key]).mean() for key in metric_dicts.keys()
+        }
+        reduce_metrics["lm_loss"] = reduce_losses(losses).mean()
 
     if neox_args.precision == "fp16" and model.optimizer.overflow:
         skipped_iter = 1
     else:
         skipped_iter = 0
 
-    collect_loss_for_unit_test(reduced_loss["lm_loss"])
-    return reduced_loss, skipped_iter
+    collect_loss_for_unit_test(reduce_metrics["lm_loss"])
+    return reduce_metrics, skipped_iter
 
 
 def train_step_pipe(neox_args, timers, model, data_iterator):
@@ -863,10 +1303,34 @@ def train_step_pipe(neox_args, timers, model, data_iterator):
     return loss_dict
 
 
+def is_save_iter(neox_args, iteration):
+    if neox_args.extra_save_iters and iteration in neox_args.extra_save_iters:
+        return True
+
+    if neox_args.checkpoint_factor:
+        if neox_args.checkpoint_scale == "linear":
+            assert float(
+                neox_args.checkpoint_factor
+            ).is_integer(), "checkpoint_factor must be a whole number when using linear checkpoint_scale"
+            return iteration % neox_args.checkpoint_factor == 0
+        elif neox_args.checkpoint_scale == "log":
+            # Check if iteration is a power of checkpoint_factor
+            assert neox_args.checkpoint_factor > 1
+            power = 1
+            while power < iteration + 1:
+                if int(power) == iteration:
+                    return True
+                power *= neox_args.checkpoint_factor
+            return False
+
+    return False
+
+
 def train(
     neox_args,
     timers,
     model,
+    reference_model,
     optimizer,
     lr_scheduler,
     train_data_iterator,
@@ -922,6 +1386,7 @@ def train(
             model=model,
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
+            reference_model=reference_model,
         )
         if neox_args.profile and iteration == neox_args.profile_step_stop:
             torch.cuda.cudart().cudaProfilerStop()
@@ -957,7 +1422,7 @@ def train(
         )
 
         # Checkpointing
-        if neox_args.save and iteration in neox_args.save_iters:
+        if neox_args.save and is_save_iter(neox_args, iteration):
             save_checkpoint(
                 neox_args=neox_args,
                 iteration=iteration,
@@ -981,6 +1446,7 @@ def train(
                 iteration=iteration,
                 verbose=False,
                 timers=timers,
+                reference_model=reference_model,
             )
 
         if neox_args.exit_interval and iteration % neox_args.exit_interval == 0:
@@ -998,7 +1464,13 @@ def train(
 
 
 def evaluate(
-    neox_args, forward_step_fn, data_iterator, model, verbose=False, timers=None
+    neox_args,
+    forward_step_fn,
+    data_iterator,
+    model,
+    verbose=False,
+    timers=None,
+    reference_model=None,
 ):
     """Evaluation.
     neox_args: NeoX Arguments
@@ -1012,6 +1484,7 @@ def evaluate(
     # Turn on evaluation mode which disables dropout.
     model.eval()
     losses = []
+    metric_dicts = defaultdict(list)
     if neox_args.char_level_ppl:
         data_iterator = CharCounter(data_iterator, neox_args.tokenizer)
 
@@ -1033,14 +1506,16 @@ def evaluate(
                 else neox_args.gradient_accumulation_steps
             ):
                 # Forward evaluation
-                loss = forward_step_fn(
+                loss, metric_dict = forward_step_fn(
                     model=model,
                     data_iterator=data_iterator,
                     neox_args=neox_args,
                     timers=timers,
+                    reference_model=reference_model,
                 )
                 losses.append(loss)
-
+                for key in metric_dict.keys():
+                    metric_dicts[key].append(metric_dict[key])
             # When contiguous memory optimizations are enabled, the buffers
             # allocated by the optimizations are deallocated during backward pass
             # in the absence of backward pass the buffers should be reset after each
@@ -1050,6 +1525,8 @@ def evaluate(
 
     # reduces losses across processes for logging & run eval harness tasks
     eval_results = {"lm_loss": reduce_losses(losses).mean().item()}
+    for key in metric_dicts.keys():
+        eval_results[key] = reduce_losses(metric_dicts[key]).mean().item()
     eval_results["lm_loss_ppl"] = math.exp(eval_results["lm_loss"])
 
     if neox_args.char_level_ppl:
@@ -1092,6 +1569,7 @@ def evaluate_and_print_results(
     verbose=False,
     timers=None,
     chart_name="validation",
+    reference_model=None,
 ):
     """Helper function to evaluate and dump results on screen."""
     total_loss_dict = evaluate(
@@ -1101,6 +1579,7 @@ def evaluate_and_print_results(
         model=model,
         verbose=verbose,
         timers=timers,
+        reference_model=reference_model,
     )
     string = f" {chart_name} results at {prefix} | "
     for k, v in total_loss_dict.items():
@@ -1117,6 +1596,7 @@ def evaluate_and_print_results(
                     iteration,
                     use_wandb=neox_args.use_wandb,
                     tensorboard_writer=neox_args.tensorboard_writer,
+                    comet_experiment=neox_args.comet_experiment,
                 )
         else:
             string += f"{k} value: {v:.6E} | "
@@ -1126,6 +1606,7 @@ def evaluate_and_print_results(
                 iteration,
                 use_wandb=neox_args.use_wandb,
                 tensorboard_writer=neox_args.tensorboard_writer,
+                comet_experiment=neox_args.comet_experiment,
             )
 
     length = len(string) + 1
diff --git a/megatron/utils.py b/megatron/utils.py
index 26b4439bd..507c44179 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -275,10 +275,11 @@ def elapsed(self, reset=True):
 class Timers:
     """Group of timers."""
 
-    def __init__(self, use_wandb, tensorboard_writer):
+    def __init__(self, use_wandb, tensorboard_writer, comet_experiment):
         self.timers = {}
         self.use_wandb = use_wandb
         self.tensorboard_writer = tensorboard_writer
+        self.comet_experiment = comet_experiment
 
     def __call__(self, name):
         if name not in self.timers:
@@ -300,6 +301,14 @@ def write(self, names, iteration, normalizer=1.0, reset=False):
             if self.use_wandb:
                 wandb.log({f"timers/{name}": value}, step=iteration)
 
+            if self.comet_experiment:
+                self.comet_experiment.__internal_api__log_metric__(
+                    f"timers/{name}",
+                    value,
+                    framework="gpt-neox",
+                    step=iteration,
+                )
+
     def log(self, names, normalizer=1.0, reset=True):
         """Log a group of timers."""
         assert normalizer > 0.0
@@ -449,7 +458,7 @@ def setup_for_inference_or_eval(use_cache=True, overwrite_values=None, input_arg
     initialize_megatron(neox_args)
 
     # set up model and load checkpoint.
-    model, _, _ = setup_model_and_optimizer(
+    model, _, _, _ = setup_model_and_optimizer(
         neox_args=neox_args,
         use_cache=use_cache,
         iteration=neox_args.iteration,
diff --git a/post-training/README.md b/post-training/README.md
new file mode 100644
index 000000000..fb7ac8eb4
--- /dev/null
+++ b/post-training/README.md
@@ -0,0 +1,57 @@
+# Post-Training
+
+Examples for running post-training with ultrafeedback data for SFT/DPO/RM training.
+
+```bash
+python tools/ckpts/convert_hf_llama_to_neox.py --tp 4 --model meta-llama/Meta-Llama-3-8B-Instruct --model_path checkpoints/neox_converted/llama3-8b-instruct
+```
+
+## Data generation
+First, grab the jsonl file...
+
+```bash
+python post-training/llama_data.py
+```
+## DPO data
+```bash
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_dpo_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --only-last
+```
+
+## RM data
+```bash
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --for-rm
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_rm_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --for-rm
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys rejected --for-rm
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --for-rm
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_test_filtered.jsonl --output-prefix data/pairwise/llama3_rm_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --for-rm
+python tools/datasets/preprocess_data_with_chat_template.py --input data/pairwise/llama3_dpo_train_filtered.jsonl --output-prefix data/pairwise/llama3_rm_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys chosen --for-rm
+```
+
+## SFT data
+```bash
+python tools/datasets/preprocess_data_with_chat_template.py --input data/sft/llama3_sft_train_filtered.jsonl --output-prefix data/sft/llama3_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages
+python tools/datasets/preprocess_data_with_chat_template.py --input data/sft/llama3_sft_test_filtered.jsonl --output-prefix data/sft/llama3_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages
+python tools/datasets/preprocess_data_with_chat_template.py --input data/sft/llama3_sft_train_filtered.jsonl --output-prefix data/sft/llama3_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages
+```
+
+## KTO data
+```bash
+python tools/datasets/preprocess_data_with_chat_template.py --input data/kto/llama3_sft_train_filtered.jsonl --output-prefix data/kto/llama3_train --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages --reward-key reward
+python tools/datasets/preprocess_data_with_chat_template.py --input data/kto/llama3_sft_test_filtered.jsonl --output-prefix data/kto/llama3_test --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages --reward-key reward
+python tools/datasets/preprocess_data_with_chat_template.py --input data/kto/llama3_sft_train_filtered.jsonl --output-prefix data/kto/llama3_val --tokenizer-path checkpoints/neox_converted/llama3-8b-instruct/tokenizer --jsonl-keys messages --reward-key reward
+```
+
+
+## Converting back to hf
+```bash
+# RM
+python tools/ckpts/convert_neox_to_hf.py --input_dir eleuther-neox/checkpoints/rm/llama3/llama3-8b-instruct/global_step100 --output_dir checkpoints/rm/llama3_hf --config_file checkpoints/rm/llama3/llama3-8b-instruct/global_step100/configs/llama3-8b-rm.yml --precision bf16 --vocab-is-hf-tokenizer --architecture llama --pad-token-id 128002
+
+# SFT/DPO
+python tools/ckpts/convert_neox_to_hf.py --input_dir eleuther-neox/checkpoints/<dpo/sft>/llama3/llama3-8b-instruct/global_step100 --output_dir checkpoints/<dpo/sft>/llama3_hf --config_file checkpoints/<dpo/sft>/llama3/llama3-8b-instruct/global_step100/configs/llama3-8b-rm.yml --precision bf16 --vocab-is-hf-tokenizer --architecture llama
+```
diff --git a/post-training/configs/benchmarking/llama-13b-dpo.yml b/post-training/configs/benchmarking/llama-13b-dpo.yml
new file mode 100644
index 000000000..1b97f51b4
--- /dev/null
+++ b/post-training/configs/benchmarking/llama-13b-dpo.yml
@@ -0,0 +1,127 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 64,
+
+  # model settings
+  "num_layers": 40,
+  "hidden_size": 5120,
+  "num_attention_heads": 40,
+  "num_kv_heads": 40,
+  # following along with zephyr's max length...
+  "seq_length": 1024,
+  "max_position_embeddings": 1024,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 500000,
+  "rope_fusion": true,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+  "rmsnorm_fusion": true,
+
+  "attention_config": [[["flash"], 40]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "use_bias_in_mlp": false,
+  "use_flashattn_swiglu": true,
+  "activation": "swiglu",
+  "intermediate_size": 13824,
+  "mlp_multiple_of": 13824,
+
+
+  "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 5.0e-7,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+  "min_lr": 0.0,
+
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1000000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1000000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_impl": "dpo",
+  "dataset_impl": "pairwise",
+  "dpo_reference_free": false,
+  "dpo_fp32": false,
+  "dpo_beta": 0.01,
+  "allow_chopped": false,
+  "pos_train_data_paths": [ "data/pairwise/dpo_train_chosen_document" ],
+  "pos_train_label_data_paths": [ "data/pairwise/dpo_train_chosen_label_document" ],
+  "neg_train_data_paths": [ "data/pairwise/dpo_train_rejected_document" ],
+  "neg_train_label_data_paths": [ "data/pairwise/dpo_train_rejected_label_document" ],
+  "pos_valid_data_paths": [ "data/pairwise/dpo_val_chosen_document" ],
+  "pos_valid_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ],
+  "neg_valid_data_paths": [ "data/pairwise/dpo_val_rejected_document" ],
+  "neg_valid_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ],
+  "pos_test_data_paths": [ "data/pairwise/dpo_val_chosen_document" ],
+  "pos_test_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ],
+  "neg_test_data_paths": [ "data/pairwise/dpo_val_rejected_document" ],
+  "neg_test_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ],
+
+
+  "train_micro_batch_size_per_gpu": 2,
+  "gradient_accumulation_steps": 16,
+  "data_impl": "mmap",
+  "pack_impl": "unpacked",
+  "num_workers": 4,
+
+  "checkpoint_activations": false,
+  "checkpoint_num_layers": 1,
+  "partition_activations": false,
+  "synchronize_each_layer": false,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": false,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "bf16"
+  },
+
+  "train_iters": 477,
+  "lr_decay_iters": 477,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.1,
+  "checkpoint_factor": 1000,
+  "eval_interval": 143000,
+  "eval_iters": 10,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+
+  "save": "checkpoints/pairwise/llama-13b-dpo",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  # use the same mistral tokenizer just for performance testing
+  "vocab-file": "checkpoints/neox_converted/zephyr-sft/tokenizer/tokenizer.json",
+  "use_wandb": true,
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "tokenizer_type": "HFTokenizer",
+  "wandb_group": "llama-13b",
+  "wandb_project": "llama-13b-perf-test",
+}
diff --git a/post-training/configs/benchmarking/mistral-dpo.yml b/post-training/configs/benchmarking/mistral-dpo.yml
new file mode 100644
index 000000000..3e2f1a5ac
--- /dev/null
+++ b/post-training/configs/benchmarking/mistral-dpo.yml
@@ -0,0 +1,126 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "num_kv_heads": 8,
+  # following along with zephyr's max length...
+  "seq_length": 1024,
+  "max_position_embeddings": 1024,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "rope_fusion": true,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+  "rmsnorm_fusion": true,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "use_bias_in_mlp": false,
+  "use_flashattn_swiglu": true,
+  "activation": "swiglu",
+  "intermediate_size": 14336,
+  "mlp_multiple_of": 14336,
+
+
+  "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 5.0e-7,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+  "min_lr": 0.0,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_impl": "dpo",
+  "dataset_impl": "pairwise",
+  "dpo_fp32": false,
+  "dpo_beta": 0.01,
+  "allow_chopped": false,
+  "pos_train_data_paths": [ "data/pairwise/dpo_train_chosen_document" ],
+  "pos_train_label_data_paths": [ "data/pairwise/dpo_train_chosen_label_document" ],
+  "neg_train_data_paths": [ "data/pairwise/dpo_train_rejected_document" ],
+  "neg_train_label_data_paths": [ "data/pairwise/dpo_train_rejected_label_document" ],
+  "pos_valid_data_paths": [ "data/pairwise/dpo_val_chosen_document" ],
+  "pos_valid_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ],
+  "neg_valid_data_paths": [ "data/pairwise/dpo_val_rejected_document" ],
+  "neg_valid_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ],
+  "pos_test_data_paths": [ "data/pairwise/dpo_val_chosen_document" ],
+  "pos_test_label_data_paths": [ "data/pairwise/dpo_val_chosen_label_document" ],
+  "neg_test_data_paths": [ "data/pairwise/dpo_val_rejected_document" ],
+  "neg_test_label_data_paths": [ "data/pairwise/dpo_val_rejected_label_document" ],
+
+
+  "train_micro_batch_size_per_gpu": 8,
+  "gradient_accumulation_steps": 8,
+  "data_impl": "mmap",
+  "pack_impl": "unpacked",
+  "num_workers": 1,
+
+  "checkpoint_activations": false,
+  "checkpoint_num_layers": 32,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": false,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "bf16"
+  },
+
+  "train_iters": 477,
+  "lr_decay_iters": 477,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.1,
+  "checkpoint_factor": 1000,
+  "eval_interval": 143000,
+  "eval_iters": 10,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+
+  "save": "checkpoints/pairwise/zephyr-beta-recreation",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "checkpoints/neox_converted/zephyr-sft",
+  "vocab-file": "checkpoints/neox_converted/zephyr-sft/tokenizer/tokenizer.json",
+  "use_wandb": true,
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "tokenizer_type": "HFTokenizer",
+  "wandb_group": "zephyr-beta-dpo",
+  "wandb_project": "zephyr-beta-dpo",
+}
diff --git a/post-training/configs/llama3-8b-dpo.yml b/post-training/configs/llama3-8b-dpo.yml
new file mode 100644
index 000000000..8a75caef0
--- /dev/null
+++ b/post-training/configs/llama3-8b-dpo.yml
@@ -0,0 +1,125 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "num_kv_heads": 8,
+  # llama3 supports more than this but this is just for testing.
+  "seq_length": 1024,
+  "max_position_embeddings": 1024,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 500000,
+  "rope_fusion": true,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "use_bias_in_mlp": false,
+  "use_flashattn_swiglu": true,
+  "activation": "swiglu",
+  "intermediate_size": 14336,
+  "mlp_multiple_of": 14336,
+
+
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 5.0e-7,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+  "min_lr": 0.0,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_impl": "dpo",
+  "dataset_impl": "pairwise",
+  "dpo_fp32": true,
+  "dpo_beta": 0.01,
+  "allow_chopped": false,
+  "pos_train_data_paths": [ "data/pairwise/llama3_dpo_train_chosen_document" ],
+  "pos_train_label_data_paths": [ "data/pairwise/llama3_dpo_train_chosen_label_document" ],
+  "neg_train_data_paths": [ "data/pairwise/llama3_dpo_train_rejected_document" ],
+  "neg_train_label_data_paths": [ "data/pairwise/llama3_dpo_train_rejected_label_document" ],
+  "pos_valid_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_document" ],
+  "pos_valid_label_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_label_document" ],
+  "neg_valid_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_document" ],
+  "neg_valid_label_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_label_document" ],
+  "pos_test_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_document" ],
+  "pos_test_label_data_paths": [ "data/pairwise/llama3_dpo_val_chosen_label_document" ],
+  "neg_test_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_document" ],
+  "neg_test_label_data_paths": [ "data/pairwise/llama3_dpo_val_rejected_label_document" ],
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "pack_impl": "unpacked",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 477,
+  "lr_decay_iters": 477,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.1,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100,
+  "eval_iters": 10,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+
+  "save": "checkpoints/dpo/llama3/llama3-8b-instruct",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "checkpoints/neox_converted/llama3-8b-instruct",
+  "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json",
+  "use_wandb": true,
+  "wandb_group": "llama3-8b-instruct",
+  "wandb_project": "ultrafeedback-dpo",
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "tokenizer_type": "HFTokenizer",
+}
diff --git a/post-training/configs/llama3-8b-kto.yml b/post-training/configs/llama3-8b-kto.yml
new file mode 100644
index 000000000..e819d37cb
--- /dev/null
+++ b/post-training/configs/llama3-8b-kto.yml
@@ -0,0 +1,120 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "num_kv_heads": 8,
+  # llama3 supports more than this but this is just for testing.
+  "seq_length": 1024,
+  "max_position_embeddings": 1024,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 500000,
+  "rope_fusion": true,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "use_bias_in_mlp": false,
+  "use_flashattn_swiglu": true,
+  "activation": "swiglu",
+  "intermediate_size": 14336,
+  "mlp_multiple_of": 14336,
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+  "min_lr": 0.000001,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+
+  "train_impl": "kto",
+  "kto_fp32": true,
+  "kto_beta": 0.1,
+  "allow_chopped": false,
+  "train_label_data_paths": [ "data/kto/llama3_train_messages_label_document" ],
+  "test_label_data_paths": [ "data/kto/llama3_test_messages_label_document" ],
+  "valid_label_data_paths": [ "data/kto/llama3_train_messages_label_document" ],
+  "train_data_paths": [ "data/kto/llama3_train_messages_document" ],
+  "test_data_paths": [ "data/kto/llama3_test_messages_document" ],
+  "valid_data_paths": [ "data/kto/llama3_train_messages_document" ],
+  "train_reward_data_paths": [ "data/kto/llama3_train_messages_reward_document" ],
+  "test_reward_data_paths": [ "data/kto/llama3_test_messages_reward_document" ],
+  "valid_reward_data_paths": [ "data/kto/llama3_train_messages_reward_document" ],
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "pack_impl": "unpacked",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 477,
+  "lr_decay_iters": 477,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.1,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100,
+  "eval_iters": 10,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+
+  "save": "checkpoints/kto/llama3/llama3-8b-instruct",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "checkpoints/neox_converted/llama3-8b-instruct",
+  "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json",
+  "use_wandb": true,
+  "wandb_group": "llama3-8b-instruct",
+  "wandb_project": "ultrafeedback-kto",
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "tokenizer_type": "HFTokenizer",
+}
diff --git a/post-training/configs/llama3-8b-rm.yml b/post-training/configs/llama3-8b-rm.yml
new file mode 100644
index 000000000..43117bf95
--- /dev/null
+++ b/post-training/configs/llama3-8b-rm.yml
@@ -0,0 +1,121 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "num_kv_heads": 8,
+  # llama3 supports more than this but this is just for testing.
+  "seq_length": 1024,
+  "max_position_embeddings": 1024,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 500000,
+  "rope_fusion": true,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "use_bias_in_mlp": false,
+  "use_flashattn_swiglu": true,
+  "activation": "swiglu",
+  "intermediate_size": 14336,
+  "mlp_multiple_of": 14336,
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 5.0e-7,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+  "min_lr": 0.0,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_impl": "rm",
+  "dataset_impl": "pairwise",
+  "allow_chopped": false,
+  "pos_train_data_paths": [ "data/pairwise/llama3_rm_train_chosen_document" ],
+  "pos_train_label_data_paths": [ "data/pairwise/llama3_rm_train_chosen_label_document" ],
+  "neg_train_data_paths": [ "data/pairwise/llama3_rm_train_rejected_document" ],
+  "neg_train_label_data_paths": [ "data/pairwise/llama3_rm_train_rejected_label_document" ],
+  "pos_valid_data_paths": [ "data/pairwise/llama3_rm_val_chosen_document" ],
+  "pos_valid_label_data_paths": [ "data/pairwise/llama3_rm_val_chosen_label_document" ],
+  "neg_valid_data_paths": [ "data/pairwise/llama3_rm_val_rejected_document" ],
+  "neg_valid_label_data_paths": [ "data/pairwise/llama3_rm_val_rejected_label_document" ],
+  "pos_test_data_paths": [ "data/pairwise/llama3_rm_val_chosen_document" ],
+  "pos_test_label_data_paths": [ "data/pairwise/llama3_rm_val_chosen_label_document" ],
+  "neg_test_data_paths": [ "data/pairwise/llama3_rm_val_rejected_document" ],
+  "neg_test_label_data_paths": [ "data/pairwise/llama3_rm_val_rejected_label_document" ],
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "pack_impl": "unpacked",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 477,
+  "lr_decay_iters": 477,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.1,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100,
+  "eval_iters": 10,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+
+  "save": "checkpoints/rm/llama3/llama3-8b-instruct",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "checkpoints/neox_converted/llama3-8b-instruct",
+  "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json",
+  "use_wandb": true,
+  "wandb_group": "llama3-8b-instruct",
+  "wandb_project": "ultrafeedback-rm",
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "tokenizer_type": "HFTokenizer",
+}
diff --git a/post-training/configs/llama3-8b-sft.yml b/post-training/configs/llama3-8b-sft.yml
new file mode 100644
index 000000000..bfcea1142
--- /dev/null
+++ b/post-training/configs/llama3-8b-sft.yml
@@ -0,0 +1,112 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 4,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "num_kv_heads": 8,
+  # llama3 supports more than this but this is just for testing.
+  "seq_length": 1024,
+  "max_position_embeddings": 1024,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 500000,
+  "rope_fusion": true,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "use_bias_in_mlp": false,
+  "use_flashattn_swiglu": true,
+  "activation": "swiglu",
+  "intermediate_size": 14336,
+  "mlp_multiple_of": 14336,
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+  "min_lr": 0.000001,
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  "train_label_data_paths": [ "data/sft/llama3_train_messages_label_document" ],
+  "test_label_data_paths": [ "data/sft/llama3_test_messages_label_document" ],
+  "valid_label_data_paths": [ "data/sft/llama3_train_messages_label_document" ],
+  "train_data_paths": [ "data/sft/llama3_train_messages_document" ],
+  "test_data_paths": [ "data/sft/llama3_test_messages_document" ],
+  "valid_data_paths": [ "data/sft/llama3_train_messages_document" ],
+
+  "train_micro_batch_size_per_gpu": 32,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+  "pack_impl": "unpacked",
+  "num_workers": 1,
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 477,
+  "lr_decay_iters": 477,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.1,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100,
+  "eval_iters": 10,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+
+  "save": "checkpoints/sft/llama3/llama3-8b-instruct",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "checkpoints/neox_converted/llama3-8b-instruct",
+  "vocab-file": "checkpoints/neox_converted/llama3-8b-instruct/tokenizer/tokenizer.json",
+  "use_wandb": true,
+  "wandb_group": "llama3-8b-instruct",
+  "wandb_project": "ultrafeedback-sft",
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+  "tokenizer_type": "HFTokenizer",
+}
diff --git a/post-training/dpo_data.py b/post-training/dpo_data.py
new file mode 100644
index 000000000..d24eb43e5
--- /dev/null
+++ b/post-training/dpo_data.py
@@ -0,0 +1,103 @@
+"""
+https://github.com/huggingface/alignment-handbook/blob/main/scripts/run_dpo.py
+adapted to just grab the dataset
+"""
+import os
+from alignment import (
+    DataArguments,
+    DPOConfig,
+    H4ArgumentParser,
+    ModelArguments,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    is_adapter_model,
+)
+from datasets import load_dataset, DatasetDict
+from transformers import AutoTokenizer
+
+import jsonlines
+
+###############
+# Load datasets
+###############
+raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
+raw_datasets = DatasetDict(
+    {
+        "train": raw_datasets["train_prefs"],
+        "test": raw_datasets["test_prefs"],
+    }
+)
+column_names = list(raw_datasets["train"].features)
+
+#####################################
+# Load tokenizer and process datasets
+#####################################
+truncation_side = (
+    "left"  # Truncate from left to ensure we don't lose labels in final turn
+)
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+
+#####################
+# Apply chat template
+#####################
+raw_datasets = raw_datasets.map(
+    apply_chat_template,
+    fn_kwargs={
+        "tokenizer": tokenizer,
+        "task": "dpo",
+        "auto_insert_empty_system_msg": True,
+    },
+    desc="Formatting comparisons with prompt template",
+)
+
+##########################
+# Decontaminate benchmarks
+##########################
+num_raw_train_samples = len(raw_datasets["train"])
+raw_datasets = raw_datasets.filter(
+    decontaminate_humaneval,
+    fn_kwargs={"text_column": "text_chosen"},
+    batched=True,
+    batch_size=10_000,
+    num_proc=1,
+    desc="Decontaminating HumanEval samples",
+)
+num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
+print(
+    f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples / num_raw_train_samples * 100:.2f}%) samples from the training set."
+)
+###############
+# Length filter
+###############
+# Since the alignment handbook recipes call for a max token limit of 1024...
+num_filtered_train_samples = len(raw_datasets["train"])
+
+
+def length_filter(example):
+    return (len(tokenizer.apply_chat_template(example["chosen"])) < 1024) and (
+        len(tokenizer.apply_chat_template(example["rejected"])) < 1024
+    )
+
+
+num_length_filtered_train_samples = num_filtered_train_samples - len(
+    raw_datasets["train"]
+)
+print(
+    f"Length Filtered {num_length_filtered_train_samples} ({num_length_filtered_train_samples / num_filtered_train_samples * 100:.2f}%) samples from the training set."
+)
+# get directory of the python script
+dir_path = os.path.dirname(os.path.realpath(__file__))
+for split in ["train", "test"]:
+    with open(os.path.join(dir_path, f"dpo_{split}_filtered.jsonl"), "w") as f:
+        writer = jsonlines.Writer(f)
+        for item in raw_datasets[split]:
+            # add empty system messages
+            item["chosen"] = [{"role": "system", "content": ""}] + item["chosen"]
+            item["rejected"] = [{"role": "system", "content": ""}] + item["rejected"]
+            writer.write(item)
diff --git a/post-training/llama_data.py b/post-training/llama_data.py
new file mode 100644
index 000000000..eab6ac9f1
--- /dev/null
+++ b/post-training/llama_data.py
@@ -0,0 +1,49 @@
+import os
+
+from datasets import load_dataset, DatasetDict
+
+import jsonlines
+
+###############
+# Load datasets
+###############
+raw_datasets = load_dataset("HuggingFaceH4/ultrafeedback_binarized")
+# convert to just train and test, not necessary but it looks better
+raw_datasets = DatasetDict(
+    {
+        "train": raw_datasets["train_prefs"],
+        "test": raw_datasets["test_prefs"],
+    }
+)
+os.makedirs(os.path.join("data", "pairwise"), exist_ok=True)
+for split in ["train", "test"]:
+    with open(
+        os.path.join("data", "pairwise", f"llama3_dpo_{split}_filtered.jsonl"), "w"
+    ) as f:
+        writer = jsonlines.Writer(f)
+        for item in raw_datasets[split]:
+            item["chosen"] = item["chosen"]
+            item["rejected"] = item["rejected"]
+            writer.write(item)
+os.makedirs(os.path.join("data", "sft"), exist_ok=True)
+for split in ["train", "test"]:
+    with open(
+        os.path.join("data", "sft", f"llama3_sft_{split}_filtered.jsonl"), "w"
+    ) as f:
+        writer = jsonlines.Writer(f)
+        for item in raw_datasets[split]:
+            item["messages"] = item["chosen"]
+            writer.write(item)
+os.makedirs(os.path.join("data", "kto"), exist_ok=True)
+for split in ["train", "test"]:
+    with open(
+        os.path.join("data", "kto", f"llama3_kto_{split}_filtered.jsonl"), "w"
+    ) as f:
+        writer = jsonlines.Writer(f)
+        for item in raw_datasets[split]:
+            item["messages"] = item["chosen"]
+            item["reward"] = 1
+            writer.write(item)
+            item["messages"] = item["rejected"]
+            item["reward"] = -1
+            writer.write(item)
diff --git a/post-training/recreating_zephyr_dpo.md b/post-training/recreating_zephyr_dpo.md
new file mode 100644
index 000000000..d97eb3791
--- /dev/null
+++ b/post-training/recreating_zephyr_dpo.md
@@ -0,0 +1,39 @@
+# Initial setup
+
+```bash
+python tools/ckpts/convert_hf_llama_to_neox.py --tp 2 --model HuggingFaceH4/mistral-7b-sft-beta --model_path checkpoints/neox_converted/zephyr-sft_tp2
+```
+
+
+# To generate data
+First make a new environment... We want to keep the same data between runs so the easiest way is to create a new conda
+environment and follow the steps below.
+```
+conda create -n handbook python=3.10 && conda activate handbook
+git clone https://github.com/huggingface/alignment-handbook.git
+cd ./alignment-handbook/
+python -m pip install .
+python -m pip install jsonlines
+```
+
+## DPO data
+```bash
+# from the gpt-neox repo
+conda activate handbook
+python post-training/dpo_data.py
+conda deactivate
+# activate your neox conda environment, or whatever you need to switch to the neox environment
+mkdir data
+mkdir data/pairwise
+python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys rejected --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_train --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_test_filtered.jsonl --output-prefix data/pairwise/dpo_test --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
+python tools/datasets/preprocess_data_with_chat_template.py --input post-training/dpo_train_filtered.jsonl --output-prefix data/pairwise/dpo_val --tokenizer-path checkpoints/neox_converted/zephyr-sft/tokenizer --jsonl-keys chosen --only-last
+```
+
+## Running
+```bash
+python deepy.py train.py post-training/configs/benchmarking/mistral-dpo.yml
+```
diff --git a/requirements/pyproject-apex-pip.toml b/requirements/pyproject-apex-pip.toml
new file mode 100644
index 000000000..df41dc925
--- /dev/null
+++ b/requirements/pyproject-apex-pip.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-apex-pip"
+version = "0.1.0"
+description = "Apex pip requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+pip = "23.3.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-comet.toml b/requirements/pyproject-comet.toml
new file mode 100644
index 000000000..04422a213
--- /dev/null
+++ b/requirements/pyproject-comet.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-comet"
+version = "0.1.0"
+description = "Comet ML requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+comet_ml = ">=3.45.0"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-flashattention.toml b/requirements/pyproject-flashattention.toml
new file mode 100644
index 000000000..14c7ad112
--- /dev/null
+++ b/requirements/pyproject-flashattention.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-flashattention"
+version = "0.1.0"
+description = "Flash Attention requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+flash-attn = "2.5.6"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-mamba.toml b/requirements/pyproject-mamba.toml
new file mode 100644
index 000000000..0f6191662
--- /dev/null
+++ b/requirements/pyproject-mamba.toml
@@ -0,0 +1,16 @@
+[tool.poetry]
+name = "gpt-neox-mamba"
+version = "0.1.0"
+description = "Mamba requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+causal_conv1d = ">=1.1.0"
+einops = "*"
+mamba_ssm = ">=1.2.0.post1"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-neox-dev.toml b/requirements/pyproject-neox-dev.toml
new file mode 100644
index 000000000..55b00f6ba
--- /dev/null
+++ b/requirements/pyproject-neox-dev.toml
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "gpt-neox-dev"
+version = "0.1.0"
+description = "Development requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+autopep8 = ">=1.5.6"
+clang-format = ">=13.0.1"
+pre-commit = ">=2.17.0"
+pytest = ">=6.2.3"
+pytest-cov = ">=2.11.1"
+pytest-forked = ">=1.3.0"
+pytest-html = "4.1.1"
+pytest-xdist = "*"
+toml = ">=0.10.2"
+packaging = ">=23.0"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-onebitadam.toml b/requirements/pyproject-onebitadam.toml
new file mode 100644
index 000000000..aeaf33aa6
--- /dev/null
+++ b/requirements/pyproject-onebitadam.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-onebitadam"
+version = "0.1.0"
+description = "OneBitAdam requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+cupy-cuda111 = ">=8.6.0"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-s3.toml b/requirements/pyproject-s3.toml
new file mode 100644
index 000000000..a0cb99aef
--- /dev/null
+++ b/requirements/pyproject-s3.toml
@@ -0,0 +1,15 @@
+[tool.poetry]
+name = "gpt-neox-s3"
+version = "0.1.0"
+description = "S3 requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+boto3 = "*"
+hf-transfer = ">=0.1.3"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-sparseattention.toml b/requirements/pyproject-sparseattention.toml
new file mode 100644
index 000000000..2864c799b
--- /dev/null
+++ b/requirements/pyproject-sparseattention.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-sparseattention"
+version = "0.1.0"
+description = "Sparse Attention requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+triton = "2.1.0"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-tensorboard.toml b/requirements/pyproject-tensorboard.toml
new file mode 100644
index 000000000..79bbfa900
--- /dev/null
+++ b/requirements/pyproject-tensorboard.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-tensorboard"
+version = "0.1.0"
+description = "TensorBoard requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+tensorboard = "2.13.0"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-transformerengine.toml b/requirements/pyproject-transformerengine.toml
new file mode 100644
index 000000000..7c313e0d9
--- /dev/null
+++ b/requirements/pyproject-transformerengine.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-transformerengine"
+version = "0.1.0"
+description = "Transformer Engine requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+transformer-engine = {git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "stable"}
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject-wandb.toml b/requirements/pyproject-wandb.toml
new file mode 100644
index 000000000..c5806b341
--- /dev/null
+++ b/requirements/pyproject-wandb.toml
@@ -0,0 +1,14 @@
+[tool.poetry]
+name = "gpt-neox-wandb"
+version = "0.1.0"
+description = "Weights & Biases requirements for GPT-NeoX"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+wandb = ">=0.10.28"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/pyproject.toml b/requirements/pyproject.toml
new file mode 100644
index 000000000..91d6fc1dd
--- /dev/null
+++ b/requirements/pyproject.toml
@@ -0,0 +1,33 @@
+[tool.poetry]
+name = "gpt-neox"
+version = "2.0.0"
+description = "An open-source library for training large-scale language models on GPUs"
+authors = ["EleutherAI <contact@eleuther.ai>"]
+license = "Apache-2.0"
+readme = "README.md"
+homepage = "https://www.github.com/eleutherai/gpt-neox"
+repository = "https://www.github.com/eleutherai/gpt-neox"
+documentation = "https://www.github.com/eleutherai/gpt-neox"
+
+[tool.poetry.dependencies]
+python = "^3.8"
+deepspeed = {git = "https://github.com/EleutherAI/DeeperSpeed.git", rev = "02e2ebf7dee6aaab3d89094ed470a4609763c742"}
+ftfy = "^6.0.1"
+huggingface_hub = "^0.11.0"
+jinja2 = "3.1.4"
+lm_dataformat = {git = "https://github.com/EleutherAI/lm_dataformat.git", rev = "4eec05349977071bf67fc072290b95e31c8dd836"}
+lm_eval = ">=0.4.0,<=0.4.1"
+mpi4py = "^3.0.3"
+numpy = "<2.0"
+pybind11 = "^2.6.2"
+regex = "*"
+sentencepiece = "*"
+six = "*"
+tiktoken = "^0.1.2"
+tokenizers = "^0.12.1"
+transformers = "4.38.0"
+toml = "*"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/requirements/requirements-comet.txt b/requirements/requirements-comet.txt
new file mode 100644
index 000000000..904301eaa
--- /dev/null
+++ b/requirements/requirements-comet.txt
@@ -0,0 +1 @@
+comet_ml>=3.45.0
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 60ff3224f..8dfd5595c 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,8 +1,10 @@
 autopep8>=1.5.6
 clang-format>=13.0.1
+packaging>=23.0
 pre-commit>=2.17.0
 pytest>=6.2.3
 pytest-cov>=2.11.1
 pytest-forked>=1.3.0
 pytest-html==4.1.1
 pytest-xdist
+toml>=0.10.2
diff --git a/requirements/requirements-transformerengine.txt b/requirements/requirements-transformerengine.txt
new file mode 100644
index 000000000..2050d7566
--- /dev/null
+++ b/requirements/requirements-transformerengine.txt
@@ -0,0 +1 @@
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 3ac92598a..b5a84674b 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,11 +1,11 @@
 deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed
 ftfy>=6.0.1
-lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
 jinja2==3.1.4
+lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 lm_eval>=0.4.0,<=0.4.1
 mpi4py>=3.0.3
-numpy>=1.22.0
+numpy<2.0
 pybind11>=2.6.2
 regex
 sentencepiece
diff --git a/tests/README.md b/tests/README.md
index 316096cc5..32618d757 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -3,6 +3,7 @@
 Tests use pytests with coverage and forked plugins. Install with:
 
 ```bash
+pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-dev.txt
 ```
 
@@ -32,7 +33,7 @@ pytest --forked tests/model/test_model_generation.py
 
 Some tests can run on cpu only. These are marked with the decorator @pytest.mark.cpu.
 The test cases for cpu can be run with:
-````
+```
 pytest tests -m cpu
 ```
 
@@ -49,3 +50,80 @@ if You see this kind of error:
 RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
 ```
 It usually means that you used some pytorch.cuda function before the test creates the processes. However just importing `from torch.utils import cpp_extension` can also trigger this.
+
+
+## CPU Test Integration
+
+Tests can be run against physical CPUs through GitHub Actions. To have tests run on the physical CPU test, here is generally how the CI should be written:
+
+### runs-on
+
+#### NOTE: These BKMs were written to work with CI infrastructure that is no longer in place. To use the Github runners (ubuntu-22.04 / ubuntu-latest), skip the 'runs-on' section.
+
+The CI needs to be written to target the CPU Github Action runner. The jobs that need to run on CPU should use the hardware runner's labels:
+```yaml
+jobs:
+  cpu-test-job:
+    runs-on: [ 'self-hosted', 'aws', 'test'] # these labels tell GitHub to execute on the runner with the 'aws' and 'test' labels
+```
+
+### Software dependencies
+
+Hardware tests that need python and docker should install them as part of the test execution to make sure the tests run as expected:
+```yaml
+steps:
+    # sample syntax to setup python with pip
+  - uses: actions/setup-python@v4
+    with:
+      python-version: "3.8"
+      cache: "pip"
+
+    # sample setup of docker (there's no official Docker setup action)
+  - name: Docker setup
+    run: | # taken from Docker's installation page: https://docs.docker.com/engine/install/ubuntu/
+      # Add Docker's official GPG key:
+      sudo apt-get update
+      sudo apt-get install ca-certificates curl
+      sudo install -m 0755 -d /etc/apt/keyrings
+      sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+      sudo chmod a+r /etc/apt/keyrings/docker.asc
+      # Add the repository to Apt sources:
+      echo \
+        "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+        $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+        sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+      sudo apt-get update
+      sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
+```
+
+Any other software dependencies should be assumed to be missing and installed as part of the CI.
+
+### Using Docker image
+
+Using the Docker image and running tests in a container is recommended to resolve environment issues. There is a modified docker-compose.yml in tests/cpu_tests directory that is recommended to be used for CPU tests:
+
+```bash
+cp tests/cpu_tests/docker-compose.yml .
+# export any env variables here that should be used:
+export NEOX_DATA_PATH='./data/enwik8'
+docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null
+# then can set up and run tests in the container using docker exec
+docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt
+# etc.
+# please clean up the container as part of the CI:
+docker rm $CONTAINER
+```
+
+At the time of writing there is no built-in method to provide an offline-built Docker image to `jobs.<job-id>.container`.
+
+### Using existing CPU test CI
+
+There is an existing CPU test workflow that can be included in existing CI:
+
+```yaml
+steps:
+  - name: Run CPU Tests
+    uses:
+      target_test_ref: $GITHUB_REF # replace with the ref/SHA that the tests should be run on
+      # have a look at the reusable workflow here: https://github.com/EleutherAI/gpt-neox/blob/main/tests/cpu_tests/action.yml
+```
diff --git a/tests/model/test_model_train.py b/tests/model/test_model_train.py
index 31798f342..65adfcdee 100644
--- a/tests/model/test_model_train.py
+++ b/tests/model/test_model_train.py
@@ -28,7 +28,6 @@
 
 PARAMS_TO_TEST = {
     "gpt_j_residual": [True, False],
-    "mlp_type": ["llama", "regular"],
     "pos_emb": ["learned", "rotary", "sinusoidal", "rpe", "alibi", "none"],
     "attention_config": [
         "global",
diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py
index 176151c2a..5f8ba7bd2 100644
--- a/tests/neox_args/test_neoxargs_usage.py
+++ b/tests/neox_args/test_neoxargs_usage.py
@@ -66,7 +66,9 @@ def test_neoxargs_usage():
 
         # find args matches
         matches = list(
-            re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
+            re.findall(
+                r"(?<=neox_args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents
+            )
         )
         if len(matches) == 0:
             continue
diff --git a/tests/requirements/test_requirements.py b/tests/requirements/test_requirements.py
new file mode 100644
index 000000000..20e8ad0dd
--- /dev/null
+++ b/tests/requirements/test_requirements.py
@@ -0,0 +1,131 @@
+import pytest
+import toml
+from pathlib import Path
+from typing import Dict, List, Optional
+from packaging.version import parse as parse_version, Version
+from dataclasses import dataclass
+
+
+@dataclass
+class Dependency:
+    name: str
+    version: Optional[str] = None
+
+    @classmethod
+    def from_requirement(cls, requirement: str) -> "Dependency":
+        """Parse a requirement string into a Dependency object."""
+        # Common version specifiers
+        specifiers = ["==", ">=", ">", "<=", "<"]
+        name = requirement
+        version = None
+
+        for spec in specifiers:
+            if spec in requirement:
+                name, version = requirement.split(spec, 1)
+                version = version.strip()
+                break
+
+        return cls(name.lower().strip(), version)
+
+    def matches_version(self, other_version: str) -> bool:
+        """Check if this dependency's version matches another version string."""
+        if not self.version or not other_version:
+            return True
+
+        try:
+            # Convert versions to comparable objects
+            our_version = parse_version(self.version)
+            their_version = parse_version(other_version.replace("*", "0"))
+            return our_version == their_version
+        except ValueError:
+            # If versions can't be parsed, fall back to string comparison
+            return self.version.replace("*", "0") == other_version.replace("*", "0")
+
+
+class DependencyValidator:
+    def __init__(self, requirements_dir: Path):
+        self.requirements_dir = requirements_dir
+
+    def parse_requirements(self, file_path: Path) -> List[Dependency]:
+        """Parse requirements.txt file into a list of Dependencies."""
+        try:
+            with open(file_path, "r") as f:
+                lines = [
+                    line.strip()
+                    for line in f
+                    if line.strip() and not line.startswith("#")
+                ]
+                return [Dependency.from_requirement(line) for line in lines]
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Requirements file not found: {file_path}")
+        except Exception as e:
+            raise ValueError(f"Error parsing requirements file {file_path}: {str(e)}")
+
+    def parse_pyproject(self, file_path: Path) -> Dict[str, str]:
+        """Parse pyproject.toml file and extract dependencies."""
+        try:
+            with open(file_path, "r") as f:
+                pyproject_data = toml.load(f)
+                return {
+                    name.lower(): str(version)
+                    for name, version in pyproject_data["tool"]["poetry"][
+                        "dependencies"
+                    ].items()
+                    if name.lower() != "python"  # Exclude Python version
+                }
+        except FileNotFoundError:
+            raise FileNotFoundError(f"pyproject.toml file not found: {file_path}")
+        except Exception as e:
+            raise ValueError(f"Error parsing pyproject.toml {file_path}: {str(e)}")
+
+    def compare_dependencies(
+        self, req_deps: List[Dependency], pyproject_deps: Dict[str, str]
+    ) -> tuple[bool, List[str]]:
+        """Compare dependencies between requirements.txt and pyproject.toml."""
+        mismatches = []
+
+        for req in req_deps:
+            if req.name not in pyproject_deps:
+                mismatches.append(
+                    f"Dependency '{req.name}' not found in pyproject.toml"
+                )
+                continue
+
+            if not req.matches_version(pyproject_deps[req.name]):
+                mismatches.append(
+                    f"Version mismatch for '{req.name}': "
+                    f"requirements.txt={req.version}, "
+                    f"pyproject.toml={pyproject_deps[req.name]}"
+                )
+
+        return len(mismatches) == 0, mismatches
+
+
+def get_corresponding_pyproject(req_file: Path) -> Path:
+    """Get the corresponding pyproject.toml file for a requirements file."""
+    env_name = req_file.stem.split("-")[1]
+    return req_file.parent / f"pyproject-{env_name}.toml"
+
+
+@pytest.mark.parametrize("req_file", Path("requirements").glob("requirements-*.txt"))
+def test_pyproject_matches_requirements(req_file: Path):
+    """Test that requirements.txt dependencies match pyproject.toml dependencies."""
+    validator = DependencyValidator(req_file.parent)
+    pyproject_file = get_corresponding_pyproject(req_file)
+
+    # Parse both dependency files
+    req_deps = validator.parse_requirements(req_file)
+    pyproject_deps = validator.parse_pyproject(pyproject_file)
+
+    # Compare dependencies and get detailed mismatches
+    is_match, mismatches = validator.compare_dependencies(req_deps, pyproject_deps)
+
+    # Create detailed error message if there are mismatches
+    if not is_match:
+        error_msg = "\n".join(
+            [
+                f"\nDependency mismatches found between {req_file} and {pyproject_file}:",
+                *[f"- {msg}" for msg in mismatches],
+            ]
+        )
+        pytest.fail(error_msg)
diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py
index e0801434c..6935e480a 100644
--- a/tests/unit/test_format_conversion_scripts.py
+++ b/tests/unit/test_format_conversion_scripts.py
@@ -4,8 +4,12 @@
 from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 
 
+@pytest.mark.skip(
+    reason="Conversion test is skipped until we fix the CUDA + torch multiprocessing issue."
+)
 def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
     # Generate random GPT-NEOX model, check we can convert to hf format
+
     model_dir = str(tmpdir)
     input_args = ["train.py", "tests/config/test_setup.yml"]
     deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args)
diff --git a/tools/ckpts/README.md b/tools/ckpts/README.md
index 24d5cf31c..770cfb9c6 100644
--- a/tools/ckpts/README.md
+++ b/tools/ckpts/README.md
@@ -131,3 +131,20 @@ options:
   --num_output_shards NUM_OUTPUT_SHARDS
   --pipeline_parallel   Only use if PP>1
 ```
+
+### `convert_hf_llama_to_neox.py`
+Takes an HF Llama checkpoint and puts it into a NeoX-compatible format.
+
+Note that this does not support pipeline parallelism!
+
+```
+usage: convert_hf_llama_to_neox.py [-h] [--tp TP] [--pp PP] [--model MODEL] [--model_path MODEL_PATH]
+
+options:
+  -h, --help            show this help message and exit
+  --tp TP               Number of tensor parallelism ranks
+  --pp PP               Number of pipeline parallelism stages
+  --model MODEL         HF model name
+  --model_path MODEL_PATH
+                        Path to save model
+```
diff --git a/tools/ckpts/convert_hf_llama_to_neox.py b/tools/ckpts/convert_hf_llama_to_neox.py
new file mode 100644
index 000000000..21249995b
--- /dev/null
+++ b/tools/ckpts/convert_hf_llama_to_neox.py
@@ -0,0 +1,211 @@
+import torch
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import os
+import tqdm
+
+
+def convert_model(hf_state_dict, hf_config, tp_ranks):
+    conv_state_dicts = [{} for _ in range(tp_ranks)]
+    # get embeddings...
+    for i, chunk in enumerate(
+        torch.chunk(hf_state_dict["model.embed_tokens.weight"], tp_ranks, dim=0)
+    ):
+        conv_state_dicts[i][
+            "sequential.0.word_embeddings.weight"
+        ] = chunk.clone().detach()
+    print(
+        "model.embed_tokens.weight",
+        hf_state_dict["model.embed_tokens.weight"].shape,
+        "sequential.0.word_embeddings.weight",
+        conv_state_dicts[0]["sequential.0.word_embeddings.weight"].shape,
+    )
+    # Get config data...
+    num_kv_heads = hf_config.num_key_value_heads
+    num_q_heads = hf_config.num_attention_heads
+    head_dim = hf_config.hidden_size // num_q_heads
+    # do layers...
+    for layer_num in tqdm.tqdm(range(model.model.config.num_hidden_layers)):
+        # --- attention ---
+        # Output first since it's a simple row parallel...
+        for i, chunk in enumerate(
+            torch.chunk(
+                hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"],
+                tp_ranks,
+                dim=1,
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.attention.dense.weight"
+            ] = chunk.clone().detach()
+        print(
+            f"model.layers.{layer_num}.self_attn.o_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.o_proj.weight"].shape,
+            f"sequential.{layer_num+2}.attention.dense.weight",
+            conv_state_dicts[0][
+                f"sequential.{layer_num+2}.attention.dense.weight"
+            ].shape,
+        )
+        # Now for attention...
+        # Split into heads...
+        q = hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"]
+        k = hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"]
+        v = hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"]
+        # The GQA code splits the heads by the num_q_heads so we also do that
+        # here to ensure it matches...
+        q = q.view(num_q_heads, -1, q.shape[-1])
+        k = k.view(num_q_heads, -1, q.shape[-1])
+        v = v.view(num_q_heads, -1, q.shape[-1])
+        # Chunk for tensor parallelism...
+        for i, q_chunk, k_chunk, v_chunk in zip(
+            range(tp_ranks),
+            torch.chunk(q, tp_ranks, dim=0),
+            torch.chunk(k, tp_ranks, dim=0),
+            torch.chunk(v, tp_ranks, dim=0),
+        ):
+            # Need to join the heads across q, k, v...
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.attention.query_key_value.weight"
+            ] = (
+                torch.cat([q_chunk, k_chunk, v_chunk], dim=1)
+                .view(-1, q.shape[-1])
+                .clone()
+                .detach()
+            )
+        print(
+            f"model.layers.{layer_num}.self_attn.(q/k/v)_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.q_proj.weight"].shape,
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.k_proj.weight"].shape,
+            hf_state_dict[f"model.layers.{layer_num}.self_attn.v_proj.weight"].shape,
+            f"sequential.{layer_num+2}.attention.query_key_value.weight",
+            conv_state_dicts[0][
+                f"sequential.{layer_num+2}.attention.query_key_value.weight"
+            ].shape,
+        )
+        # --- mlp ---
+        # Do SwiGLU weights...
+        # w1...
+        for i, (w1, w3) in enumerate(
+            zip(
+                torch.chunk(
+                    hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"],
+                    tp_ranks,
+                    dim=0,
+                ),
+                torch.chunk(
+                    hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"],
+                    tp_ranks,
+                    dim=0,
+                ),
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.mlp.linear1.weight"
+            ] = torch.cat([w3.clone().detach(), w1.clone().detach()], dim=0)
+        print(
+            f"model.layers.{layer_num}.mlp.gate_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.mlp.gate_proj.weight"].shape,
+            f"model.layers.{layer_num}.mlp.up_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.mlp.up_proj.weight"].shape,
+            f"sequential.{layer_num+2}.mlp.w3.weight",
+            conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.linear1.weight"].shape,
+        )
+        # w2 (output)...
+        for i, chunk in enumerate(
+            torch.chunk(
+                hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"],
+                tp_ranks,
+                dim=1,
+            )
+        ):
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.mlp.linear2.weight"
+            ] = chunk.clone().detach()
+        print(
+            f"model.layers.{layer_num}.mlp.down_proj.weight",
+            hf_state_dict[f"model.layers.{layer_num}.mlp.down_proj.weight"].shape,
+            f"sequential.{layer_num+2}.mlp.linear2.weight",
+            conv_state_dicts[0][f"sequential.{layer_num+2}.mlp.linear2.weight"].shape,
+        )
+        # --- norm ---
+        for i in range(tp_ranks):
+            conv_state_dicts[i][f"sequential.{layer_num+2}.input_layernorm.scale"] = (
+                hf_state_dict[f"model.layers.{layer_num}.input_layernorm.weight"]
+                .clone()
+                .detach()
+            )
+            conv_state_dicts[i][
+                f"sequential.{layer_num+2}.post_attention_layernorm.scale"
+            ] = (
+                hf_state_dict[
+                    f"model.layers.{layer_num}.post_attention_layernorm.weight"
+                ]
+                .clone()
+                .detach()
+            )
+
+    # Get final ln/linear....
+    index = model.model.config.num_hidden_layers + 3
+    for i in range(tp_ranks):
+        conv_state_dicts[i][f"sequential.{index}.norm.scale"] = (
+            hf_state_dict["model.norm.weight"].clone().detach()
+        )
+    index += 1
+    # do output...
+    for i, chunk in enumerate(
+        torch.chunk(hf_state_dict["lm_head.weight"], tp_ranks, dim=0)
+    ):
+        conv_state_dicts[i][
+            f"sequential.{index}.final_linear.weight"
+        ] = chunk.clone().detach()
+    print(
+        "lm_head.weight",
+        hf_state_dict["lm_head.weight"].shape,
+        f"sequential.{index}.final_linear.weight",
+        conv_state_dicts[0][f"sequential.{index}.final_linear.weight"].shape,
+    )
+    return conv_state_dicts
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--tp", type=int, default=1, help="Number of tensor parallelism ranks"
+    )
+    parser.add_argument(
+        "--pp", type=int, default=0, help="Number of pipeline parallelism stages"
+    )
+    parser.add_argument("--model", type=str, default="gpt2", help="HF model name")
+    parser.add_argument(
+        "--model_path", type=str, default=None, help="Path to save model"
+    )
+    args = parser.parse_args()
+    assert args.pp == 0, "Pipeline parallelism not supported yet"
+    tokenizer = AutoTokenizer.from_pretrained(args.model).save_pretrained(
+        args.model_path + "/tokenizer"
+    )
+    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype="auto")
+    state_dict = model.state_dict()
+    for key in state_dict.keys():
+        print(key, state_dict[key].shape)
+    os.makedirs(args.model_path, exist_ok=True)
+    # Setup model directory...
+    os.makedirs(f"{args.model_path}/0", exist_ok=True)
+    # Save the latest file so neox can figure out where to grab the weights...
+    with open(f"{args.model_path}/latest", "w") as f:
+        f.write("0")
+    # Convert the model...
+    tp_state_dicts = convert_model(state_dict, model.model.config, args.tp)
+    for i in range(args.tp):
+        torch.save(
+            {
+                "dp_world_size": 1,
+                "mp_world_size": args.tp,
+                "optimizer": {},
+                "global_steps": 1,
+                "skipped_steps": 1,
+                "iteration": 1,
+                "module": tp_state_dicts[i],
+            },
+            f"{args.model_path}/0/mp_rank_{i:02d}_model_states.pt",
+        )
diff --git a/tools/ckpts/convert_hf_to_sequential.py b/tools/ckpts/convert_hf_to_sequential.py
index c53f28391..5e0ada334 100644
--- a/tools/ckpts/convert_hf_to_sequential.py
+++ b/tools/ckpts/convert_hf_to_sequential.py
@@ -119,16 +119,27 @@ def shard_sequential_mp(num_mp_ranks, sequential):
     ranks = {x: dict() for x in range(num_mp_ranks)}
     for k, v in sequential.items():
         if reduce(
+            np.logical_or,
+            [
+                x in k
+                for x in [
+                    "dense_4h_to_h.bias",
+                    "attention.dense.bias",
+                ]
+            ],
+        ):
+            # Divide by tp_size since they get added together
+            for x in range(num_mp_ranks):
+                ranks[x][k] = v / num_mp_ranks
+        elif reduce(
             np.logical_or,
             [
                 x in k
                 for x in [
                     "layernorm",
                     "rotary_emb",
-                    "dense_4h_to_h.bias",
                     "norm.weight",
                     "norm.bias",
-                    "attention.dense.bias",
                 ]
             ],
         ):
@@ -504,6 +515,7 @@ def get_non_existing_dir(tmp_dir):
     neox_args.configure_distributed_args()
     neox_args.build_tokenizer()
     neox_args.initialize_tensorboard_writer()
+    neox_args.comet()
 
     # setup logging and timers
     # init_wandb(neox_args=neox_args)
diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
index 35812383e..8dfe02d54 100644
--- a/tools/ckpts/convert_neox_to_hf.py
+++ b/tools/ckpts/convert_neox_to_hf.py
@@ -26,6 +26,7 @@
     GPTNeoXConfig,
     AutoModelForCausalLM,
     AutoConfig,
+    AutoModelForSequenceClassification,
 )
 
 from typing import List, Literal
@@ -50,57 +51,110 @@
 
 # Model definitions: a list of keys, and where they fall in terms of handling them in the presence of TP.
 # in format : {model arch: {param type: {param in neox: param in HF}}}
-
 MODEL_KEYS = {
     "neox": {
-        "COLUMN_PARALLEL_LINEAR_KEYS": {
-            "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias",
-            "attention.query_key_value.weight": "attention.query_key_value.weight",
-            "attention.query_key_value.bias": "attention.query_key_value.bias",  # TODO: handle GQA separately?
-        },
-        "ROW_PARALLEL_LINEAR_KEYS": {
-            "attention.dense.weight": "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight",
-        },
-        "ROW_PARALLEL_BIAS_KEYS": {
-            "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias",
-            "attention.dense.bias": "attention.dense.bias",
+        "new": {
+            "COLUMN_PARALLEL_LINEAR_KEYS": {
+                "mlp.linear1.weight": "mlp.dense_h_to_4h.weight",
+                "mlp.linear1.bias": "mlp.dense_h_to_4h.bias",
+                "attention.query_key_value.weight": "attention.query_key_value.weight",
+                "attention.query_key_value.bias": "attention.query_key_value.bias",  # TODO: handle GQA separately?
+            },
+            "ROW_PARALLEL_LINEAR_KEYS": {
+                "attention.dense.weight": "attention.dense.weight",
+                "mlp.linear2.weight": "mlp.dense_4h_to_h.weight",
+            },
+            "ROW_PARALLEL_BIAS_KEYS": {
+                "mlp.linear2.bias": "mlp.dense_4h_to_h.bias",
+                "attention.dense.bias": "attention.dense.bias",
+            },
+            "NORM_KEYS": {
+                "input_layernorm.weight": "input_layernorm.weight",
+                "input_layernorm.bias": "input_layernorm.bias",
+                "post_attention_layernorm.weight": "post_attention_layernorm.weight",
+                "post_attention_layernorm.bias": "post_attention_layernorm.bias",
+            },
+            "FINAL_NORM_KEYS": {
+                "norm.weight": "weight",
+                "norm.bias": "bias",
+            },
         },
-        "NORM_KEYS": {
-            "input_layernorm.weight": "input_layernorm.weight",
-            "input_layernorm.bias": "input_layernorm.bias",
-            "post_attention_layernorm.weight": "post_attention_layernorm.weight",
-            "post_attention_layernorm.bias": "post_attention_layernorm.bias",
-        },
-        "FINAL_NORM_KEYS": {
-            "norm.weight": "weight",
-            "norm.bias": "bias",
+        "legacy": {
+            "COLUMN_PARALLEL_LINEAR_KEYS": {
+                "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight",
+                "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias",
+                "attention.query_key_value.weight": "attention.query_key_value.weight",
+                "attention.query_key_value.bias": "attention.query_key_value.bias",  # TODO: handle GQA separately?
+            },
+            "ROW_PARALLEL_LINEAR_KEYS": {
+                "attention.dense.weight": "attention.dense.weight",
+                "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight",
+            },
+            "ROW_PARALLEL_BIAS_KEYS": {
+                "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias",
+                "attention.dense.bias": "attention.dense.bias",
+            },
+            "NORM_KEYS": {
+                "input_layernorm.weight": "input_layernorm.weight",
+                "input_layernorm.bias": "input_layernorm.bias",
+                "post_attention_layernorm.weight": "post_attention_layernorm.weight",
+                "post_attention_layernorm.bias": "post_attention_layernorm.bias",
+            },
+            "FINAL_NORM_KEYS": {
+                "norm.weight": "weight",
+                "norm.bias": "bias",
+            },
         },
     },
     "llama": {
-        "COLUMN_PARALLEL_LINEAR_KEYS": {
-            "mlp.w1.weight": "mlp.gate_proj.weight",
-            "mlp.w3.weight": "mlp.up_proj.weight",
-        },
-        "ROW_PARALLEL_LINEAR_KEYS": {
-            "attention.dense.weight": "self_attn.o_proj.weight",
-            "mlp.w2.weight": "mlp.down_proj.weight",
+        "new": {
+            "COLUMN_PARALLEL_LINEAR_KEYS": {
+                "mlp.linear1.weight": ["mlp.up_proj.weight", "mlp.gate_proj.weight"]
+            },
+            "ROW_PARALLEL_LINEAR_KEYS": {
+                "attention.dense.weight": "self_attn.o_proj.weight",
+                "mlp.linear2.weight": "mlp.down_proj.weight",
+            },
+            "ROW_PARALLEL_BIAS_KEYS": {},  # No biases in RowParallelLinear layers
+            "NORM_KEYS": {
+                "input_layernorm.scale": "input_layernorm.weight",
+                "post_attention_layernorm.scale": "post_attention_layernorm.weight",
+            },
+            "FINAL_NORM_KEYS": {
+                "norm.scale": "weight",
+            },
+            "GQA_QKV_KEYS": {  # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately.
+                "attention.query_key_value.weight": [
+                    "self_attn.q_proj.weight",
+                    "self_attn.k_proj.weight",
+                    "self_attn.v_proj.weight",
+                ],
+            },
         },
-        "ROW_PARALLEL_BIAS_KEYS": {},  # No biases in RowParallelLinear layers
-        "NORM_KEYS": {
-            "input_layernorm.scale": "input_layernorm.weight",
-            "post_attention_layernorm.scale": "post_attention_layernorm.weight",
-        },
-        "FINAL_NORM_KEYS": {
-            "norm.scale": "weight",
-        },
-        "GQA_QKV_KEYS": {  # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately.
-            "attention.query_key_value.weight": [
-                "self_attn.q_proj.weight",
-                "self_attn.k_proj.weight",
-                "self_attn.v_proj.weight",
-            ],
+        "legacy": {
+            "COLUMN_PARALLEL_LINEAR_KEYS": {
+                "mlp.w1.weight": "mlp.gate_proj.weight",
+                "mlp.w3.weight": "mlp.up_proj.weight",
+            },
+            "ROW_PARALLEL_LINEAR_KEYS": {
+                "attention.dense.weight": "self_attn.o_proj.weight",
+                "mlp.w2.weight": "mlp.down_proj.weight",
+            },
+            "ROW_PARALLEL_BIAS_KEYS": {},  # No biases in RowParallelLinear layers
+            "NORM_KEYS": {
+                "input_layernorm.scale": "input_layernorm.weight",
+                "post_attention_layernorm.scale": "post_attention_layernorm.weight",
+            },
+            "FINAL_NORM_KEYS": {
+                "norm.scale": "weight",
+            },
+            "GQA_QKV_KEYS": {  # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately.
+                "attention.query_key_value.weight": [
+                    "self_attn.q_proj.weight",
+                    "self_attn.k_proj.weight",
+                    "self_attn.v_proj.weight",
+                ],
+            },
         },
     },
 }
@@ -165,7 +219,7 @@ def get_key(loaded_config, key, default=None):
             return default
 
 
-def create_config(neox_config, architecture="neox"):
+def create_config(neox_config, architecture="neox", is_rm=False, pad_token_id=-1):
     """take in a loaded yaml from NeoX and assign relevant values to HF config.
     Returns: GPTNeoXConfig() object
     """
@@ -238,7 +292,9 @@ def __init__(self, neox_config):
                     "num-kv-heads",
                     get_key(neox_config, "num-attention-heads"),
                 ),
-                "hidden_act": get_key(neox_config, "activation", default="silu"),
+                "hidden_act": get_key(
+                    neox_config, "activation", default="silu"
+                ).replace("swiglu", "silu"),
                 "rms_norm_eps": get_key(neox_config, "rms-norm-epsilon", 1.0e-6),
                 "bos_token_id": tokenizer.eod,
                 "eos_token_id": tokenizer.eod,
@@ -285,6 +341,9 @@ def __init__(self, neox_config):
             }
         )
         hf_config = GPTNeoXConfig(**args)
+    if is_rm:
+        hf_config.num_labels = 1
+        hf_config.pad_token_id = pad_token_id
 
     return hf_config
 
@@ -383,6 +442,30 @@ def reshard_and_split_qkv(
         return state_dict
 
 
+def get_mlp_naming_convention(loaded_tp_ranks, layer_idx, sequential):
+    """Determine whether the checkpoint uses the legacy or new MLP naming convention."""
+    print(list(loaded_tp_ranks[0]["module"].keys()))
+    if any(
+        [
+            ["mlp.linear1.weight" in key for key in list(state_dict["module"].keys())]
+            for state_dict in loaded_tp_ranks
+        ]
+    ):
+        return "new"
+    elif any(
+        [
+            [
+                "mlp.dense_h_to_4h.weight" in key
+                for key in list(state_dict["module"].keys())
+            ]
+            for state_dict in loaded_tp_ranks
+        ]
+    ):
+        return "legacy"
+    else:
+        raise ValueError("Unable to determine MLP naming convention in checkpoint")
+
+
 def convert(
     input_checkpoint_path,
     loaded_config,
@@ -390,6 +473,8 @@ def convert(
     sequential: bool = True,
     precision: Literal["auto", "fp16", "bf16", "fp32"] = "auto",
     architecture: Literal["neox", "llama", "mistral"] = "neox",
+    is_rm: bool = False,
+    pad_token_id: int = -1,
 ):
     """convert a NeoX checkpoint to a HF model format.
     should perform model-parallel merging correctly
@@ -398,9 +483,14 @@ def convert(
 
     ARCH = MODEL_KEYS[architecture]
 
-    hf_config = create_config(loaded_config, architecture=architecture)
+    hf_config = create_config(
+        loaded_config, architecture=architecture, is_rm=is_rm, pad_token_id=pad_token_id
+    )
 
-    hf_model = AutoModelForCausalLM.from_config(hf_config)
+    if not is_rm:
+        hf_model = AutoModelForCausalLM.from_config(hf_config)
+    else:
+        hf_model = AutoModelForSequenceClassification.from_config(hf_config)
 
     if architecture == "neox":
         hf_transformer = hf_model.gpt_neox
@@ -474,6 +564,20 @@ def convert(
     ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {embed_in.shape[0]}"
     ### End Embedding Layer ###
 
+    # grab from 3rd layer to pass embeddings
+    mlp_naming = get_mlp_naming_convention(
+        load_partitions(
+            input_checkpoint_path,
+            mp_partitions,
+            layer_idx=3,
+            sequential=sequential,
+        ),
+        0,
+        sequential,
+    )
+    print(f"Detected MLP naming convention: {mlp_naming}")
+    ARCH = ARCH[mlp_naming]
+
     for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
 
         # get layer from hf model
@@ -509,12 +613,31 @@ def convert(
 
         # LinearWithTPMerge
         for key, hf_key in ARCH["COLUMN_PARALLEL_LINEAR_KEYS"].items():
-            state_dict[hf_key] = torch.cat(
-                get_state(
-                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
-                ),
-                dim=0,
-            )
+            if type(hf_key) == list:
+                # Llama magic - split the weight into two parts for the gate and up proj
+                states = [
+                    torch.chunk(state, chunks=2, dim=0)
+                    for state in get_state(
+                        loaded_tp_ranks,
+                        key,
+                        layer_idx=layer_i + 2,
+                        sequential=sequential,
+                    )
+                ]
+                # Set up proj...
+                state_dict[hf_key[0]] = torch.cat([state[0] for state in states], dim=0)
+                # Set gate proj...
+                state_dict[hf_key[1]] = torch.cat([state[1] for state in states], dim=0)
+            else:
+                state_dict[hf_key] = torch.cat(
+                    get_state(
+                        loaded_tp_ranks,
+                        key,
+                        layer_idx=layer_i + 2,
+                        sequential=sequential,
+                    ),
+                    dim=0,
+                )
 
         # LinearWithTPSplitBias
         for key, hf_key in ARCH["ROW_PARALLEL_BIAS_KEYS"].items():
@@ -556,10 +679,6 @@ def convert(
             sequential=sequential,
         )
     # Load final layer norm
-    if architecture == "neox":
-        lm_head = hf_model.embed_out
-    else:
-        lm_head = hf_model.lm_head
     norm_state_dict = {}
     for key, hf_key in ARCH["FINAL_NORM_KEYS"].items():
         norm_state_dict[hf_key] = sum(
@@ -580,30 +699,64 @@ def convert(
 
     # Load output embedding
     if not sequential:
-        loaded_tp_ranks = load_partitions(
-            input_checkpoint_path,
-            mp_partitions,
-            get_key(loaded_config, "num-layers") + 4,
-            sequential=sequential,
-        )
+        if get_key(loaded_config, "no-weight-tying", False):
+            # if we have trained input + output embedding layers without tied weights
+            loaded_tp_ranks = load_partitions(
+                input_checkpoint_path,
+                mp_partitions,
+                get_key(loaded_config, "num-layers") + 4,
+                sequential=sequential,
+            )
+        else:
+            # in this case, output embedding layer and input embedding layer are tied.
+            # load + save the input embed weights into the output embedding layer's place.
+            loaded_tp_ranks = load_partitions(
+                input_checkpoint_path,
+                mp_partitions,
+                layer_idx=0,
+                sequential=sequential,
+            )
     # output embedding / LM head
-    if architecture == "neox":  # name of lm head / final linear proj varies
-        lm_head = hf_model.embed_out
+    if not is_rm:
+        if architecture == "neox":  # name of lm head / final linear proj varies
+            lm_head = hf_model.embed_out
+        else:
+            lm_head = hf_model.lm_head
     else:
-        lm_head = hf_model.lm_head
-    lm_head.load_state_dict(
-        {
-            "weight": torch.cat(
-                get_state(
-                    loaded_tp_ranks,
-                    "final_linear.weight",
-                    layer_idx=get_key(loaded_config, "num-layers") + 4,
-                    sequential=sequential,
+        lm_head = hf_model.score
+
+    if get_key(loaded_config, "no-weight-tying", False):
+        # save the (untied) final linear into LM head for HF
+        lm_head.load_state_dict(
+            {
+                "weight": torch.cat(
+                    get_state(
+                        loaded_tp_ranks,
+                        "final_linear.weight" if not is_rm else "rm_linear.weight",
+                        layer_idx=get_key(loaded_config, "num-layers") + 4,
+                        sequential=sequential,
+                    ),
+                    dim=0 if not is_rm else 1,
                 ),
-                dim=0,
-            ),
-        }
-    )
+            }
+        )
+    else:
+        # don't need to worry about rm here since you can't really tie them...
+
+        # embedding layers are tied. transpose input layer and save
+        lm_head.load_state_dict(
+            {
+                "weight": torch.cat(
+                    get_state(
+                        loaded_tp_ranks,
+                        "word_embeddings.weight",
+                        layer_idx=0,
+                        sequential=sequential,
+                    ),
+                    dim=0,
+                ),
+            }
+        )
 
     del loaded_tp_ranks
 
@@ -642,6 +795,17 @@ def main(input_args=None, overwrite_values=None):
         action="store_true",
         help="Whether to skip saving the tokenizer alongside a model.",
     )
+    parser.add_argument(
+        "--vocab-is-hf-tokenizer",
+        action="store_true",
+        help="Whether the vocab file is in a Huggingface tokenizer path.",
+    )
+    parser.add_argument(
+        "--pad-token-id",
+        type=int,
+        default=-1,
+        help="Pad token id to set in tokenizer. Required for RM style models.",
+    )
     parser.add_argument(
         "--architecture",
         type=str,
@@ -674,6 +838,9 @@ def main(input_args=None, overwrite_values=None):
     # while Sequential model state dicts are saved all together in one mp_rank_xx_model_states.pt
     # file per tensor/model parallel shard.
     pipeline_world_size = get_key(loaded_config, "pipe-parallel-size", 1)
+    is_rm = get_key(loaded_config, "train_impl", "normal") == "rm"
+    if is_rm and args.pad_token_id == -1:
+        raise ValueError("RM models require a pad token id to be set.")
     if pipeline_world_size == 0:
         sequential = True
         print(
@@ -692,6 +859,8 @@ def main(input_args=None, overwrite_values=None):
         args.output_dir,
         sequential=sequential,
         architecture=args.architecture,
+        is_rm=is_rm,
+        pad_token_id=args.pad_token_id,
     )
 
     # Save to disk.
@@ -700,8 +869,18 @@ def main(input_args=None, overwrite_values=None):
     if not args.no_save_tokenizer:
         # save tokenizer to directory as well, for easy loading of model as a HF model.
         tokenizer_type = get_key(loaded_config, "tokenizer-type")
+        if args.vocab_is_hf_tokenizer:
+            from transformers import AutoTokenizer
 
-        if tokenizer_type == "HFTokenizer":  # TODO: handle sentencepiece tokenizers?
+            tokenizer = AutoTokenizer.from_pretrained(
+                os.path.dirname(get_key(loaded_config, "vocab-file"))
+            )
+            if args.pad_token_id != -1:
+                tokenizer.pad_token_id = args.pad_token_id
+            print("loaded tokenizer: ", tokenizer)
+            tokenizer.save_pretrained(args.output_dir)
+            print("tokenizer saved!")
+        elif tokenizer_type == "HFTokenizer":  # TODO: handle sentencepiece tokenizers?
             print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
             print(
                 "Warning: please check that your model config and tokenizer end with the correct special tokens (EOS, BOS)."
@@ -711,6 +890,8 @@ def main(input_args=None, overwrite_values=None):
             tokenizer = PreTrainedTokenizerFast(
                 tokenizer_file=get_key(loaded_config, "vocab-file")
             )
+            if args.pad_token_id != -1:
+                tokenizer.pad_token_id = args.pad_token_id
             print("loaded tokenizer: ", tokenizer)
             tokenizer.save_pretrained(args.output_dir)
             print("tokenizer saved!")
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index f8215959c..af3009a23 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -93,6 +93,57 @@ output data:
   --dataset-impl {lazy,cached,mmap}
                         Dataset implementation to use. Default: mmap
 
+runtime:
+  --workers WORKERS     Number of worker processes to launch
+  --log-interval LOG_INTERVAL
+                        Interval between progress updates
+```
+## `preprocess_data_with_chat_template.py`
+Similar, but uses huggingface's [chat templates](https://huggingface.co/docs/transformers/main/en/chat_templating) to
+tokenize the data to support multiturn and more complicated use cases.
+
+N.B. If using this, you  **must** specify your data when training/finetuning with the following configs
+```json
+"train_data_paths": ["train_documents"],
+"test_data_paths": ["test_documents"],
+"valid_data_paths": ["test_documents"],
+"label_data_paths": ["label_documents"]
+```
+
+the `"data_path"` option will not work with `"label_data_paths"`.
+
+
+```
+usage: preprocess_data_with_chat_template.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--no-mask]
+                                             [--generation-role GENERATION_ROLE] [--only-last] [--num-docs NUM_DOCS]
+                                             --tokenizer-path TOKENIZER_PATH [--ftfy] --output-prefix OUTPUT_PREFIX
+                                             [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS]
+                                             [--log-interval LOG_INTERVAL]
+
+options:
+  -h, --help            show this help message and exit
+
+input data:
+  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
+  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
+                        space separate listed of keys to extract from jsonl. Default: text
+  --no-mask             If set, this will not mask any tokens in the input data.
+  --generation-role GENERATION_ROLE
+                        The role of the model generating the chat, usually 'assistant'. Default: assistant
+  --only-last           If set, this will mask everything except the last turn in the chat.
+  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
+
+tokenizer:
+  --tokenizer-path TOKENIZER_PATH
+                        Path to HF Tokenizer.
+  --ftfy                Use ftfy to clean text
+
+output data:
+  --output-prefix OUTPUT_PREFIX
+                        Path to binary output file without suffix
+  --dataset-impl {lazy,cached,mmap}
+                        Dataset implementation to use. Default: mmap
+
 runtime:
   --workers WORKERS     Number of worker processes to launch
   --log-interval LOG_INTERVAL
diff --git a/tools/datasets/preprocess_data_with_chat_template.py b/tools/datasets/preprocess_data_with_chat_template.py
new file mode 100644
index 000000000..ee2b983b6
--- /dev/null
+++ b/tools/datasets/preprocess_data_with_chat_template.py
@@ -0,0 +1,416 @@
+# Copyright (c) 2024, EleutherAI
+# This file is based on code by the authors denoted below and has been modified from its original version.
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A script for processing a dataset such that chat templates are utilized in the creation of the data.
+These are then used to perform instruction/chat model finetunes (for example, finetuning a model on only the assistant
+portions of a chatml dataset).
+
+This follows the same output format as 'preprocess_data_with_mask.py' but using chat templates to generate the data.
+This way we can support multiturn chat data in the finetuning process. instead of relying on a single turn of data.
+
+To run this script, first edit `tools/datasets/corpora.py` such that the command to call
+ `tools/datasets/preprocess_data_with_chat_template.py` is as follows:
+
+```
+cmd = f"python tools/datasets/preprocess_data_with_with_chat_template.py \
+    --input {jsonl_filepath} \
+    --output-prefix {parent_folder}/{self.name} \
+    --tokenizer-path {hf-tokenizer} \
+    --jsonl-keys {jsonl_keys} \
+    --dataset-impl mmap \
+    --workers {self.num_workers} "
+
+if self.only_last:
+    cmd += f"--only-last "
+
+if self.no_mask:
+    cmd += f"--no-mask "
+```
+
+Then, specify
+```
+"train_data_paths": ["/path/to/dataset/name_text_document"],
+"label_data_paths": ["/path/to/dataset/name_label_document"]
+```
+in your YML config. This will then allow for finetuning on the data with loss masks set appropriately.
+
+"""
+
+import argparse
+import multiprocessing
+import os
+import sys
+
+import lm_dataformat as lmd
+import numpy as np
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+
+import time
+import tqdm
+import jsonlines
+
+from megatron.data import indexed_dataset
+from threading import Semaphore
+from typing import List, Dict, Tuple
+from transformers import AutoTokenizer, PreTrainedTokenizer
+
+
+def build_chat(
+    chat: List[Dict[str, str]],
+    generation_role: str,
+    apply_mask: bool,
+    tokenizer: PreTrainedTokenizer,
+    only_last_turn: bool = False,
+    for_rm: bool = False,
+) -> Tuple[List[int], List[int]]:
+    """
+    Build a chat from a list of dictionaries. Each dictionary should have a "role" and "content" key, this follows the
+    Chat Template from https://huggingface.co/docs/transformers/main/en/chat_templating
+
+    :param chat: A list of dictionaries with "role" and "content" keys
+    :param generation_role: The role of the model generating the chat, usually "assistant"
+    :param apply_mask: Whether to apply a loss mask to the chat, if False, all tokens will be included in the loss
+    :param tokenizer: A HF tokenizer
+    :param only_last_turn: Whether to only include the last turn in the chat, needed for some fine-tuning tasks
+    """
+    tokens = []
+    mask = []
+    if apply_mask is False:
+        tokens = tokenizer.apply_chat_template(chat)
+        mask = tokens
+        return tokens, mask
+    elif for_rm:
+        tokens = tokenizer.apply_chat_template(chat)
+        mask = [-100] * len(tokens)
+        if tokenizer.eos_token_id is not None:
+            # since this is processed in a causal format (input[:-1], mask[1:], we need to put two here...
+            mask.append(-100)
+            tokens.append(tokenizer.eos_token_id)
+            mask.append(tokenizer.eos_token_id)
+            tokens.append(tokenizer.eos_token_id)
+        else:
+            raise ValueError(
+                "Tokenizer does not have an EOS token, unable to determine good mask, please edit and make your own."
+            )
+        return tokens, mask
+    for i, turn in enumerate(chat):
+        add_gen = (
+            False if i == len(chat) - 1 else chat[i + 1]["role"] == generation_role
+        )
+        chat_tokens = tokenizer.apply_chat_template(
+            chat[: i + 1], add_generation_prompt=add_gen
+        )[len(tokens) :]
+        # remove previous stuff...
+        tokens.extend(chat_tokens)
+        if only_last_turn and (i != len(chat) - 1):
+            mask.extend([-100] * len(chat_tokens))
+        elif apply_mask and (turn["role"] != generation_role):
+            mask.extend([-100] * len(chat_tokens))
+        else:
+            mask.extend(chat_tokens)
+    if tokenizer.eos_token_id is not None:
+        mask.append(tokenizer.eos_token_id if mask[-1] != -100 else -100)
+        tokens.append(tokenizer.eos_token_id)
+    return tokens, mask
+
+
+class Encoder(object):
+    def __init__(self, args):
+        self.args = args
+
+    def initializer(self):
+        # Use Encoder class as a container for global data
+        Encoder.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_path)
+
+    def encode(self, text):
+        ids = {}
+        for key in self.args.jsonl_keys:
+            text_ids, label_ids = build_chat(
+                text[key],
+                self.args.generation_role,
+                not self.args.no_mask,
+                Encoder.tokenizer,
+                self.args.only_last,
+                self.args.for_rm,
+            )
+            if self.args.reward_key is not None:
+                reward = text[self.args.reward_key]
+                if self.args.binary_reward:
+                    reward = [1] if reward else [-1]
+                elif type(reward) == float:
+                    reward = [reward]
+                ids[key] = (text_ids, label_ids, reward)
+            else:
+                ids[key] = (text_ids, label_ids, None)
+        return ids, len(text)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated "
+        "list",
+    )
+    group.add_argument(
+        "--jsonl-keys",
+        nargs="+",
+        default=["conversation"],
+        help="space separate listed of keys to extract from jsonl. Default: text",
+    )
+    group.add_argument(
+        "--no-mask",
+        help="If set, this will not mask any tokens in the input data.",
+        action="store_true",
+    )
+    group.add_argument(
+        "--for-rm",
+        help="If set, this will mask everything except the last token in the chat.",
+        action="store_true",
+    )
+
+    group.add_argument(
+        "--generation-role",
+        type=str,
+        default="assistant",
+        help="The role of the model generating the chat, usually 'assistant'. Default: assistant",
+    )
+    group.add_argument(
+        "--only-last",
+        help="If set, this will mask everything except the last turn in the chat.",
+        action="store_true",
+    )
+    group.add_argument(
+        "--reward-key",
+        type=str,
+        default=None,
+        help="Optional: key to use for reward data in the input data.",
+    )
+    group.add_argument(
+        "--binary-reward",
+        help="If set, this will treat the reward data as a boolean.",
+        action="store_true",
+    )
+    group.add_argument(
+        "--num-docs",
+        default=None,
+        help="Optional: Number of documents in the input data (if known) for an accurate progress bar.",
+        type=int,
+    )
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--tokenizer-path",
+        type=str,
+        required=True,
+        help="Path to HF Tokenizer.",
+    )
+    group.add_argument("--ftfy", action="store_true", help="Use ftfy to clean text")
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
+    group.add_argument(
+        "--dataset-impl",
+        type=str,
+        default="mmap",
+        choices=["lazy", "cached", "mmap"],
+        help="Dataset implementation to use. Default: mmap",
+    )
+
+    group = parser.add_argument_group(title="runtime")
+    group.add_argument(
+        "--workers", type=int, default=1, help="Number of worker processes to launch"
+    )
+    group.add_argument(
+        "--log-interval",
+        type=int,
+        default=100,
+        help="Interval between progress updates",
+    )
+    args = parser.parse_args()
+    args.keep_empty = False
+
+    # some default/dummy values for the tokenizer
+    args.rank = 0
+    args.make_vocab_size_divisible_by = 128
+    args.model_parallel_size = 1
+
+    return args
+
+
+def yield_from_files(fnames: list, semaphore):
+    """
+    Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
+    other compressed formats. Also filters out empty documents.
+
+    :param fnames: list of filenames
+    """
+
+    def yielder(fname, semaphore):
+        with open(fname, encoding="utf-8") as f:
+            reader = jsonlines.Reader(f)
+            for f in reader:
+                semaphore.acquire()
+                yield f
+
+    for fname in fnames:
+        semaphore.acquire()
+
+        yield from yielder(fname, semaphore)
+
+
+def main():
+    args = get_args()
+    encoder = Encoder(args)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
+    print(f"Vocab size: {tokenizer.vocab_size}")
+    print(f"Output prefix: {args.output_prefix}")
+
+    # build a semaphore object to stop `yield_from_files` from getting ahead of encoder.encode and
+    # hence building up memory
+    semaphore = Semaphore(10000 + args.workers)
+
+    # use multiprocessing to iterate over input documents
+    fin = yield_from_files(args.input.split(","), semaphore)
+
+    if args.workers > 1:
+        pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
+        encoded_docs = pool.imap(encoder.encode, fin, chunksize=25)
+    else:
+        encoder.initializer()
+        encoded_docs = (encoder.encode(doc) for doc in fin)
+
+    # make a dataset builder for each key in args.jsonl_keys
+    # each key will output to a different file beginning with args.output_prefix
+    output_bin_files = {}
+    output_idx_files = {}
+    builders = {}
+    for key in args.jsonl_keys:
+        output_bin_files[key] = "{}_{}_{}.bin".format(
+            args.output_prefix, key, "document"
+        )
+        output_idx_files[key] = "{}_{}_{}.idx".format(
+            args.output_prefix, key, "document"
+        )
+        builders[key] = indexed_dataset.make_builder(
+            output_bin_files[key],
+            impl=args.dataset_impl,
+            vocab_size=tokenizer.vocab_size,
+        )
+        builders[key]._dtype = np.int32
+        if not args.no_mask:
+            assert (
+                key + "_label" not in args.jsonl_keys
+            ), "label should not be included as it will be generated according to the mask."
+            label_key = key + "_label"
+            output_bin_files[label_key] = "{}_{}_{}.bin".format(
+                args.output_prefix, label_key, "document"
+            )
+            output_idx_files[label_key] = "{}_{}_{}.idx".format(
+                args.output_prefix, label_key, "document"
+            )
+            builders[label_key] = indexed_dataset.make_builder(
+                output_bin_files[label_key],
+                impl=args.dataset_impl,
+                vocab_size=tokenizer.vocab_size,
+            )
+            builders[label_key]._dtype = np.int32
+        if args.reward_key is not None:
+            assert (
+                key + "_reward" not in args.jsonl_keys
+            ), "reward should not be included as it will be generated from the data."
+            reward_key = key + "_reward"
+            output_bin_files[reward_key] = "{}_{}_{}.bin".format(
+                args.output_prefix, reward_key, "document"
+            )
+            output_idx_files[reward_key] = "{}_{}_{}.idx".format(
+                args.output_prefix, reward_key, "document"
+            )
+            builders[reward_key] = indexed_dataset.make_builder(
+                output_bin_files[reward_key],
+                impl=args.dataset_impl,
+                vocab_size=tokenizer.vocab_size,
+            )
+            builders[reward_key]._dtype = np.int32
+
+    # actually do tokenization
+    proc_start = time.time()
+    total_bytes_processed = 0
+    pbar = tqdm.tqdm()
+    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
+        total_bytes_processed += bytes_processed
+
+        # release semaphore so `yield_from_files` can add another file to the buffer
+        semaphore.release()
+
+        # add each tokenized document / sentence
+        for key, conv in doc.items():
+            tokens = conv[0]
+            token_mask = conv[1]
+            reward = conv[2]
+            builders[key].add_item(np.array(tokens, dtype=builders[key].dtype))
+            builders[key + "_label"].add_item(
+                np.array(token_mask, dtype=builders[key + "_label"].dtype)
+            )
+            if args.reward_key is not None:
+                builders[key + "_reward"].add_item(
+                    np.array(reward, dtype=builders[key + "_reward"].dtype)
+                )
+            # add indx...
+            builders[key].end_document()
+            builders[key + "_label"].end_document()
+            if args.reward_key is not None:
+                builders[key + "_reward"].end_document()
+            if i == 1:
+                print("key: ", key)
+                print("tokens: ", tokens)
+                print("token_mask: ", token_mask)
+                print("Reward: ", reward)
+        # log progress
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            pbar.set_description(
+                f"Processed {i}{'' if args.num_docs is None else '/' + str(args.num_docs)} documents ({i / elapsed} docs/s, {mbs} MB/s)."
+            )
+            if i != 0:
+                pbar.update(args.log_interval)
+
+    # save output file
+    update_keys = args.jsonl_keys
+    for key in update_keys:
+        builders[key].finalize(output_idx_files[key])
+        builders[key + "_label"].finalize(output_idx_files[key + "_label"])
+        if args.reward_key is not None:
+            builders[key + "_reward"].finalize(output_idx_files[key + "_reward"])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/train.py b/train.py
index 2e4b09954..3e01a6306 100644
--- a/train.py
+++ b/train.py
@@ -27,6 +27,7 @@ def main(input_args=None, overwrite_values=None):
     neox_args.configure_distributed_args()
     neox_args.build_tokenizer()  # tokenizer needs to be build in training in order to set the padding vocab
     neox_args.initialize_tensorboard_writer()  # is initialized if tensorboard directory is defined
+    neox_args.initialize_comet()  # is initialized if comet directory is defined
     pretrain(neox_args=neox_args)