From e52b7495abf2cb2ab75ccb0ed2044e3e5d08eecd Mon Sep 17 00:00:00 2001
From: zhangir-azerbayev <zazerbayev@gmail.com>
Date: Thu, 5 Oct 2023 17:42:13 -0400
Subject: [PATCH] update with llemma

---
 34b_launch_script.sh                          |   39 +
 README.md                                     |  415 +---
 configs/1-3B.yml                              |   91 -
 configs/125M-json.yml                         |   78 -
 configs/125M.yml                              |   94 -
 configs/13B.yml                               |   92 -
 configs/175B.yml                              |   90 -
 configs/19M.yml                               |   95 -
 configs/2-7B.yml                              |   91 -
 configs/20B.yml                               |  111 -
 configs/350M.yml                              |   90 -
 configs/49M.yml                               |   91 -
 configs/6-7B.yml                              |   91 -
 configs/760M.yml                              |   91 -
 configs/800M.yml                              |   84 -
 configs/README.md                             |  314 ---
 configs/autotuning_configs/small_tune.json    |   78 -
 configs/autotuning_configs/tune.json          |   72 -
 configs/autotuning_configs/tune_1-3B.json     |   86 -
 configs/autotuning_configs/tune_6-7B.json     |   77 -
 configs/bf16_125M.yml                         |   78 -
 configs/bnb_125M.yml                          |   85 -
 configs/cpu_mock_config.yml                   |    5 -
 configs/data_mixture.yml                      |    6 +
 configs/eleutherai_cluster.yml                |   29 -
 configs/finetuning_configs/6-9B.yml           |   89 -
 configs/gen_docs.py                           |   93 -
 configs/gmlp_small.yml                        |   72 -
 configs/llama/13B.yml                         |   26 -
 configs/llama/30B.yml                         |   26 -
 configs/llama/65B.yml                         |   26 -
 configs/llama/7B.yml                          |   26 -
 configs/llemma_34b.yml                        |  108 +
 configs/llemma_7b.yml                         |  105 +
 configs/local_setup.yml                       |   30 -
 configs/neox_arguments.md                     | 2023 -----------------
 configs/pythia/1-4B.yml                       |   85 -
 configs/pythia/12B.yml                        |   84 -
 configs/pythia/160M.yml                       |   85 -
 configs/pythia/1B.yml                         |   86 -
 configs/pythia/2-8B.yml                       |   87 -
 configs/pythia/410M.yml                       |   85 -
 configs/pythia/6-9B.yml                       |   84 -
 configs/pythia/70M.yml                        |   85 -
 configs/slurm_125M.yml                        |   64 -
 configs/slurm_local.json                      |   11 -
 configs/slurm_local.yml                       |   12 -
 configs/sparse.yml                            |   15 -
 configs/text_generation.yml                   |   21 -
 requirements.txt                              |  116 +
 requirements/requirements-dev.txt             |    7 -
 requirements/requirements-flashattention.txt  |    1 -
 requirements/requirements-onebitadam.txt      |    1 -
 requirements/requirements-sparseattention.txt |    1 -
 requirements/requirements-tensorboard.txt     |    1 -
 requirements/requirements-wandb.txt           |    1 -
 requirements/requirements.txt                 |   17 -
 57 files changed, 410 insertions(+), 5636 deletions(-)
 create mode 100644 34b_launch_script.sh
 delete mode 100644 configs/1-3B.yml
 delete mode 100644 configs/125M-json.yml
 delete mode 100644 configs/125M.yml
 delete mode 100644 configs/13B.yml
 delete mode 100644 configs/175B.yml
 delete mode 100644 configs/19M.yml
 delete mode 100644 configs/2-7B.yml
 delete mode 100644 configs/20B.yml
 delete mode 100644 configs/350M.yml
 delete mode 100644 configs/49M.yml
 delete mode 100644 configs/6-7B.yml
 delete mode 100644 configs/760M.yml
 delete mode 100644 configs/800M.yml
 delete mode 100644 configs/README.md
 delete mode 100644 configs/autotuning_configs/small_tune.json
 delete mode 100644 configs/autotuning_configs/tune.json
 delete mode 100644 configs/autotuning_configs/tune_1-3B.json
 delete mode 100644 configs/autotuning_configs/tune_6-7B.json
 delete mode 100644 configs/bf16_125M.yml
 delete mode 100644 configs/bnb_125M.yml
 delete mode 100644 configs/cpu_mock_config.yml
 create mode 100644 configs/data_mixture.yml
 delete mode 100644 configs/eleutherai_cluster.yml
 delete mode 100755 configs/finetuning_configs/6-9B.yml
 delete mode 100644 configs/gen_docs.py
 delete mode 100644 configs/gmlp_small.yml
 delete mode 100644 configs/llama/13B.yml
 delete mode 100644 configs/llama/30B.yml
 delete mode 100644 configs/llama/65B.yml
 delete mode 100644 configs/llama/7B.yml
 create mode 100644 configs/llemma_34b.yml
 create mode 100644 configs/llemma_7b.yml
 delete mode 100644 configs/local_setup.yml
 delete mode 100644 configs/neox_arguments.md
 delete mode 100755 configs/pythia/1-4B.yml
 delete mode 100755 configs/pythia/12B.yml
 delete mode 100755 configs/pythia/160M.yml
 delete mode 100755 configs/pythia/1B.yml
 delete mode 100755 configs/pythia/2-8B.yml
 delete mode 100755 configs/pythia/410M.yml
 delete mode 100755 configs/pythia/6-9B.yml
 delete mode 100755 configs/pythia/70M.yml
 delete mode 100644 configs/slurm_125M.yml
 delete mode 100644 configs/slurm_local.json
 delete mode 100644 configs/slurm_local.yml
 delete mode 100644 configs/sparse.yml
 delete mode 100644 configs/text_generation.yml
 create mode 100644 requirements.txt
 delete mode 100644 requirements/requirements-dev.txt
 delete mode 100644 requirements/requirements-flashattention.txt
 delete mode 100644 requirements/requirements-onebitadam.txt
 delete mode 100644 requirements/requirements-sparseattention.txt
 delete mode 100644 requirements/requirements-tensorboard.txt
 delete mode 100644 requirements/requirements-wandb.txt
 delete mode 100644 requirements/requirements.txt

diff --git a/34b_launch_script.sh b/34b_launch_script.sh
new file mode 100644
index 000000000..3718c5862
--- /dev/null
+++ b/34b_launch_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#... your SLURM arguments here
+#SBATCH --nodes=32
+#SBATCH --ntasks-per-node=8         
+#SBATCH --cpus-per-task=12
+#SBATCH --gres=gpu:8
+#SBATCH --output=34b_replication_%j.out
+#SBATCH --error=34b_replication_%j.out
+#SBATCH --exclusive
+#SBATCH --open-mode=append
+#SBATCH --requeue
+
+# setup the environment using the script we created before
+source /fsx/proj-mathlm/conda_setup_deeperspeed.sh
+#source /fsx/quentin/setup.sh
+
+ds_report
+
+# set distributed env variable flags such as NCCL_DEBUG here
+
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+# Move to the gpt-neox install
+TRAIN_PATH=/path/to/gpt-neox
+cd $TRAIN_PATH
+
+# Write the hostfile for this job here
+# Should write to a hostfile that contains lines of format `<machine IP> slots=<NUM_GPUS_PER_NODE>`
+bash /helper/script/write_hostfile.sh
+export DLTS_HOSTFILE=path/to/hostfile/hosts_$SLURM_JOBID
+
+
+# launch distributed job. If using `"deepspeed_slurm": true` and `"launcher": "slurm"` on a SLURM cluster, 
+# then NeoX will handle the creation of a distributed run across 256 gpus.
+python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
+        --conf_dir /path/to/math-lm/pretraining llemma_34b.yml data_mixture.yml   
\ No newline at end of file
diff --git a/README.md b/README.md
index c96692c60..56c21b08c 100644
--- a/README.md
+++ b/README.md
@@ -1,416 +1,73 @@
-[![GitHub issues](https://img.shields.io/github/issues/EleutherAI/gpt-neox)](https://github.com/EleutherAI/gpt-neox/issues)
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>](https://wandb.ai/eleutherai/neox)
+# LLeMA Pretraining
 
-# GPT-NeoX
+This is a modified version of the `EleutherAI/GPT-NeoX` repository used for the Llemma project. This branch diverged from `main` at commit `009018e`. This branch implements the following features that are not present in `009018e` (some of these features may have subsequently been merged into `main`):
+- [FlashAttention-2](https://arxiv.org/abs/2307.08691)
+- Grouped Query Attention
+- A numerical precision fix for RoPE    
+- Saving checkpoints to Amazon S3.
 
-This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training.
+The remaining portion of this `README` contains instructions to replicate pretraining of the LLeMA models. 
 
-For those looking for a TPU-centric codebase, we recommend [Mesh Transformer JAX](https://github.com/kingoflolz/mesh-transformer-jax).
+Training was performed across 256 A100 GPUs. We include configuration files and sample SLURM job script for the library to replicate training on a SLURM-managed cluster.
 
-**If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**
 
-## GPT-NeoX 2.0
+## Replicating Training
 
-Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), which was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
 
-- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
-- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
+### Set up environment
 
-# Contents
-
-* [Quick Start](#quick-start)
-  * [Environment and Dependencies](#environment-and-dependencies)
-  * [Usage](#usage)
-* [Configuration](#configuration)
-* [Datasets](#datasets)
-  * [Preconfigured Datasets](#preconfigured-datasets)
-  * [Using Custom Data](#using-custom-data)
-* [Training and Finetuning](#training-and-finetuning)
-  * [Select Pretrained Models](#pretrained-models)
-    * [GPT-NeoX-20B](#gpt-neox-20b)
-    * [Pythia](#pythia)
-    * [Polyglot](#polyglot)
-* [Inference](#inference)
-* [Evaluation](#evaluation)
-* [Exporting to Hugging Face](#exporting-to-hugging-face)
-* [Monitoring](#monitoring)
-  * [Weights & Biases](#wandb)
-  * [TensorBoard](#tensorboard)
-* [Administrative Notes](#administrative-notes)
-  * [Citing GPT-NeoX](#citing-gpt-neox)
-  * [Licensing](#licensing)
-  * [Publications](#publications)
-  * [Acknowledgements](#acknowledgements)
-
-# Quick Start
-
-## Environment and Dependencies
-
-### Host Setup
-
-First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8.
-
-To install the remaining basic dependencies, run:
+We provide a file containing a dump of our training environment.
 
+You can install all required packages via
 ```bash
-pip install -r requirements/requirements.txt
-pip install -r requirements/requirements-wandb.txt
-pip install -r requirements/requirements-tensorboard.txt
-python ./megatron/fused_kernels/setup.py install # optional if not using fused kernels
-```
-
-from the repository root.
-
-<aside>
-
-**Warning:** Our codebase relies on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), our fork of the [DeepSpeed](https://github.com/microsoft/DeepSpeed) library with some added changes. We strongly recommend using Anaconda, a virtual machine, or some other form of environment isolation before continuing. Failure to do so may cause other repositories that rely on DeepSpeed to break.
-
-</aside>
-
-### TensorBoard
-=======
-### Flash Attention
-
-To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
-
-
-### Containerized Setup
-
-We also provide a Dockerfile if you prefer to run NeoX in a container. To use this option, first build an image named `gpt-neox` from the repository root directory with `docker build -t gpt-neox -f Dockerfile .`. We also host pre-built images on [Docker Hub at `leogao2/gpt-neox`](https://hub.docker.com/r/leogao2/gpt-neox/tags).
-
-You can then run a container based on this image. For instance, the below snippet mounts the cloned repository (`gpt-neox`) directory to `/gpt-neox` in the container and uses [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to make four GPUs (numbers 0-3) accessible to the container. [As noted by the NCCL documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#sharing-data), both `--shm-size=1g` and `--ulimit memlock=-1` are important to prevent Docker from allocating too little shared memory.
-```
-nvidia-docker run --rm -it -e NVIDIA_VISIBLE_DEVICES=0,1,2,3 --shm-size=1g --ulimit memlock=-1 --mount type=bind,src=$PWD,dst=/gpt-neox gpt-neox
+pip install -r requirements.txt
 ```
+Make sure you are installing https://github.com/EleutherAI/DeeperSpeed/tree/new-fix for your DeepSpeed version and install fused kernels for GPT-NeoX via `python ./megatron/fused_kernels/setup.py install` from within your GPT-NeoX install.
 
-## Usage
 
-All functionality (inference included), should be launched using `deepy.py`, a wrapper around the `deepspeed` launcher.
+### Converting Llama 2 checkpoints into NeoX format
 
-We currently offer three main functions:
-1. `train.py` is used for training and finetuning models.
-2. `evaluate.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
-3. `generate.py` is used to sample text from a trained model.
+First, download CodeLlama 7b or 34b from the Meta AI repo and rename the download folder to 7B or 34B within the CodeLlama repository.
 
-which can be launched with:
+Then, to convert either model into the format expected by GPT-NeoX for checkpoints:
 
+Sample command for 7b Meta->NeoX format:
 ```bash
-./deepy.py [script.py] [./path/to/config_1.yml] [./path/to/config_2.yml] ... [./path/to/config_n.yml]
+python convert_raw_llama_weights_to_hf.py --input_dir /path/to/codellama/repo --config_file /path/to/this/repo/math-lm/pretraining/llemma_7b.yml --output_dir /path/to/save/into/ --num_output_shards {TP_DEGREE, we use 2}
 ```
 
-E.G To generate text unconditionally with the GPT-NeoX-20B model, you can use the following:
+Sample command for 34b Meta->NeoX format:
+(Requires large amounts of GPU VRAM or CPU RAM. Pass `CUDA_VISIBLE_DEVICES=""` to perform conversion on CPU. 34b conversion may take a while)
 ```bash
-./deepy.py generate.py ./configs/20B.yml
+CUDA_VISIBLE_DEVICES="" python convert_raw_llama_weights_to_hf.py --input_dir /path/to/codellama/repo --config_file /path/to/this/repo/math-lm/pretraining/llemma_34b.yml --output_dir /path/to/save/into/ --num_output_shards {TP_DEGREE, we use 8}
 ```
 
-Or optionally pass in a text file (e.g `prompt.txt`) to use as the prompt, which should be a plain `.txt` file with each prompt separated by newline characters, also passing in the path to an output file.
-
-```bash
-./deepy.py generate.py ./configs/20B.yml -i prompt.txt -o sample_outputs.txt
-```
-
-To reproduce our evaluation numbers on, for example, TriviaQA and PIQA use:
-
-```bash
-./deepy.py evaluate.py ./configs/20B.yml --eval_tasks triviaqa piqa
-```
-
-You can add an arbitrary list of evaluation tasks here, for details of all tasks available, see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
-
-For more details on each entry point, see the [Training and Finetuning](#training-and-finetuning), [Inference](#inference) and [Evaluation](#evaluation)
-# Configuration
-
-GPT-NeoX parameters are defined in a YAML configuration file which is passed to the deepy.py launcher. We have provided some example .yaml files in [configs](./configs/), including one for GPT-NeoX-20B, and example configuration files for other model sizes.
-
-These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `train_micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
-
-For a more detailed guide to all the features available and how to configure them, see [the configuration README](configs/README.md), and for documentation of every possible argument, see [configs/neox_arguments.md](configs/neox_arguments.md).
-
-# Datasets
-
-## Preconfigured Datasets
-
-Several preconfigured datasets are available, including most components from [the Pile](https://arxiv.org/abs/2101.00027), as well as the Pile train set itself, for straightforward tokenization using the `prepare_data.py` entry point.
-
-E.G, to download and tokenize the enwik8 dataset with the GPT2 Tokenizer, saving them to `./data` you can run:
-
-```
-python prepare_data.py -d ./data
-```
-
-or a single shard of the pile (`pile_subset`) with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
-
-```
-python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json pile_subset
-```
-
-The tokenized data will be saved out to two files: `[data-dir]/[dataset-name]/[dataset-name]_text_document.bin`and `[data-dir]/[dataset-name]/[dataset-name]_text_document.idx`. You will need to add the prefix that both these files share to your training configuration file under the `data-path` field. E.G:
-
-```yaml
-  "data-path": "./data/enwik8/enwik8_text_document",
-```
-
-## Using Custom Data
-
-To prepare your own dataset for training with custom data, format it as one large [jsonl](https://jsonlines.org/)-formatted file with each item in the list of dictionaries being a separate document. The document text should be grouped under one JSON key, i.e `"text"`. Any auxiliary data stored in other fields will not be used.
-
-Next make sure to download the GPT2 tokenizer vocab, and merge files from the following links:
-
-- Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-- Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-
-Or use the 20B tokenizer (for which only a single Vocab file is needed):
-
-- Vocab: https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
-
-(alternatively, you can provide any tokenizer file that can be loaded by Hugging Face's tokenizers library with the `Tokenizer.from_pretrained()` command)
-
-You can now pretokenize your data using `tools/preprocess_data.py`, the arguments for which are detailed below:
-
-```
-usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS] --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer} [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix OUTPUT_PREFIX
-                          [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] [--log-interval LOG_INTERVAL]
-
-optional arguments:
-  -h, --help            show this help message and exit
-
-input data:
-  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
-  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
-                        space separate listed of keys to extract from jsonl. Defa
-  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
-
-tokenizer:
-  --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
-                        What type of tokenizer to use.
-  --vocab-file VOCAB_FILE
-                        Path to the vocab file
-  --merge-file MERGE_FILE
-                        Path to the BPE merge file (if necessary).
-  --append-eod          Append an <eod> token to the end of a document.
-  --ftfy                Use ftfy to clean text
-
-output data:
-  --output-prefix OUTPUT_PREFIX
-                        Path to binary output file without suffix
-  --dataset-impl {lazy,cached,mmap}
-                        Dataset implementation to use. Default: mmap
-
-runtime:
-  --workers WORKERS     Number of worker processes to launch
-  --log-interval LOG_INTERVAL
-                        Interval between progress updates
-
-```
-
-For example:
-
-```bash
-python tools/preprocess_data.py \
-            --input ./data/mydataset.jsonl.zst \
-            --output-prefix ./data/mydataset \
-            --vocab ./data/gpt2-vocab.json \
-            --merge-file gpt2-merges.txt \
-            --dataset-impl mmap \
-            --tokenizer-type GPT2BPETokenizer \
-            --append-eod
-```
-
-You would then run training with the following settings added to your configuration file:
-
-```yaml
-  "data-path": "data/mydataset/mydataset",
-```
-
-# Training and Finetuning
-
-Training is launched using `deepy.py`, a wrapper around DeepSpeed's launcher, which launches the same script in parallel across many GPUs / nodes.
-
-The general usage pattern is:
-
-```bash
-python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
-```
-
-You can pass in an arbitrary number of configs which will all be merged at runtime.
-
-You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
-
-E.G:
-
-```bash
-python ./deepy.py train.py -d configs 125M.yml local_setup.yml
-```
-
-This will deploy the `train.py` script on all nodes with one process per GPU. The worker nodes and number of GPUs are specified in the `/job/hostfile` file (see [parameter documentation](configs/README.md)), or can simply be passed in as the `num_gpus` arg if running on a single node setup.
-
-Although this is not strictly necessary, we find it useful to define the model parameters in one config file (e.g `configs/125M.yml`) and the data path parameters in another (e.g `configs/local_setup.yml`).
-
-
-## Pretrained Models
-
-### GPT-NeoX-20B
-
-GPT-NeoX-20B is a 20 billion parameter autoregressive language model trained on [the Pile](https://arxiv.org/abs/2101.00027). Technical details about GPT-NeoX-20B can be found in [the associated paper](https://arxiv.org/abs/2204.06745). The configuration file for this model is both available at [`./configs/20B.yml`](./configs/20B.yml) and included in the download links below.
-
-[Slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/) - (No optimizer states, for inference or finetuning, 39GB)
-
-To download from the command line to a folder named `20B_checkpoints`, use the following command:
-
-```bash
-wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints
-```
-
-[Full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/) - (Including optimizer states, 268GB)
-
-To download from the command line to a folder named `20B_checkpoints`, use the following command:
-
-```bash
-wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/ -P 20B_checkpoints
-```
-
-Weights can be alternatively be downloaded using a BitTorrent client. Torrent files can be downloaded here: [slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent), [full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights.torrent).
-
-We additionally have 150 checkpoints saved throughout training, one every 1,000 steps. We are working on figuring out how to best serve these at scale, but in the meanwhile people interested in working with the partially trained checkpoints can email us at contact@eleuther.ai to arrange access.
-
-### Pythia
-
-The Pythia Scaling Suite is a suite of models ranging from 70M parameters to 12B parameters trained on [the Pile](https://pile.eleuther.ai) intended to promote research on interpretability and training dynamics of large language models. Further details about the project and links to the models can be found in the [in the paper](https://arxiv.org/abs/2304.01373) and [on the project's GitHub](https://github.com/EleutherAI/pythia).
-
-### Polyglot
-
-The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly available language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
-
-# Inference
-
-**For most uses we recommend deploying models trained using the GPT-NeoX library via the Hugging Face Transformers library which is better optimized for inference.**
-
-We support three types of generation from a pretrained model:
-1. Unconditional generation
-2. Conditional generation based on an input read from a file
-3. Interactive generation, which allows for multiple rounds of back-and-forth between a user and the language model via a command line interface
-
-All three types of text generation can be launched via `python ./deepy.py generate.py -d configs 125M.yml local_setup.yml text_generation.yml` with the appropriate values set in `configs/text_generation.yml`.
-
-# Evaluation
-
-GPT-NeoX supports evaluation on downstream tasks through the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
-
-To evaluate a trained model on the evaluation harness, simply run:
-
-```bash
-python ./deepy.py evaluate.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
-```
-
-where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--eval_tasks lambada hellaswag piqa sciq`. For details of all tasks available, refer to the [lm-evaluation-harness repo](https://github.com/EleutherAI/lm-evaluation-harness).
-
-# Exporting to Hugging Face
-
-GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) GPTNeoXModel format.
-
-To convert a NeoX checkpoint (with pipeline-parallel-size>=1) to Hugging Face-loadable format, run:
-```bash
-python ./tools/convert_module_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location
-```
-
-To convert a sequential model to Hugging Face format, run:
-```bash
-python  ./tools/convert_sequential_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location
-```
-(Note: this script should be used for v2.0 checkpoints saved on a v2.0 commit prior to https://github.com/EleutherAI/gpt-neox/pull/866 and which used `pipe-parallel-size=1`. Using `pipe-parallel-size=0` will also save models in this format.)
-
-Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
-```bash
-huggingface-cli login
-python ./tools/upload.py
-```
-and input the requested information, including HF hub user token.
-
-Note, however, that this compatibility is not one-to-one, and only certain configurations from GPT-NeoX are supported in the Hugging Face GPTNeoXModel class. Advanced features such as alternative positional embeddings may require new Transformers modeling code and new conversion script tweaks.
-
-# Monitoring
-
-In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard/)
-
-<h2 id="wandb">Weights & Biases</h2>
-
-EleutherAI is currently using [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox). If you are logged into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account.
-
-## TensorBoard
-
-We also support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from  `./requirements/requirements-tensorboard.txt`.
-
-# Running on multi-node
-
-If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
-
-# Administrative Notes
-
-## Citing GPT-NeoX
-
-If you have found the GPT-NeoX library helpful in your work, you can cite this repository as
-
-```bibtex
-@software{gpt-neox-library,
-  title = {{GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch}},
-  author = {Andonian, Alex and Anthony, Quentin and Biderman, Stella and Black, Sid and Gali, Preetham and Gao, Leo and Hallahan, Eric and Levy-Kramer, Josh and Leahy, Connor and Nestler, Lucas and Parker, Kip and Pieler, Michael and Purohit, Shivanshu and Songz, Tri and Phil, Wang and Weinbach, Samuel},
-  url = {https://www.github.com/eleutherai/gpt-neox},
-  doi = {10.5281/zenodo.5879544},
-  month = {8},
-  year = {2021},
-  version = {0.0.1},
-}
-```
-
-To cite our 20 billion parameter model, please use
-
-```bibtex
-@inproceedings{gpt-neox-20b,
-  title={{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
-  author={Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
-  booktitle={Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
-  url={https://arxiv.org/abs/2204.06745},
-  year={2022}
-}
-```
 
-Citation instructions for other pretrained models can be found [in the appropriate repository](#pretrained-models).
+### Check Out Codebase
 
-## Licensing
+Next, check out the commit used to train the model you are replicating.
 
-This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2021, EleutherAI. Licensed under the Apache License:
+* 7b / 34b: https://github.com/EleutherAI/gpt-neox/commit/{this_commit_hash}
 
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
+### Launching Training
 
-        http://www.apache.org/licenses/LICENSE-2.0
+Then, edit the provided YML files to set paths based on your own system's saved locations for checkpoints and data files, and edit the SLURM job script as specified (using ) or run the job across multiple nodes using your own system's orchestration.
 
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
+**Tip**: Note that the global batch size will be scaled by your number of nodes. Therefore, if running on a number of nodes different from 32 you should scale gradient accumulation steps accordingly. 
 
-This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
+We used a batch size of 4M tokens. To calculate global batch size, you should compute `seq_len * num_gpus * ( train_microbatch_size_per_gpu * gradient_accumulation_steps) / (model_parallel_size * max(pipeline_parallel_size, 1))` .
 
-This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate.
 
-For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email us at contact@eleuther.ai.
+## Contents
 
-## Publications
+The files in this folder are as follows:
 
-The following publications have come out of this project:
+* `34b_launch_script.sh` contains a skeleton SLURM job script to launch training with NeoX across 32 nodes.
 
- - Black, Biderman, Hallahan, Anthony, Gao, Golding, He, Leahy, McDonell, Phang, Pieler, Prashanth, Purohit, Reynolds, Tow, Wang, and Weinbach. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*. 2022.
- - Biderman, Schoelkopf, Anthony, Bradley, O'Brien, Hallahan, Khan, Purohit, Prashanth, Raff, Skowron, Sutawika, and van der Wal. "[Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling](https://arxiv.org/abs/2304.01373)." *arXiv preprint arXiv:2304.01373*. 2023.
+* `configs/data_mixture.yml` contains a list of the domain weights for the final training run.
 
-The following publications by other research groups use this library:
-- Chi, Fan, Ramadge, and Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)". _arXiv preprint arXiv:2205.09921_. 2022.
-- Horawalavithana, Ayton, Sharma, Howland, Subramanian, Vasquez, Cosbey, Glenski, and Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://openreview.net/pdf?id=SLX-I2MHUZ9)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*. 2022.
-- Kolak, Martins, Le Goues, and Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://openreview.net/forum?id=rHlzJh_b1-5)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR*. 2022.
-- Xu, Alon, Neubig, and Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code*. 2022.
+* `configs/llemma_7b.yml` is a cleaned-up version of the config file used to train Llemma-7b.
 
-## Acknowledgements
+* `configs/llemma_34b.yml` is a cleaned-up version of the config file used to train Llemma-34b.
 
-We run our experiments on a Kubernetes cluster generously provided by [CoreWeave](https://coreweave.com/) and a SLURM cluster provided by [Stability AI](https://stability.ai).
+* `requirements.txt` is a dump of the virtual environmment used in training, created via `pip freeze`.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
deleted file mode 100644
index 3e80ae7fc..000000000
--- a/configs/1-3B.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 24,
-   "hidden_size": 2048,
-   "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0002,
-       "betas": [0.9, 0.95],
-       "eps":  1.0e-8,
-     }
-   },
-   "min_lr": 0.00002,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/125M-json.yml b/configs/125M-json.yml
deleted file mode 100644
index 95a76ebfc..000000000
--- a/configs/125M-json.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 12,
-  "hidden_size": 768,
-  "num_attention_heads": 12,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "norm": "layernorm",
-  "pos_emb": "rotary",
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0006,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00006,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true
-  },
-
-  "train_micro_batch_size_per_gpu": 4,
-  "data_impl": "mmap",
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0.0,
-  "attention_dropout": 0.0,
-
-  "fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 320000,
-  "lr_decay_iters": 320000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 10000,
-  "eval_interval": 1000,
-  "eval_iters": 10,
-
-  "log_interval": 100,
-  "steps_per_print": 10,
-  "keep_last_n_checkpoints": 4,
-  "wall_clock_breakdown": true,
-
-  "hostfile": "/mock_path"
-}
diff --git a/configs/125M.yml b/configs/125M.yml
deleted file mode 100644
index 15a4b3b01..000000000
--- a/configs/125M.yml
+++ /dev/null
@@ -1,94 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.00006,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-
-  #  networking
-  "hostfile": "/mock_path"
-}
diff --git a/configs/13B.yml b/configs/13B.yml
deleted file mode 100644
index 7af3208ef..000000000
--- a/configs/13B.yml
+++ /dev/null
@@ -1,92 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 40,
-   "hidden_size": 5120,
-   "num_attention_heads": 40,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0001,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-   "min_lr": 0.00001,
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/175B.yml b/configs/175B.yml
deleted file mode 100644
index cc5c5c23f..000000000
--- a/configs/175B.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 96,
-   "hidden_size": 12288,
-   "num_attention_heads": 96,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00006,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.000006,
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/19M.yml b/configs/19M.yml
deleted file mode 100644
index 83e5c594a..000000000
--- a/configs/19M.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  # model settings
-  "num_layers": 6,
-  "hidden_size": 512,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.001,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-    }
-  },
-  "min_lr": 0.0001,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-  "train_micro_batch_size_per_gpu": 4, #32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "eval_interval": 100000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  # additional deepspeed args not specified above
-  "deepspeed_extra_args": {
-    "comms_logger": {
-        "enabled": true,
-        "verbose": true,
-        "prof_all": true,
-        "debug": false
-    },
-  }
-
-}
diff --git a/configs/2-7B.yml b/configs/2-7B.yml
deleted file mode 100644
index 2bddda0ed..000000000
--- a/configs/2-7B.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 32,
-   "hidden_size": 2560,
-   "num_attention_heads": 32,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00016,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.000016,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/20B.yml b/configs/20B.yml
deleted file mode 100644
index 243f794d0..000000000
--- a/configs/20B.yml
+++ /dev/null
@@ -1,111 +0,0 @@
-# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
-# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
-# the model in memory.
-
-{
-  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
-  "vocab_file": "./20B_checkpoints/20B_tokenizer.json",
-  "save": "./20B_checkpoints",
-  "load": "./20B_checkpoints",
-
-  # If finetuning, edit the following to the location of your finetuning dataset:
-  "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
-
-  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-  # across the node boundaries )
-  "pipe_parallel_size": 4,
-  "model_parallel_size": 2,
-
-  # model settings
-  "num_layers": 44,
-  "hidden_size": 6144,
-  "num_attention_heads": 64,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "norm": "layernorm",
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  # optimizer settings
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.97e-4,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-      }
-      },
-
-  "min_lr": 0.97e-5,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-  "stage": 1,
-  "allgather_partitions": True,
-  "allgather_bucket_size": 1260000000,
-  "overlap_comm": True,
-  "reduce_scatter": True,
-  "reduce_bucket_size": 1260000000,
-  "contiguous_gradients": True,
-  },
-
-  # batch / data settings (assuming 96 GPUs)
-  "train_micro_batch_size_per_gpu": 4,
-  "gradient_accumulation_steps": 32,
-  "data_impl": "mmap",
-  "split": "995,4,1",
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": false,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.01,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-    },
-
-  # misc. training settings
-  "train_iters": 150000,
-  "lr_decay_iters": 150000,
-
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 500, # this variable previously called `save-interval`
-  "eval_interval": 1000,
-  "eval_iters": 10,
-
-  # logging
-  "log_interval": 2,
-  "steps_per_print": 2,
-  "wall_clock_breakdown": false,
-
-  ### NEW DATA: ####
-  "tokenizer_type": "HFTokenizer",
-  "tensorboard-dir": "./tensorboard",
-  "log_dir": "./logs",
-
-}
diff --git a/configs/350M.yml b/configs/350M.yml
deleted file mode 100644
index 1247ea88d..000000000
--- a/configs/350M.yml
+++ /dev/null
@@ -1,90 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 24,
-   "hidden_size": 1024,
-   "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0003,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.00003,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/49M.yml b/configs/49M.yml
deleted file mode 100644
index 9852320b0..000000000
--- a/configs/49M.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-{
-  # parallelism settings
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  # model settings
-  "num_layers": 10,
-  "hidden_size": 640,
-  "num_attention_heads": 10,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  # these should provide some speedup but takes a while to build, set to true if desired
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  # optimizer settings
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0008,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-    }
-  },
-  "min_lr": 0.00008,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-  # batch / data settings
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-  },
-
-  # misc. training settings
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "eval_interval": 100000,
-  "eval_iters": 10,
-
-  # logging
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-}
diff --git a/configs/6-7B.yml b/configs/6-7B.yml
deleted file mode 100644
index 8054c3ff2..000000000
--- a/configs/6-7B.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 32,
-   "hidden_size": 4096,
-   "num_attention_heads": 32,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8,
-     }
-   },
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-   "min_lr": 0.000012,
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/760M.yml b/configs/760M.yml
deleted file mode 100644
index 5cbb39559..000000000
--- a/configs/760M.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 24,
-   "hidden_size": 1536,
-   "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-   # init methods
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00025,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8,
-     }
-   },
-   "min_lr": 0.000025,
-
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/800M.yml b/configs/800M.yml
deleted file mode 100644
index f522b40c7..000000000
--- a/configs/800M.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  # model settings
-  "num_layers": 16,
-  "hidden_size": 2048,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": false,
-  "bias_gelu_fusion": false,
-
-  # init methods
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00025,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8,
-    }
-  },
-  "min_lr": 0.000025,
-
-  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-  "train_micro_batch_size_per_gpu": 16,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  # activation checkpointing
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  # regularization
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  # precision settings
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1,
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "eval_interval": 40000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-}
diff --git a/configs/README.md b/configs/README.md
deleted file mode 100644
index 5b065d2cd..000000000
--- a/configs/README.md
+++ /dev/null
@@ -1,314 +0,0 @@
-# Configuration and parameters
-
-GPT-NeoX parameters are defined in a YAML configuration file which is passed to the `deepy.py` launcher - for examples see the files contained in this folder.
-Parameters originate from either the [DeepSpeed runner CLI (DSL)](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/runner.py#L33), [DeepSpeed configuration file (DSC)](https://www.deepspeed.ai/docs/config-json/), [Megatron-LM CLI (Meg)](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/arguments.py#L224) or are GPT-NeoX (NeoX) modifications.
-
-## Example Configuration (GPT3 Small):
-
-Below is an example configuration `.yaml` to train a ~160M parameter GPT model. This readme will go through each section in the configuration and the options available.
-
-For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
-
-Note: yaml arguments may be formatted with either '-' or '_'. The standard separator used is a '_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
-```yaml
-# GPT-3 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "rmsnorm",
-   "pos_emb": "none",
-   "no_weight_tying": true,
-    # this should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "train_iters": 320000,
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "max_grad_norm": 1.0,
-       "betas": [0.9, 0.95]
-     }
-   },
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "gradient_accumulation_steps": 1,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # lr decay settings
-   "lr_decay_iters": 320000,
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-
-   # misc. training settings
-   "distributed_backend": "nccl",
-   "save_interval": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
-```
-
-### Parallelism Settings:
-
-The parallelism settings are left at 1 in all configs, as the settings you want will be highly dependent on your compute setup and network topology.
-We have found it best to do model parallelism within a node, and schedule pipeline stages across node boundaries.
-
-```yaml
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-```
-
-These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must be divisible by `pipe_parallel_size` * `model_parallel_size`.
-
-
-### Model Settings:
-```yaml
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "rmsnorm",
-   "pos_emb": "none",
-   "no_weight_tying": true,
-    # this should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "train_iters": 320000,
-```
-An example of some basic settings used to configure your model's architecture and number of training steps.
-
-### Optimizer Settings:
-
-Our optimizer configuration has a similar syntax to deepspeed's. Different optimizers will have different arguments for "params".
-Learning rate should be configured from here using the `"lr"` field of `optimizer["params"]`.
-
-```yaml
-  # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "max_grad_norm": 1.0,
-       "betas": [0.9, 0.95]
-     }
-   }
-   ```
-Available optimizer types are:
-
-- `"Adam"`: regular Adam optimizer
-- `"OneBitAdam"`: Deepspeed's [OneBitAdam optimizer](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). To use 1-bit adam, you'll also need to add the `freeze_step`, `cuda_aware`, and `comm_backend_name` fields, like so:
-```yaml
-   "optimizer": {
-     "type": "OneBitAdam",
-     "params": {
-       "lr": 0.0001,
-       "freeze_step": 23000,
-       "betas": [0.9, 0.95],
-       "cuda_aware": false,
-       "comm_backend_name": "nccl"
-     }
-```
-
-- `"CPU_Adam"`/`"CPU_torch_adam"`: Adam optimizer on CPU. Either megatron's version ("CPU_Adam") or torch's ("CPU_torch_adam")
-- `"SM3"`: SM3 or [Memory adaptive efficient optimization optimizer](https://arxiv.org/pdf/1901.11150.pdf). We have found this doesn't work well with fp16 training.
-- `"madgrad_wd"`: MADGRAD or [A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
-    Optimizer] weight decay has been implemented AdamW style instead of the original madgrad Adam style. https://arxiv.org/abs/2101.11075
-
-### ZeRO Optimization:
-
-```yaml
-# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-  "zero_optimization": {
-        "stage": 0,
-        "allgather_partitions": True,
-        "allgather_bucket_size": 500000000,
-        "overlap_comm": True,
-        "reduce_scatter": True,
-        "reduce_bucket_size": 500000000,
-        "contiguous_gradients": True,
-  },
-  "zero_allow_untested_optimizer": false,
-
-```
-
-ZeRO optimization in NeoX is currently configured identically to how deepspeed configures it, please see [the deepspeed docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) for more information.
-
-If you want to combine an optimizer untested by DeepSpeed with ZeRO (i.e, not ADAM or LAMB), you must pass `"zero_allow_untested_optimizer": true` *outside* of the `"zero_optimization"` dictionary (see above).
-
-N.B - ZeRO stages 2+ are incompatible with pipeline parallelism. Please set `"pipe-parallel-size"` to 0 if you want to use ZeRO stage 2 or more.
-
-### Batch Size Settings:
-
-```yaml
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "gradient_accumulation_steps": 1,
-```
-Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"train_micro_batch_size_per_gpu"`.
-- `"train_batch_size"`: The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
-- `"train_micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
-- `"gradient_accumulation_steps"`: Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs.
-
-### Extra DeepSpeed Settings
-
-```yaml
-# additional deepspeed args not specified above
-"deepspeed_extra_args": {
-    "comms_logger": {
-        "enabled": true,
-        "verbose": true,
-        "prof_all": true,
-        "debug": false
-    },
-}
-```
-Additional DeepSpeed settings besides those mentioned above should be wrapped in the `"deepspeed_extra_args` argument, as in the example above. This functionality is designed to allow arguments not specified by existing dataclasses to be passed to DeepSpeed (e.g. when new functionalities are implemented). If any settings are duplicated here from elsewhere in the YAML, the system will throw an exception and notify the user.
-
-### Dataset / Tokenizer / Checkpoint / Logging Settings:
-
-```yaml
-   "data_impl": "mmap",
-   "split": "949,50,1",
-   # Suggested data paths when using GPT-NeoX locally
-   "data_path": "data/enwik8/enwik8_text_document",
-   #"train_data_path": "data/enwik8/enwik8_text_document",
-   #"test_data_path": "data/enwik8/enwik8_text_document",
-   #"valid_data_path": "data/enwik8/enwik8_text_document",
-   "vocab_file": "data/gpt2-vocab.json",
-   "merge_file": "data/gpt2-merges.txt",
-   "save": "checkpoints",
-   "load": "checkpoints",
-   "tensorboard_dir": "tensorboard",
-   "log_dir": "logs",
-   "save_interval": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-```
-
-### LR Scheduler settings
-
-```yaml
-   "lr_decay_iters": 320000,
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-```
-
-Settings used to modify the learning rate over time.
-
-N.B - `OneBitAdam` requires you to use deepspeed's internal lr scheduler because reasons. Currently the lr decay style defaults to deepspeed's `WarmupDecay
-
-### Activation Checkpointing Settings:
-
-```yaml
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-```
-
-Checkpointing works by trading compute for memory. Rather than storing all intermediate activations of the entire computation graph for computing backward, the checkpointed part does not save intermediate activations, and instead recomputes them in backward pass.
-
-### Mixed Precision Training Settings:
-gpt-neox's fp16 training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
-An example config for fp16 training:
-
-```yaml
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-```
-
-Alternatively you can use the `precision` config which can be set to `fp16`, `bfloat16`, or `fp32`. If you set `"precision": "fp16"` without adding a `"fp16": {...}` dict, then it will simply use DeepSpeed's defaults for fp16 training.
-
-
-### SLURM Settings
-
-If you are running GPT-NeoX on a SLURM cluster and wish to use SLURM to coordinate nodes, then you must set the following variables in your config:
-
-```yaml
-    "launcher": "slurm",
-    "deepspeed_slurm": true
-```
-
-Additionally, you need to modify _all_ of your configs to conform to the JSON. When launching a GPT-NeoX job you can specify multiple YAML config files. Internally, all of these files are merged into one config and then passed as a single long command line argument to Deep(er)Speed. When using SLURM and its internal command `srun`, python fails to parse this long command line argument unless it is in the more restrictive JSON format. In practice, the example NeoX configs are already very close to JSON. As an example, this is a snippet of a YAML-compatible config, N.B. the comment the capital-F `False`:
-
-```yaml
-    # optimizer settings
-   "optimizer": {
-     "type": "OneBitAdam",
-     "params": {
-       "lr": 0.0001,
-       "freeze_step": 23000,
-       "betas": [0.9, 0.95],
-       "cuda_aware": False,
-       "comm_backend_name": "nccl"
-     }
-```
-
-To make this JSON just remove the comment and use all lowercase for the boolean:
-
-```yaml
-   "optimizer": {
-     "type": "OneBitAdam",
-     "params": {
-       "lr": 0.0001,
-       "freeze_step": 23000,
-       "betas": [0.9, 0.95],
-       "cuda_aware": false,
-       "comm_backend_name": "nccl"
-     }
-```
diff --git a/configs/autotuning_configs/small_tune.json b/configs/autotuning_configs/small_tune.json
deleted file mode 100644
index 52c99449b..000000000
--- a/configs/autotuning_configs/small_tune.json
+++ /dev/null
@@ -1,78 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 1,
-
-   "num-layers": 12,
-   "hidden-size": 768,
-   "num-attention-heads": 12,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-
-   "scaled-upper-triang-masked-softmax-fusion": false,
-   "bias-gelu-fusion": false,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-
-   "train_micro_batch_size_per_gpu": 1,
-   "data-impl": "mmap",
-   "split": "949,50,1",
-
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.0,
-   "hidden-dropout": 0.0,
-   "attention-dropout": 0.0,
-
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 320000,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "save-interval": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-
-   "log-interval": 100,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "comment": "neox",
-   "autotuning": {
-       "enabled": true,
-       "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   },
-   "zero_optimization": {
-      "stage": [0, 1, 2, 3]
-   },
-  "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
-  "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
-  "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
-}
diff --git a/configs/autotuning_configs/tune.json b/configs/autotuning_configs/tune.json
deleted file mode 100644
index b2f114539..000000000
--- a/configs/autotuning_configs/tune.json
+++ /dev/null
@@ -1,72 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 1,
-   "num-layers": 12,
-   "hidden-size": 768,
-   "num-attention-heads": 12,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-   "scaled-upper-triang-masked-softmax-fusion": true,
-   "bias-gelu-fusion": true,
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-   "train_micro_batch_size_per_gpu": 1,
-   "autotuning_config": {
-     "enabled": true,
-     "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   },
-   "data-impl": "mmap",
-   "split": "949,50,1",
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.0,
-   "hidden-dropout": 0.0,
-   "attention-dropout": 0.0,
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-   "train-iters": 200,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "save-interval": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-   "log-interval": 100,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "comment": "neox"
-}
diff --git a/configs/autotuning_configs/tune_1-3B.json b/configs/autotuning_configs/tune_1-3B.json
deleted file mode 100644
index 8207d0cfc..000000000
--- a/configs/autotuning_configs/tune_1-3B.json
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 1,
-
-   "num-layers": 24,
-   "hidden-size": 2048,
-   "num-attention-heads": 16,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-   "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
-   "attention_config": [[["flash"], 24]],
-   "scaled-upper-triang-masked-softmax-fusion": false,
-   "bias-gelu-fusion": false,
-
-   "init_method": "small_init",
-   "output_layer_init_method": "wang_init",
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0002,
-       "betas": [0.9, 0.95],
-       "eps":  1.0e-8
-     }
-   },
-   "min_lr": 0.00002,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true
-  },
-  "train_micro_batch_size_per_gpu": 1,
-   "autotuning": {
-     "enabled": true,
-     "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   },
-   "data-impl": "mmap",
-
-   "checkpoint-activations": false,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.1,
-   "hidden-dropout": 0,
-   "attention-dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 320000,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "checkpoint-factor": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "no_ssh_check": true,
-
-   "log-interval": 10,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 1,
-   "wall_clock_breakdown": true
-}
diff --git a/configs/autotuning_configs/tune_6-7B.json b/configs/autotuning_configs/tune_6-7B.json
deleted file mode 100644
index 3d7aadf68..000000000
--- a/configs/autotuning_configs/tune_6-7B.json
+++ /dev/null
@@ -1,77 +0,0 @@
-{
-   "pipe-parallel-size": 1,
-   "model-parallel-size": 8,
-
-   "num-layers": 32,
-   "hidden-size": 4096,
-   "num-attention-heads": 32,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "no-weight-tying": true,
-
-   "scaled-upper-triang-masked-softmax-fusion": false,
-   "bias-gelu-fusion": false,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-
-   "train_micro_batch_size_per_gpu": 1,
-   "zero_optimization": {
-      "stage": [0, 1, 2, 3]
-   },
-   "data-impl": "mmap",
-   "split": "949,50,1",
-
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0,
-   "hidden-dropout": 0,
-   "attention-dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 100,
-   "lr-decay-iters": 320000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "checkpoint-factor": 10000,
-   "eval-interval": 1000,
-   "eval-iters": 10,
-   "log-interval": 100,
-   "steps_per_print": 10,
-   "keep-last-n-checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "no_ssh_check": true,
-   "comment": "neox",
-   "autotuning": {
-       "enabled": true,
-       "mp_size": 8,
-       "arg_mappings": {
-       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
-       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
-     }
-   }
-}
diff --git a/configs/bf16_125M.yml b/configs/bf16_125M.yml
deleted file mode 100644
index 93f808d35..000000000
--- a/configs/bf16_125M.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8,
-     }
-   },
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.0,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   "precision": "bfloat16",
-
-   "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/bnb_125M.yml b/configs/bnb_125M.yml
deleted file mode 100644
index 66f733803..000000000
--- a/configs/bnb_125M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "use_bnb_optimizer": true,
-
-   # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled_upper_triang_masked_softmax_fusion": false,
-   "bias_gelu_fusion": false,
-
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8,
-     }
-   },
-   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": True,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": True,
-    "reduce_scatter": True,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": True,
-  },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.0,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/cpu_mock_config.yml b/configs/cpu_mock_config.yml
deleted file mode 100644
index 653aa21d8..000000000
--- a/configs/cpu_mock_config.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-# CPU unit tests should be independent of the presence of GPUs on the test server
-# host. This configuration mocks these GPU resources and other dependencies.
-{
-  "global_num_gpus": 1
-}
diff --git a/configs/data_mixture.yml b/configs/data_mixture.yml
new file mode 100644
index 000000000..7c6a1b9f1
--- /dev/null
+++ b/configs/data_mixture.yml
@@ -0,0 +1,6 @@
+{
+  "train-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/train/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/train/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/train/code-with-proofsteps/code-with-proofsteps_text_document", "/fsx/proj-mathlm/proof-pile_llama/train/pile-sample/pile-sample_text_document", "/fsx/proj-mathlm/code-rp_llama/train/code-rp/code-rp_text_document"],
+  "train-data-weights": [2, 4, 1, 0.147368, 0.221053],
+  "valid-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/validation/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/validation/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/validation/code-with-proofsteps/code-with-proofsteps_text_document"],
+  "test-data-paths": ["/fsx/proj-mathlm/proof-pile_llama/test/arxiv-rp/arxiv-rp_text_document", "/fsx/proj-mathlm/open-web-math-v1.2_llama/test/open-web-math/open-web-math_text_document", "/fsx/proj-mathlm/code-with-proofsteps_llama/test/code-with-proofsteps/code-with-proofsteps_text_document"],
+}
\ No newline at end of file
diff --git a/configs/eleutherai_cluster.yml b/configs/eleutherai_cluster.yml
deleted file mode 100644
index 36e75d8b3..000000000
--- a/configs/eleutherai_cluster.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Data paths and options when using EleutherAI cluster
-{
-  # you may include multiple distinct datasets if desired
-  "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
-  "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
-  "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
-
-  # if using multiple datasets, provide weights for them to be sampled with
-  # "train-data-weights": [1., 2.],
-  # "test-data-weights": [2., 1.],
-  # "valid-data-weights": [0.5, 0.4],
-
-
-  # If you would like the code to create val and test datasets from your training set use the following instead
-  # "split" determines the relative size of train, val, and test
-
-  # "split" 995,4,1
-  # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
-
-  "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
-  "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
-  "save": "/mnt/ssd-1/checkpoints",
-  "load": "/mnt/ssd-1/checkpoints",
-  "tensorboard_dir": "/mnt/ssd-1/tensorboard",
-  "log_dir": "/mnt/ssd-1/logs",
-  "wandb_team": "eleutherai",
-  "wandb_project": "neox",
-  "wandb_group": "example"
-}
diff --git a/configs/finetuning_configs/6-9B.yml b/configs/finetuning_configs/6-9B.yml
deleted file mode 100755
index c6120d6d5..000000000
--- a/configs/finetuning_configs/6-9B.yml
+++ /dev/null
@@ -1,89 +0,0 @@
-{
-  # finetuning option
-  "load": "/path/to/checkpoint",
-  "finetune": true,
-
-  "pipe-parallel-size": 1,
-  "model-parallel-size": 2,
-
-   "num-layers": 32,
-   "hidden-size": 4096,
-   "num-attention-heads": 32,
-   "seq-length": 2048,
-   "max-position-embeddings": 2048,
-   "norm": "layernorm",
-   "pos-emb": "rotary",
-   "rotary_pct": 0.25,
-   "no-weight-tying": true,
-   "gpt_j_residual": true,
-   "output_layer_parallelism": "column",
-   
-   "attention-config": [[["flash"], 32]],
-   
-   "scaled-upper-triang-masked-softmax-fusion": true,
-   "bias-gelu-fusion": true,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-   
-   "min_lr": 0.000012,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-    "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
-  },
-
-   "train_micro_batch_size_per_gpu": 8,
-   "gradient_accumulation_steps": 2,
-   "data-impl": "mmap",
-
-   "checkpoint-activations": true,
-   "checkpoint-num-layers": 1,
-   "partition-activations": true,
-   "synchronize-each-layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight-decay": 0.1,
-   "hidden-dropout": 0,
-   "attention-dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "initial_scale_power": 12,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train-iters": 143000,
-   "lr-decay-iters": 143000,
-   "distributed-backend": "nccl",
-   "lr-decay-style": "cosine",
-   "warmup": 0.01,
-   "checkpoint-factor": 1000,
-   "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
-   "eval-interval": 143000,
-   "eval-iters": 10,
-
-   "log-interval": 10,
-   "steps_per_print": 10,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/gen_docs.py b/configs/gen_docs.py
deleted file mode 100644
index 08431e6c6..000000000
--- a/configs/gen_docs.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import sys
-import os
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-from megatron.neox_arguments import neox_args, deepspeed_args
-from inspect import getmembers, getsource
-from dataclasses import field, is_dataclass
-from itertools import tee, zip_longest
-import pathlib
-
-
-def pairwise(iterable):
-    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
-    a, b = tee(iterable)
-    next(b, None)
-    return zip_longest(a, b)
-
-
-def get_docs(module):
-    ARGS_CLASSES = getmembers(module, is_dataclass)
-    results = {}
-    for name, dcls in ARGS_CLASSES:
-        assert is_dataclass(dcls)
-        src = getsource(dcls)
-        d = dcls()
-        loc = 0
-        results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
-        for cur, _next in pairwise(d.__dataclass_fields__.items()):
-            field_name, field_def = cur
-            field_type = field_def.type
-            if hasattr(field_type, "__name__"):
-                field_type = field_type.__name__
-            else:
-                field_type = str(field_type)
-
-            field_default = field_def.default
-
-            # try to find the field definition
-            loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
-
-            if _next is not None:
-                next_field_name, _ = _next
-                # try to find the next field definition
-                next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
-            else:
-                next_loc = len(src)
-
-            # try to get the docstring
-            _src = src[loc:next_loc].strip()
-            if '"""' in _src:
-                doc = _src.split('"""')[1].strip()
-            elif "'''" in _src:
-                doc = _src.split("'''")[1].strip()
-            else:
-                doc = ""
-            results[name]["attributes"][field_name] = {
-                "name": field_name,
-                "type": field_type,
-                "default": field_default,
-                "doc": doc,
-            }
-    return results
-
-
-def to_md(docs, intro_str=""):
-    """
-    Writes the docs dictionary to markdown format
-    """
-    lines = []
-    lines.append(intro_str)
-    for name, doc in docs.items():
-        lines.append(f"## {name}")
-        lines.append(f"{doc['doc']}")
-        lines.append("")
-        for field_name, field_def in doc["attributes"].items():
-            # attribute name and type
-            lines.append(f"- **{field_name}**: {field_def['type']}")
-            # default value
-            lines.append(f"    Default = {str(field_def['default'])}")
-            lines.append(f"    {field_def['doc']}")
-            lines.append("")
-    return "\n\n".join(lines)
-
-
-if __name__ == "__main__":
-    docs = get_docs(neox_args)
-    docs.update(get_docs(deepspeed_args))
-    intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
-    md = to_md(docs, intro_str=intro_str)
-    with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
-        f.write(md)
diff --git a/configs/gmlp_small.yml b/configs/gmlp_small.yml
deleted file mode 100644
index 2a5b02d60..000000000
--- a/configs/gmlp_small.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-# GPT-2 pretraining setup
-{
-   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
-   # across the node boundaries )
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-   "attention_config": [[["gmlp"], "all"]],
-
-
-   # model settings
-   "num_layers": 12,
-   "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
-   "gmlp_attn_dim": 64,
-   "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "none",
-   "no_weight_tying": true,
-
-   # optimizer settings
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e_8,
-     }
-   },
-
-   # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-
-   # activation checkpointing
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": false,
-   "synchronize_each_layer": true,
-
-   # regularization
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-
-   # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-}
diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml
deleted file mode 100644
index 305567be1..000000000
--- a/configs/llama/13B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 2,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 40,
-  "hidden_size": 5120,
-  "num_attention_heads": 40,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml
deleted file mode 100644
index 450f8da38..000000000
--- a/configs/llama/30B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 4,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 60,
-  "hidden_size": 6656,
-  "num_attention_heads": 52,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml
deleted file mode 100644
index 85f199ce2..000000000
--- a/configs/llama/65B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 8,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 80,
-  "hidden_size": 8192,
-  "num_attention_heads": 64,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml
deleted file mode 100644
index ecbf187a8..000000000
--- a/configs/llama/7B.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-  "make_vocab_size_divisible_by": 1,
-
-  # model settings
-  "num_layers": 32,
-  "hidden_size": 4096,
-  "num_attention_heads": 32,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 1,
-  "no_weight_tying": true,
-  "gpt_j_residual": false,
-  "output_layer_parallelism": "column",
-  "norm": "rmsnorm",
-  "rms_norm_epsilon": 1.0e-6,
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": false,
-  "use_bias_in_norms": false,
-  "use_bias_in_attn_linear": false,
-  "mlp_type": "llama",
-  "activation": "silu",
-}
diff --git a/configs/llemma_34b.yml b/configs/llemma_34b.yml
new file mode 100644
index 000000000..88a714575
--- /dev/null
+++ b/configs/llemma_34b.yml
@@ -0,0 +1,108 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00005,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 2,
+   "gradient_accumulation_steps": 16,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "precision": "bfloat16",
+   "fp32_allreduce": true,
+   "bf16": {
+     "enabled": true
+   },
+   "data_types": {
+     "grad_accum_dtype": "fp32"
+   },
+
+   "train_iters": 12000,
+   "lr_decay_iters": 12000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "decay_lr_to": 0.033,
+   "warmup_iters": 500,
+   "checkpoint_factor": 250,
+   "eval_interval": 250,
+   "eval_iters": 25,
+
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "SPMTokenizer",
+   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+   "save": "/fsx/proj-mathlm/saved-weights/34b_1epoch",
+   # "load": "" # set to same as "save" to resume from intermediate finetuning step
+   "load": "/path/to/converted/codellama_34b_weights_with_mp8",
+
+   "finetune": true, # set to false once resuming from intermediate finetuning step
+   "checkpoint_validation_with_forward_pass": true,
+
+
+   "use_wandb": true,
+   "wandb_group": "34b-codellama-5e-5lr",
+   "wandb_project": "math-lm",
+   "wandb_team": "your-teamname-here",
+   "wandb_host": "https://api.wandb.ai",
+
+   "launcher": "slurm",
+   "deepspeed_slurm": true
+}
\ No newline at end of file
diff --git a/configs/llemma_7b.yml b/configs/llemma_7b.yml
new file mode 100644
index 000000000..c77c1c841
--- /dev/null
+++ b/configs/llemma_7b.yml
@@ -0,0 +1,105 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 2,
+   "data_impl": "mmap",
+
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   "precision": "bfloat16",
+   "fp32_allreduce": true,
+   "bf16": {
+     "enabled": true
+   },
+   "data_types": {
+     "grad_accum_dtype": "fp32"
+   },
+
+   "train_iters": 48000,
+   "lr_decay_iters": 48000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "decay_lr_to": 0.033,
+   "warmup_iters": 500,
+   "checkpoint_factor": 500,
+   "eval_interval": 250,
+   "eval_iters": 50,
+
+   "log_interval": 1,
+   "steps_per_print": 1,
+   "wall_clock_breakdown": true,
+
+   "tokenizer_type": "SPMTokenizer",
+   "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+   "save": "/path/to/save/llema-replication",
+   #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+   "load": "/path/to/converted/codellama_7b_weights_with_mp2",
+
+   "finetune": true, # set to false once resuming from intermediate finetuning step
+   "checkpoint_validation_with_forward_pass": true,
+
+   "use_wandb": true,
+   "wandb_group": "codellama_200btok_mp2_32node_2gas_deeperspeed",
+   "wandb_project": "math-lm",
+   "wandb_team": "your-teamname-here",
+   "wandb_host": "https://api.wandb.ai",
+
+   "launcher": "slurm",
+   "deepspeed_slurm": true
+}
\ No newline at end of file
diff --git a/configs/local_setup.yml b/configs/local_setup.yml
deleted file mode 100644
index d031a2ad8..000000000
--- a/configs/local_setup.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Suggested data paths when using GPT-NeoX locally
-{
-  "data_path": "data/enwik8/enwik8_text_document",
-
-  # or for weighted datasets:
-  # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],
-  # "train-data-weights": [1., 2.],
-  # "test-data-weights": [2., 1.],
-  # "valid-data-weights": [0.5, 0.4],
-
-  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.
-  # WARNING: setting this to True will override any user provided weights
-  # "weight_by_num_documents": false,
-  # "weighted_sampler_alpha": 0.3,
-
-  "vocab_file": "data/gpt2-vocab.json",
-  "merge_file": "data/gpt2-merges.txt",
-
-  "save": "checkpoints",
-  "load": "checkpoints",
-  "checkpoint_validation_with_forward_pass": False,
-
-  "tensorboard_dir": "tensorboard",
-  "log_dir": "logs",
-  "use_wandb": True,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
-}
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
deleted file mode 100644
index c50e7ff01..000000000
--- a/configs/neox_arguments.md
+++ /dev/null
@@ -1,2023 +0,0 @@
-Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):
-
-
-## NeoXArgsLRScheduler
-
-LR Scheduler Arguments
-
-
-
-- **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential']
-
-    Default = linear
-
-    Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'.
-
-
-
-- **lr_decay_iters**: int
-
-    Default = None
-
-    Number of iterations to decay learning rate over, If None defaults to --train-iters
-
-
-
-- **min_lr**: float
-
-    Default = 0.0
-
-    Minimum value for learning rate. The scheduler clips values below this threshold.
-
-
-
-- **warmup**: float
-
-    Default = 0.01
-
-    Percentage of total iterations to warmup on (.01 = 1 percent of all training iters).
-
-
-
-- **override_lr_scheduler**: bool
-
-    Default = False
-
-    Reset the values of the scheduler (learning rate,warmup iterations, minimum learning rate, maximum number of iterations, and decay style from input arguments and ignore values from checkpoints. Note that all the above values will be reset.
-
-
-
-- **use_checkpoint_lr_scheduler**: bool
-
-    Default = False
-
-    Use checkpoint to set the values of the scheduler (learning rate, warmup iterations, minimum learning rate, maximum number of iterations, and decay style from checkpoint and ignore input arguments.
-
-
-
-## NeoXArgsLogging
-
-Logging Arguments
-
-
-
-- **use_wandb**: bool
-
-    Default = None
-
-    Flag indicating if wandb is to be used.
-
-
-
-- **wandb_group**: str
-
-    Default = None
-
-    Weights and Biases group name - used to group together "runs".
-
-
-
-- **wandb_team**: str
-
-    Default = None
-
-    Team name for Weights and Biases.
-
-
-
-- **wandb_project**: str
-
-    Default = neox
-
-    wandb project name
-
-
-
-- **wandb_host**: str
-
-    Default = https://api.wandb.ai
-
-    url of the wandb host
-
-
-
-- **wandb_init_all_ranks**: bool
-
-    Default = False
-
-    Initialize wandb on all ranks.
-
-
-
-- **git_hash**: str
-
-    Default = d3e481c
-
-    current git hash of repository
-
-
-
-- **log_dir**: str
-
-    Default = None
-
-    Directory to save logs to.
-
-
-
-- **tensorboard_dir**: str
-
-    Default = None
-
-    Write TensorBoard logs to this directory.
-
-
-
-- **log_interval**: int
-
-    Default = 100
-
-    Interval between logging.
-
-
-
-- **log_grad_pct_zeros**: bool
-
-    Default = False
-
-    Log the percentage of zeros for the gradient of each parameter to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
-
-
-
-- **log_param_norm**: bool
-
-    Default = False
-
-    Log the frob norm of the parameters to wandb / tensorboard (useful for debugging). Needs wandb_init_all_ranks set to True if using pipeline parallelism to log all ranks.
-
-
-
-- **log_grad_norm**: bool
-
-    Default = False
-
-    Log the frob norm of the gradients to wandb / tensorboard (useful for debugging).
-    (N.B - this will only work with pp = 0 for now, as we don't have access to the gradients of the model because
-    deepspeed.)
-
-
-
-- **log_optimizer_states**: bool
-
-    Default = False
-
-    Log the frob norm of the optimizer states to wandb / tensorboard (useful for debugging).
-
-
-
-- **log_gradient_noise_scale**: bool
-
-    Default = False
-
-    Whether to log the gradient noise scale when training (cf. https://arxiv.org/abs/1812.06162 for explanation)
-
-
-
-- **gradient_noise_scale_n_batches**: int
-
-    Default = 5
-
-    Number of batches to accumulate gradients for in the gradient noise scale logger.
-
-
-
-- **gradient_noise_scale_cpu_offload**: bool
-
-    Default = False
-
-    Whether to offload the buffered gradients to cpu when measuring gradient noise scale.
-
-
-
-## NeoXArgsModel
-
-Model Arguments
-
-
-
-- **precision**: typing.Literal['fp16', 'fp32', 'bfloat16']
-
-    Default = None
-
-    description of the used precision, either one of fp16 or fp32 (and in the future bf16).
-
-
-
-- **num_layers**: int
-
-    Default = None
-
-    Number of transformer layers.
-
-
-
-- **hidden_size**: int
-
-    Default = None
-
-    Transformer hidden size.
-
-
-
-- **num_attention_heads**: int
-
-    Default = None
-
-    Number of transformer attention heads.
-
-
-
-- **seq_length**: int
-
-    Default = None
-
-    Maximum sequence length to process.
-
-
-
-- **max_position_embeddings**: int
-
-    Default = None
-
-    Maximum number of position embeddings to use. This is the size of position embedding.
-
-
-
-- **norm**: typing.Literal['layernorm', 'rmsnorm', 'scalenorm']
-
-    Default = layernorm
-
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
-
-
-
-- **layernorm_epsilon**: float
-
-    Default = 1e-05
-
-    Layer norm epsilon.
-
-
-
-- **rms_norm_epsilon**: float
-
-    Default = 1e-08
-
-    Root mean squared norm epsilon
-
-
-
-- **scalenorm_epsilon**: float
-
-    Default = 1e-08
-
-    Scalenorm epsilon
-
-
-
-- **pos_emb**: typing.Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none']
-
-    Default = learned
-
-    Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none'
-
-
-
-- **rpe_num_buckets**: int
-
-    Default = 32
-
-    T5 relative positional encoding number of buckets, default 32.
-
-
-
-- **rpe_max_distance**: int
-
-    Default = 128
-
-    T5 relative positional encoding max distance, default 128.
-
-
-
-- **opt_pos_emb_offset**: int
-
-    Default = 0
-
-    Learned position embedding offset (only used by OPT, where it should be set to 2).
-
-
-
-- **no_weight_tying**: bool
-
-    Default = False
-
-    Disables weight tying between embedding weights and final Linear layer
-
-
-
-- **attention_config**: list
-
-    Default = None
-
-    Attention configuration for gpt-neox
-
-    The first item in the list specifies the attention type(s), and should be a list of strings. The second item
-    specifies the number of times to repeat those attention types in the full list.
-
-    attention type choices:  [global, local, sparse_fixed, sparse_variable, bslongformer, bigbird]
-
-    So a 12 layer network with only global attention could be specified like:
-        [[[`global`], 12]]
-
-    or a 12 layer network with alternating global / local like:
-        [[[`global`, `local`], 6]]
-
-    If none is specified, this defaults to
-        [[[`global`], n_layers]]
-
-
-
-- **sparsity_config**: dict
-
-    Default = None
-
-    Sparsity configuration dict as defined in https://www.deepspeed.ai/docs/config-json/#sparse-attention
-
-    Note that since neox is autoregressive, attention is always "unidirectional" and `horizontal_global_attention` is
-    always false.
-
-    The main difference between our sparsity config and deepspeed's is that `mode` is ignored - since it is instead
-    specified in attention_config defining each layer.
-
-    An example config is given below:
-          "sparse_attention": {
-            "block": 16,
-            "different_layout_per_head": true,
-            "num_local_blocks": 4,
-            "num_global_blocks": 1,
-            "num_different_global_patterns": 4,
-            "num_random_blocks": 0,
-            "local_window_blocks": [4],
-            "global_block_indices": [0],
-            "global_block_end_indices": None,
-            "num_sliding_window_blocks": 3
-          }
-
-
-
-- **num_unique_layers**: int
-
-    Default = None
-
-    Number of unique transformer layers. num-layers should be divisible by this value. Currently only has an effect when pipe_parallel_size=0.
-
-
-
-- **param_sharing_style**: str
-
-    Default = grouped
-
-    Ordering of the shared parameters. For example, for a num-layers=4 and --num-unique-layers=2, we will have the following ordering for two unique layers 1 and 2-: grouped: [1, 2, 1, 2] and spaced: [1, 1, 2, 2].
-
-
-
-- **make_vocab_size_divisible_by**: int
-
-    Default = 128
-
-    Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
-
-
-
-- **activation**: typing.Literal['gelu', 'geglu', 'relu', 'softsign', 'swish', 'mish', 'silu']
-
-    Default = gelu
-
-    Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish", "silu"]
-
-
-
-- **scaled_upper_triang_masked_softmax_fusion**: bool
-
-    Default = False
-
-    Enable fusion of query_key_value_scaling time (upper diagonal) masking and softmax.
-
-
-
-- **scaled_masked_softmax_fusion**: bool
-
-    Default = False
-
-    Enable fusion of query_key_value_scaling general masking and softmax.
-
-
-
-- **bias_gelu_fusion**: bool
-
-    Default = False
-
-    Enable bias and gelu fusion.
-
-
-
-- **bias_dropout_fusion**: bool
-
-    Default = False
-
-    Enable bias and dropout fusion.
-
-
-
-- **fp16_lm_cross_entropy**: bool
-
-    Default = False
-
-    Move the cross entropy unreduced loss calculation for lm head to fp16.
-
-
-
-- **init_method_std**: float
-
-    Default = 0.02
-
-    Standard deviation of the zero mean normal distribution used for weight initialization.
-
-
-
-- **apply_query_key_layer_scaling**: bool
-
-    Default = False
-
-    Scale Q * K^T by 1 / layer-number. If this flag is set, then it will automatically set attention-softmax-in-fp32 to true
-
-
-
-- **use_cpu_initialization**: bool
-
-    Default = False
-
-    If set, affine parallel weights initialization uses CPU
-
-
-
-- **attention_softmax_in_fp32**: bool
-
-    Default = False
-
-    Run attention masking and softmax in fp32.
-
-
-
-- **rotary_pct**: float
-
-    Default = 1.0
-
-    pct of hidden dims to apply rotary positional embedding to
-
-
-
-- **rotary_emb_base**: int
-
-    Default = 10000
-
-    Base for rotary positional embedding
-
-
-
-- **init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
-
-    Default = normal
-
-    Init function used on all layers except ff residual outputs - choose from
-    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
-
-
-
-- **output_layer_init_method**: typing.Literal['normal', 'scaled_normal', 'orthogonal', 'scaled_orthogonal', 'xavier_uniform', 'xavier_normal', 'wang_init', 'small_init']
-
-    Default = scaled_normal
-
-    Init function used for ff residual outputs - choose from
-    ["normal", "scaled_normal", "orthogonal", "scaled_orthogonal", "xavier_uniform", "xavier_normal", "wang_init", "small_init"]
-
-
-
-- **gmlp_attn_dim**: int
-
-    Default = 64
-
-    the dimension of the single head self attention in gmlp model (not used in gpt models).
-    If None - gmlp model doesn't use attention.
-
-
-
-- **gpt_j_residual**: bool
-
-    Default = False
-
-    If false, we use the conventional residual path:
-      x = x + attn(ln1(x))
-      x = x + mlp(ln2(x))
-    Otherwise, we use the residual path from GPT-J, which offers a slight speedup:
-      x = ln(x)
-      x = x + attn(x) + mlp(x)
-
-
-
-- **gpt_j_tied**: bool
-
-    Default = False
-
-    If false, we use
-      x = x + attn(ln1(x)) + mlp(ln2(x))
-    Otherwise, we tie the layer norms
-      y = ln(x)
-      x = x + attn(y) + mlp(y)
-
-
-
-- **use_bias_in_norms**: bool
-
-    Default = True
-
-    If false, norms (e.g. LayerNorm) will not have bias terms
-
-
-
-- **use_bias_in_attn_linear**: bool
-
-    Default = True
-
-    If false, attn_linear (e.g. QKVO) will not have bias terms
-
-
-
-- **mlp_type**: str
-
-    Default = regular
-
-    Types:
-        regular: Megatron implementation
-        llama: LLaMA MLP (SiLU-gated MLP)
-
-
-
-- **soft_prompt_tuning**: dict
-
-    Default = None
-
-    Dictionary configuring the soft prompt tuning parameters.
-    If enabled, will train *only* the soft prompt, and freezes the rest of the model.
-    parameters in the dict are:
-        'enabled': bool = True # enables soft prompting
-        'num_tokens': int = 10 # length of the soft prompt in tokens
-        'init_string': str = '' # if provided, initialize the soft prompt with the word embeddings of this string
-        'init_range': float = 0.5 # if no init string is provided, initialize the soft prompt with a uniform distribution between -init_range and init_rang
-
-
-
-- **output_layer_parallelism**: typing.Literal['column']
-
-    Default = column
-
-    Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
-
-
-
-## NeoXArgsOptimizer
-
-Optimizer Arguments
-
-
-
-- **optimizer_type**: typing.Literal['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd']
-
-    Default = adam
-
-    Type of optimizer to use. Choose from ['adam', 'onebitadam', 'cpu_adam', 'cpu_torch_adam', 'sm3', 'madgrad_wd', 'sgd']
-    NOTE: sgd will use MuSGD from Mup. Mup must be enabled for this optimizer.
-
-
-
-- **use_bnb_optimizer**: bool
-
-    Default = False
-
-    Whether to enable the bitsandbytes optimizers
-
-
-
-- **zero_stage**: typing.Union[int, typing.List[int], typing.Literal['all']]
-
-    Default = None
-
-    Zero Optimizer stage
-
-
-
-- **zero_reduce_scatter**: bool
-
-    Default = None
-
-    Zero: Uses reduce or reduce scatter instead of allreduce to average gradients
-
-
-
-- **zero_contiguous_gradients**: bool
-
-    Default = None
-
-    Zero: Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. Only useful when running very large models.
-
-
-
-- **zero_reduce_bucket_size**: int
-
-    Default = None
-
-    Zero: Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes
-
-
-
-- **zero_allgather_bucket_size**: int
-
-    Default = None
-
-    Zero: Number of elements allgathered at a time. Limits the memory required for the allgather for large model sizes
-
-
-
-- **lr**: float
-
-    Default = None
-
-    Max Learning rate during training
-
-
-
-## NeoXArgsOther
-
-Misc. Arguments
-
-
-
-- **distributed_backend**: str
-
-    Default = nccl
-
-    Which backend to use for distributed training.
-
-
-
-- **local_rank**: int
-
-    Default = None
-
-    local rank passed from distributed launcher.
-
-
-
-- **rank**: int
-
-    Default = None
-
-    global rank of process being run (passed in via distributed launcher)
-
-
-
-- **lazy_mpu_init**: bool
-
-    Default = False
-
-    If set to True, initialize_megatron() skips DDP initialization and returns function to complete it instead. Also turns on use-cpu-initialization flag. This is for external DDP manager.
-
-
-
-- **short_seq_prob**: float
-
-    Default = 0.1
-
-    Probability of producing a short sequence.
-
-
-
-- **eod_mask_loss**: bool
-
-    Default = False
-
-    Mask loss for the end of document tokens.
-
-
-
-- **adlr_autoresume**: bool
-
-    Default = False
-
-    Enable auto-resume on adlr cluster.
-
-
-
-- **adlr_autoresume_interval**: int
-
-    Default = 1000
-
-    Intervals over which check for auto-resume termination signal
-
-
-
-- **seed**: int
-
-    Default = 1234
-
-    Random seed used for python, numpy, pytorch, and cuda.
-
-
-
-- **onnx_safe**: bool
-
-    Default = False
-
-    Use workarounds for known problems with Torch ONNX exporter
-
-
-
-- **deepscale**: bool
-
-    Default = False
-
-    (Deprecated) enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
-
-
-
-- **deepscale_config**: str
-
-    Default = None
-
-    (Deprecated) deepscale json configuration file.
-
-
-
-- **deepspeed_mpi**: bool
-
-    Default = False
-
-    Run via MPI, this will attempt to discover the necessary variables to initialize torch distributed from the MPI environment
-
-
-
-- **deepspeed_slurm**: bool
-
-    Default = False
-
-    Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
-
-
-
-- **user_script**: str
-
-    Default = None
-
-    user script to be run
-
-
-
-- **iteration**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **do_train**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **do_valid**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **do_test**: int
-
-    Default = None
-
-    Set during training
-
-
-
-- **save_iters**: list
-
-    Default = None
-
-    Set during training
-
-
-
-- **global_num_gpus**: int
-
-    Default = None
-
-    Set during launching
-
-
-
-## NeoXArgsParallelism
-
-Parallelism Arguments
-
-
-
-- **pipe_parallel_size**: int
-
-    Default = 0
-
-    Number of pipeline parallel stages. Disable with 0.
-
-
-
-- **model_parallel_size**: int
-
-    Default = 1
-
-    Size of the model parallelism.
-
-
-
-- **pipe_partition_method**: str
-
-    Default = type:transformer|mlp
-
-    method used to distribute model layers across pipeline stages. Choose from "parameters", which balances the number
-    of parameters on each pipeline stage, "uniform", which naively balances the number of layers per stage, or
-    "type:[regex]", which balances layers whose class names match [regex]
-
-
-
-- **world_size**: int
-
-    Default = None
-
-    Total world size (i.e number of gpus in cluster). Configured post-launch using distributed launcher
-
-
-
-- **is_pipe_parallel**: bool
-
-    Default = False
-
-    flag to determine whether pipeline parallelism is on - shouldn't be set by user, is automatically determined
-    according to pipeline parallel size.
-
-
-
-## NeoXArgsTemplate
-
-NeoXArgsTemplate()
-
-
-
-## NeoXArgsTextgen
-
-Text Generation arguments
-
-
-
-- **text_gen_type**: str
-
-    Default = None
-
-    How to generate text/sample the model.
-    Options: `unconditional`, `input-file`, `interactive`
-
-
-
-- **temperature**: float
-
-    Default = 0.0
-
-    exponential scaling output distribution ("higher == more risk")
-
-
-
-- **top_p**: float
-
-    Default = 0.0
-
-    Top-p (nucleus) sampling chooses from the smallest possible set of tokens whose cumulative probability exceeds the probability top_p.
-
-
-
-- **top_k**: int
-
-    Default = 0
-
-    integer between 0 and the models vocab size. Filters out any logits with a probability less than that of the top_kth token.
-
-
-
-- **return_logits**: bool
-
-    Default = False
-
-    Boolean for whether to return the logits for generated tokens
-
-
-
-- **maximum_tokens**: int
-
-    Default = 64
-
-    maximum number of tokens to be generated
-
-
-
-- **prompt_end**: str
-
-    Default = 
-
-
-    a single prompt's end. Defaults to newline
-
-
-
-- **sample_input_file**: str
-
-    Default = None
-
-    Get input from file instead of interactive mode, each line is an input.
-
-
-
-- **sample_output_file**: str
-
-    Default = samples.txt
-
-    Output file
-
-
-
-- **num_samples**: int
-
-    Default = 1
-
-    Number of samples to generate unconditionally, defaults to 1 and interactive conditional sampling
-
-
-
-- **recompute**: bool
-
-    Default = False
-
-    During generation recompute all attention instead of using previously computed keys/values.
-    Should be set to true for sparse attention models
-
-
-
-- **eval_results_prefix**: str
-
-    Default = 
-
-    prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
-
-
-
-- **eval_tasks**: list
-
-    Default = None
-
-    Tasks to evaluate on using lm_eval_harness
-
-
-
-## NeoXArgsTokenizer
-
-Tokenizer Arguments
-
-
-
-- **tokenizer_type**: typing.Literal['GPT2BPETokenizer', 'HFTokenizer', 'HFGPT2Tokenizer', 'SPMTokenizer', 'CharLevelTokenizer', 'TiktokenTokenizer']
-
-    Default = GPT2BPETokenizer
-
-    Type of tokenizer to use - should be one of ["GPT2BPETokenizer", "HFTokenizer", "HFGPT2Tokenizer", "SPMTokenizer", "CharLevelTokenizer", "TiktokenTokenizer"]
-
-
-
-- **padded_vocab_size**: int
-
-    Default = None
-
-    Total (padded) vocabulary size of tokenizer. Configured after launching of training,
-    as it's dependent on the parallelism size.
-
-
-
-## NeoXArgsTraining
-
-Training Arguments
-
-
-
-- **data_path**: str
-
-    Default = None
-
-    Path to combined dataset to split.
-
-
-
-- **use_shared_fs**: bool
-
-    Default = True
-
-    Whether to use a shared filesystem for data loading. If False, local rank 0 on all nodes will preprocess the data,
-    otherwise only global rank 0 will preprocess the data. This is implemented in megatron/data/gpt2_dataset.py::_build_index_mappings.
-
-
-
-- **train_data_paths**: list
-
-    Default = None
-
-    List of paths to train datasets.
-
-
-
-- **label_data_paths**: list
-
-    Default = None
-
-    List of paths to label datasets (not shifted by 1 yet!).
-
-
-
-- **test_data_paths**: list
-
-    Default = None
-
-    List of paths to test datasets.
-
-
-
-- **valid_data_paths**: list
-
-    Default = None
-
-    List of paths to validation datasets.
-
-
-
-- **train_data_weights**: list
-
-    Default = None
-
-    List of 'weights' that decide how often to sample from each training dataset when blending datasets. If None, defaults to equal weighting.
-    Should be a list the same length as `train_data_paths`
-
-
-
-- **valid_data_weights**: list
-
-    Default = None
-
-    List of 'weights' that decide how often to sample from each validation dataset when blending datasets. If None, defaults to equal weighting.
-    Should be a list the same length as `valid_data_paths`
-
-
-
-- **test_data_weights**: list
-
-    Default = None
-
-    List of 'weights' that decide how often to sample from each test dataset when blending datasets. If None, defaults to equal weighting.
-    Should be a list the same length as `test_data_paths`
-
-
-
-- **weight_by_num_documents**: bool
-
-    Default = False
-
-    If True, Builds dataset weights from a multinomial distribution over groups of data according to the number of
-    documents in each group.
-
-    WARNING: setting this to True will override any user provided weights
-
-    We sample from a group according to the probability p(L) ∝ |L| ** α,
-    where p(L) is the probability of sampling from a given group,
-          |L| is the number of examples in that datapoint,
-          and α is a coefficient that acts to upsample data from underrepresented groups
-
-    Hence α (`alpha`) allows us to control how much to 'boost' the probability of training on low-resource groups.
-
-    See https://arxiv.org/abs/1911.02116 for more details
-
-
-
-- **weighted_sampler_alpha**: float
-
-    Default = 0.3
-
-    Alpha value for `weight_by_num_documents`. Only has an effect if `weight_by_num_documents` = True.
-
-    when alpha = 1, the probability of sampling from a given group = n_samples / total_samples
-    as alpha -> 0, the probability of sampling from all groups becomes equal, and number of documents has no effect
-    as alpha -> inf, the probability of sampling from the groups with *the most samples* -> 1
-
-
-
-- **data_impl**: str
-
-    Default = infer
-
-    Implementation of indexed datasets.
-
-
-
-- **mmap_warmup**: bool
-
-    Default = False
-
-    Warm up mmap files.
-
-
-
-- **save**: str
-
-    Default = None
-
-    Output directory to save checkpoints to.
-
-
-
-- **config_files**: dict
-
-    Default = None
-
-    Store of original config files mapping config filename to file contents
-
-
-
-- **load**: str
-
-    Default = None
-
-    Directory containing a model checkpoint.
-
-
-
-- **checkpoint_validation_with_forward_pass**: bool
-
-    Default = False
-
-    save input and output of a forward pass with the checkpoint and validate after load
-
-
-
-- **checkpoint_scale**: typing.Literal['linear', 'log']
-
-    Default = linear
-
-    How step at which checkpoints are saved should scale. "linear" implies 1 checkpoint will be saved at every multiple of `checkpoint-factor`,
-    while "log" implies that the number of steps between each checkpoint will be multiplied by `checkpoint-factor` at each step, starting from step 1.
-
-
-
-- **checkpoint_factor**: int
-
-    Default = None
-
-    Acts as a multiplier on either the "log" or "linear" checkpoint spacing.
-
-    With `checkpoint-scale="linear"`, `checkpoint-factor=20`, and `train-iters=100`, checkpoints will be saved at
-    steps [20, 40, 60, 80, 100].
-
-    With `checkpoint-scale="log"`, `checkpoint-factor=2`, and `train-iters=100`, checkpoints will be saved at
-    steps [1, 2, 4, 8, 16, 32, 64, 100].
-
-    Note that the last checkpoint step is always saved.
-
-
-
-- **extra_save_iters**: list
-
-    Default = None
-
-    Additional iterations when a checkpoint should be saved.
-    Must be a list of ints or `None`.
-
-
-
-- **no_save_optim**: bool
-
-    Default = False
-
-    Do not save current optimizer.
-
-
-
-- **no_save_rng**: bool
-
-    Default = False
-
-    Do not save current rng state.
-
-
-
-- **no_load_optim**: bool
-
-    Default = False
-
-    Do not load optimizer when loading checkpoint.
-
-
-
-- **no_load_rng**: bool
-
-    Default = False
-
-    Do not load rng state when loading checkpoint.
-
-
-
-- **finetune**: bool
-
-    Default = False
-
-    Load model for finetuning. Do not load optimizer or rng state from checkpoint and set iteration to 0. Assumed when loading a release checkpoint.
-
-
-
-- **batch_size**: int
-
-    Default = None
-
-    training microbatch size per gpu
-
-
-
-- **train_iters**: int
-
-    Default = None
-
-    Number of iterations to run for training.
-
-
-
-- **eval_iters**: int
-
-    Default = 100
-
-    Number of iterations to run for evaluation validation/test for.
-
-
-
-- **keep_last_n_checkpoints**: int
-
-    Default = None
-
-    Number of last checkpoints to keep
-
-
-
-- **eval_interval**: int
-
-    Default = 1000
-
-    Interval between running evaluation on validation set.
-
-
-
-- **split**: str
-
-    Default = 969, 30, 1
-
-    Comma_separated list of proportions for training, validation, and test split. For example the split 90,5,5 will use 90% of data for training, 5% for validation and 5% for test.
-
-
-
-- **vocab_file**: str
-
-    Default = None
-
-    Path to the vocab file.
-
-
-
-- **merge_file**: str
-
-    Default = None
-
-    Path to the BPE merge file.
-
-
-
-- **num_workers**: int
-
-    Default = 2
-
-    Dataloader number of workers.
-
-
-
-- **exit_interval**: int
-
-    Default = None
-
-    Exit the program after the iteration is divisible by this value.
-
-
-
-- **attention_dropout**: float
-
-    Default = 0.1
-
-    Post attention dropout probability.
-
-
-
-- **hidden_dropout**: float
-
-    Default = 0.1
-
-    Dropout probability for hidden state transformer.
-
-
-
-- **weight_decay**: float
-
-    Default = 0.01
-
-    Weight decay coefficient for L2 regularization.
-
-
-
-- **checkpoint_activations**: bool
-
-    Default = False
-
-    Checkpoint activation to allow for training with larger models, sequences, and batch sizes.
-
-
-
-- **checkpoint_num_layers**: int
-
-    Default = 1
-
-    Chunk size (number of layers) for checkpointing.
-
-
-
-- **deepspeed_activation_checkpointing**: bool
-
-    Default = True
-
-    DEPRECATED - TODO: remove
-    Uses activation checkpointing from deepspeed
-
-
-
-- **contiguous_checkpointing**: bool
-
-    Default = False
-
-    Contiguous memory checkpointing for activations.
-
-
-
-- **checkpoint_in_cpu**: bool
-
-    Default = False
-
-    Move the activation checkpoints to CPU.
-
-
-
-- **synchronize_each_layer**: bool
-
-    Default = False
-
-    does a synchronize at the beginning and end of each checkpointed layer.
-
-
-
-- **profile_backward**: bool
-
-    Default = False
-
-    Enables backward pass profiling for checkpointed layers.
-
-
-
-- **partition_activations**: bool
-
-    Default = False
-
-    Partition Activations across GPUs before checkpointing.
-
-
-
-- **gas**: int
-
-    Default = None
-
-    gradient_accumulation_steps
-
-
-
-- **clip_grad**: float
-
-    Default = None
-
-    Gradient clipping based on global L2 norm.
-
-
-
-- **hysteresis**: int
-
-    Default = 2
-
-    hysteresis for dynamic loss scaling
-
-
-
-- **dynamic_loss_scale**: bool
-
-    Default = None
-
-    flag indicating whether dynamic loss scale is used
-
-
-
-- **loss_scale**: float
-
-    Default = None
-
-    Static loss scaling, positive power of 2
-    values can improve fp16 convergence. If None, dynamic loss scaling is used.
-
-
-
-- **loss_scale_window**: float
-
-    Default = 1000.0
-
-    Window over which to raise/lower dynamic scale.
-
-
-
-- **min_scale**: float
-
-    Default = 1.0
-
-    Minimum loss scale for dynamic loss scale.
-
-
-
-- **char_level_ppl**: bool
-
-    Default = False
-
-    Whether to calculate character level perplexity as well as token level perplexity. (may incur a time cost)
-
-
-
-- **use_mup**: bool
-
-    Default = False
-
-    Whether to use Microsoft's Mup https://github.com/microsoft/mup
-
-
-
-- **coord_check**: bool
-
-    Default = False
-
-    Whether to generate a "coord check" plot to verify mup's implementation in neox
-
-
-
-- **save_base_shapes**: bool
-
-    Default = False
-
-    Whether to save base shapes for mup. This will save the shapes to the path specified in base-shapes-file.
-
-
-
-- **base_shapes_file**: str
-
-    Default = None
-
-    Path to the base shapes to save to/load from
-
-
-
-- **mup_init_scale**: float
-
-    Default = 1.0
-
-    Initialization scale: All the parameters are multiplied by this value
-
-
-
-- **mup_attn_temp**: float
-
-    Default = 1.0
-
-    Attention temperature: Reciprocal of the multiplier applied to the input to attention softmax
-
-
-
-- **mup_output_temp**: float
-
-    Default = 1.0
-
-    Output temperature: Reciprocal of the multiplier applied to the input to softmax that
-    produces the distribution over output tokens.
-
-
-
-- **mup_embedding_mult**: float
-
-    Default = 1.0
-
-    Scalar by which we multiply the output of the embedding layer
-
-
-
-- **mup_rp_embedding_mult**: float
-
-    Default = 1.0
-
-    Scalar by which we multiply vectors representing relative position
-
-
-
-- **mup_width_scale**: int
-
-    Default = 2
-
-    What to scale width by when creating the delta model for mup
-
-
-
-## NeoXArgsDeepspeedConfig
-
-Args for deepspeed config
-    Every argument included here will be included in deepspeed config json
-    As of Mar 8 2023, up to date compared to https://www.deepspeed.ai/docs/config-json/
-
-
-
-- **deepspeed**: bool
-
-    Default = True
-
-    boolean flag to enable DeepSpeed (Always True)
-
-
-
-- **train_batch_size**: int
-
-    Default = None
-
-    The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
-
-
-
-- **train_micro_batch_size_per_gpu**: int
-
-    Default = None
-
-    Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, gradient_accumulation_steps is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with gradient_accumulation_steps in the configuration JSON.
-
-
-
-- **gradient_accumulation_steps**: int
-
-    Default = 1
-
-    Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs. Should not be concurrently specified with train_step_batch_size in the configuration JSON.
-
-
-
-- **optimizer**: dict
-
-    Default = None
-
-    dict containing the keys type and params
-
-    type: The optimizer name. DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, and OneBitLamb optimizers (See here for details) and will import other optimizers from torch.
-
-    params: Dictionary of parameters to instantiate optimizer. The parameter names must match the optimizer constructor signature (e.g., for Adam).
-
-
-
-- **scheduler**: dict
-
-    Default = None
-
-    dict containing the keys type and params
-
-    type: The scheduler name. See here (https://deepspeed.readthedocs.io/en/latest/schedulers.html) for list of support schedulers.
-
-    params: Dictionary of parameters to instantiate scheduler. The parameter names should match scheduler constructor signature.
-
-
-
-- **fp32_allreduce**: bool
-
-    Default = False
-
-    During gradient averaging perform allreduce with 32 bit values
-
-
-
-- **prescale_gradients**: bool
-
-    Default = False
-
-    Scale gradients before doing allreduce
-
-
-
-- **gradient_predivide_factor**: float
-
-    Default = 1.0
-
-    Before gradient averaging predivide gradients by a specified factor, can sometimes help with fp16 stability when scaling to large numbers of GPUs
-
-
-
-- **sparse_gradients**: bool
-
-    Default = False
-
-    Enable sparse compression of torch.nn.Embedding gradients.
-
-
-
-- **fp16**: dict
-
-    Default = None
-
-    Configuration for using mixed precision/FP16 training that leverages NVIDIA’s Apex package.
-
-    Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#fp16-training-options
-
-
-
-- **bf16**: dict
-
-    Default = None
-
-    Configuration for using bfloat16 floating-point format as an alternative to FP16. BFLOAT16 requires hardware support (e.g., NVIDIA A100). Dictionary options as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#bfloat16-training-options
-
-
-
-- **amp**: dict
-
-    Default = None
-
-    Configuration for using automatic mixed precision (AMP) training that leverages NVIDIA’s Apex AMP package.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options
-
-
-
-- **gradient_clipping**: float
-
-    Default = 1.0
-
-    Enable gradient clipping with provided value
-
-
-
-- **zero_optimization**: dict
-
-    Default = None
-
-    Configuration for using ZeRO optimization.
-
-    Multi-level dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#zero-optimization-options
-
-
-
-- **curriculum_learning**: dict
-
-    Default = None
-
-    
-
-
-
-- **curriculum_seqlen**: int
-
-    Default = 0
-
-    Internal var for tracking the current seqlen
-
-
-
-- **steps_per_print**: int
-
-    Default = 10
-
-    Print train loss every N steps.
-
-
-
-- **wall_clock_breakdown**: bool
-
-    Default = False
-
-    Enable timing of the latency of forward/backward/update training phases.
-
-
-
-- **dump_state**: bool
-
-    Default = False
-
-    Print out state information of DeepSpeed object after initialization.
-
-
-
-- **flops_profiler**: dict
-
-    Default = None
-
-    Configuration for using FLOPS profiler.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#flops-profiler
-
-
-
-- **communication_data_type**: bool
-
-    Default = None
-
-    During gradient averaging, perform communication with selected data type. By default it will be determined by selected regime
-
-
-
-- **autotuning**: dict
-
-    Default = None
-
-    Configuration for using autotuning.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#autotuning
-
-
-
-- **activation_checkpointing**: dict
-
-    Default = None
-
-    Configuration for using activation checkpointing.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#activation-checkpointing
-
-
-
-- **sparse_attention**: dict
-
-    Default = None
-
-    Configuration for using sparse attention.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#sparse-attention
-
-
-
-- **data_efficiency**: dict
-
-    Default = None
-
-    Configuration for using data efficiency.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-efficiency
-
-
-
-- **tensorboard**: dict
-
-    Default = None
-
-    Configuration for using tensorboard.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#monitoring-module-tensorboard-wandb-csv
-
-
-
-- **wandb**: dict
-
-    Default = None
-
-    Configuration for using wandb.
-
-
-
-- **csv_monitor**: dict
-
-    Default = None
-
-    Configuration for using csv_monitor.
-
-
-
-- **elasticity**: dict
-
-    Default = None
-
-    Configuration for using elastic training.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#elastic-training-config-v01-and-v02
-
-
-
-- **comms_logger**: dict
-
-    Default = None
-
-    Configuration for using communication logger.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#communication-logging
-
-
-
-- **compression_training**: dict
-
-    Default = None
-
-    Configuration for using compression training.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#compression
-
-
-
-- **checkpoint**: dict
-
-    Default = None
-
-    Configuration for using checkpointing.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#checkpoint-options
-
-
-
-- **data_types**: dict
-
-    Default = None
-
-    Configuration for using data types.
-
-    Dictionary as described in Deepspeed documentation: https://www.deepspeed.ai/docs/config-json/#data-type-options
-
-
-
-- **deepspeed_extra_args**: dict
-
-    Default = None
-
-    Dictionary of extra arguments to be included in the yaml config file. This can be used for any argument not included in the above list.
-
-
-
-## NeoXArgsDeepspeedRunner
-
-Args for deepspeed runner (deepspeed.launcher.runner).
-    Every argument included here will be passed as command line argument to deepspeed.launcher.runner
-
-
-
-- **hostfile**: str
-
-    Default = None
-
-    list of hostnames / ssh aliases and the number of GPUs per host
-
-    example file contents:
-    worker-1 slots=4
-    worker-2 slots=4
-    127.0.0 slots=4
-    127.0.1 slots=4
-
-
-
-- **include**: str
-
-    Default = None
-
-    Specify hardware resources to use during execution. String format is `NODE_SPEC[@NODE_SPEC ...]` where `NODE_SPEC=NAME[:SLOT[,SLOT ...]]`. If `:SLOT` is omitted, include all slots on that host. Example: `"worker-0@worker-1:0,2"` will use all slots. on `worker-0` and slots `[0, 2]` on `worker-1`.
-
-
-
-- **exclude**: str
-
-    Default = None
-
-    Specify hardware resources to NOT use during execution. Same format as include
-
-
-
-- **num_nodes**: int
-
-    Default = -1
-
-    Total number of worker nodes to run on, this will use the top N hosts from the given hostfile. -1 will use all.
-
-
-
-- **num_gpus**: int
-
-    Default = None
-
-    Max number of GPUs to use on each node, will use [0:N) GPU ids on each node. None / not specifying a value will use all.
-
-
-
-- **master_port**: int
-
-    Default = 29500
-
-    Port used by PyTorch distributed for communication during training.
-
-
-
-- **master_addr**: str
-
-    Default = None
-
-    IP address of node 0, will be inferred via 'hostname -I' if not specified.
-
-
-
-- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
-
-    Default = pdsh
-
-    Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
-
-
-
-- **force_multi**: bool
-
-    Default = False
-
-    Force multi-node training even if only one node is specified.
-
-
-
-- **detect_nvlink_pairs**: bool
-
-    Default = False
-
-    If true, autodetects nvlink pairs and remaps cuda visible devices to place them next to each other. This is an Eleuther addition to deepspeed, and should speed up model parallel training on setups with nvlink pairs when mp=2.
-
-
-
-- **autotuning_run**: str
-
-    Default = None
-
-    Either "tune", "run", or `None`.
-
-
-
-- **no_ssh_check**: bool
-
-    Default = False
-
-    If true, overrides the default check where DeepSpeed confirms that the headnode is accessible via ssh.
-
-
-
-- **comment**: str
-
-    Default = None
-
-    Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
-
diff --git a/configs/pythia/1-4B.yml b/configs/pythia/1-4B.yml
deleted file mode 100755
index bfb8d4bc7..000000000
--- a/configs/pythia/1-4B.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 24,
-  "hidden_size": 2048,
-  "num_attention_heads": 16,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 24]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0002,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00002,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 16,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-  "tokenizer_type": "HFTokenizer"
-  }
diff --git a/configs/pythia/12B.yml b/configs/pythia/12B.yml
deleted file mode 100755
index 21b67521d..000000000
--- a/configs/pythia/12B.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 4,
-
-   "num_layers": 36,
-   "hidden_size": 5120,
-   "num_attention_heads": 40,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "rotary_pct": 0.25,
-   "no_weight_tying": true,
-   "gpt_j_residual": true,
-   "output_layer_parallelism": "column",
-
-   "attention_config": [[["flash"], 36]],
-
-   "scaled_upper_triang_masked_softmax_fusion": true,
-   "bias_gelu_fusion": true,
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-   "min_lr": 0.000012,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 8,
-   "gradient_accumulation_steps": 2,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "initial_scale_power": 12,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train_iters": 143000,
-   "lr_decay_iters": 143000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 1000,
-   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-   "eval_interval": 143000,
-   "eval_iters": 10,
-
-   "log_interval": 10,
-   "steps_per_print": 10,
-   "wall_clock_breakdown": true,
-
-   "log_grad_norm": true,
-
-   "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/160M.yml b/configs/pythia/160M.yml
deleted file mode 100755
index 2f6abdef5..000000000
--- a/configs/pythia/160M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 12,
-  "hidden_size": 768,
-  "num_attention_heads": 12,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 12]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0006,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00006,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/1B.yml b/configs/pythia/1B.yml
deleted file mode 100755
index 78fc28946..000000000
--- a/configs/pythia/1B.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 16,
-  "hidden_size": 2048,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00025,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.000025,
-
-  "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "fp16": {
-    "enabled": true,
-    "type": "bfloat16",
-    "auto_cast": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "fp32_allreduce": true,
-
-  "train_micro_batch_size_per_gpu": 4,
-  "gradient_accumulation_steps": 4,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/2-8B.yml b/configs/pythia/2-8B.yml
deleted file mode 100755
index 04427e9fd..000000000
--- a/configs/pythia/2-8B.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 32,
-  "hidden_size": 2560,
-  "num_attention_heads": 32,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 32]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.00016,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.000016,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 8,
-  "gradient_accumulation_steps": 2,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 40000,
-  "eval_iters": 10,
-
-  "log_grad_norm": true,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/410M.yml b/configs/pythia/410M.yml
deleted file mode 100755
index 95afe9cd9..000000000
--- a/configs/pythia/410M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 24,
-  "hidden_size": 1024,
-  "num_attention_heads": 16,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 24]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.0003,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.00003,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 143000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/6-9B.yml b/configs/pythia/6-9B.yml
deleted file mode 100755
index 869129f30..000000000
--- a/configs/pythia/6-9B.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 2,
-
-   "num_layers": 32,
-   "hidden_size": 4096,
-   "num_attention_heads": 32,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "rotary_pct": 0.25,
-   "no_weight_tying": true,
-   "gpt_j_residual": true,
-   "output_layer_parallelism": "column",
-
-   "attention_config": [[["flash"], 32]],
-
-   "scaled_upper_triang_masked_softmax_fusion": true,
-   "bias_gelu_fusion": true,
-
-
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.00012,
-       "betas": [0.9, 0.95],
-       "eps": 1.0e-8
-     }
-   },
-
-   "min_lr": 0.000012,
-
-   "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 1260000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 1260000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-   "train_micro_batch_size_per_gpu": 8,
-   "gradient_accumulation_steps": 2,
-   "data_impl": "mmap",
-
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.1,
-   "hidden_dropout": 0,
-   "attention_dropout": 0,
-
-   "fp16": {
-     "fp16": true,
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "initial_scale_power": 12,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-
-   "train_iters": 143000,
-   "lr_decay_iters": 143000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 1000,
-   "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-   "eval_interval": 143000,
-   "eval_iters": 10,
-
-   "log_interval": 10,
-   "steps_per_print": 10,
-   "wall_clock_breakdown": true,
-
-   "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/pythia/70M.yml b/configs/pythia/70M.yml
deleted file mode 100755
index a58553b4e..000000000
--- a/configs/pythia/70M.yml
+++ /dev/null
@@ -1,85 +0,0 @@
-{
-  "pipe_parallel_size": 1,
-  "model_parallel_size": 1,
-
-  "num_layers": 6,
-  "hidden_size": 512,
-  "num_attention_heads": 8,
-  "seq_length": 2048,
-  "max_position_embeddings": 2048,
-  "pos_emb": "rotary",
-  "rotary_pct": 0.25,
-  "no_weight_tying": true,
-  "gpt_j_residual": true,
-  "output_layer_parallelism": "column",
-
-  "attention_config": [[["flash"], 6]],
-
-  "scaled_upper_triang_masked_softmax_fusion": true,
-  "bias_gelu_fusion": true,
-
-  "init_method": "small_init",
-  "output_layer_init_method": "wang_init",
-
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.001,
-      "betas": [0.9, 0.95],
-      "eps": 1.0e-8
-    }
-  },
-  "min_lr": 0.0001,
-
-  "zero_optimization": {
-    "stage": 1,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true,
-    "cpu_offload": false
-  },
-
-  "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
-  "data_impl": "mmap",
-  "num_workers": 1,
-
-  "checkpoint_activations": true,
-  "checkpoint_num_layers": 1,
-  "partition_activations": true,
-  "synchronize_each_layer": true,
-
-  "gradient_clipping": 1.0,
-  "weight_decay": 0.1,
-  "hidden_dropout": 0,
-  "attention_dropout": 0,
-
-  "fp16": {
-    "fp16": true,
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 12,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-
-  "train_iters": 143000,
-  "lr_decay_iters": 143000,
-  "distributed_backend": "nccl",
-  "lr_decay_style": "cosine",
-  "warmup": 0.01,
-  "checkpoint_factor": 1000,
-  "extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
-  "eval_interval": 100000,
-  "eval_iters": 10,
-
-  "log_interval": 10,
-  "steps_per_print": 10,
-  "wall_clock_breakdown": true,
-
-  "tokenizer_type": "HFTokenizer"
-}
diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
deleted file mode 100644
index c6f388b9d..000000000
--- a/configs/slurm_125M.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-   "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
-   "num_layers": 12,
-   "hidden_size": 768,
-   "num_attention_heads": 12,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
-   "norm": "layernorm",
-   "pos_emb": "rotary",
-   "no_weight_tying": true,
-   "scaled_upper_triang_masked_softmax_fusion": true,
-   "bias_gelu_fusion": true,
-   "optimizer": {
-     "type": "Adam",
-     "params": {
-       "lr": 0.0006,
-       "betas": [0.9, 0.999],
-       "eps": 1.0e-8
-     }
-   },
-   "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 500000000,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 500000000,
-    "contiguous_gradients": true
-  },
-   "train_micro_batch_size_per_gpu": 4,
-   "data_impl": "mmap",
-   "split": "949,50,1",
-   "checkpoint_activations": true,
-   "checkpoint_num_layers": 1,
-   "partition_activations": true,
-   "synchronize_each_layer": true,
-   "gradient_clipping": 1.0,
-   "weight_decay": 0.0,
-   "hidden_dropout": 0.0,
-   "attention_dropout": 0.0,
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
-   "distributed_backend": "nccl",
-   "lr_decay_style": "cosine",
-   "warmup": 0.01,
-   "checkpoint_factor": 10000,
-   "eval_interval": 1000,
-   "eval_iters": 10,
-   "log_interval": 100,
-   "steps_per_print": 10,
-   "keep_last_n_checkpoints": 4,
-   "wall_clock_breakdown": true,
-   "launcher": "slurm",
-   "deepspeed_slurm": true,
-   "comment": "neox"
-}
diff --git a/configs/slurm_local.json b/configs/slurm_local.json
deleted file mode 100644
index 36e16089b..000000000
--- a/configs/slurm_local.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-  "vocab-file": "data/gpt2-vocab.json",
-  "merge-file": "data/gpt2-merges.txt",
-  "save": "checkpoints",
-  "checkpoint_validation_with_forward_pass": false,
-  "tensorboard-dir": "tensorboard",
-  "log-dir": "logs",
-  "use_wandb": true,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
-}
diff --git a/configs/slurm_local.yml b/configs/slurm_local.yml
deleted file mode 100644
index 1a2b73aba..000000000
--- a/configs/slurm_local.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "data_path": "data/enwik8/enwik8_text_document",
-  "vocab_file": "data/gpt2-vocab.json",
-  "merge_file": "data/gpt2-merges.txt",
-  "save": "checkpoints",
-  "checkpoint_validation_with_forward_pass": false,
-  "tensorboard_dir": "tensorboard",
-  "log_dir": "logs",
-  "use_wandb": true,
-  "wandb_host": "https://api.wandb.ai",
-  "wandb_project": "neox"
-}
diff --git a/configs/sparse.yml b/configs/sparse.yml
deleted file mode 100644
index 7251c88b7..000000000
--- a/configs/sparse.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Add this to your config for sparse attention every other layer
-{
-  "attention_config": [[["local", "global"], "all"]],
-
-  # sparsity config:
-  # (these are the defaults for local sliding window sparsity, training will work without this here, but it's left in for
-  # illustrative purposes)
-  # see https://www.deepspeed.ai/tutorials/sparse-attention/#how-to-config-sparsity-structures for
-  # more detailed config instructions and available parameters
-
-  "sparsity_config": {
-    "block": 16, # block size
-    "num_local_blocks": 32,
-  }
-}
diff --git a/configs/text_generation.yml b/configs/text_generation.yml
deleted file mode 100644
index 5a49d61e5..000000000
--- a/configs/text_generation.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Parameters used for text generation
-# Make sure `load` is specified somewhere else
-{
-  # Text gen type: `input-file`, `unconditional` or `interactive`
-  "text_gen_type": "unconditional",
-
-  # Params for all
-  "maximum_tokens": 102,
-  "prompt_end": "\n",
-  "temperature": 1.0,
-  "top_p": 0.0,
-  "top_k": 0,
-  "recompute": false,
-
-  # `unconditional`: samples
-  "num_samples": 10,
-
-  # input/output file
-  "sample_input_file": "sample_input.txt",
-  "sample_output_file": "sample_output.txt",
-}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..e9d301ca3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,116 @@
+absl-py==1.4.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.2
+attrs==23.1.0
+best-download==0.0.9
+boto3==1.28.22
+botocore==1.31.22
+certifi==2023.5.7
+chardet==5.1.0
+charset-normalizer==3.1.0
+click==8.1.4
+cmake==3.26.4
+colorama==0.4.6
+CPCargo @ git+https://github.com/samikama/CPCargo@efbf0a5f2ad893c0eee4caae6098001b74be62d8
+DataProperty==1.0.0
+datasets==2.13.1
+DeepSpeed @ git+https://github.com/EleutherAI/DeeperSpeed.git@new-fix#egg=deepspeed
+dill==0.3.6
+docker-pycreds==0.4.0
+einops==0.6.1
+filelock==3.12.2
+flash-attn==2.0.0.post1
+frozenlist==1.3.3
+fsspec==2023.6.0
+ftfy==6.1.1
+fused-kernels @ file:///fsx/hailey/math-lm/gpt-neox/megatron/fused_kernels
+gitdb==4.0.10
+GitPython==3.1.32
+hf_transfer==0.1.3
+hjson==3.1.0
+huggingface-hub==0.16.4
+idna==3.4
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.1
+jsonlines==3.1.0
+lit==16.0.6
+lm-dataformat @ git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+lm-eval==0.3.0
+MarkupSafe==2.1.3
+mbstrdecoder==1.1.3
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+networkx==3.1
+ninja==1.11.1
+nltk==3.8.1
+numexpr==2.8.4
+numpy==1.25.0
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+openai==0.27.8
+packaging==23.1
+pandas==2.0.3
+pathtools==0.1.2
+pathvalidate==3.0.0
+portalocker==2.7.0
+protobuf==4.23.4
+psutil==5.9.5
+py-cpuinfo==9.0.0
+pyarrow==12.0.1
+pybind11==2.10.4
+pycountry==22.3.5
+pydantic==1.10.11
+pytablewriter==1.0.0
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0
+regex==2023.6.3
+rehash==1.0.1
+requests==2.31.0
+rouge-score==0.1.2
+s3transfer==0.6.1
+sacrebleu==1.5.0
+safetensors==0.3.1
+scikit-learn==1.3.0
+scipy==1.11.1
+sentencepiece==0.1.99
+sentry-sdk==1.28.1
+setproctitle==1.3.2
+six==1.16.0
+smmap==5.0.0
+sqlitedict==2.1.0
+sympy==1.12
+tabledata==1.3.1
+tcolorpy==0.1.3
+threadpoolctl==3.1.0
+tiktoken==0.4.0
+tokenizers==0.13.3
+torch==2.0.1
+tqdm==4.65.0
+tqdm-multiprocess==0.0.11
+transformers==4.31.0
+triton==2.0.0
+typepy==1.3.1
+typing_extensions==4.7.1
+tzdata==2023.3
+ujson==5.8.0
+urllib3==1.26.16
+wandb==0.15.5
+watchdog==3.0.0
+wcwidth==0.2.6
+xxhash==3.2.0
+yarl==1.9.2
+zstandard==0.21.0
\ No newline at end of file
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
deleted file mode 100644
index 6c58478f3..000000000
--- a/requirements/requirements-dev.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-autopep8>=1.5.6
-clang-format>=13.0.1
-pre-commit>=2.17.0
-pytest>=6.2.3
-pytest-cov>=2.11.1
-pytest-forked>=1.3.0
-pytest-xdist
diff --git a/requirements/requirements-flashattention.txt b/requirements/requirements-flashattention.txt
deleted file mode 100644
index 8397ebe75..000000000
--- a/requirements/requirements-flashattention.txt
+++ /dev/null
@@ -1 +0,0 @@
-flash-attn==2.0.0.post1
diff --git a/requirements/requirements-onebitadam.txt b/requirements/requirements-onebitadam.txt
deleted file mode 100644
index 349e3b39a..000000000
--- a/requirements/requirements-onebitadam.txt
+++ /dev/null
@@ -1 +0,0 @@
-cupy-cuda111>=8.6.0
diff --git a/requirements/requirements-sparseattention.txt b/requirements/requirements-sparseattention.txt
deleted file mode 100644
index 09386fdcb..000000000
--- a/requirements/requirements-sparseattention.txt
+++ /dev/null
@@ -1 +0,0 @@
-triton==2.0.0.dev20221202
diff --git a/requirements/requirements-tensorboard.txt b/requirements/requirements-tensorboard.txt
deleted file mode 100644
index 6d5967364..000000000
--- a/requirements/requirements-tensorboard.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorboard==2.13.0
diff --git a/requirements/requirements-wandb.txt b/requirements/requirements-wandb.txt
deleted file mode 100644
index 1df18b051..000000000
--- a/requirements/requirements-wandb.txt
+++ /dev/null
@@ -1 +0,0 @@
-wandb>=0.10.28
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
deleted file mode 100644
index 443f162e6..000000000
--- a/requirements/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-best_download
-git+https://github.com/EleutherAI/DeeperSpeed.git#egg=deepspeed
-ftfy>=6.0.1
-git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
-huggingface_hub>=0.11.0
-hf-transfer>=0.1.3
-lm_eval>=0.3.0
-mpi4py>=3.0.3
-numpy>=1.22.0
-pybind11>=2.6.2
-regex
-sentencepiece
-six
-tiktoken>=0.1.2
-tokenizers>=0.12.1
-transformers>=4.24.0
-git+https://github.com/samikama/CPCargo@main