diff --git a/llm/convert_c4.py b/llm/convert_c4.py
deleted file mode 100644
index 5b02e5160..000000000
--- a/llm/convert_c4.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""C4 streaming dataset conversion scripts."""
-
-import os
-import random
-from argparse import ArgumentParser, Namespace
-from glob import glob
-from itertools import islice
-from typing import Any, Dict, Iterable, List, Tuple
-
-import datasets
-import torch
-from composer.datasets.streaming import StreamingDatasetWriter
-from datasets import Dataset
-from torch.utils.data import DataLoader, IterableDataset, get_worker_info
-
-
-def parse_args() -> Namespace:
-    """Parse commandline arguments."""
-    args = ArgumentParser()
-    args.add_argument('--out_root', type=str, required=True)
-    args.add_argument('--shard_size_limit', type=int, default=1 << 28)
-    args.add_argument('--tqdm', type=int, default=1)
-    args.add_argument('--splits', nargs='+', default=['train', 'val'])
-
-    return args.parse_args()
-
-
-def get(split: str) -> IterableDataset:
-    """Collect the samples for this dataset split.
-
-    Args:
-        split (str): Split name.
-
-    Returns:
-        An IterableDataset.
-    """
-
-    class ShardedC4(IterableDataset):
-
-        def __init__(self):
-            self.dataset = datasets.load_dataset(path='c4', name='en', split=split, streaming=True)
-
-        def num_shards(self):
-            return len(self.dataset._ex_iterable.kwargs['filepaths'])
-
-        def __iter__(self):
-            worker_info = get_worker_info()
-            if worker_info:
-                num_workers = worker_info.num_workers
-                worker_id = worker_info.id
-                shards = self.dataset._ex_iterable.kwargs['filepaths']
-                assert len(shards) % num_workers == 0
-                self.dataset._ex_iterable.kwargs['filepaths'] = shards[worker_id::num_workers]
-            return iter(self.dataset)
-
-    return ShardedC4()
-
-
-def each(dataset: IterableDataset) -> Iterable[Dict[str, bytes]]:
-    """Generator over each dataset sample.
-
-    Args:
-        samples (Dataset): A HF Dataset locally downloaded.
-
-    Yields:
-        Sample dicts.
-    """
-    num_workers = min(64, dataset.num_shards())
-    batch_size = 512
-    # If using multiple workers, configure each worker to prefetch as many samples as it can, up to the aggregate device batch size
-    # If not using workers, the torch DataLoader expects the default value for prefetch_factor, which non-intuitively must be 2.
-    prefetch_factor = max(1, 2 * batch_size // num_workers) if num_workers > 0 else 2
-
-    loader = DataLoader(
-        dataset=dataset,
-        sampler=None,
-        batch_size=batch_size,
-        num_workers=num_workers,
-        prefetch_factor=prefetch_factor,
-    )
-    for batch in loader:
-        keys = list(batch.keys())
-        current_bs = len(batch[keys[0]])
-        for idx in range(current_bs):
-            yield {key: batch_values[idx].encode('utf-8') for key, batch_values in batch.items()}
-
-
-def main(args: Namespace) -> None:
-    """Main: create C4 streaming dataset.
-
-    Args:
-        args (Namespace): Commandline arguments.
-    """
-    fields = ['text', 'timestamp', 'url']
-
-    for (split, split_new_name, expected_num_samples) in [
-        ('train', 'train', 364868892),
-        ('validation', 'val', 364608),
-    ]:
-        # Only generate the splits requested
-        if split_new_name not in args.splits:
-            continue
-
-        # Get dataset
-        dataset = get(split=split)
-
-        # Write samples
-        with StreamingDatasetWriter(dirname=os.path.join(args.out_root, split_new_name),
-                                    fields=fields,
-                                    shard_size_limit=args.shard_size_limit,
-                                    compression=None) as out:
-            out.write_samples(samples=each(dataset), use_tqdm=bool(args.tqdm), total=expected_num_samples)
-
-
-if __name__ == '__main__':
-    main(parse_args())
diff --git a/llm/hf_configs/final/gpt-125m-biotok.json b/llm/hf_configs/final/gpt-125m-biotok.json
new file mode 100644
index 000000000..39a2e388a
--- /dev/null
+++ b/llm/hf_configs/final/gpt-125m-biotok.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 28895,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 28895,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": false,
+  "vocab_size": 28896
+}
diff --git a/llm/hf_configs/final/gpt-1b-biotok.json b/llm/hf_configs/final/gpt-1b-biotok.json
new file mode 100644
index 000000000..4c72f1e48
--- /dev/null
+++ b/llm/hf_configs/final/gpt-1b-biotok.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 28895,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 28895,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 2048,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": false,
+  "vocab_size": 28896
+}
diff --git a/llm/hf_configs/final/gpt-3b-biotok.json b/llm/hf_configs/final/gpt-3b-biotok.json
new file mode 100644
index 000000000..f2d6bc213
--- /dev/null
+++ b/llm/hf_configs/final/gpt-3b-biotok.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 28895,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 28895,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 2560,
+  "n_head": 20,
+  "n_inner": null,
+  "n_layer": 32,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": false,
+  "vocab_size": 28896
+}
diff --git a/llm/hf_configs/final/gpt-5b-biotok.json b/llm/hf_configs/final/gpt-5b-biotok.json
new file mode 100644
index 000000000..a1589121e
--- /dev/null
+++ b/llm/hf_configs/final/gpt-5b-biotok.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 28895,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 28895,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 4096,
+  "n_head": 32,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": false,
+  "vocab_size": 28896
+}
diff --git a/llm/hf_configs/final/gpt-7b-biotok.json b/llm/hf_configs/final/gpt-7b-biotok.json
new file mode 100644
index 000000000..b7ec40b91
--- /dev/null
+++ b/llm/hf_configs/final/gpt-7b-biotok.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 28895,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 28895,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 4096,
+  "n_head": 32,
+  "n_inner": null,
+  "n_layer": 32,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": false,
+  "vocab_size": 28896
+}
diff --git a/llm/hf_configs/gpt-125m-ctx-1024.json b/llm/hf_configs/old/gpt-125m-ctx-1024.json
similarity index 100%
rename from llm/hf_configs/gpt-125m-ctx-1024.json
rename to llm/hf_configs/old/gpt-125m-ctx-1024.json
diff --git a/llm/hf_configs/gpt-1b-ctx-1024.json b/llm/hf_configs/old/gpt-1b-ctx-1024.json
similarity index 100%
rename from llm/hf_configs/gpt-1b-ctx-1024.json
rename to llm/hf_configs/old/gpt-1b-ctx-1024.json
diff --git a/llm/hf_configs/gpt-350m-ctx-1024.json b/llm/hf_configs/old/gpt-350m-ctx-1024.json
similarity index 100%
rename from llm/hf_configs/gpt-350m-ctx-1024.json
rename to llm/hf_configs/old/gpt-350m-ctx-1024.json
diff --git a/llm/hf_configs/gpt-3b-ctx-1024.json b/llm/hf_configs/old/gpt-3b-ctx-1024.json
similarity index 100%
rename from llm/hf_configs/gpt-3b-ctx-1024.json
rename to llm/hf_configs/old/gpt-3b-ctx-1024.json
diff --git a/llm/hf_configs/gpt-760m-ctx-1024.json b/llm/hf_configs/old/gpt-760m-ctx-1024.json
similarity index 100%
rename from llm/hf_configs/gpt-760m-ctx-1024.json
rename to llm/hf_configs/old/gpt-760m-ctx-1024.json
diff --git a/llm/hf_configs/gpt-7b-ctx-1024.json b/llm/hf_configs/old/gpt-7b-ctx-1024.json
similarity index 100%
rename from llm/hf_configs/gpt-7b-ctx-1024.json
rename to llm/hf_configs/old/gpt-7b-ctx-1024.json
diff --git a/llm/hf_configs/mistral_gpt2_small.json b/llm/hf_configs/old/mistral_gpt2_small.json
similarity index 100%
rename from llm/hf_configs/mistral_gpt2_small.json
rename to llm/hf_configs/old/mistral_gpt2_small.json
diff --git a/llm/hf_configs/old/noflash-gpt-125m-ctx-1024.json b/llm/hf_configs/old/noflash-gpt-125m-ctx-1024.json
new file mode 100644
index 000000000..604207073
--- /dev/null
+++ b/llm/hf_configs/old/noflash-gpt-125m-ctx-1024.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.12.3",
+  "use_cache": false,
+  "vocab_size": 50257
+}
diff --git a/llm/hf_configs/pubmed_gpt2_medium.json b/llm/hf_configs/old/pubmed_gpt2_medium.json
similarity index 100%
rename from llm/hf_configs/pubmed_gpt2_medium.json
rename to llm/hf_configs/old/pubmed_gpt2_medium.json
diff --git a/llm/hf_configs/pubmed_mistral_gpt2_small.json b/llm/hf_configs/old/pubmed_mistral_gpt2_small.json
similarity index 100%
rename from llm/hf_configs/pubmed_mistral_gpt2_small.json
rename to llm/hf_configs/old/pubmed_mistral_gpt2_small.json
diff --git a/llm/hf_configs/tests/gpt-125m-ctx-1024-no-dropout.json b/llm/hf_configs/tests/gpt-125m-ctx-1024-no-dropout.json
new file mode 100644
index 000000000..6ca6a66a0
--- /dev/null
+++ b/llm/hf_configs/tests/gpt-125m-ctx-1024-no-dropout.json
@@ -0,0 +1,38 @@
+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": true,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.0,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.21.3",
+  "use_cache": false,
+  "vocab_size": 50257
+}
diff --git a/llm/llm/data.py b/llm/llm/data.py
deleted file mode 100644
index ed2f2aabd..000000000
--- a/llm/llm/data.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Build a StreamingC4 dataset and dataloader for training.
-"""
-
-import os
-import sys
-from itertools import islice
-from typing import Any, Dict, Iterator, Mapping, Optional
-
-import transformers
-from composer.datasets.streaming import StreamingDataset
-from torch.utils.data import DataLoader
-
-
-class StreamingC4(StreamingDataset):
-    """
-    Implementation of the C4 (Colossal Cleaned Common Crawl) dataset using StreamingDataset V1.
-
-    Args:
-        remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
-        local (str): Local filesystem directory where dataset is cached during operation.
-        split (str): The dataset split to use, either 'train' or 'val'.
-        shuffle (bool): Whether to shuffle the samples in this dataset.
-        tokenizer_name (str): The name of the HuggingFace tokenizer to use to tokenize samples.
-        max_seq_len (int): The max sequence length of each token sample.
-        group_method (str): How to group text samples into token samples. Supports 'truncate' or 'concat'.
-        max_retries (int): Number of download re-attempts before giving up. Default: 2.
-        timeout (float): How long to wait for shard to download before raising an exception. Default: 120 sec.
-        batch_size (Optional[int]): Hint batch_size that will be used on each device's DataLoader. Default: ``None``.
-    """
-
-    def __init__(self,
-                 remote: str,
-                 local: str,
-                 split: str,
-                 shuffle: bool,
-                 tokenizer_name: str,
-                 max_seq_len: int,
-                 group_method: str = 'truncate',
-                 max_retries: int = 2,
-                 timeout: float = 120,
-                 batch_size: Optional[int] = None):
-        # Validation
-        if split not in ['train', 'val']:
-            raise ValueError(f"split='{split}' must be one of ['train', 'val'].")
-        if group_method not in ['truncate', 'concat']:
-            raise ValueError(f"group_method='{group_method}' must be one of ['truncate', 'concat'].")
-
-        # Build StreamingDataset
-        decoders = {
-            'text': self._decode,
-            'timestamp': self._decode,
-            'url': self._decode,
-        }
-        super().__init__(remote=os.path.join(remote, split),
-                         local=os.path.join(local, split),
-                         shuffle=shuffle,
-                         decoders=decoders,
-                         max_retries=max_retries,
-                         timeout=timeout,
-                         batch_size=batch_size)
-        self.tokenizer_name = tokenizer_name
-        self.max_seq_len = max_seq_len
-        self.group_method = group_method
-
-        # Build tokenizer
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.tokenizer_name)
-        if self.tokenizer.pad_token is None:
-            # Some tokenizers (e.g. GPT2 tokenizer) have no padding token which causes bugs
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        # suppress warnings when using group_method='concat' and no truncation
-        self.tokenizer.model_max_length = int(1e30)
-
-    # How to decode binary data from .mds files to python strings
-    def _decode(self, data: bytes) -> str:
-        return data.decode('utf-8')
-
-    # How to tokenize a text sample to a token sample
-    def _tokenize(self, text_sample):
-        if self.group_method == 'truncate':
-            truncation = True
-            padding = 'max_length'
-            max_length = self.max_seq_len
-        elif self.group_method == 'concat':
-            truncation = False
-            padding = False
-            max_length = None
-        else:
-            raise ValueError(f"Got unknown group_method='{self.group_method}'.")
-        return self.tokenizer(text_sample['text'], truncation=truncation, padding=padding, max_length=max_length)
-
-    # How to process a sample
-    def __getitem__(self, idx: int) -> Dict[str, Any]:
-        text_sample = super().__getitem__(idx)
-        token_sample = self._tokenize(text_sample)
-        return token_sample
-
-    # Define iterable over samples
-    # Usually this can be left alone and inherited directly from super() class StreamingDataset, but concatenating samples is custom behavior.
-    # If group_method=='truncate', we simply return the token sample.
-    # If group_method=='concat', then we keep fetching token samples until we fill up max_seq_len.
-    def __iter__(self) -> Iterator[Any]:
-        if self.group_method == 'truncate':
-            iterator = super().__iter__()
-            yield from iterator
-
-        elif self.group_method == 'concat':
-            buffer = {}
-            while True:
-                iterator = super().__iter__()
-                for sample in iterator:
-
-                    for k, v in sample.items():
-                        buffer[k] = buffer.get(k, []) + v + [self.tokenizer.eos_token_id]
-                    if len(buffer['input_ids']) >= self.max_seq_len:
-                        concat_sample = {}
-                        for k, v in buffer.items():
-                            concat_sample[k] = v[:self.max_seq_len]
-                            buffer[k] = v[self.max_seq_len:]
-                        yield concat_sample
-        else:
-            raise ValueError(f"Got unknown group_method='{self.group_method}'.")
-
-    # Define length
-    # Usually this can be left alone and inherited directly from super() class StreamingDataset, but concatenating samples is custom behavior.
-    # If group_method=='truncate', we simply return the # samples.
-    # If group_method=='concat', we repeat forever, and we don't have a defined length.
-    def __len__(self) -> int:
-        if self.group_method == 'truncate':
-            return super().__len__()
-        elif self.group_method == 'concat':
-            return None
-        else:
-            raise ValueError(f"Got unknown group_method='{self.group_method}'.")
-
-
-def build_dataloader(cfg: Mapping[str, Any], device_batch_size: int):
-
-    if cfg.dataset.name == 'streaming_c4':
-        dataset = StreamingC4(split=cfg.dataset.split,
-                              remote=cfg.dataset.remote,
-                              local=cfg.dataset.local,
-                              shuffle=cfg.dataset.shuffle,
-                              tokenizer_name=cfg.dataset.tokenizer_name,
-                              max_seq_len=cfg.dataset.max_seq_len,
-                              group_method=cfg.dataset.group_method,
-                              batch_size=device_batch_size)
-    else:
-        raise ValueError(f'Not sure how to build dataset={cfg.dataset.name}')
-
-    collate_fn = transformers.DataCollatorForLanguageModeling(
-        tokenizer=dataset.tokenizer, mlm=False)
-
-    return DataLoader(
-        dataset,
-        collate_fn=collate_fn,
-        batch_size=device_batch_size,
-        drop_last=cfg.drop_last,
-        num_workers=cfg.num_workers,
-        pin_memory=cfg.pin_memory,
-        prefetch_factor=cfg.prefetch_factor,
-        persistent_workers=cfg.persistent_workers,
-        timeout=cfg.timeout,
-    )
-
-# Helpful to test if your dataloader is working locally
-# Run `python data.py [remote] [local, optional]` and verify that batches are printed out
-if __name__ == '__main__':
-    remote = sys.argv[1]
-    if len(sys.argv) > 2:
-        local = sys.argv[2]
-    else:
-        local = remote
-    print (f'Reading val split from {remote} -> {local}')
-
-    batch_size = 2
-    dataset  = StreamingC4(split='val',
-                            remote=remote,
-                            local=local,
-                            shuffle=False,
-                            tokenizer_name='gpt2',
-                            max_seq_len=32,
-                            group_method='concat',
-                            batch_size=batch_size)
-
-    collate_fn = transformers.DataCollatorForLanguageModeling(
-        tokenizer=dataset.tokenizer, mlm=False)
-
-    loader = DataLoader(
-        dataset,
-        collate_fn=collate_fn,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=4,
-    )
-
-    for batch_ix, batch in enumerate(islice(loader, 5)):
-        print('\n')
-        print ('#'*20, f'Batch {batch_ix}', '#'*20)
-        for k, v in batch.items():
-            print (k, v.shape, v.dtype)
-        for sample_ix, token_sample in enumerate(batch['input_ids']):
-            print ('-'*20, f' Sample {sample_ix} ', '-'*20)
-            print (dataset.tokenizer.decode(token_sample))
-
diff --git a/llm/llm/data_pubmed.py b/llm/llm/data_pubmed.py
index 4d7ade35b..dcb6a3200 100644
--- a/llm/llm/data_pubmed.py
+++ b/llm/llm/data_pubmed.py
@@ -85,7 +85,7 @@ def __iter__(self) -> Iterator[Any]:
                 iterator = super().__iter__()
                 for sample in iterator:
                     for k, v in sample.items():
-                        buffer[k] = buffer.get(k, []) + v + [self.tokenizer.eos_token_id]
+                        buffer[k] = buffer.get(k, []) + v
                     while len(buffer['input_ids']) >= self.max_seq_len:
                         concat_sample = {}
                         for k, v in buffer.items():
diff --git a/llm/llm/gpt.py b/llm/llm/gpt.py
index f7c14744e..ee99542a3 100644
--- a/llm/llm/gpt.py
+++ b/llm/llm/gpt.py
@@ -15,19 +15,41 @@
 from composer.models.base import ComposerModel
 from flash_attn.flash_attention import FlashMHA
 from transformers.models.gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2LMHeadModel
 
-from .hf_flash_gpt_2 import GPT2FlashLMHeadModel
+from .hf_flash_gpt2 import GPT2FlashLMHeadModel
 
 
+def prepare_hf_gpt2_model_for_fsdp(model):
+    # Special Case! When using the LMHeadModel, the weights of the self.lm_head and self.transformer.wte are tied.
+    # This tying occurs inside the `self.post_init()` function call above.
+    # This is a hurdle for FSDP because they need to be in the same FSDP block
+    # These lines ensures that both modules stay together in the top-most block
+    model.transformer._fsdp_wrap = False
+    model.transformer.wte._fsdp_wrap = False
+    model.lm_head._fsdp_wrap = False
+
+
+    # FSDP Wrap and Activation Checkpoint every GPT2Block
+    model.fsdp_wrap_fn = lambda module: isinstance(module, GPT2Block)
+    model.activation_checkpointing_fn = lambda module: isinstance(module, GPT2Block)
+
 class ComposerGPT(ComposerModel):
 
-    def __init__(self, cfg, device='meta'):
+    def __init__(self, cfg):
         super().__init__()
         # load GPT2 config from standard HF model config json
         hf_config = GPT2Config.from_json_file(cfg.hf_config)
         # build model with config
-        self.model = GPT2FlashLMHeadModel(hf_config)
-        self.model.to(device)
+        flash_attn = cfg.get('flash_attn', False)
+        if flash_attn:
+            self.model = GPT2FlashLMHeadModel(hf_config)
+        else:
+            self.model = GPT2LMHeadModel(hf_config)
+
+        # Tag layers to make the model ready for FSDP
+        prepare_hf_gpt2_model_for_fsdp(self.model)
+
         self.train_metrics = {
             'LanguageCrossEntropy': LanguageCrossEntropy(hf_config.vocab_size),
             'Perplexity': Perplexity(),
@@ -43,7 +65,7 @@ def get_targets(self, batch):
         return targets
 
     def forward(self, batch):
-        return self.model(input_ids=batch['input_ids']).logits
+        return self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask']).logits
 
     def eval_forward(self, batch, outputs=None):
         return outputs if outputs is not None else self.forward(batch)
diff --git a/llm/llm/gpt_old.py b/llm/llm/gpt_old.py
deleted file mode 100644
index af5c37638..000000000
--- a/llm/llm/gpt_old.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright 2022 MosaicML Composer authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-A simple, flexible implementation of a GPT model.
-Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
-"""
-
-import math
-from typing import Any, Mapping
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from composer.metrics.nlp import LanguageCrossEntropy, Perplexity
-from composer.models.base import ComposerModel
-from flash_attn.flash_attention import FlashMHA
-
-
-class TorchCausalAttention(nn.Module):
-    def __init__(self, cfg: Mapping[str, Any], device: str = None):
-        super().__init__()
-        self.mha = nn.MultiheadAttention(
-            embed_dim=cfg.d_model,
-            num_heads=cfg.n_heads,
-            dropout=cfg.attn_pdrop,
-            bias=True,
-            batch_first=True,
-            device=device,
-        )
-        self.register_buffer(
-            "mask", torch.tril(torch.ones(cfg.max_seq_len, cfg.max_seq_len)))
-        self.mha.out_proj._is_residual = True
-
-    def forward(self, x, key_padding_mask):
-        return self.mha(x, x, x, attn_mask=self.mask, need_weights=False)
-
-
-class FlashCausalAttention(nn.Module):
-    def __init__(self, cfg: Mapping[str, Any], device: str = None):
-        super().__init__()
-        self.mha = FlashMHA(
-            embed_dim=cfg.d_model,
-            num_heads=cfg.n_heads,
-            attention_dropout=cfg.attn_pdrop,
-            bias=True,
-            batch_first=True,
-            causal=True,
-            device=device,
-        )
-        self.mha.out_proj._is_residual = True
-
-    def forward(self, x, key_padding_mask):
-        return self.mha(x,
-                        key_padding_mask=key_padding_mask,
-                        need_weights=False)
-
-
-class GPTMLP(nn.Module):
-    def __init__(self, cfg: Mapping[str, Any], device: str = None):
-        super().__init__()
-        self.mlp_up = nn.Linear(cfg.d_model,
-                                cfg.mlp_ratio * cfg.d_model,
-                                device=device)
-        self.mlp_act = nn.GELU(approximate='none')
-        self.mlp_down = nn.Linear(cfg.mlp_ratio * cfg.d_model,
-                                  cfg.d_model,
-                                  device=device)
-        self.mlp_down._is_residual = True
-
-    def forward(self, x):
-        return self.mlp_down(self.mlp_act(self.mlp_up(x)))
-
-
-class GPTBlock(nn.Module):
-    def __init__(self, cfg: Mapping[str, Any], device: str = None):
-        super().__init__()
-        self.ln_1 = nn.LayerNorm(cfg.d_model, device=device)
-        if cfg.attn_impl == 'torch':
-            self.causal_attn = TorchCausalAttention(cfg, device)
-        elif cfg.attn_impl == 'flash':
-            self.causal_attn = FlashCausalAttention(cfg, device)
-        else:
-            raise ValueError(f'Unknown attn_impl={cfg.attn_impl}')
-        self.ln_2 = nn.LayerNorm(cfg.d_model, device=device)
-        self.mlp = GPTMLP(cfg, device=device)
-        self.resid_attn_dropout = nn.Dropout(cfg.resid_pdrop)
-        self.resid_mlp_dropout = nn.Dropout(cfg.resid_pdrop)
-
-    def forward(self,
-                x: torch.Tensor,
-                key_padding_mask: torch.ByteTensor = None) -> torch.Tensor:
-        a = self.ln_1(x)
-        b, _ = self.causal_attn(a, key_padding_mask)
-        x = x + self.resid_attn_dropout(b)
-        m = self.ln_2(x)
-        n = self.mlp(m)
-        x = x + self.resid_mlp_dropout(n)
-        return x
-
-
-class GPT(nn.Module):
-    def __init__(self, cfg: Mapping[str, Any], device: str = 'meta'):
-        super().__init__()
-        self.cfg = cfg
-        self.transformer = nn.ModuleDict(
-            dict(
-                wte=nn.Embedding(cfg.vocab_size, cfg.d_model, device=device),
-                wpe=nn.Embedding(cfg.max_seq_len, cfg.d_model, device=device),
-                emb_drop=nn.Dropout(cfg.emb_pdrop),
-                blocks=nn.ModuleList([
-                    GPTBlock(cfg, device=device) for _ in range(cfg.n_layers)
-                ]),
-                ln_f=nn.LayerNorm(cfg.d_model, device=device),
-            ))
-        self.lm_head = nn.Linear(cfg.d_model,
-                                 cfg.vocab_size,
-                                 bias=False,
-                                 device=device)
-
-        if device != 'meta':
-            self.apply(self.param_init_fn)
-
-    def forward(self,
-                input_ids: torch.LongTensor,
-                key_padding_mask: torch.ByteTensor = None):
-        _, S = input_ids.size()
-        assert (
-            S <= self.cfg.max_seq_len
-        ), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.cfg.max_seq_len}"
-        pos = torch.arange(0, S, dtype=torch.long,
-                           device=input_ids.device).unsqueeze(0)
-
-        tok_emb = self.transformer.wte(input_ids)
-        pos_emb = self.transformer.wpe(pos)
-        x = self.transformer.emb_drop(tok_emb + pos_emb)
-        for block in self.transformer.blocks:
-            x = block(x, key_padding_mask)
-        x = self.transformer.ln_f(x)
-        logits = self.lm_head(x)
-        return logits
-
-    # Param Initialization, needed for device='meta' fast initialization
-    def param_init_fn(self, module):
-        # Linear
-        if isinstance(module, nn.Linear):
-            torch.nn.init.normal_(module.weight,
-                                  mean=0.0,
-                                  std=self.cfg.init_std)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-
-            if getattr(module, '_is_residual', False):
-                module.weight.data.normal_(
-                    mean=0.0,
-                    std=(self.cfg.init_std / math.sqrt(2 * self.cfg.n_layers)))
-
-        # Embedding
-        if isinstance(module, nn.Embedding):
-            torch.nn.init.normal_(module.weight,
-                                  mean=0.0,
-                                  std=self.cfg.init_std)
-
-        # LayerNorm
-        if isinstance(module, nn.LayerNorm):
-            torch.nn.init.zeros_(module.bias)
-            torch.nn.init.ones_(module.weight)
-
-    # FSDP Wrap function
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, GPTBlock)
-
-    # Activation Checkpointing
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, GPTBlock)
-
-
-class ComposerGPT(ComposerModel):
-
-    def __init__(self, cfg, device='meta'):
-        super().__init__()
-        self.model = GPT(cfg, device=device)
-        self.train_metrics = {
-            'LanguageCrossEntropy': LanguageCrossEntropy(cfg.vocab_size),
-            'Perplexity': Perplexity(),
-        }
-        self.eval_metrics = {
-            'LanguageCrossEntropy': LanguageCrossEntropy(cfg.vocab_size),
-            'Perplexity': Perplexity(),
-        }
-
-    def get_targets(self, batch):
-        targets = torch.roll(batch["labels"], shifts=-1)
-        targets[:, -1] = -100
-        return targets
-
-    def forward(self, batch):
-        return self.model(batch['input_ids'],
-                          key_padding_mask=batch['attention_mask'].bool())
-
-    def eval_forward(self, batch, outputs=None):
-        return outputs if outputs is not None else self.forward(batch)
-
-    def loss(self, outputs, batch):
-        targets = self.get_targets(batch)
-        return F.cross_entropy(outputs.view(-1, outputs.size(-1)),
-                               targets.view(-1),
-                               ignore_index=-100)
-
-    def get_metrics(self, is_train=False):
-        return self.train_metrics if is_train else self.eval_metrics
-
-    def update_metric(self, batch, outputs, metric):
-        outputs = outputs.view(-1, outputs.size(-1))
-        targets = self.get_targets(batch).view(-1)
-        metric.update(outputs, targets)
diff --git a/llm/llm/hf_flash_gpt.py b/llm/llm/hf_flash_gpt.py
deleted file mode 100644
index 79acec75c..000000000
--- a/llm/llm/hf_flash_gpt.py
+++ /dev/null
@@ -1,1592 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Modified HF GPT2 w/flash attention"""
-
-from einops import rearrange
-from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-
-import math
-import os
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.cuda.amp import autocast
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel, SequenceSummary
-from transformers.pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
-from transformers.utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "gpt2"
-_CONFIG_FOR_DOC = "GPT2Config"
-_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
-
-GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "gpt2",
-    "gpt2-medium",
-    "gpt2-large",
-    "gpt2-xl",
-    "distilgpt2",
-    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
-]
-
-
-def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
-    """Load tf checkpoints in a pytorch model"""
-    try:
-        import re
-
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(gpt2_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array.squeeze())
-
-    for name, array in zip(names, arrays):
-        name = name[6:]  # skip "model/"
-        name = name.split("/")
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
-                scope_names = re.split(r"(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "w" or scope_names[0] == "g":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "b":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
-                pointer = getattr(pointer, scope_names[0])
-                pointer = getattr(pointer, "weight")
-            else:
-                pointer = getattr(pointer, scope_names[0])
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        try:
-            assert (
-                pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-class GPT2FlashAttention(nn.Module):
-    def __init__(self, config, is_cross_attention=False, layer_idx=None):
-        super().__init__()
-
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
-                1, 1, max_positions, max_positions
-            ),
-        )
-        self.register_buffer("masked_bias", torch.tensor(-1e4))
-
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        self.split_size = self.embed_dim
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-
-        self.scale_attn_weights = config.scale_attn_weights
-        self.is_cross_attention = is_cross_attention
-
-        # Layer-wise attention scaling, reordering, and upcasting
-        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
-        self.layer_idx = layer_idx
-        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
-
-        if self.is_cross_attention:
-            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
-            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
-        else:
-            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
-        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
-
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-
-        self.pruned_heads = set()
-
-    # FSDP Wrap function
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, GPT2FlashBlock)
-
-    # Activation Checkpointing
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, GPT2FlashBlock)
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
-        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
-
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-
-        # Update hyper params
-        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
-        self.num_heads = self.num_heads - len(heads)
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # rearrange to flash attention form
-        key = rearrange(key, 'b h s d -> b s h d')
-        value = rearrange(value, 'b h s d -> b s h d')
-        query = rearrange(query, 'b h s d -> b s h d')
-
-        #assert query.dtype in [torch.float16, torch.bfloat16], f"{query.dtype}"
-        
-        # stack
-        qkv = torch.stack([query,key,value], dim=2)
-        #qkv = torch.tensor(qkv,dtype=torch.bfloat16)
-        assert qkv.dtype in [torch.float16, torch.bfloat16]
-        
-        # flash attention logic
-        batch_size = qkv.shape[0]
-        seqlen = qkv.shape[1]
-        num_heads = qkv.shape[3]
-        dk = qkv.shape[4]
-        dk_per_head = int(dk)/int(num_heads)
-        qkv = rearrange(qkv, 'b s ... -> (b s) ...')
-        max_s = seqlen
-        cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device)
-        attn_pdrop = 0.1
-        softmax_scale = 1/float(math.sqrt(dk))
-        output = flash_attn_unpadded_qkvpacked_func(
-            qkv, cu_seqlens, max_s, attn_pdrop,
-            softmax_scale=softmax_scale, causal=True
-        )
-        output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
-        output = rearrange(output, 'b s h d -> b h s d')
-        #output = torch.tensor(output, dtype=torch.float32)
-        return output, None
-
-
-        #attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        #if self.scale_attn_weights:
-            #attn_weights = attn_weights / torch.tensor(
-                #value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
-            #)
-
-        # Layer-wise attention scaling
-        #if self.scale_attn_by_inverse_layer_idx:
-            #attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        #if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            #query_length, key_length = query.size(-2), key.size(-2)
-            #causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
-            #mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            #mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            #attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        #if attention_mask is not None:
-            # Apply the attention mask
-            #attn_weights = attn_weights + attention_mask
-
-        #attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
-        #attn_weights = attn_weights.type(value.dtype)
-        #attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        #if head_mask is not None:
-            #attn_weights = attn_weights * head_mask
-
-        #attn_output = torch.matmul(attn_weights, value)
-
-        #return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
-        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        # Preallocate attn_weights for `baddbmm`
-        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
-
-        # Compute Scale Factor
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        if self.scale_attn_by_inverse_layer_idx:
-            scale_factor /= float(self.layer_idx + 1)
-
-        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
-        with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
-            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        if not self.is_cross_attention:
-            # if only "normal" attention layer implements causal mask
-            query_length, key_length = query.size(-2), key.size(-2)
-            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
-            mask_value = torch.finfo(attn_weights.dtype).min
-            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
-            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)
-            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            # Apply the attention mask
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Splits hidden_size dim into attn_head_size and num_heads
-        """
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        """
-        Merges attn_head_size dim and num_attn_heads dim into hidden_size
-        """
-        tensor = tensor.permute(0, 2, 1, 3).contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-        if encoder_hidden_states is not None:
-            if not hasattr(self, "q_attn"):
-                raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
-                )
-
-            query = self.q_attn(hidden_states)
-            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
-            attention_mask = encoder_attention_mask
-        else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-
-        if use_cache is True:
-            present = (key, value)
-        else:
-            present = None
-
-        if self.reorder_and_upcast_attn:
-            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        else:
-            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-
-        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-class GPT2MLP(nn.Module):
-    def __init__(self, intermediate_size, config):
-        super().__init__()
-        embed_dim = config.hidden_size
-        self.c_fc = Conv1D(intermediate_size, embed_dim)
-        self.c_proj = Conv1D(embed_dim, intermediate_size)
-        self.act = ACT2FN[config.activation_function]
-        self.dropout = nn.Dropout(config.resid_pdrop)
-
-    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
-        hidden_states = self.c_fc(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.c_proj(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        return hidden_states
-
-
-class GPT2FlashBlock(nn.Module):
-    def __init__(self, config, layer_idx=None):
-        super().__init__()
-        hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
-
-        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = GPT2FlashAttention(config, layer_idx=layer_idx)
-        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        if config.add_cross_attention:
-            self.crossattention = GPT2FlashAttention(config, is_cross_attention=True, layer_idx=layer_idx)
-            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-
-        self.mlp = GPT2MLP(inner_dim, config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
-        residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        attn_outputs = self.attn(
-            hidden_states,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
-        outputs = attn_outputs[1:]
-        # residual connection
-        hidden_states = attn_output + residual
-
-        if encoder_hidden_states is not None:
-            # add one self-attention block for cross-attention
-            if not hasattr(self, "crossattention"):
-                raise ValueError(
-                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
-                    "cross-attention layers by setting `config.add_cross_attention=True`"
-                )
-            residual = hidden_states
-            hidden_states = self.ln_cross_attn(hidden_states)
-            cross_attn_outputs = self.crossattention(
-                hidden_states,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                output_attentions=output_attentions,
-            )
-            attn_output = cross_attn_outputs[0]
-            # residual connection
-            hidden_states = residual + attn_output
-            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
-
-        residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        # residual connection
-        hidden_states = residual + feed_forward_hidden_states
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs  # hidden_states, present, (attentions, cross_attentions)
-
-
-class GPT2PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = GPT2Config
-    load_tf_weights = load_tf_weights_in_gpt2
-    base_model_prefix = "transformer"
-    is_parallelizable = True
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["GPT2Block"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def param_init_fn(self, module):
-        self._init_weights(module)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, (nn.Linear, Conv1D)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
-        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
-        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-        #
-        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-        for name, p in module.named_parameters():
-            if name == "c_proj.weight":
-                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
-                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GPT2Model):
-            module.gradient_checkpointing = value
-
-
-@dataclass
-class GPT2DoubleHeadsModelOutput(ModelOutput):
-    """
-    Base class for outputs of models predicting if two sentences are consecutive or not.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss.
-        mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
-            Multiple choice classification loss.
-        logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
-            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
-        past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
-            sequence_length, embed_size_per_head)`).
-
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    mc_loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    mc_logits: torch.FloatTensor = None
-    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-GPT2_START_DOCSTRING = r"""
-
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-GPT2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
-            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
-            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
-            sequence tokens in the vocabulary.
-
-            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-
-            Indices can be obtained using [`GPT2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
-            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
-            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
-            their past given to this model should not be passed as `input_ids` as they have already been computed.
-        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
-            `past_key_values`. In other words, the `attention_mask` always has to have the length:
-            `len(past_key_values) + len(input_ids)`
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
-            1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-
-            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
-            `past_key_values`).
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-PARALLELIZE_DOCSTRING = r"""
-    This is an experimental feature and is a subject to change at a moment's notice.
-
-    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
-    it will evenly distribute blocks across all devices.
-
-    Args:
-        device_map (`Dict[int, list]`, optional, defaults to None):
-            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
-            automatically mapped to the first device (for esoteric reasons). That means that the first device should
-            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
-            following number of attention modules:
-
-                - gpt2: 12
-                - gpt2-medium: 24
-                - gpt2-large: 36
-                - gpt2-xl: 48
-
-    Example:
-
-    ```python
-    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
-    model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
-    device_map = {
-        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
-        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
-        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
-        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
-    }
-    model.parallelize(device_map)
-    ```
-"""
-DEPARALLELIZE_DOCSTRING = r"""
-    Moves the model to cpu from a model parallel state.
-
-    Example:
-
-    ```python
-    # On a 4 GPU machine with gpt2-large:
-    model = GPT2LMHeadModel.from_pretrained("gpt2-large")
-    device_map = {
-        0: [0, 1, 2, 3, 4, 5, 6, 7],
-        1: [8, 9, 10, 11, 12, 13, 14, 15],
-        2: [16, 17, 18, 19, 20, 21, 22, 23],
-        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
-    }
-    model.parallelize(device_map)  # Splits the model across several devices
-    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
-    ```
-"""
-
-
-@add_start_docstrings(
-    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
-    GPT2_START_DOCSTRING,
-)
-class GPT2FlashModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.embed_dim = config.hidden_size
-
-        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
-        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
-
-        self.drop = nn.Dropout(config.embd_pdrop)
-        self.h = nn.ModuleList([GPT2FlashBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
-        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        # Check validity of device_map
-        self.device_map = (
-            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
-        )
-        assert_device_map(self.device_map, len(self.h))
-        self.model_parallel = True
-        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
-        self.last_device = "cuda:" + str(max(self.device_map.keys()))
-        self.wte = self.wte.to(self.first_device)
-        self.wpe = self.wpe.to(self.first_device)
-        # Load onto devices
-        for k, v in self.device_map.items():
-            for block in v:
-                cuda_device = "cuda:" + str(k)
-                self.h[block] = self.h[block].to(cuda_device)
-        # ln_f to last
-        self.ln_f = self.ln_f.to(self.last_device)
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.model_parallel = False
-        self.device_map = None
-        self.first_device = "cpu"
-        self.last_device = "cpu"
-        self.wte = self.wte.to("cpu")
-        self.wpe = self.wpe.to("cpu")
-        for index in range(len(self.h)):
-            self.h[index] = self.h[index].to("cpu")
-        self.ln_f = self.ln_f.to("cpu")
-        torch.cuda.empty_cache()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # GPT2Attention mask.
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and -10000.0 for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.add_cross_attention and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if attention_mask is not None:
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
-    """,
-    GPT2_START_DOCSTRING,
-)
-class GPT2FlashLMHeadModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = GPT2FlashModel(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.transformer.h))
-        self.transformer.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.transformer.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.transformer.deparallelize()
-        self.transformer = self.transformer.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.model_parallel = False
-        torch.cuda.empty_cache()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # only last token for inputs_ids if past is defined in kwargs
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-        else:
-            position_ids = None
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=CausalLMOutputWithCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-            cross_attentions=transformer_outputs.cross_attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-        """
-        return tuple(
-            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
-        )
-
-
-@add_start_docstrings(
-    """
-The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
-RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
-input embeddings, the classification head takes as input the input of a specified classification token index in the
-input sequence).
-""",
-    GPT2_START_DOCSTRING,
-)
-class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        config.num_labels = 1
-        self.transformer = GPT2Model(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.multiple_choice_head = SequenceSummary(config)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings(PARALLELIZE_DOCSTRING)
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.transformer.h))
-        self.transformer.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.transformer.first_device)
-        self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
-        self.model_parallel = True
-
-    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
-    def deparallelize(self):
-        self.transformer.deparallelize()
-        self.transformer = self.transformer.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.multiple_choice_head = self.multiple_choice_head.to("cpu")
-        self.model_parallel = False
-        torch.cuda.empty_cache()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        # only last token for inputs_ids if past is defined in kwargs
-        if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-        else:
-            position_ids = None
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "token_type_ids": token_type_ids,
-        }
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        mc_token_ids: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        mc_labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, GPT2DoubleHeadsModelOutput]:
-        r"""
-        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
-            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
-            1]`.
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
-            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
-        mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
-            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
-            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)
-
-        Return:
-
-        Example:
-
-        ```python
-        >>> import torch
-        >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
-
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        >>> model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
-
-        >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
-        >>> # Update the model embeddings with the new vocabulary size
-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))
-
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
-        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
-        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
-
-        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
-        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
-
-        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
-        >>> lm_logits = outputs.logits
-        >>> mc_logits = outputs.mc_logits
-        ```"""
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
-        lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
-
-        mc_loss = None
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
-        lm_loss = None
-        if labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-        if not return_dict:
-            output = (lm_logits, mc_logits) + transformer_outputs[1:]
-            if mc_loss is not None:
-                output = (mc_loss,) + output
-            return ((lm_loss,) + output) if lm_loss is not None else output
-
-        return GPT2DoubleHeadsModelOutput(
-            loss=lm_loss,
-            mc_loss=mc_loss,
-            logits=lm_logits,
-            mc_logits=mc_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-        """
-        return tuple(
-            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
-            for layer_past in past
-        )
-
-
-@add_start_docstrings(
-    """
-    The GPT2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-1) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    GPT2_START_DOCSTRING,
-)
-class GPT2ForSequenceClassification(GPT2PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.transformer = GPT2Model(config)
-        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="microsoft/DialogRPT-updown",
-        output_type=SequenceClassifierOutputWithPast,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output="'LABEL_0'",
-        expected_loss=5.28,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size, sequence_length = input_ids.shape[:2]
-        else:
-            batch_size, sequence_length = inputs_embeds.shape[:2]
-
-        assert (
-            self.config.pad_token_id is not None or batch_size == 1
-        ), "Cannot handle batch sizes > 1 if no padding token is defined."
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
-            else:
-                sequence_lengths = -1
-                logger.warning(
-                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-                )
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    GPT2_START_DOCSTRING,
-)
-class GPT2ForTokenClassification(GPT2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.transformer = GPT2Model(config)
-        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
-            classifier_dropout = config.classifier_dropout
-        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
-    # fmt: off
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint="brad1141/gpt2-finetuned-comp2",
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_loss=0.25,
-        expected_output=["Lead", "Lead", "Lead", "Position", "Lead", "Lead", "Lead", "Lead", "Lead", "Lead", "Lead", "Lead"],
-    )
-    # fmt: on
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        hidden_states = self.dropout(hidden_states)
-        logits = self.classifier(hidden_states)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/llm/llm/hf_flash_gpt_2.py b/llm/llm/hf_flash_gpt2.py
similarity index 75%
rename from llm/llm/hf_flash_gpt_2.py
rename to llm/llm/hf_flash_gpt2.py
index bd095a221..87ca91f41 100644
--- a/llm/llm/hf_flash_gpt_2.py
+++ b/llm/llm/hf_flash_gpt2.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """Modified HF GPT2 w/flash attention"""
 
-import math
 import os
 from typing import Optional, Tuple, Union
 
@@ -32,8 +31,7 @@
 class GPT2FlashAttention(GPT2Attention):
     def __init__(self, config, is_cross_attention=False, layer_idx=None):
         super().__init__(config=config, is_cross_attention=is_cross_attention, layer_idx=layer_idx)
-        if self.reorder_and_upcast_attn:
-            raise ValueError('GPT2FlashAttention does not support reorder_and_upcast_attn')
+        self.attn_pdrop = config.attn_pdrop
 
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         # rearrange to flash attention form
@@ -41,34 +39,27 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         value = rearrange(value, 'b h s d -> b s h d')
         query = rearrange(query, 'b h s d -> b s h d')
 
-        #assert query.dtype in [torch.float16, torch.bfloat16], f"{query.dtype}"
-
         # stack
         qkv = torch.stack([query,key,value], dim=2)
-        #qkv = torch.tensor(qkv,dtype=torch.bfloat16)
         assert qkv.dtype in [torch.float16, torch.bfloat16]
 
         # flash attention logic
         batch_size = qkv.shape[0]
         seqlen = qkv.shape[1]
-        num_heads = qkv.shape[3]
         dk = qkv.shape[4]
-        dk_per_head = int(dk)/int(num_heads)
         qkv = rearrange(qkv, 'b s ... -> (b s) ...')
         max_s = seqlen
         cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device)
-        if self.training:
-            attn_pdrop = 0.1
-        else:
-            attn_pdrop = 0.0
-        softmax_scale = 1/float(math.sqrt(dk))
+        attn_pdrop = self.attn_pdrop if self.training else 0.0
+        softmax_scale = (1.0 / (dk ** 0.5)) if self.scale_attn_weights else 1.0
+        softmax_scale = (softmax_scale / float(self.layer_idx + 1)) if self.scale_attn_by_inverse_layer_idx else softmax_scale
         output = flash_attn_unpadded_qkvpacked_func(
             qkv, cu_seqlens, max_s, attn_pdrop,
             softmax_scale=softmax_scale, causal=True
         )
         output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
         output = rearrange(output, 'b s h d -> b h s d')
-        #output = torch.tensor(output, dtype=torch.float32)
+
         return output, None
 
 
@@ -124,24 +115,3 @@ def __init__(self, config):
 
         # Initialize weights and apply final processing
         self.post_init()
-
-        # Special Case! When using the LMHeadModel, the weights of the self.lm_head and self.transformer.wte are tied.
-        # This tying occurs inside the `self.post_init()` function call above.
-        # This is a hurdle for FSDP because they need to be in the same FSDP block
-        # These lines ensures that both modules stay together in the top-most block
-        self.transformer._fsdp_wrap = False
-        self.transformer.wte._fsdp_wrap = False
-        self.lm_head._fsdp_wrap = False
-
-    # Meta tensor param init fn
-    def param_init_fn(self, module):
-        if isinstance(module, GPT2LMHeadModel):
-            module.post_init()
-
-    # FSDP Wrap function
-    def fsdp_wrap_fn(self, module):
-        return isinstance(module, GPT2Block)
-
-    # Activation Checkpointing
-    def activation_checkpointing_fn(self, module):
-        return isinstance(module, GPT2Block)
diff --git a/llm/main.py b/llm/main.py
index 7125fb9d3..408816fc2 100644
--- a/llm/main.py
+++ b/llm/main.py
@@ -3,45 +3,28 @@
 
 import os
 import sys
+import warnings
+from urllib.parse import urlparse
 
 from composer import Trainer
 from composer.callbacks import LRMonitor, MemoryMonitor, SpeedMonitor
-from composer.loggers import ObjectStoreLogger, ProgressBarLogger, WandBLogger
+from composer.loggers import WandBLogger
 from composer.optim import DecoupledAdamW
-from torch.optim import AdamW
 from composer.optim.scheduler import (ConstantWithWarmupScheduler,
-                                      CosineAnnealingWithWarmupScheduler, LinearWithWarmupScheduler)
-from composer.utils import S3ObjectStore, dist, reproducibility
+                                      CosineAnnealingWithWarmupScheduler)
+from composer.utils import dist, reproducibility
 from omegaconf import OmegaConf as om
 
-import wandb
 from llm.data_pubmed import build_dataloader
 from llm.gpt import ComposerGPT
 
 
 def build_logger(name, kwargs):
-    if name == 'progress_bar':
-        return ProgressBarLogger(
-            progress_bar=kwargs.get('progress_bar', True),
-            log_to_console=kwargs.get('log_to_console', True),
-        )
-    elif name == 'wandb':
+    if name == 'wandb':
         return WandBLogger(**kwargs)
-    elif name == 's3':
-        object_store_logger = ObjectStoreLogger(
-            object_store_cls=S3ObjectStore,
-            object_store_kwargs=kwargs,
-        )
-        return object_store_logger
     else:
         raise ValueError(f'Not sure how to build logger: {name}')
 
-def build_object_store(name, kwargs):
-    if name == 's3':
-        return S3ObjectStore(**kwargs)
-    else:
-        raise ValueError(f'Not sure how to build object store: {name}')
-
 def build_callback(name, kwargs):
     if name == 'lr_monitor':
         return LRMonitor()
@@ -52,6 +35,18 @@ def build_callback(name, kwargs):
     else:
         raise ValueError(f'Not sure how to build callback: {name}')
 
+def build_optimizer(cfg, model):
+    if cfg.name == 'decoupled_adamw':
+        return DecoupledAdamW(
+            model.parameters(),
+            lr=cfg.lr,
+            betas=cfg.betas,
+            eps=cfg.eps,
+            weight_decay=cfg.weight_decay
+        )
+    else:
+        raise ValueError(f'Not sure how to build optimizer: {cfg.name}')
+
 
 def build_scheduler(cfg):
     if cfg.name == 'constant_with_warmup':
@@ -61,14 +56,11 @@ def build_scheduler(cfg):
         return CosineAnnealingWithWarmupScheduler(
             t_warmup=cfg.t_warmup,
             alpha_f=cfg.alpha_f)
-    elif cfg.name == 'linear_with_warmup':
-        return LinearWithWarmupScheduler(
-            t_warmup=cfg.t_warmup)
     else:
         raise ValueError(f'Not sure how to build scheduler: {cfg.name}')
 
 # Coming soon: this conversion math will be done inside Composer Trainer rather than entrypoint
-def get_batch_size_info(cfg):
+def update_batch_size_info(cfg):
     global_train_batch_size = cfg.global_train_batch_size
     device_train_batch_size = global_train_batch_size // dist.get_world_size()
     device_train_microbatch_size = cfg.device_train_microbatch_size
@@ -88,76 +80,78 @@ def get_batch_size_info(cfg):
         raise ValueError(
             f'Not sure how to parse {device_train_microbatch_size=}')
 
-    return device_train_batch_size, device_train_grad_accum, device_eval_batch_size, device_eval_microbatch_size
+    cfg.n_gpus = dist.get_world_size()
+    cfg.device_train_batch_size = device_train_batch_size
+    cfg.device_train_grad_accum = device_train_grad_accum
+    cfg.device_eval_batch_size = device_eval_batch_size
+    cfg.device_eval_microbatch_size = device_eval_microbatch_size
+    return cfg
+
+def get_load_params(cfg):
+    load_path = cfg.get('load_path', None)
+    if load_path and load_path.startswith('wandb'):
+        url = urlparse(load_path)
+        entity, project = url.netloc.split(':')
+        load_object_store = WandBLogger(entity=entity, project=project)
+        return load_path, load_object_store
+    else:
+        return load_path, None
 
+def log_config(cfg):
+    print(om.to_yaml(cfg))
+    if 'wandb' in cfg.get('loggers', {}):
+        try:
+            import wandb
+        except ImportError as e:
+            raise e
+        if wandb.run:
+            wandb.config.update(om.to_container(cfg, resolve=True))
 
 def main(cfg):
-    print("Training using config: ")
-    print(om.to_yaml(cfg))
     reproducibility.seed_all(cfg.seed)
 
+    # Run Name
+    cfg.run_name = cfg.get('run_name', os.environ.get('COMPOSER_RUN_NAME', 'llm'))
+
+    # Get batch size info
+    cfg = update_batch_size_info(cfg)
+
     # Read FSDP Config as a dict
     fsdp_config = cfg.get('fsdp_config', None)
     fsdp_config = om.to_container(fsdp_config, resolve=True) if fsdp_config else None
 
     # Build Model
-    # For fast initialization, use `meta` device
+    # For fast initialization of MosaicGPT, use cfg.model.device='meta'
     print('Initializing model...')
-    device = 'meta' if fsdp_config else 'cuda'
-    model = ComposerGPT(cfg=cfg.model, device=device)
-    n_params = sum(p.numel() for p in model.parameters())
-    print(f'{n_params=:.2e}')
-
-    # Get batch size info
-    device_train_batch_size, device_train_grad_accum, device_eval_batch_size, device_eval_microbatch_size = get_batch_size_info(cfg)
+    warnings.filterwarnings(action='ignore', message='Torchmetrics v0.9 introduced a new argument class property')
+    model = ComposerGPT(cfg=cfg.model)
+    cfg.n_params = sum(p.numel() for p in model.parameters())
+    print(f'{cfg.n_params=:.2e}')
 
     # Dataloaders
     print("Building train loader...")
-    train_loader = build_dataloader(cfg.train_loader, device_train_batch_size)
+    train_loader = build_dataloader(cfg.train_loader, cfg.device_train_batch_size)
     print("Building eval loader...")
-    eval_loader = build_dataloader(cfg.eval_loader, device_eval_batch_size)
+    eval_loader = build_dataloader(cfg.eval_loader, cfg.device_eval_batch_size)
 
     # Optimizer
-    if cfg.optimizer.name == 'adamw':
-        optimizer = AdamW(
-            model.parameters(),
-            lr=cfg.optimizer.lr,
-            betas=cfg.optimizer.betas,
-            eps=cfg.optimizer.eps,
-            weight_decay=cfg.optimizer.weight_decay)
-    elif cfg.optimizer.name == 'decoupled_adamw':
-        optimizer = DecoupledAdamW(
-            model.parameters(),
-            lr=cfg.optimizer.lr,
-            betas=cfg.optimizer.betas,
-            eps=cfg.optimizer.eps,
-            weight_decay=cfg.optimizer.weight_decay)
-    else:
-        raise ValueError(f'Requested unsupported optimizer: {cfg.optimizer.name}')
-    
+    optimizer = build_optimizer(cfg.optimizer, model)
 
     # Scheduler
     scheduler = build_scheduler(cfg.scheduler)
 
     # Loggers
-    loggers = [build_logger(name, logger_cfg) for name, logger_cfg in cfg.loggers.items()]
+    loggers = [build_logger(name, logger_cfg) for name, logger_cfg in cfg.get('loggers', {}).items()]
 
     # Callbacks
-    callbacks = [build_callback(name, callback_cfg) for name, callback_cfg in cfg.callbacks.items()]
-
-    # (Optional) Load object store
-    load_object_store = cfg.get('load_object_store', None)
-    if load_object_store is not None:
-        name = list(load_object_store.keys())[0]
-        kwargs = load_object_store[name]
-        if name in ['s3']:
-          load_object_store = build_object_store(name, kwargs)
-        elif name in ['wandb']:
-          load_object_store = build_logger(name, kwargs)
+    callbacks = [build_callback(name, callback_cfg) for name, callback_cfg in cfg.get('callbacks', {}).items()]
+
+    # Load object store
+    load_path, load_object_store = get_load_params(cfg)
 
     # Build the Trainer
     trainer = Trainer(
-        run_name=cfg.get('run_name', os.environ['COMPOSER_RUN_NAME']),
+        run_name=cfg.run_name,
         seed=cfg.seed,
         model=model,
         train_dataloader=train_loader,
@@ -166,40 +160,34 @@ def main(cfg):
         schedulers=scheduler,
         max_duration=cfg.max_duration,
         eval_interval=cfg.eval_interval,
+        eval_subset_num_batches=cfg.eval_subset_num_batches,
+        progress_bar=cfg.progress_bar,
+        log_to_console=cfg.log_to_console,
         loggers=loggers,
         callbacks=callbacks,
         precision=cfg.precision,
         grad_clip_norm=cfg.grad_clip_norm,
-        grad_accum=device_train_grad_accum,
+        grad_accum=cfg.device_train_grad_accum,
         fsdp_config=fsdp_config,
-        checkpoint_save_path=cfg.get('checkpoint_save_path', None),
-        checkpoint_save_interval=cfg.get('checkpoint_save_interval', '1000ba'),
-        num_checkpoints_to_keep=cfg.get('num_checkpoints_to_keep', -1),
-        save_artifact_name=cfg.get('save_artifact_name', '{run_name}/checkpoints/ep{epoch}-ba{batch}-rank{rank}.pt'),
-        save_latest_artifact_name=cfg.get('save_latest_artifact_name', '{run_name}/checkpoints/latest-rank{rank}'),
-        load_path=cfg.get('load_path', None),
+        save_folder=cfg.get('save_folder', None),
+        save_interval=cfg.get('save_interval', '1000ba'),
+        save_num_checkpoints_to_keep=cfg.get('save_num_checkpoints_to_keep', -1),
+        load_path=load_path,
         load_object_store=load_object_store,
         load_weights_only=cfg.get('load_weights_only', False),
     )
 
     print("Logging config...")
-    config_dict = om.to_container(cfg, resolve=True)
-    config_dict.update({
-        'n_gpus': dist.get_world_size(),
-        'n_params': n_params,
-        'device_train_batch_size': device_train_batch_size,
-        'device_eval_batch_size': device_eval_batch_size,
-        'device_eval_microbatch_size': device_eval_microbatch_size,
-    })
-    if wandb.run is not None:
-        wandb.config.update(config_dict)
+    log_config(cfg)
 
     print("Starting training...")
     trainer.fit()
 
 
 if __name__ == '__main__':
-    conf_path = sys.argv[1]
-    with open(conf_path) as f:
-        cfg = om.load(f)
+    yaml_path, args_list = sys.argv[1], sys.argv[2:]
+    with open(yaml_path) as f:
+        yaml_cfg = om.load(f)
+    cli_cfg = om.from_cli(args_list)
+    cfg = om.merge(yaml_cfg, cli_cfg)
     main(cfg)
diff --git a/llm/requirements.txt b/llm/requirements.txt
index a0833e46b..0b26193eb 100644
--- a/llm/requirements.txt
+++ b/llm/requirements.txt
@@ -1,4 +1,7 @@
-mosaicml[streaming] @ git+https://github.com/mosaicml/composer@fsdp-alpha
+torchvision<0.14
+torchtext<0.14
+torch<1.13
+mosaicml[streaming]==0.11.0
 flash_attn @ git+https://github.com/HazyResearch/flash-attention.git@main
 transformers==4.21.3
 datasets==2.4.0
diff --git a/llm/test_model.py b/llm/test_model.py
new file mode 100644
index 000000000..00a40d144
--- /dev/null
+++ b/llm/test_model.py
@@ -0,0 +1,57 @@
+import torch
+from composer.utils import reproducibility
+from transformers import DataCollatorForLanguageModeling
+from transformers.models.gpt2 import GPT2Config
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+from llm.hf_flash_gpt2 import GPT2FlashLMHeadModel
+
+
+def test_fwd_bkw(config_path, autocast_device, autocast_dtype):
+    reproducibility.seed_all(42)
+
+    # Build both models
+    shared_config = GPT2Config.from_json_file(config_path)
+    non_flash_model = GPT2LMHeadModel(shared_config)
+    flash_model = GPT2FlashLMHeadModel(shared_config)
+
+    # Initialize with same parameters
+    non_flash_state_dict = non_flash_model.state_dict()
+    flash_model.load_state_dict(non_flash_state_dict)
+
+    # Fake inputs
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    fake_sample = tokenizer('Here is a fake sample of length 8')
+    collate_fn = DataCollatorForLanguageModeling(tokenizer, mlm=False)
+    fake_batch = collate_fn([fake_sample])
+
+
+    # Move to device
+    non_flash_model = non_flash_model.to(autocast_device)
+    flash_model = flash_model.to(autocast_device)
+    fake_batch = {
+        k: v.to(autocast_device)
+        for k, v in fake_batch.items()
+    }
+    print (fake_batch)
+
+    # Compare outputs
+    with torch.autocast(device_type=autocast_device, dtype=autocast_dtype):
+        non_flash_outputs = non_flash_model(**fake_batch).logits
+        flash_outputs = flash_model(**fake_batch).logits
+
+    print ('#'*20)
+    print ('OUTPUTS')
+    print (non_flash_outputs)
+    print (flash_outputs)
+    print (torch.allclose(flash_outputs, non_flash_outputs, atol=5e-02))
+
+
+
+config_path = './hf_configs/tests/gpt-125m-ctx-1024-no-dropout.json'
+
+autocast_device = 'cuda'
+autocast_dtype = torch.bfloat16
+test_fwd_bkw(config_path, autocast_device, autocast_dtype)
diff --git a/llm/yamls/final/gpt-125m-biotok.yaml b/llm/yamls/final/gpt-125m-biotok.yaml
new file mode 100644
index 000000000..e70a71f78
--- /dev/null
+++ b/llm/yamls/final/gpt-125m-biotok.yaml
@@ -0,0 +1,106 @@
+data_remote: &data_remote s3://crfm-pubmed/pubmed-randomized
+data_local: &data_local /tmp/mds-cache/pubmed-randomized
+max_seq_len: &max_seq_len 1024
+tokenizer_name: &tokenizer_name stanford-crfm/pubmed_gpt_tokenizer
+
+# Run Name
+run_name: gpt-125m
+
+# Model
+model:
+  hf_config: hf_configs/final/gpt-125m-biotok.json
+  flash_attn: true
+
+# Dataloaders
+train_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: train
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: true
+  drop_last: true
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+eval_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: val
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: false
+  drop_last: false
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 6.0e-4
+  betas:
+    - 0.9
+    - 0.95
+  eps: 1.0e-08
+  weight_decay: 6.0e-5
+
+max_duration: 100000ba
+eval_interval: 5000ba
+eval_subset_num_batches: 5000
+global_train_batch_size: 512
+grad_clip_norm: 1.0
+
+# System
+seed: 17
+device_train_microbatch_size: auto
+# device_train_microbatch_size: 16
+precision: bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  min_params: 1e9
+  mixed_precision: FULL
+  activation_checkpointing: false
+  activation_cpu_offload: false
+  verbose: true
+
+# Logging
+progress_bar: true
+log_to_console: true
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+
+loggers:
+  wandb: {}
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 5000ba
+save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+save_folder: ./{run_name}/checkpoints
+# save_folder: s3://crfm-pubmed/checkpoints/{run_name}/checkpoints
+
+# Load from local filesystem or remote object store
+# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt
diff --git a/llm/yamls/final/gpt-1b-biotok.yaml b/llm/yamls/final/gpt-1b-biotok.yaml
new file mode 100644
index 000000000..d7f54f33f
--- /dev/null
+++ b/llm/yamls/final/gpt-1b-biotok.yaml
@@ -0,0 +1,107 @@
+data_remote: &data_remote s3://crfm-pubmed/pubmed-randomized
+data_local: &data_local /tmp/mds-cache/pubmed-randomized
+max_seq_len: &max_seq_len 1024
+tokenizer_name: &tokenizer_name stanford-crfm/pubmed_gpt_tokenizer
+
+# Run Name
+run_name: gpt-1b
+
+# Model
+model:
+  hf_config: hf_configs/final/gpt-1b-biotok.json
+  flash_attn: true
+
+# Dataloaders
+train_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: train
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: true
+  drop_last: true
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+eval_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: val
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: false
+  drop_last: false
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 2.0e-4
+  betas:
+    - 0.9
+    - 0.95
+  eps: 1.0e-08
+  weight_decay: 2.0e-5
+
+max_duration: 100000ba
+eval_interval: 5000ba
+eval_subset_num_batches: 5000
+global_train_batch_size: 512
+grad_clip_norm: 1.0
+
+# System
+seed: 17
+# device_train_microbatch_size: auto
+device_train_microbatch_size: 16
+precision: bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  min_params: 1e9
+  mixed_precision: FULL
+  activation_checkpointing: true
+  activation_cpu_offload: false
+  verbose: true
+
+# Logging
+progress_bar: true
+log_to_console: true
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+
+loggers:
+  wandb: {}
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 5000ba
+save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+save_folder: ./{run_name}/checkpoints
+# save_folder: s3://crfm-pubmed/checkpoints/{run_name}/checkpoints
+
+# Load from local filesystem or remote object store
+# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: ./artifacts/1b-biotok-flash-fsdp-gpus-64-NbbmPS.checkpoints.ep0-ba5000-rank0.pt:v0/ep0-ba5000-rank0.pt
\ No newline at end of file
diff --git a/llm/yamls/final/gpt-3b-biotok-300k.yaml b/llm/yamls/final/gpt-3b-biotok-300k.yaml
new file mode 100644
index 000000000..d583c3aa5
--- /dev/null
+++ b/llm/yamls/final/gpt-3b-biotok-300k.yaml
@@ -0,0 +1,107 @@
+data_remote: &data_remote s3://crfm-pubmed/pubmed-randomized
+data_local: &data_local /tmp/mds-cache/pubmed-randomized
+max_seq_len: &max_seq_len 1024
+tokenizer_name: &tokenizer_name stanford-crfm/pubmed_gpt_tokenizer
+
+# Run Name
+run_name: gpt-3b
+
+# Model
+model:
+  hf_config: hf_configs/final/gpt-3b-biotok.json
+  flash_attn: true
+
+# Dataloaders
+train_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: train
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: true
+  drop_last: true
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+eval_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: val
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: false
+  drop_last: false
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 1.6e-4
+  betas:
+    - 0.9
+    - 0.95
+  eps: 1.0e-08
+  weight_decay: 1.6e-5
+
+max_duration: 300000ba
+eval_interval: 5000ba
+eval_subset_num_batches: 1000
+global_train_batch_size: 1024
+grad_clip_norm: 1.0
+
+# System
+seed: 17
+# device_train_microbatch_size: auto
+device_train_microbatch_size: 8
+precision: bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  min_params: 1e9
+  mixed_precision: FULL
+  activation_checkpointing: true
+  activation_cpu_offload: false
+  verbose: true
+
+# Logging
+progress_bar: true
+log_to_console: true
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+
+loggers:
+  wandb: {}
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 5000ba
+save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+save_folder: ./{run_name}/checkpoints
+# save_folder: s3://crfm-pubmed/checkpoints/{run_name}/checkpoints
+
+# Load from local filesystem or remote object store
+# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: ./artifacts/1b-biotok-flash-fsdp-gpus-64-NbbmPS.checkpoints.ep0-ba5000-rank0.pt:v0/ep0-ba5000-rank0.pt
\ No newline at end of file
diff --git a/llm/yamls/final/gpt-5b-biotok-200k.yaml b/llm/yamls/final/gpt-5b-biotok-200k.yaml
new file mode 100644
index 000000000..cdf292c4e
--- /dev/null
+++ b/llm/yamls/final/gpt-5b-biotok-200k.yaml
@@ -0,0 +1,107 @@
+data_remote: &data_remote s3://crfm-pubmed/pubmed-randomized
+data_local: &data_local /tmp/mds-cache/pubmed-randomized
+max_seq_len: &max_seq_len 1024
+tokenizer_name: &tokenizer_name stanford-crfm/pubmed_gpt_tokenizer
+
+# Run Name
+run_name: gpt-5b
+
+# Model
+model:
+  hf_config: hf_configs/final/gpt-5b-biotok.json
+  flash_attn: true
+
+# Dataloaders
+train_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: train
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: true
+  drop_last: true
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+eval_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: val
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: false
+  drop_last: false
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 1.4e-4
+  betas:
+    - 0.9
+    - 0.95
+  eps: 1.0e-08
+  weight_decay: 1.4e-5
+
+max_duration: 200000ba
+eval_interval: 5000ba
+eval_subset_num_batches: 1000
+global_train_batch_size: 1024
+grad_clip_norm: 1.0
+
+# System
+seed: 17
+# device_train_microbatch_size: auto
+device_train_microbatch_size: 8
+precision: bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  min_params: 1e9
+  mixed_precision: FULL
+  activation_checkpointing: true
+  activation_cpu_offload: false
+  verbose: true
+
+# Logging
+progress_bar: true
+log_to_console: true
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+
+loggers:
+  wandb: {}
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 5000ba
+save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+save_folder: ./{run_name}/checkpoints
+# save_folder: s3://crfm-pubmed/checkpoints/{run_name}/checkpoints
+
+# Load from local filesystem or remote object store
+# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: ./artifacts/1b-biotok-flash-fsdp-gpus-64-NbbmPS.checkpoints.ep0-ba5000-rank0.pt:v0/ep0-ba5000-rank0.pt
\ No newline at end of file
diff --git a/llm/yamls/final/gpt-7b-biotok-150k.yaml b/llm/yamls/final/gpt-7b-biotok-150k.yaml
new file mode 100644
index 000000000..dd4a89f89
--- /dev/null
+++ b/llm/yamls/final/gpt-7b-biotok-150k.yaml
@@ -0,0 +1,107 @@
+data_remote: &data_remote s3://crfm-pubmed/pubmed-randomized
+data_local: &data_local /tmp/mds-cache/pubmed-randomized
+max_seq_len: &max_seq_len 1024
+tokenizer_name: &tokenizer_name stanford-crfm/pubmed_gpt_tokenizer
+
+# Run Name
+run_name: gpt-7b
+
+# Model
+model:
+  hf_config: hf_configs/final/gpt-7b-biotok.json
+  flash_attn: true
+
+# Dataloaders
+train_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: train
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: true
+  drop_last: true
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+eval_loader:
+  dataset:
+    name: streaming_pubmed
+    remote: *data_remote
+    local: *data_local
+    split: val
+    tokenizer_name: *tokenizer_name
+    max_seq_len: *max_seq_len
+    group_method: concat
+    shuffle: false
+  drop_last: false
+  num_workers: 8
+  pin_memory: true
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 1.2e-4
+  betas:
+    - 0.9
+    - 0.95
+  eps: 1.0e-08
+  weight_decay: 1.2e-5
+
+max_duration: 150000ba # 300B tokens
+eval_interval: 5000ba
+eval_subset_num_batches: 1000
+global_train_batch_size: 2048
+grad_clip_norm: 1.0
+
+# System
+seed: 17
+# device_train_microbatch_size: auto
+device_train_microbatch_size: 4
+precision: bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  min_params: 1e9
+  mixed_precision: FULL
+  activation_checkpointing: true
+  activation_cpu_offload: false
+  verbose: true
+
+# Logging
+progress_bar: true
+log_to_console: true
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+
+loggers:
+  wandb: {}
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 5000ba
+save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+save_folder: ./{run_name}/checkpoints
+# save_folder: s3://crfm-pubmed/checkpoints/{run_name}/checkpoints
+
+# Load from local filesystem or remote object store
+# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: ./artifacts/1b-biotok-flash-fsdp-gpus-64-NbbmPS.checkpoints.ep0-ba5000-rank0.pt:v0/ep0-ba5000-rank0.pt
\ No newline at end of file
diff --git a/llm/yamls/gpt-125m-demo.yaml b/llm/yamls/old/gpt-125m-demo.yaml
similarity index 100%
rename from llm/yamls/gpt-125m-demo.yaml
rename to llm/yamls/old/gpt-125m-demo.yaml
diff --git a/llm/yamls/gpt-125m.yaml b/llm/yamls/old/gpt-125m.yaml
similarity index 100%
rename from llm/yamls/gpt-125m.yaml
rename to llm/yamls/old/gpt-125m.yaml
diff --git a/llm/yamls/gpt-13b.yaml b/llm/yamls/old/gpt-13b.yaml
similarity index 100%
rename from llm/yamls/gpt-13b.yaml
rename to llm/yamls/old/gpt-13b.yaml
diff --git a/llm/yamls/gpt-1b.yaml b/llm/yamls/old/gpt-1b.yaml
similarity index 100%
rename from llm/yamls/gpt-1b.yaml
rename to llm/yamls/old/gpt-1b.yaml
diff --git a/llm/yamls/gpt-30b.yaml b/llm/yamls/old/gpt-30b.yaml
similarity index 100%
rename from llm/yamls/gpt-30b.yaml
rename to llm/yamls/old/gpt-30b.yaml
diff --git a/llm/yamls/gpt-350m.yaml b/llm/yamls/old/gpt-350m.yaml
similarity index 100%
rename from llm/yamls/gpt-350m.yaml
rename to llm/yamls/old/gpt-350m.yaml
diff --git a/llm/yamls/gpt-3b.yaml b/llm/yamls/old/gpt-3b.yaml
similarity index 100%
rename from llm/yamls/gpt-3b.yaml
rename to llm/yamls/old/gpt-3b.yaml
diff --git a/llm/yamls/gpt-70b.yaml b/llm/yamls/old/gpt-70b.yaml
similarity index 100%
rename from llm/yamls/gpt-70b.yaml
rename to llm/yamls/old/gpt-70b.yaml
diff --git a/llm/yamls/gpt-760m.yaml b/llm/yamls/old/gpt-760m.yaml
similarity index 100%
rename from llm/yamls/gpt-760m.yaml
rename to llm/yamls/old/gpt-760m.yaml
diff --git a/llm/yamls/gpt-7b.yaml b/llm/yamls/old/gpt-7b.yaml
similarity index 100%
rename from llm/yamls/gpt-7b.yaml
rename to llm/yamls/old/gpt-7b.yaml
diff --git a/llm/yamls/gpt-mistral-125m-demo.yaml b/llm/yamls/old/gpt-mistral-125m-demo.yaml
similarity index 100%
rename from llm/yamls/gpt-mistral-125m-demo.yaml
rename to llm/yamls/old/gpt-mistral-125m-demo.yaml
diff --git a/llm/yamls/gpt-mistral-125m.yaml b/llm/yamls/old/gpt-mistral-125m.yaml
similarity index 100%
rename from llm/yamls/gpt-mistral-125m.yaml
rename to llm/yamls/old/gpt-mistral-125m.yaml
diff --git a/llm/yamls/pubmed-gpt-125m.yaml b/llm/yamls/old/pubmed-gpt-125m.yaml
similarity index 100%
rename from llm/yamls/pubmed-gpt-125m.yaml
rename to llm/yamls/old/pubmed-gpt-125m.yaml
diff --git a/llm/yamls/pubmed-gpt-350m.yaml b/llm/yamls/old/pubmed-gpt-350m.yaml
similarity index 100%
rename from llm/yamls/pubmed-gpt-350m.yaml
rename to llm/yamls/old/pubmed-gpt-350m.yaml
diff --git a/llm/yamls/pubmed-gpt-3b.yaml b/llm/yamls/old/pubmed-gpt-3b.yaml
similarity index 100%
rename from llm/yamls/pubmed-gpt-3b.yaml
rename to llm/yamls/old/pubmed-gpt-3b.yaml
diff --git a/llm/yamls/pubmed-mistral-gpt-125m.yaml b/llm/yamls/old/pubmed-mistral-gpt-125m.yaml
similarity index 100%
rename from llm/yamls/pubmed-mistral-gpt-125m.yaml
rename to llm/yamls/old/pubmed-mistral-gpt-125m.yaml