From a2b202034f6cebee217aaa726ba760c10358283b Mon Sep 17 00:00:00 2001
From: jaimemcc <99298642+jaimemcc-intel@users.noreply.github.com>
Date: Mon, 4 Dec 2023 14:29:50 -0800
Subject: [PATCH 01/17] Patch coverity scan (#1090)

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

update build command to avert empty cwd in build metrics

* Update coverity_scan.yml

* Update coverity_scan.yml

adding verbose to debug curl

* Update coverity_scan.yml

debug print trace to examine build metrics xml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update coverity_scan.yml

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 .github/workflows/coverity_scan.yml | 39 ++++++++++++++++++-----------
 configs/neox_arguments.md           |  8 +++---
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml
index 1a5a420a1..a79d0d8fb 100644
--- a/.github/workflows/coverity_scan.yml
+++ b/.github/workflows/coverity_scan.yml
@@ -23,29 +23,38 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
+      with:
+        path: gpt-neox
 
     - name: Install utils
       run: |
-        apt update -y && apt upgrade -y
-        apt install curl jq wget -y
+        sudo apt update -y && sudo apt upgrade -y
+        sudo apt install curl jq wget -y
 
     - name: Coverity Download
       run: |
-        wget https://scan.coverity.com/download/linux64 --post-data "token=$COVERITY_TOKEN&project=EleutherAI%2Fgpt-neox" -O coverity_tool.tgz
-        $GITHUB_WORKSPACE/bin/cov-configure --python
-        $GITHUB_WORKSPACE/bin/cov-configure --gcc
+        wget https://scan.coverity.com/download/linux64 --post-data "token=$COVERITY_TOKEN&project=$COVERITY_PROJECT" -O coverity_tool.tgz --no-verbose
+        mkdir $GITHUB_WORKSPACE/coverity && tar xvf coverity_tool.tgz -C $GITHUB_WORKSPACE/coverity --strip-components=1
+        $GITHUB_WORKSPACE/coverity/bin/cov-configure --python
+        $GITHUB_WORKSPACE/coverity/bin/cov-configure --gcc
 
-    - name: Coverity Scan
+    - name: Coverity Scan and Upload
       run: |
         set -x
-        $GITHUB_WORKSPACE/bin/cov-build --dir cov-int --no-command --fs-capture-search $GITHUB_WORKSPACE
-
-    - name: Coverity Upload
-      run: |
+        pushd $GITHUB_WORKSPACE
+        cd $GITHUB_WORKSPACE/gpt-neox
+        $GITHUB_WORKSPACE/coverity/bin/cov-build --dir $GITHUB_WORKSPACE/cov-int --no-command --fs-capture-search ./
+        popd
         tar caf build-results.bz2 cov-int
-        curl --form token=$COV_PASSPHRASE \
+        curl --form token=$COVERITY_TOKEN \
           --form email=$COV_USER \
-          --form file=@GITHUB_WORKSPACE/build-results.bz2 \
-          --form version="Version" \
-          --form description="Build" \
-          https://scan.coverity.com/builds?project=EleutherAI%2Fgpt-neox
+          --form file=@build-results.bz2 \
+          --form version="${{ inputs.build_version }}" \
+          --form description="${{ inputs.build_description }}" \
+          https://scan.coverity.com/builds?project=$COVERITY_PROJECT
+
+    - name: Upload Scan Build as Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: coverity-build-${{ github.sha }}
+        path: build-results.bz2
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index bc2e8fc57..8188a79f6 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 2da1083
+    Default = efaee8d
 
     current git hash of repository
 
@@ -800,7 +800,7 @@ Misc. Arguments
 
 
 
-- **do_train**: int
+- **do_train**: bool
 
     Default = None
 
@@ -808,7 +808,7 @@ Misc. Arguments
 
 
 
-- **do_valid**: int
+- **do_valid**: bool
 
     Default = None
 
@@ -816,7 +816,7 @@ Misc. Arguments
 
 
 
-- **do_test**: int
+- **do_test**: bool
 
     Default = None
 

From 050f560e8d0878d3e0af9d8cdffec02d4fdc9401 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 6 Dec 2023 15:42:27 -0500
Subject: [PATCH 02/17] Corrects FLOPs formula as per 1093 (#1094)

* Update logging.py

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
---
 configs/neox_arguments.md | 2 +-
 megatron/logging.py       | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 8188a79f6..6a9d02c9d 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = efaee8d
+    Default = bb1b145
 
     current git hash of repository
 
diff --git a/megatron/logging.py b/megatron/logging.py
index 3a40864b5..afde680a5 100644
--- a/megatron/logging.py
+++ b/megatron/logging.py
@@ -92,17 +92,15 @@ def get_flops(neox_args, iter_time_s) -> float:
     hidden_size = neox_args.hidden_size
     num_layers = neox_args.num_layers
     ckpt_activations_factor = 4 if neox_args.checkpoint_activations else 3
-    flops_calc1 = (
+    flops_per_iteration = (
         24
         * ckpt_activations_factor
         * batch_size
         * seq_len
         * num_layers
         * (hidden_size**2)
-        * (1.0 + (seq_len / (6.0 * hidden_size)))
+        * (1.0 + (seq_len / (6.0 * hidden_size)) + (vocab_size / (16.0 * num_layers * hidden_size)))
     )
-    flops_calc2 = vocab_size / (16.0 * num_layers * hidden_size)
-    flops_per_iteration = flops_calc1 + flops_calc2
     return flops_per_iteration / (iter_time_s * world_size)
 
 

From f19b2eca1e0eb52c1307a2a9e671bd3b6ba76db6 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Tue, 19 Dec 2023 16:11:33 -0500
Subject: [PATCH 03/17] Update CODEOWNERS

Remove myself as a code owner as I shouldn't be approving PRs.
---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f4555efb4..3cb082e80 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1 @@
-* @EleutherAI/pm-gptneo
+* @Quentin-Anthony

From 07166da70870424cdf6d7971a75207bb2763fdbd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 20 Dec 2023 14:56:54 -0800
Subject: [PATCH 04/17] Bump transformers from 4.30.2 to 4.36.0 in
 /requirements (#1097)

Bumps [transformers](https://github.com/huggingface/transformers) from 4.30.2 to 4.36.0.
- [Release notes](https://github.com/huggingface/transformers/releases)
- [Commits](https://github.com/huggingface/transformers/compare/v4.30.2...v4.36.0)

---
updated-dependencies:
- dependency-name: transformers
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 914664c03..a94bf9c21 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -12,4 +12,4 @@ sentencepiece
 six
 tiktoken>=0.1.2
 tokenizers>=0.12.1
-transformers==4.30.2
+transformers==4.36.0

From 9283effb37e1b72acef265d17cbe37881441e88c Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Wed, 20 Dec 2023 17:57:37 -0500
Subject: [PATCH 05/17] Pins old DeeperSpeed until bug is fixed (#1095)

* Pins old DeeperSpeed until bug is fixed

There is a bug in upstream DeepSpeed detailed [here](https://github.com/microsoft/DeepSpeed/issues/4781) that we didn't catch before synching with main. This pins the prior commit so the bug doesn't impact users.

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
---
 configs/neox_arguments.md     | 2 +-
 requirements/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 6a9d02c9d..6003d15cc 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = bb1b145
+    Default = a279fc8
 
     current git hash of repository
 
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index a94bf9c21..137da4d81 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,5 +1,5 @@
 best_download
-git+https://github.com/EleutherAI/DeeperSpeed.git#egg=deepspeed
+git+https://github.com/EleutherAI/DeeperSpeed.git@b9260436e7da3e297fc6bedfd27d9e69fbba6f5c#egg=deepspeed
 ftfy>=6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0

From 9eef954432cdfae128a9fd77e0faf91e8804f2fa Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 22 Dec 2023 03:36:50 -0500
Subject: [PATCH 06/17] Update README.md

---
 README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 528447835..28bdbafa8 100644
--- a/README.md
+++ b/README.md
@@ -507,10 +507,12 @@ GPT-NeoX has been used by academic and industry researchers for a variety of hig
 EleutherAI and our collaborators have used it in the following publications:
  - Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, McDonell, Jason Phang, Michael Pieler, Prashanth, Shivanshu Purohit, Laria Reynolds, Jon Tow, Ben Wang, and Samuel Weinbach. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models* (2022).
  - Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O’Brien, Eric Hallahan, Mohammad Aflah Khan et al. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. PMLR (2023).
- - Zhangir Azerbayev, Bartosz Piotrowski, Hailey Schoelkopf, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433* (2023).
+ - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433* (2023).
  - Stella Biderman, USVSN Sai Prashanth, Lintang Sutawika, Hailey Schoelkopf, Quentin Anthony, Shivanshu Purohit, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" *arXiv preprint arXiv:2304.11158* (2023).
  - Hyunwoong Ko, Kichang Yang, Minho Ryu, Taekyoon Choi, Seungmu Yang, and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254* (2023).
- - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_ (2023).
+ - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_ (2023).
+ - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_ (2023).
+ - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
 
 ### External Publications
 The following publications by other research groups use this library:
@@ -526,6 +528,12 @@ The following publications by other research groups use this library:
 - Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_ (2023).
 - Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://alon-albalak.github.io/images/Online_Data_Mixing.pdf)." _preprint_ (2023).
 - Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." _bioRxiv_ (2023).
+- Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv preprint arXiv:2310.01119_, 2023.
+- Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." _arXiv preprint arXiv:2310.06266_, 2023.
+- Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
+
+
 
 ### Models
 The following models were trained using this library:

From a48e09e6e60409d3b49b553912d57406e0585e0f Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 22 Dec 2023 03:37:05 -0500
Subject: [PATCH 07/17] Update README.md

---
 README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 528447835..28bdbafa8 100644
--- a/README.md
+++ b/README.md
@@ -507,10 +507,12 @@ GPT-NeoX has been used by academic and industry researchers for a variety of hig
 EleutherAI and our collaborators have used it in the following publications:
  - Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, McDonell, Jason Phang, Michael Pieler, Prashanth, Shivanshu Purohit, Laria Reynolds, Jon Tow, Ben Wang, and Samuel Weinbach. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models* (2022).
  - Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O’Brien, Eric Hallahan, Mohammad Aflah Khan et al. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. PMLR (2023).
- - Zhangir Azerbayev, Bartosz Piotrowski, Hailey Schoelkopf, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433* (2023).
+ - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433* (2023).
  - Stella Biderman, USVSN Sai Prashanth, Lintang Sutawika, Hailey Schoelkopf, Quentin Anthony, Shivanshu Purohit, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" *arXiv preprint arXiv:2304.11158* (2023).
  - Hyunwoong Ko, Kichang Yang, Minho Ryu, Taekyoon Choi, Seungmu Yang, and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254* (2023).
- - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, Quentin Anthony, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_ (2023).
+ - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_ (2023).
+ - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_ (2023).
+ - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
 
 ### External Publications
 The following publications by other research groups use this library:
@@ -526,6 +528,12 @@ The following publications by other research groups use this library:
 - Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_ (2023).
 - Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://alon-albalak.github.io/images/Online_Data_Mixing.pdf)." _preprint_ (2023).
 - Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." _bioRxiv_ (2023).
+- Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv preprint arXiv:2310.01119_, 2023.
+- Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." _arXiv preprint arXiv:2310.06266_, 2023.
+- Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
+
+
 
 ### Models
 The following models were trained using this library:

From 613e5a62a491aded6d7a7f95eb38d49f066862ff Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 22 Dec 2023 08:38:04 +0000
Subject: [PATCH 08/17] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 6003d15cc..a5210bb52 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = a279fc8
+    Default = a48e09e
 
     current git hash of repository
 

From be7eeda60341f9e39354990d2629a5b8bec2fd4d Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 22 Dec 2023 03:56:47 -0500
Subject: [PATCH 09/17] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 28bdbafa8..1a3ea819e 100644
--- a/README.md
+++ b/README.md
@@ -523,6 +523,7 @@ The following publications by other research groups use this library:
 - Eghbal A. Hosseini, Martin A. Schrimpf, Yian Zhang, Samuel Bowman, Noga Zaslavsky, and Evelina Fedorenko. "[Artificial neural network language models align neurally and behaviorally with humans even after a developmentally realistic amount of training.](https://www.biorxiv.org/content/10.1101/2022.10.04.510681)" _BioRxiv_ (2022).
 - Byung-Doh Oh and William Schuler. "[Transformer-Based LM Surprisal Predicts Human Reading Times Best with About Two Billion Training Tokens](https://arxiv.org/abs/2304.11389)." *arXiv preprint arXiv:2304.11389* (2023).
 - Ta-Chung Chi, Ting-Han Fan, Alexander Rudnicky, and Peter Ramadge. "[Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis](https://aclanthology.org/2023.acl-long.756/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_, pp. 13522-13537 (2023).
+- Ta-Chung Chi, Ting-Han Fan, Li-Wei Chen, Alexander Rudnicky, and Peter Ramadge. "[Latent Positional Information is in the Self-Attention Variance of Transformer Language Models Without Positional Embeddings](https://aclanthology.org/2023.acl-short.102/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)_, pp. 13522-13537 (2023).
 - Xidong Feng, Yicheng Luo, Ziyan Wang, Hongrui Tang, Mengyue Yang, Kun Shao, David Mguni, Yali Du, and Jun Wang. "[ChessGPT: Bridging Policy Learning and Language Modeling.](https://arxiv.org/abs/2306.09200)" _arXiv preprint arXiv:2306.09200_ (2023).
 - Orion Walker Dollar, Sameera Horawalavithana, Scott Vasquez, W. James Pfaendtner, and Svitlana Volkova. "[MolJET: Multimodal Joint Embedding Transformer for Conditional de novo Molecular Design and Multi-Property Optimization.](https://openreview.net/pdf?id=7UudBVsIrr)" _preprint_ (2023).
 - Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_ (2023).

From 2117afcf00aa8b92eb3dce5ae5f4405176b4e25a Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 22 Dec 2023 13:01:06 -0500
Subject: [PATCH 10/17] Update README.md

---
 README.md | 190 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 101 insertions(+), 89 deletions(-)

diff --git a/README.md b/README.md
index 28bdbafa8..eaaaad55e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 # GPT-NeoX
 
-This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training. This library is in widespread use in [academic, industry, and government labs](https://github.com/EleutherAI/gpt-neox#adoption-and-publications), including by researchers at Oak Ridge National Lab, CarperAI, Stability AI, Carnegie Mellon University, and the University of Tokyo. Uniquely among similar libraries GPT-NeoX supports a wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/),  [LUMI](https://www.lumi-supercomputer.eu/), and others.
+This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training. This library is in widespread use in [academic, industry, and government labs](https://github.com/EleutherAI/gpt-neox#adoption-and-publications), including by researchers at Oak Ridge National Lab, CarperAI, Stability AI, Together.ai, Korea University, Carnegie Mellon University, and the University of Tokyo among others. Uniquely among similar libraries GPT-NeoX supports a wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/),  [LUMI](https://www.lumi-supercomputer.eu/), and others.
 
 **If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**
 
@@ -13,7 +13,7 @@ GPT-NeoX leverages many of the same features and technologies as the popular Meg
 * Distributed training with ZeRO and 3D parallelism
 * A wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/),  [LUMI](https://www.lumi-supercomputer.eu/), and others.
 * Cutting edge architectural innovations including rotary and alibi positional embeddings, parallel feedforward attention layers, and flash attention.
-* Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 & 2
+* Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 \& 2
 * Curriculum Learning
 * Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, logging via [WandB](https://wandb.ai/site), and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
 
@@ -39,27 +39,43 @@ Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherA
 
 # Contents
 
-* [Quick Start](#quick-start)
+- [GPT-NeoX](#gpt-neox)
+  * [Why GPT-NeoX?](#why-gpt-neox)
+  * [News](#news)
+  * [Versions](#versions)
+- [Contents](#contents)
+- [Quick Start](#quick-start)
   * [Environment and Dependencies](#environment-and-dependencies)
+    + [Host Setup](#host-setup)
+    + [Flash Attention](#flash-attention)
+    + [Multi-Node Launching](#multi-node-launching)
+    + [Containerized Setup](#containerized-setup)
   * [Usage](#usage)
-* [Configuration](#configuration)
-* [Datasets](#datasets)
+- [Configuration](#configuration)
+- [Datasets](#datasets)
   * [Preconfigured Datasets](#preconfigured-datasets)
   * [Using Custom Data](#using-custom-data)
-* [Training and Finetuning](#training-and-finetuning)
-  * [Select Pretrained Models](#pretrained-models)
-    * [GPT-NeoX-20B](#gpt-neox-20b)
-    * [Pythia](#pythia)
-    * [Polyglot](#polyglot)
-* [Inference](#inference)
-* [Evaluation](#evaluation)
-* [Exporting to Hugging Face](#exporting-to-hugging-face)
-* [Monitoring](#monitoring)
-  * [Weights & Biases](#wandb)
+- [Training and Finetuning](#training-and-finetuning)
+  * [Pretrained Models](#pretrained-models)
+    + [GPT-NeoX-20B](#gpt-neox-20b)
+    + [Pythia](#pythia)
+    + [Polyglot](#polyglot)
+- [Inference](#inference)
+- [Evaluation](#evaluation)
+- [Exporting to Hugging Face](#exporting-to-hugging-face)
+- [Monitoring](#monitoring)
+  * [Weights and Biases](#weights-and-biases)
   * [TensorBoard](#tensorboard)
-* [Administrative Notes](#administrative-notes)
+- [Running on multi-node](#running-on-multi-node)
+- [Adoption and Publications](#adoption-and-publications)
+  * [Publications](#publications)
+  * [Models](#models)
+    + [English LLMs](#english-llms)
+    + [Non-English LLMs](#non-english-llms)
+    + [Code Models](#code-models)
+    + [Other Modalities](#other-modalities)
+- [Administrative Notes](#administrative-notes)
   * [Citing GPT-NeoX](#citing-gpt-neox)
-  * [Adoption and Publications](#adoption-and-publications)
   * [Licensing](#licensing)
   * [Acknowledgements](#acknowledgements)
 
@@ -452,7 +468,7 @@ Note, however, that this compatibility is not one-to-one, and only certain confi
 
 In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site) and [TensorBoard](https://www.tensorflow.org/tensorboard/)
 
-<h2 id="wandb">Weights & Biases</h2>
+## Weights and Biases
 
 EleutherAI is currently using [Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox). If you are logged into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account.
 
@@ -464,6 +480,73 @@ We also support using TensorBoard via the <code><var>tensorboard-dir</var></code
 
 If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
 
+# Adoption and Publications
+
+The GPT-NeoX library was been widely adopted by academic and industry researchers and ported on to many HPC systems.
+
+If you have found this library useful in your research, please reach out and let us know! We would love to add you to our lists.
+
+## Publications
+
+EleutherAI and our collaborators have used it in the following publications:
+ - **Sid Black**, **Stella Biderman**, **Eric Hallahan**, **Quentin Anthony**, **Leo Gao**, **Laurence Golding**, **Horace He**, **Connor Leahy**, **Kyle McDonell**, **Jason Phang**, **Michael Pieler**, **Shivanshu Purohit**, **Laria Reynolds**, **Jon Tow**, **Ben Wang**, and **Samuel Weinbach**. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
+ - **Stella Biderman**, **Hailey Schoelkopf**, **Quentin Anthony**, **Herbie Bradley**, **Kyle O'Brien**, **Eric Hallahan**, **Mohammad Aflah Khan**, **Shivanshu Purohit**, **USVSN Sai Prashanth**, Edward Raff, **Aviya Skowron**, **Lintang Sutawika**, **Oskar van der Wal**. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. _PMLR_, 2023.
+ - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433*, 2023.
+ - **Stella Biderman**, **USVSN Sai Prashanth**, **Lintang Sutawika**, **Hailey Schoelkopf**, **Quentin Anthony**, **Shivanshu Purohit**, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" In _Neural Information Processing Systems_, 2023.
+ - **Hyunwoong Ko**, **Kichang Yang**, **Minho Ryu**, **Taekyoon Choi**, **Seungmu Yang,** and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254*, 2023.
+ - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_, 2023.
+ - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_, 2023.
+ - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." In _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
+
+The following publications by other research groups use this library:
+- Ta-Chung Chi, Ting-Han Fan, Peter J. Ramadge, and Alexander Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)." In *Advances in Neural Information Processing Systems* 35 (2022).
+- Sameera Horawalavithana, Ellyn Ayton, Shivam Sharma, Scott Howland, Megha Subramanian, Scott Vasquez, Robin Cosbey, Maria Glenski, and Svitlana Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://aclanthology.org/2022.bigscience-1.12/)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
+- Sophia Kolak, Ruben Martins, Claire Le Goues, and Vincent J. Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://par.nsf.gov/biblio/10340618)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR*, 2022.
+- Frank F. Xu, Uri Alon, Graham Neubig, and Vincent J. Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code*, 2022.
+- Eghbal A. Hosseini, Martin A. Schrimpf, Yian Zhang, Samuel Bowman, Noga Zaslavsky, and Evelina Fedorenko. "[Artificial neural network language models align neurally and behaviorally with humans even after a developmentally realistic amount of training.](https://www.biorxiv.org/content/10.1101/2022.10.04.510681)" _BioRxiv_, 2022.
+- Byung-Doh Oh and William Schuler. "[Transformer-Based LM Surprisal Predicts Human Reading Times Best with About Two Billion Training Tokens](https://arxiv.org/abs/2304.11389)." In *Findings of the Association for Computational Linguistics*, 2023.
+- Ta-Chung Chi, Ting-Han Fan, Alexander Rudnicky, and Peter Ramadge. "[Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis](https://aclanthology.org/2023.acl-long.756/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_, pp. 13522-13537, 2023.
+- Xidong Feng, Yicheng Luo, Ziyan Wang, Hongrui Tang, Mengyue Yang, Kun Shao, David Mguni, Yali Du, and Jun Wang. "[ChessGPT: Bridging Policy Learning and Language Modeling.](https://arxiv.org/abs/2306.09200)" _arXiv preprint arXiv:2306.09200_, 2023.
+- Orion Walker Dollar, Sameera Horawalavithana, Scott Vasquez, W. James Pfaendtner, and Svitlana Volkova. "[MolJET: Multimodal Joint Embedding Transformer for Conditional de novo Molecular Design and Multi-Property Optimization.](https://openreview.net/pdf?id=7UudBVsIrr)" _preprint under review_, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_, 2023.
+- Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://arxiv.org/abs/2312.02406)." In _NeurIPS Workshop on R0-FoMo: Robustness of Few-shot and Zero-shot Learning in Large Foundation Models_, 2023.
+- Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." _bioRxiv_, 2023.
+- Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv preprint arXiv:2310.01119_, 2023.
+- Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." _arXiv preprint arXiv:2310.06266_, 2023.
+- Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
+
+## Models
+The following models were trained using this library:
+
+### English LLMs
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
+- CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
+- StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
+- Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
+- Carnegie Mellon University's [proofGPT (1.3B and 6.7B)](https://huggingface.co/hoskinson-center/proofGPT-v0.1-6.7B)
+- Dampish's [StellarX (2.8B and 4B)](https://huggingface.co/Dampish/StellarX-4B-V0.2)
+- Oak Ridge National Lab's [FORGE (26B)](https://dl.acm.org/doi/10.1145/3581784.3613215)
+
+### Non-English LLMs
+- EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
+- Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
+- LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
+- Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
+- CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
+- The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
+- The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
+- nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
+
+### Code Models
+- Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM](https://huggingface.co/nikitharao/catlm)
+- StabilityAI's [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding) and [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
+- CodeFuse AI's [StableCode (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
+
+### Other Modalities
+-  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
+-  Gretel's [Text-to-Table](https://huggingface.co/gretelai/text2table)
+
 # Administrative Notes
 
 ## Citing GPT-NeoX
@@ -496,77 +579,6 @@ To cite the 20 billion parameter model named `GPT-NeoX-20B`, please use
 
 Citation instructions for other pretrained models can be found [in the appropriate repository](#pretrained-models).
 
-
-## Adoption and Publications
-
-**If you have found this library useful in your research, please reach out and let us know! We would love to add you to our lists.**
-
-GPT-NeoX has been used by academic and industry researchers for a variety of high performance computing projects.
-
-### Our Research
-EleutherAI and our collaborators have used it in the following publications:
- - Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, McDonell, Jason Phang, Michael Pieler, Prashanth, Shivanshu Purohit, Laria Reynolds, Jon Tow, Ben Wang, and Samuel Weinbach. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models* (2022).
- - Stella Biderman, Hailey Schoelkopf, Quentin Anthony, Herbie Bradley, Kyle O’Brien, Eric Hallahan, Mohammad Aflah Khan et al. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. PMLR (2023).
- - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433* (2023).
- - Stella Biderman, USVSN Sai Prashanth, Lintang Sutawika, Hailey Schoelkopf, Quentin Anthony, Shivanshu Purohit, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" *arXiv preprint arXiv:2304.11158* (2023).
- - Hyunwoong Ko, Kichang Yang, Minho Ryu, Taekyoon Choi, Seungmu Yang, and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254* (2023).
- - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_ (2023).
- - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_ (2023).
- - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
-
-### External Publications
-The following publications by other research groups use this library:
-- Ta-Chung Chi, Ting-Han Fan, Peter J. Ramadge, and Alexander Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)." In *Advances in Neural Information Processing Systems* 35 (2022).
-- Sameera Horawalavithana, Ellyn Ayton, Shivam Sharma, Scott Howland, Megha Subramanian, Scott Vasquez, Robin Cosbey, Maria Glenski, and Svitlana Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://aclanthology.org/2022.bigscience-1.12/)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models* (2022).
-- Sophia Kolak, Ruben Martins, Claire Le Goues, and Vincent J. Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://par.nsf.gov/biblio/10340618)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR* (2022).
-- Frank F. Xu, Uri Alon, Graham Neubig, and Vincent J. Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code* (2022).
-- Eghbal A. Hosseini, Martin A. Schrimpf, Yian Zhang, Samuel Bowman, Noga Zaslavsky, and Evelina Fedorenko. "[Artificial neural network language models align neurally and behaviorally with humans even after a developmentally realistic amount of training.](https://www.biorxiv.org/content/10.1101/2022.10.04.510681)" _BioRxiv_ (2022).
-- Byung-Doh Oh and William Schuler. "[Transformer-Based LM Surprisal Predicts Human Reading Times Best with About Two Billion Training Tokens](https://arxiv.org/abs/2304.11389)." *arXiv preprint arXiv:2304.11389* (2023).
-- Ta-Chung Chi, Ting-Han Fan, Alexander Rudnicky, and Peter Ramadge. "[Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis](https://aclanthology.org/2023.acl-long.756/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_, pp. 13522-13537 (2023).
-- Xidong Feng, Yicheng Luo, Ziyan Wang, Hongrui Tang, Mengyue Yang, Kun Shao, David Mguni, Yali Du, and Jun Wang. "[ChessGPT: Bridging Policy Learning and Language Modeling.](https://arxiv.org/abs/2306.09200)" _arXiv preprint arXiv:2306.09200_ (2023).
-- Orion Walker Dollar, Sameera Horawalavithana, Scott Vasquez, W. James Pfaendtner, and Svitlana Volkova. "[MolJET: Multimodal Joint Embedding Transformer for Conditional de novo Molecular Design and Multi-Property Optimization.](https://openreview.net/pdf?id=7UudBVsIrr)" _preprint_ (2023).
-- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_ (2023).
-- Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://alon-albalak.github.io/images/Online_Data_Mixing.pdf)." _preprint_ (2023).
-- Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." _bioRxiv_ (2023).
-- Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
-- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv preprint arXiv:2310.01119_, 2023.
-- Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." _arXiv preprint arXiv:2310.06266_, 2023.
-- Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
-
-
-
-### Models
-The following models were trained using this library:
-
-**English LLMs**
-- [EleutherAI](https://eleuther.ai/)'s [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
-- [CarperAI](https://carper.ai/)'s [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
-- [StabilityAI](https://stability.ai/)'s [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
-- [Together.ai](https://together.ai/)'s [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
-- [Carnegie Mellon University](https://www.cmu.edu/hoskinson/)'s [proofGPT (1.3B and 6.7B)](https://huggingface.co/hoskinson-center/proofGPT-v0.1-6.7B)
-- [Dampish](https://huggingface.co/Dampish)'s [StellarX (2.8B and 4B)](https://huggingface.co/Dampish/StellarX-4B-V0.2)
-
-**Non-English LLMs**
-- [EleutherAI](https://eleuther.ai/)'s [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
-- [Korea University](http://nlp.korea.ac.kr/)'s [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
-- [LearnItAnyway](https://huggingface.co/LearnItAnyway)'s [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
-- [Rinna Co.](https://rinna.co.jp/)'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese)
-- [Rinna Co.](https://rinna.co.jp/)'s [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
-- [CyberAgent](https://www.cyberagent.co.jp/en/)'s [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
-- [The Hungarian Research Centre for Linguistics](https://nytud.hu/en)'s [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
-- [The University of Tokyo](https://weblab.t.u-tokyo.ac.jp/en/hpc/)'s [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
-- [nolando.ai](https://nolano.ai)'s [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
-
-**Code Models**
-- [Carnegie Mellon University](https://www.cmu.edu/)'s [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs)
-- [StabilityAI](https://stability.ai/)'s [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
-- StabilityAI's [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
- StableCode-Completion-Alpha-3B-4k
-
-**Other Modalities**
--  [University College London](https://www.ucl.ac.uk/computer-science/)'s [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
--  [Gretel](https://gretel.ai/)'s [Text-to-Table](https://huggingface.co/gretelai/text2table)
-
 ## Licensing
 
 This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2021, EleutherAI. Licensed under the Apache License:

From 8dba5b66b0fdde5b99e0a6bfe7bb5e023e3f723a Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 22 Dec 2023 18:01:19 +0000
Subject: [PATCH 11/17] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index a5210bb52..c05606e05 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = a48e09e
+    Default = 2117afc
 
     current git hash of repository
 

From f161245b7c3848811f7e8e092977fbaeea12d283 Mon Sep 17 00:00:00 2001
From: Lintang Sutawika <lintang@sutawika.com>
Date: Sat, 23 Dec 2023 01:05:34 +0700
Subject: [PATCH 12/17] Add QK Normalization (#1100)

* add qk normalization

* Update NeoXArgs docs automatically

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 configs/neox_arguments.md            | 10 +++++++++-
 megatron/model/transformer.py        | 15 +++++++++++++++
 megatron/neox_arguments/neox_args.py |  5 +++++
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 6003d15cc..722756d6f 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = a279fc8
+    Default = 1fc0521
 
     current git hash of repository
 
@@ -261,6 +261,14 @@ Model Arguments
 
 
 
+- **use_qk_layernorm**: bool
+
+    Default = False
+
+    Use QK Normalization
+
+
+
 - **layernorm_epsilon**: float
 
     Default = 1e-05
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 63f4122e2..195e57925 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -284,6 +284,16 @@ def __init__(
             neox_args.num_attention_heads, world_size
         )
         self.pos_emb = neox_args.pos_emb
+        self.use_qk_layernorm = neox_args.use_qk_layernorm
+        if self.use_qk_layernorm:
+            norm, eps = get_norm(neox_args)
+            self.qk_layernorm = norm(
+                [
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                ],
+                eps=eps,
+            )
 
         # Strided linear layer.
         self.query_key_value = mpu.ColumnParallelLinear(
@@ -639,6 +649,11 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
             mixed_x_layer, 3
         )
 
+        # QK Normalization https://arxiv.org/abs/2302.05442
+        if self.use_qk_layernorm:
+            query_layer = self.qk_layernorm(query_layer)
+            key_layer = self.qk_layernorm(key_layer)
+
         if exists(self.rotary_emb):
             if exists(self.rotary_ndims):
                 # partial rotary
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index 324a379d4..2cfed465d 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -125,6 +125,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
     """
 
+    use_qk_layernorm: bool = False
+    """
+    Use QK Normalization
+    """
+
     layernorm_epsilon: float = 1.0e-5
     """
     Layer norm epsilon.

From 7fb3b3c79bc460c12310af042e2ab9883e964af9 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 22 Dec 2023 13:07:32 -0500
Subject: [PATCH 13/17] Update README.md

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index eaaaad55e..a8f6f7c1e 100644
--- a/README.md
+++ b/README.md
@@ -577,8 +577,6 @@ To cite the 20 billion parameter model named `GPT-NeoX-20B`, please use
 }
 ```
 
-Citation instructions for other pretrained models can be found [in the appropriate repository](#pretrained-models).
-
 ## Licensing
 
 This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2021, EleutherAI. Licensed under the Apache License:

From a7509f0e076152036ce5f3e534a153ff2022c718 Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Fri, 22 Dec 2023 13:14:14 -0500
Subject: [PATCH 14/17] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a8f6f7c1e..a4724c882 100644
--- a/README.md
+++ b/README.md
@@ -526,7 +526,7 @@ The following models were trained using this library:
 - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
 - Carnegie Mellon University's [proofGPT (1.3B and 6.7B)](https://huggingface.co/hoskinson-center/proofGPT-v0.1-6.7B)
 - Dampish's [StellarX (2.8B and 4B)](https://huggingface.co/Dampish/StellarX-4B-V0.2)
-- Oak Ridge National Lab's [FORGE (26B)](https://dl.acm.org/doi/10.1145/3581784.3613215)
+- Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
 
 ### Non-English LLMs
 - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)

From 4d5a8115752b342aa922cf406dae4d13a7a056c0 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 22 Dec 2023 18:15:21 +0000
Subject: [PATCH 15/17] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 4dacafa0a..06656d3d8 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 2117afc
+    Default = 8eaac4e
 
     current git hash of repository
 

From b27e409cf6c3b78a073681d8a312b1c1477fd703 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 22 Dec 2023 21:16:58 -0500
Subject: [PATCH 16/17] Lm eval 0.4.0 support (#1101)

* add lm-eval v0.4.0

* rename evaluate.py to avoid shadowing HF evaluate library

* document new evaluate.py filename

* Update NeoXArgs docs automatically

* handle results format differently

* Update NeoXArgs docs automatically

* Update hanging evaluate.py scripts

* Update NeoXArgs docs automatically

* Add triviaqa to default eval_tasks

* Update NeoXArgs docs automatically

---------

Co-authored-by: github-actions <github-actions@github.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
---
 README.md                            |   4 +-
 configs/neox_arguments.md            |   2 +-
 evaluate.py => eval.py               |   0
 eval_tasks/eval_adapter.py           | 143 +++++++++++++++++++--------
 megatron/neox_arguments/arguments.py |   4 +-
 megatron/training.py                 |   4 +-
 requirements/requirements.txt        |   2 +-
 7 files changed, 112 insertions(+), 47 deletions(-)
 rename evaluate.py => eval.py (100%)

diff --git a/README.md b/README.md
index 490bc94aa..d9f431d20 100644
--- a/README.md
+++ b/README.md
@@ -238,7 +238,7 @@ All functionality should be launched using `deepy.py`, a wrapper around the `dee
 
 We currently offer three main functions:
 1. `train.py` is used for training and finetuning models.
-2. `evaluate.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
+2. `eval.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
 3. `generate.py` is used to sample text from a trained model.
 
 which can be launched with:
@@ -435,7 +435,7 @@ GPT-NeoX supports evaluation on downstream tasks through the [language model eva
 To evaluate a trained model on the evaluation harness, simply run:
 
 ```bash
-python ./deepy.py evaluate.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
+python ./deepy.py eval.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
 ```
 
 where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--eval_tasks lambada hellaswag piqa sciq`. For details of all tasks available, refer to the [lm-evaluation-harness repo](https://github.com/EleutherAI/lm-evaluation-harness).
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 06656d3d8..0c0f88e5b 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 8eaac4e
+    Default = 79befef
 
     current git hash of repository
 
diff --git a/evaluate.py b/eval.py
similarity index 100%
rename from evaluate.py
rename to eval.py
diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py
index e0a32797d..15dcdb1f2 100644
--- a/eval_tasks/eval_adapter.py
+++ b/eval_tasks/eval_adapter.py
@@ -13,19 +13,8 @@
 # limitations under the License.
 
 from megatron.utils import is_local_main, print_rank_0
-import best_download
-
-# patch best_download (eval harness downloader) to only happen on the first local rank
-fn = best_download.download_file
-
-
-def _download_file(*args, **kwargs):
-    if is_local_main():
-        fn(*args, **kwargs)
-
-
-best_download.download_file = _download_file
 
+import copy
 import os
 import sys
 import dataclasses
@@ -38,13 +27,13 @@ def _download_file(*args, **kwargs):
 import torch
 import torch.nn.functional as F
 
-from lm_eval.models.gpt2 import GPT2LM
-from lm_eval import tasks, evaluator, utils, base
+from lm_eval.models.huggingface import HFLM
+from lm_eval import tasks, evaluator, utils, api
 from megatron.text_generation_utils import generate_samples_from_prompt
 from megatron import mpu
 
 
-class EvalHarnessAdapter(GPT2LM):
+class EvalHarnessAdapter(HFLM):
     """
     An adapter to run NeoX models on LM Evaluation Harness (https://github.com/EleutherAI/lm-evaluation-harness) tasks.
 
@@ -56,13 +45,13 @@ class EvalHarnessAdapter(GPT2LM):
     """
 
     def __init__(self, model, forward_step_fn, neox_args, batch_size=None):
-        self.cache_hook = base.CacheHook(None)
-        self.model = model
+        self.cache_hook = api.model.CacheHook(None)
+        self._model = model
         self.neox_args = neox_args
         self.tokenizer = neox_args.tokenizer
         self._device = torch.device(f"cuda:{neox_args.local_rank}")
         self._eot_token_id = neox_args.tokenizer.eod_id
-        self._max_length = neox_args.max_position_embeddings // 2
+        self._max_length = neox_args.max_position_embeddings
         self._max_gen_toks = 128
         self._vocab_size = neox_args.padded_vocab_size
 
@@ -94,8 +83,6 @@ def __init__(self, model, forward_step_fn, neox_args, batch_size=None):
             generate_samples_from_prompt,
             neox_args=neox_args,
             model=model,
-            maximum_tokens=self._max_gen_toks,
-            temperature=0.0,
         )
 
     @property
@@ -123,15 +110,23 @@ def batch_size(self):
     def device(self):
         return self._device
 
-    def tok_encode(self, string: str):
+    @property
+    def rank(self):
+        return 0
+
+    @property
+    def world_size(self):
+        return 1
+
+    def tok_encode(self, string: str, **kwargs):
         return self.tokenizer.encode(string)
 
-    def tok_decode(self, tokens):
+    def tok_decode(self, tokens, **kwargs):
         return self.tokenizer.decode(tokens)
 
-    def greedy_until(self, requests):
+    def generate_until(self, requests):
         """
-        Greedy until is lm_eval harness' way to say "do greedy generation" - necessary for some tasks.
+        Generate until is lm_eval harness' way to say "do greedy generation" - necessary for some tasks.
         the eval harness dispatches requests to the model, and the model does argmax generation, the results of which
         are returned to the eval harness to evaluate.
 
@@ -143,19 +138,46 @@ def greedy_until(self, requests):
         self.model.module.inference_mode(use_cache=True)  # tell model to cache kv pairs
         res = []
 
+        # get only the args from each Instance object
+        reqs = [req.args for req in requests]
+
         def _collate(x):
             toks = self.tokenizer.encode(x[0])
             return (len(toks), x[0])
 
-        reord = utils.Reorderer(requests, _collate)
-        for context, until in tqdm(reord.get_reordered(), "Running greedy generation"):
-            if isinstance(until, str):
-                until = [until]
+        reord = utils.Reorderer(reqs, _collate)
+        for context, gen_kwargs in tqdm(reord.get_reordered(), "Running greedy generation"):
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [kwargs]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                )
+            if not until:
+                until = [self.tok_decode(self.eot_token_id)]
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            if "do_sample" in kwargs.keys():
+                kwargs.pop("do_sample")
+
             stop_tokens = [self.tokenizer.encode(i) for i in until]
             cont = self.generate(
                 text=context,
                 stop_tokens=stop_tokens,
                 recompute=self.neox_args.recompute,
+                maximum_tokens=max_gen_toks,
+                **kwargs,
             )
             if cont:
                 s = cont[0]["text"] or ""
@@ -166,7 +188,7 @@ def _collate(x):
                 s = s.split(term)[0]
 
             # partial caching
-            self.cache_hook.add_partial("greedy_until", (context, until), s)
+            self.cache_hook.add_partial("generate_until", (context, until), s)
 
             res.append(s)
 
@@ -366,7 +388,6 @@ def run_eval(
         eval_tasks=None,
         num_fewshot=0,
         bootstrap_iters=2,
-        description_dict=None,
         use_cache=True,
         name="neox",
         limit=None,
@@ -385,8 +406,12 @@ def run_eval(
                 "winogrande",
                 "mathqa",
                 "pubmedqa",
+                "triviaqa"
             ]
 
+        # register all the default tasks bundled with lm-evaluation-harness repository
+        tasks.initialize_tasks()
+
         # Returns a list containing all values of the task registry that
         # match at least one of the patterns
         import fnmatch
@@ -401,6 +426,8 @@ def pattern_match(patterns, source_list):
         eval_tasks = pattern_match(eval_tasks, tasks.ALL_TASKS)
         print(f"Found tasks: {eval_tasks}")
 
+        assert len(eval_tasks) > 0, "Must run at least one task"
+
         # **HACK INCOMING**:
         # first get task dict on local main rank
         # the tasks are downloaded *as they are initialized*, and the downloads don't like multithreading.
@@ -413,31 +440,67 @@ def pattern_match(patterns, source_list):
         task_dict = tasks.get_task_dict(eval_tasks)
 
         lm = self
+
         if use_cache:
-            # TODO(jon-tow): Append a subset of `neox_args` to the cache database
-            # name arg to distinguish model runs that use different configurations.
-            lm = base.CachingLM(lm, "lm_cache/" + name + ".db")
+            use_cache = 'lm_cache/neox' + '_dp_rank' + str(self._dp_rank) + '_dp_group' + str(self._dp_group) + '.db'
+            print(f"Using cache at {use_cache}...")
+            lm = lm_eval.api.model.CachingLM(
+                lm,
+                use_cache
+                # each rank receives a different cache db.
+                # necessary to avoid multiple writes to cache at once
+                # TODO: Append a subset of `neox_args` to the cache database
+                # name arg to distinguish model runs that use different configurations.
+            )
+
+        # from simple_evaluate:
+        # override fewshot values for all tasks we can
+        for task_name in task_dict.keys():
+            task_obj = task_dict[task_name]
+            if type(task_obj) == tuple:
+                group, task_obj = task_obj
+                if task_obj is None:
+                    continue
+
+            config = task_obj._config
+
+            if num_fewshot is not None:
+                if config["num_fewshot"] == 0:
+                    utils.eval_logger.info(
+                        f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored."
+                    )
+                else:
+                    default_num_fewshot = config["num_fewshot"]
+                    if not default_num_fewshot:
+                        utils.eval_logger.warning(
+                            f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                        )
+
+                    task_obj._config["num_fewshot"] = num_fewshot
 
         results = evaluator.evaluate(
             lm=lm,
-            task_dict=tasks.get_task_dict(eval_tasks),
-            description_dict=description_dict,
-            num_fewshot=num_fewshot,
-            limit=limit,
+            task_dict=task_dict,
+            limit=10, #limit,
             bootstrap_iters=bootstrap_iters,
+            log_samples=False,
         )
 
         results["config"] = {
             "model": name,
             "model_args": dataclasses.asdict(self.neox_args),
-            "num_fewshot": num_fewshot,
             "batch_size": self.batch_size,
             "device": str(self.device),
-            "no_cache": not use_cache,
+            "use_cache": use_cache,
             "limit": limit,
             "bootstrap_iters": bootstrap_iters,
-            "description_dict": description_dict,
         }
+        results["git_hash"] = utils.get_git_commit_hash()
+
+        print(results.keys())
+        for task_name in task_dict.keys():
+            if "alias" in results["results"][task_name]:
+                results["results"][task_name].pop("alias")
 
         if was_training:
             self.model.train()
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 5ecba71f3..bf34df842 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -293,13 +293,13 @@ def consume_deepy_args(cls, input_args=None):
             type=str,
             nargs="+",
             default=None,
-            help="Optionally overwrite eval tasks to run for evaluate.py",
+            help="Optionally overwrite eval tasks to run for eval.py",
         )
         group.add_argument(
             "--iteration",
             type=int,
             default=None,
-            help="Iteration to load checkpoint from in evaluate.py / generate.py. If None is provided, uses the latest iteration.",
+            help="Iteration to load checkpoint from in the eval.py and generate.py scripts. If None is provided, uses the latest iteration.",
         )
         group.add_argument(
             "--eval_results_prefix",
diff --git a/megatron/training.py b/megatron/training.py
index 20c168ed3..31dc5d444 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -867,7 +867,6 @@ def train(
                 optimizer=optimizer,
                 lr_scheduler=lr_scheduler,
             )
-
         # Evaluation
         if (
             neox_args.eval_interval
@@ -1006,6 +1005,9 @@ def evaluate_and_print_results(
     string = f" {chart_name} results at {prefix} | "
     for k, v in total_loss_dict.items():
         if isinstance(v, dict):
+            if neox_args.eval_tasks and "results" in v:
+                v = v["results"]
+                print(v)
             for k2, v2 in v.items():
                 k3 = "_".join([k, k2])
                 string += f"{k3} value: {v2:.6E} | "
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 137da4d81..807a55974 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -3,7 +3,7 @@ git+https://github.com/EleutherAI/DeeperSpeed.git@b9260436e7da3e297fc6bedfd27d9e
 ftfy>=6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
-lm_eval==0.3.0
+git+https://github.com/EleutherAI/lm-evaluation-harness.git@main#egg=lm_eval
 mpi4py>=3.0.3
 numpy>=1.22.0
 pybind11>=2.6.2

From 1148a0f55b7512959aff5fbfce19aaaab6eada3e Mon Sep 17 00:00:00 2001
From: Stella Biderman <stellabiderman@gmail.com>
Date: Sat, 23 Dec 2023 11:56:00 -0500
Subject: [PATCH 17/17] Update README.md

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d9f431d20..d8cba6ffb 100644
--- a/README.md
+++ b/README.md
@@ -521,7 +521,7 @@ The following publications by other research groups use this library:
 The following models were trained using this library:
 
 ### English LLMs
-- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b), [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia), and [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
 - CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
 - StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
 - Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
@@ -532,6 +532,7 @@ The following models were trained using this library:
 ### Non-English LLMs
 - EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
 - Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
+- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b)
 - LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
 - Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
 - CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
@@ -540,13 +541,13 @@ The following models were trained using this library:
 - nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
 
 ### Code Models
-- Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM](https://huggingface.co/nikitharao/catlm)
+- Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
 - StabilityAI's [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding) and [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
-- CodeFuse AI's [StableCode (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
+- CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
 
 ### Other Modalities
 -  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
--  Gretel's [Text-to-Table](https://huggingface.co/gretelai/text2table)
+-  Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
 
 # Administrative Notes