From 94d31f6db0cfe17f2def6f923daae1300dfd6563 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 7 Oct 2024 15:05:00 +0000
Subject: [PATCH 1/4] fix typo

---
 megatron/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/training.py b/megatron/training.py
index 5976ae6a7..ba36efb2d 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -576,7 +576,7 @@ def forward_step(
         return model.eval_batch(data_iterator, return_logits=return_logits)
 
     # Get the batch.
-    if neox_args.memory_profiling and neox_args.it:
+    if neox_args.memory_profiling and neox_args.iteration:
         torch.cuda.nvtx.range_push(f"Get batch")
     if timers is not None:
         timers("batch generator").start()

From 44a207765fef39238934829a2098baf9909923be Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Mon, 7 Oct 2024 16:26:48 +0000
Subject: [PATCH 2/4] fix neoxargs usage test

---
 configs/neox_arguments.md              | 23 +++++++++++++++++++++++
 megatron/neox_arguments/neox_args.py   | 19 +++++++++++++++++--
 tests/neox_args/test_neoxargs_usage.py |  2 +-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 7dec66da2..686974181 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -843,6 +843,29 @@ Model Arguments
 
 
 
+- **dim_att**: int
+
+    Default = None
+
+    Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size.
+
+
+
+- **head_size**: int
+
+    Default = None
+
+    Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads.
+
+
+
+- **ffn_dim**: int
+
+    Default = None
+
+    Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor.
+
+
 ## NeoXArgsOptimizer
 
 Optimizer Arguments
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index ac313a3bb..c877c6c78 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -21,7 +21,7 @@
     from template import NeoXArgsTemplate
 
 try:
-    from typing import List, Literal, Union, Optional
+    from typing import List, Literal, Union, Optional, Any
 except ImportError:
     from typing_extensions import List, Literal, Union, Optional
 
@@ -502,6 +502,21 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
     """
 
+    dim_att: int = None
+    """
+    Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size.
+    """
+
+    head_size: int = None
+    """
+    Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads.
+    """
+
+    ffn_dim: int = None
+    """
+    Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor.
+    """
+
 
 @dataclass
 class NeoXArgsOptimizer(NeoXArgsTemplate):
@@ -673,7 +688,7 @@ class NeoXArgsLogging(NeoXArgsTemplate):
     Custom metadata to attach to the created Comet Experiment.
     """
 
-    comet_experiment = None
+    comet_experiment: Any = None
     """
     Initialized comet experiment object used to log data
     """
diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py
index 176151c2a..9c709d918 100644
--- a/tests/neox_args/test_neoxargs_usage.py
+++ b/tests/neox_args/test_neoxargs_usage.py
@@ -66,7 +66,7 @@ def test_neoxargs_usage():
 
         # find args matches
         matches = list(
-            re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
+            re.findall(r"(?<=neox_args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
         )
         if len(matches) == 0:
             continue

From 3bb912886b677ac82f37afaf5a41ce54574ae7c4 Mon Sep 17 00:00:00 2001
From: AI_WAIFU <aiwaifu@protonmail.com>
Date: Tue, 8 Oct 2024 13:41:16 +0000
Subject: [PATCH 3/4] skip conversion test due to multiprocessing issue

---
 tests/unit/test_format_conversion_scripts.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py
index e0801434c..32e0357b2 100644
--- a/tests/unit/test_format_conversion_scripts.py
+++ b/tests/unit/test_format_conversion_scripts.py
@@ -3,9 +3,12 @@
 from tests.common import simulate_deepy_env, save_random_model
 from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 
-
+@pytest.mark.skip(
+    reason="Conversion test is skipped until we fix the CUDA + torch multiprocessing issue."
+)
 def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
     # Generate random GPT-NEOX model, check we can convert to hf format
+
     model_dir = str(tmpdir)
     input_args = ["train.py", "tests/config/test_setup.yml"]
     deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args)

From a06973b3767d6d2a771397458baa56d2ae23291f Mon Sep 17 00:00:00 2001
From: Quentin Anthony <qganthony@yahoo.com>
Date: Tue, 8 Oct 2024 12:26:48 -0700
Subject: [PATCH 4/4] precommit

---
 tests/neox_args/test_neoxargs_usage.py       | 4 +++-
 tests/unit/test_format_conversion_scripts.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py
index 9c709d918..5f8ba7bd2 100644
--- a/tests/neox_args/test_neoxargs_usage.py
+++ b/tests/neox_args/test_neoxargs_usage.py
@@ -66,7 +66,9 @@ def test_neoxargs_usage():
 
         # find args matches
         matches = list(
-            re.findall(r"(?<=neox_args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents)
+            re.findall(
+                r"(?<=neox_args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents
+            )
         )
         if len(matches) == 0:
             continue
diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py
index 32e0357b2..6935e480a 100644
--- a/tests/unit/test_format_conversion_scripts.py
+++ b/tests/unit/test_format_conversion_scripts.py
@@ -3,6 +3,7 @@
 from tests.common import simulate_deepy_env, save_random_model
 from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 
+
 @pytest.mark.skip(
     reason="Conversion test is skipped until we fix the CUDA + torch multiprocessing issue."
 )