diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 7dec66da2..686974181 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -843,6 +843,29 @@ Model Arguments +- **dim_att**: int + + Default = None + + Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size. + + + +- **head_size**: int + + Default = None + + Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads. + + + +- **ffn_dim**: int + + Default = None + + Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor. + + ## NeoXArgsOptimizer Optimizer Arguments diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py index ac313a3bb..c877c6c78 100644 --- a/megatron/neox_arguments/neox_args.py +++ b/megatron/neox_arguments/neox_args.py @@ -21,7 +21,7 @@ from template import NeoXArgsTemplate try: - from typing import List, Literal, Union, Optional + from typing import List, Literal, Union, Optional, Any except ImportError: from typing_extensions import List, Literal, Union, Optional @@ -502,6 +502,21 @@ class NeoXArgsModel(NeoXArgsTemplate): Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column) """ + dim_att: int = None + """ + Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size. + """ + + head_size: int = None + """ + Size of each attention head for RWKV. Calculated as dim_att // num_attention_heads. + """ + + ffn_dim: int = None + """ + Dimension of the feed-forward network for RWKV. If not set, calculated based on hidden_size and expansion_factor. + """ + @dataclass class NeoXArgsOptimizer(NeoXArgsTemplate): @@ -673,7 +688,7 @@ class NeoXArgsLogging(NeoXArgsTemplate): Custom metadata to attach to the created Comet Experiment. """ - comet_experiment = None + comet_experiment: Any = None """ Initialized comet experiment object used to log data """ diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py index 176151c2a..9c709d918 100644 --- a/tests/neox_args/test_neoxargs_usage.py +++ b/tests/neox_args/test_neoxargs_usage.py @@ -66,7 +66,7 @@ def test_neoxargs_usage(): # find args matches matches = list( - re.findall(r"(?<=args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents) + re.findall(r"(?<=neox_args\.).{2,}?(?=[\s\n(){}+-/*;:,=,[,\]])", file_contents) ) if len(matches) == 0: continue