From b8889e72d1587b7012c561e5b8e5d6ae3af88075 Mon Sep 17 00:00:00 2001 From: Kunal Vaishnavi Date: Fri, 9 Aug 2024 21:08:47 +0000 Subject: [PATCH 1/2] Add logit softcapping for Gemma-2 --- src/python/py/models/builder.py | 9 +++++++-- src/python/setup.py.in | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 90a1cd2fb..3aef5a815 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -144,6 +144,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): } # LayerNorm-specific variables + epsilon = config.rms_norm_eps if hasattr(config, "rms_norm_eps") else 9.999999747378752e-06 self.layernorm_attrs = { "simple": True, # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm "first_layernorm": True, # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms @@ -153,6 +154,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): "output_0": "", # Output 0 for LayerNorm and SkipLayerNorm "output_3": "", # Output 3 for SkipLayerNorm "add_offset": 0, # Offset value for LayerNorm weight + "epsilon": epsilon, # Epsilon value to avoid `sqrt(0)` in LayerNorm } # RotaryEmbedding-specific variables @@ -203,6 +205,8 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): } # Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.) + softcap = config.attn_logit_softcapping if hasattr(config, "attn_logit_softcapping") else 0.0 # default is 0.0 in GroupQueryAttention kernel + # Block-sparse attention-specific variables sparse_block_size = config.blocksparse_block_size if hasattr(config, "blocksparse_block_size") else 0 kernel_block_size = config.blocksparse_triton_kernel_block_size if hasattr(config, "blocksparse_triton_kernel_block_size") else 0 @@ -212,6 +216,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.attention_attrs = { "op_type": "MultiHeadAttention", # Attention op to use "scale": 1 / np.sqrt(self.head_size), # Scale value after calculating Q x K' in attention + "softcap": softcap, # Softcap value to prevent values from exploding in attention "use_rotemb_in_attn": False, # Use rotary embeddings within attention (instead of a separate RotaryEmbedding op) "use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V) "block_sparse": { @@ -825,7 +830,7 @@ def make_layernorm(self, layer_id, layernorm, skip, simple, location): name = f"/model/layers.{layer_id}/{location}_layernorm/{'Skip' if skip else ''}LayerNorm" op_type = f"{'Skip' if skip else ''}{'Simplified' if simple else ''}LayerNormalization" - kwargs = {"epsilon": 9.999999747378752e-06} + kwargs = {"epsilon": self.layernorm_attrs["epsilon"]} if not skip: kwargs.update({"axis": -1, "stash_type": 1}) @@ -1237,7 +1242,7 @@ def make_group_query_attention(self, name, **kwargs): self.make_node( "GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, scale=self.attention_attrs["scale"], # local_window_size=self.window_size, # Disable sliding window attribute temporarily - do_rotary=self.attention_attrs["use_rotemb_in_attn"], rotary_interleaved=self.rotemb_attrs["interleaved"], + softcap=self.attention_attrs["softcap"], do_rotary=self.attention_attrs["use_rotemb_in_attn"], rotary_interleaved=self.rotemb_attrs["interleaved"], ) self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * self.num_attn_heads]) diff --git a/src/python/setup.py.in b/src/python/setup.py.in index a12fb0ec7..74659e6a4 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -26,7 +26,7 @@ package_name = '@TARGET_NAME@' def _onnxruntime_dependency() -> str: dependency = None # Use dev version as default since CI tests use nightly version for testing - ort_version = os.environ.get("ONNXRUNTIME_VERSION", "1.19.0.dev20240805002") + ort_version = os.environ.get("ONNXRUNTIME_VERSION", "1.19.0") is_nightly = True if "dev" in ort_version else False if package_name == "onnxruntime-genai": From 11042353a971c1cd0ace31003820fc57d13a8e53 Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:41:09 -0700 Subject: [PATCH 2/2] Change default epsilon value --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index e36959e91..0f064dde0 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -144,7 +144,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): } # LayerNorm-specific variables - epsilon = config.rms_norm_eps if hasattr(config, "rms_norm_eps") else 9.999999747378752e-06 + epsilon = config.rms_norm_eps if hasattr(config, "rms_norm_eps") else 1e-06 self.layernorm_attrs = { "simple": True, # Use SimplifiedLayerNorm/SkipSimplifiedLayerNorm vs. LayerNorm/SkipLayerNorm "first_layernorm": True, # 1st LayerNorm = LayerNorm, then SkipLayerNorm for all subsequent LayerNorms