diff --git a/3rdparty/Megatron-LM b/3rdparty/Megatron-LM index 2da43ef4c1..65720c87ba 160000 --- a/3rdparty/Megatron-LM +++ b/3rdparty/Megatron-LM @@ -1 +1 @@ -Subproject commit 2da43ef4c1b9e76f03b7567360cf7390e877f1b6 +Subproject commit 65720c87ba9c9d0ae8c90b1ffdbdccd2d51b1bc1 diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/attention.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/attention.py deleted file mode 100644 index 63d93448e1..0000000000 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/attention.py +++ /dev/null @@ -1,365 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-Apache2 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from typing import Callable, Optional, Sequence, Union - -import torch -from megatron.core import parallel_state, tensor_parallel -from megatron.core.extensions.transformer_engine import TEDotProductAttention -from megatron.core.packed_seq_params import PackedSeqParams -from megatron.core.parallel_state import ( - get_context_parallel_global_ranks, - get_context_parallel_group, - get_tensor_model_parallel_group, -) -from megatron.core.tensor_parallel import get_cuda_rng_tracker -from megatron.core.transformer.dot_product_attention import DotProductAttention -from megatron.core.transformer.enums import AttnMaskType -from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.core.utils import get_te_version, is_te_min_version -from torch import Tensor - - -__all__: Sequence[str] = ("ESM2DotProductAttention", "ESM2TEDotProductAttention") - - -class ESM2TEDotProductAttention(TEDotProductAttention): - """ESM2-Specific transformer engine core attention. - - Override the softmax_scale to 1.0 to match the ESM2 implementation while keeping the rest from the original TEDotProductAttention. - """ - - def __init__( - self, - config: TransformerConfig, - layer_number: int, - attn_mask_type: AttnMaskType, - attention_type: str, - attention_dropout: float | None = None, - softmax_scale: float = 1.0, - k_channels: int | None = None, - v_channels: int | None = None, - cp_comm_type: str = "p2p", - ): - """Initialize ESM2TEDotProductAttention.""" - self.config = config - self.te_forward_mask_type = False - self.qkv_format: str = "sbhd" - - if self.config.apply_query_key_layer_scaling != bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))): - raise ValueError( - f"apply_query_key_layer_scaling is {self.config.apply_query_key_layer_scaling} " - f"but environment variable NVTE_APPLY_QK_LAYER_SCALING is " - f"{os.getenv('NVTE_APPLY_QK_LAYER_SCALING')}. Transformer Engine does not support " - f"setting query key layer scaling via argument, so these two must match." - ) - - extra_kwargs = {} - if is_te_min_version("0.11.0"): - extra_kwargs["num_gqa_groups"] = self.config.num_query_groups - elif self.config.num_query_groups != self.config.num_attention_heads: - raise ValueError( - f"Transformer Engine v{get_te_version()} does not support Grouped Query Attention, " - f"use a newer version of Transformer Engine. " - f"(num_query_groups ({self.config.num_query_groups}) != " - f"num_attention_heads ({self.config.num_attention_heads}))" - ) - - if is_te_min_version("0.10.0"): - extra_kwargs["attention_type"] = attention_type - # older version don't need attention_type - - if is_te_min_version("0.12.0", check_equality=False): - self.te_forward_mask_type = True - - # Only Transformer-Engine version >= 1.0.0 supports context parallelism - if is_te_min_version("1.0.0"): - if getattr(TEDotProductAttention, "cp_stream") is None: - TEDotProductAttention.cp_stream = torch.cuda.Stream() - extra_kwargs["cp_group"] = get_context_parallel_group(check_initialized=False) - extra_kwargs["cp_global_ranks"] = get_context_parallel_global_ranks(check_initialized=False) - extra_kwargs["cp_stream"] = TEDotProductAttention.cp_stream - if is_te_min_version("1.10.0"): - if cp_comm_type is None: - extra_kwargs["cp_comm_type"] = "p2p" - else: - extra_kwargs["cp_comm_type"] = cp_comm_type - else: - assert ( - self.config.context_parallel_size == 1 - ), "Only Transformer-Engine version >= 1.0.0 supports context parallelism!" - - if self.config.deterministic_mode: - if int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")) != 0: - raise RuntimeError( - "deterministic_mode is on and we are using DotProductAttention from " - "Transformer Engine, but NVTE_ALLOW_NONDETERMINISTIC_ALGO is not 0. " - f"Currently set to: {os.getenv('NVTE_ALLOW_NONDETERMINISTIC_ALGO', 'not set')}." - ) - - if config.window_size is not None: - # Check version - assert is_te_min_version("1.2.0"), ( - f"Transformer-Engine v{get_te_version()} must be >= 1.2.0 to support" "sliding window attention." - ) - extra_kwargs["window_size"] = config.window_size - - if is_te_min_version("1.10.0"): - # TE 1.10.0 introduces the ability to set the different k and v channels - kv_channels = ( - (k_channels, v_channels) - if k_channels is not None and v_channels is not None - else self.config.kv_channels - ) - else: - kv_channels = self.config.kv_channels - - extra_kwargs["softmax_scale"] = softmax_scale - - super(TEDotProductAttention, self).__init__( - num_attention_heads=self.config.num_attention_heads, - kv_channels=kv_channels, - attention_dropout=(self.config.attention_dropout if attention_dropout is None else attention_dropout), - attn_mask_type=attn_mask_type.name, - sequence_parallel=self.config.sequence_parallel, - tp_size=self.config.tensor_model_parallel_size, - get_rng_state_tracker=(get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None), - tp_group=get_tensor_model_parallel_group(check_initialized=False), - layer_number=layer_number, - **extra_kwargs, - ) - - -class ESM2DotProductAttention(DotProductAttention): - """ESM2-Specific core attention. - - Region where selective activation recomputation is applied. - This region is memory intensive but less compute intensive which - makes activation checkpointing more efficient for LLMs (20B+). - See Reducing Activation Recomputation in Large Transformer Models: - https://arxiv.org/abs/2205.05198 for more details. - - We use the following notation: - h: hidden size - n: number of attention heads - p: number of tensor model parallel partitions - b: batch size - s: sequence length - """ - - def __init__( - self, - config: TransformerConfig, - layer_number: int, - attn_mask_type: AttnMaskType, - attention_type: str, - attention_dropout: Optional[float] = None, - ) -> None: - """Initializes the Attention class. - - Args: - config: The configuration object for the transformer. - layer_number: The layer number of the attention module. - attn_mask_type: The type of attention mask to be used. - attention_type: The type of attention mechanism. - attention_dropout: The dropout rate for attention weights. Defaults to None. - """ - super().__init__( - config=config, - layer_number=layer_number, - attn_mask_type=attn_mask_type, - attention_type=attention_type, - attention_dropout=attention_dropout, - ) - - def forward( - self, - query: Tensor, - key: Tensor, - value: Tensor, - attention_mask: Tensor, - attn_mask_type: Optional[AttnMaskType] = None, - packed_seq_params: Optional[PackedSeqParams] = None, - ): - """Forward pass of the ESM2DotProductAttention module. - - Args: - query: The query tensor of shape [sq, b, np, hn]. - key: The key tensor of shape [sk, b, ng, hn]. - value: The value tensor of shape [sk, b, ng, hn]. - attention_mask: The attention mask tensor of shape [b, np, sq, sk]. - attn_mask_type: The attention mask type, currently unused. Defaults to None. - packed_seq_params: The packed sequence parameters. These are used for context parallelism so will be needed - to be implemented if we want to support this. Defaults to None. - - Returns: - Tensor: The context tensor of shape [sq, b, hp]. - """ - if packed_seq_params is not None: - raise ValueError( - "Packed sequence is not supported by DotProductAttention. " "Please use TEDotProductAttention instead." - ) - - # =================================== - # Raw attention scores. [b, n/p, s, s] - # =================================== - - # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn] - # This is a noop for normal attention where ng == np. When using group query attention this - # creates a view that has the keys and values virtually repeated along their dimension to - # match the number of queries. - - # attn_mask_type is not used. - if (np_ng := self.num_attention_heads_per_partition // self.num_query_groups_per_partition) > 1: - key = key.repeat_interleave(np_ng, dim=2) - value = value.repeat_interleave(np_ng, dim=2) - - # [b, np, sq, sk] - b, np, sq, sk = query.size(1), query.size(2), query.size(0), key.size(0) - - # [sq, b, np, hn] -> [sq, b * np, hn] - # This will be a simple view when doing normal attention, but in group query attention - # the key and value tensors are repeated to match the queries so you can't use simple strides - # to extract the queries. - query = query.reshape(sq, b * np, -1) - # [sk, b, np, hn] -> [sk, b * np, hn] - key = key.view(sk, b * np, -1) - - # preallocting input tensor: [b * np, sq, sk] - matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor( - (b * np, sq, sk), - query.dtype, - "mpu", - ) - - # Raw attention scores. [b * np, sq, sk] - matmul_result = torch.baddbmm( - matmul_input_buffer, - query.transpose(0, 1), # [b * np, sq, hn] - key.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] - beta=0.0, - alpha=(1.0 / self.norm_factor) if self.config.normalize_attention_scores else 1.0, - ) - - # change view to [b, np, sq, sk] - attention_scores = matmul_result.view(b, np, sq, sk) - - # =========================== - # Attention probs and dropout - # =========================== - - # attention scores and attention mask [b, np, sq, sk] - # ESM2 Customization - if self.config.use_esm_attention: - # NOTE: the slicing here is to make the attention_mask the same shape as the extended - # attention mask in ESM2. The multiplication by -3.4028e+38 (float32 min_val) is - # similarly motivated by ESM2's masking approach, which forces softmax of attention scores - # for masked entries to be close to 0. This number is replaced with min_val of the precision - # using min_val instead of -inf is stable in an special case where all sequence is masked - min_val = torch.finfo(attention_scores.dtype).min - - attention_probs: Tensor = self.esm2_scale_mask_softmax( - attention_scores.masked_fill(attention_mask[:, :, 0:1, :].to(bool), min_val) - ) - # END ESM2 Customization - else: - attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - - if not self.config.sequence_parallel: - with tensor_parallel.get_cuda_rng_tracker().fork(): - attention_probs = self.attention_dropout(attention_probs) - else: - attention_probs = self.attention_dropout(attention_probs) - - # ========================= - # Context layer. [sq, b, hp] - # ========================= - - # value -> context layer. - # [sk, b, np, hn] --> [b, np, sq, hn] - - # context layer shape: [b, np, sq, hn] - b, np, sq, hn = value.size(1), value.size(2), query.size(0), value.size(3) - - # change view [sk, b * np, hn] - value = value.view(value.size(0), b * np, -1) - - # change view [b * np, sq, sk] - attention_probs = attention_probs.view(b * np, sq, -1) - - # matmul: [b * np, sq, hn] - context = torch.bmm(attention_probs, value.transpose(0, 1)) - - # change view [b, np, sq, hn] - context = context.view(b, np, sq, hn) - - # [b, np, sq, hn] --> [sq, b, np, hn] - context = context.permute(2, 0, 1, 3).contiguous() - - # [sq, b, np, hn] --> [sq, b, hp] - context = context.view(sq, b, self.hidden_size_per_partition) - - return context - - def esm2_scale_mask_softmax( - self, - input: Tensor, - mask: Optional[Tensor] = None, - scale: Optional[Union[float, int]] = None, - mask_func: Optional[Callable] = None, - ) -> Tensor: - """Scale Mask Softmax function. - - Args: - input: Tensor of shape (Batch, NP, SK, SQ). The input may or may not have already - had a mask applied to it. - mask: If a mask is to be applied, it will go here. - scale: A scale factor that will be applied before the softmax. - mask_func: An optional function to apply to the mask. If None, it is assumed that - the input already had the mask applied to it. - - Returns: - probs: Tensor of normalized probabilities after the softmax has been applied, - of shape (Batch, NP, SK, SQ). - """ - if self.attn_mask_type.name != "padding": - raise ValueError( - f"self.attn_mask_type: {self.attn_mask_type} is not 'padding'. " - "Only 'padding' type is supported currently." - ) - - original_dtype = input.dtype # Store original dtype - if ( - original_dtype == torch.float16 or original_dtype == torch.bfloat16 - ) and self.config.attention_softmax_in_fp32: - input = input.float() # Convert to float32 for softmax - - if scale is not None: - input = input * scale # Apply scaling - - if mask is not None and mask_func is not None: - input = mask_func(input, mask) # Apply mask function if provided - - probs = torch.nn.functional.softmax(input, dim=-1) # Apply softmax - - if self.config.attention_softmax_in_fp32 and original_dtype in (torch.float16, torch.bfloat16): - probs = probs.to(original_dtype) # Convert back to original dtype if necessary - - return probs diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py index d0999c2773..b9c82ed258 100644 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py +++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/model.py @@ -35,7 +35,6 @@ from torch.optim import Optimizer from bionemo.esm2.data.tokenizer import BioNeMoESMTokenizer -from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention from bionemo.esm2.model.embedding import ESM2Embedding from bionemo.llm.api import MegatronLossType from bionemo.llm.model.biobert.model import BioBertConfig, MegatronBioBertModel, PositionEmbeddingKinds @@ -294,6 +293,7 @@ class ESM2GenericConfig(BioBertConfig[ESM2ModelT, MegatronLossType]): bias_activation_fusion: bool = True # True degrades accuracy slightly, but is faster. activation_func: Callable = F.gelu # esm_gelu_func # ESM2 MLP init_method_std: float = 0.02 + softmax_scale: float = 1.0 # embedding token_dropout: bool = True @@ -346,13 +346,11 @@ def __post_init__(self): super().__post_init__() if self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec: self.apply_query_key_layer_scaling = False - self.core_attention_override = ESM2TEDotProductAttention elif self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_local_spec: logging.warning( "BiobertSpecOption.esm2_bert_layer_local_spec is depreciated. Use BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec instead." ) self.apply_query_key_layer_scaling = True - self.core_attention_override = ESM2DotProductAttention else: raise ValueError(f"Unknown biobert_spec_option: {self.biobert_spec_option}") diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py index 5ba6739164..ac21820875 100644 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py +++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/run/config_models.py @@ -25,7 +25,6 @@ from bionemo.esm2.data.datamodule import ESMDataModule from bionemo.esm2.data.dataset import RandomMaskStrategy from bionemo.esm2.data.tokenizer import get_tokenizer -from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention from bionemo.esm2.model.model import ESM2Config from bionemo.llm.model.biobert.model import BiobertSpecOption from bionemo.llm.run.config_models import ( @@ -188,14 +187,12 @@ def validate_and_set_attention_and_scaling(self): ) if self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec: self.apply_query_key_layer_scaling = False - self.core_attention_override = ESM2TEDotProductAttention elif self.biobert_spec_option == BiobertSpecOption.esm2_bert_layer_local_spec: logging.warning( "BiobertSpecOption.esm2_bert_layer_local_spec is deprecated. " "Use BiobertSpecOption.esm2_bert_layer_with_transformer_engine_spec instead." ) self.apply_query_key_layer_scaling = True - self.core_attention_override = ESM2DotProductAttention return self def model_validator(self, global_cfg: MainConfig) -> MainConfig: diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_attention.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_attention.py deleted file mode 100644 index 6383a04b65..0000000000 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_attention.py +++ /dev/null @@ -1,120 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-Apache2 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import math - -import pytest -import torch -from megatron.core.transformer.enums import AttnMaskType - -from bionemo.esm2.api import ESM2Config -from bionemo.esm2.model.attention import ESM2DotProductAttention, ESM2TEDotProductAttention -from bionemo.testing import megatron_parallel_state_utils - - -@pytest.fixture(scope="module") -def config(): - with megatron_parallel_state_utils.distributed_model_parallel_state(): - yield ESM2Config( - seq_length=20, - hidden_size=4, - num_attention_heads=4, - attention_dropout=0.1, - use_esm_attention=True, - ) - - -@pytest.fixture(scope="module") -def local_attention_layer(config: ESM2Config) -> ESM2DotProductAttention: - return ESM2DotProductAttention( - config=config, - layer_number=0, - attn_mask_type=AttnMaskType.padding, - attention_type="normal", - ).eval() - - -@pytest.fixture(scope="module") -def attention_layer(config: ESM2Config) -> ESM2TEDotProductAttention: - return ESM2TEDotProductAttention( - config=config, - layer_number=0, - attn_mask_type=AttnMaskType.padding, - attention_type="self", - ).eval() - - -def test_init(attention_layer, config): - assert attention_layer.config.use_esm_attention - assert attention_layer.config == config - - -@pytest.mark.skip(reason="Not implemented yet for transformer engine") -def test_forward(attention_layer, config): - batch_size = 2 - sequence_length = config.seq_length - hidden_size = config.hidden_size - device = torch.device("cuda") - - query = torch.randn(sequence_length, batch_size, 1, hidden_size, device=device) - key = torch.randn(sequence_length, batch_size, 1, hidden_size, device=device) - value = torch.randn(sequence_length, batch_size, 1, hidden_size, device=device) - random_ints = torch.randint(0, 2, (batch_size, 1, sequence_length, sequence_length), device=device) - attention_mask = ((random_ints + torch.transpose(random_ints, dim0=2, dim1=3)) / 2).to( - dtype=torch.bool - ) # symmetric mask tensor - - if isinstance(attention_layer, ESM2TEDotProductAttention): - raise NotImplementedError("TE requires reshaped input and is not implemented yet") - else: - output = attention_layer(query, key, value, attention_mask) - assert output.shape == (sequence_length, batch_size, hidden_size) - - -@pytest.mark.skip(reason="Not implemented yet for transformer engine") -@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.half]) -def test_attention_with_mask(attention_layer, dtype): - sequence_length_val = 3 - sequence_length_query = 1 - batch_size = 2 - emb_dim = 4 - device = torch.device("cuda") - - # query and key such that the dot prod is an all-ones tensor - query = torch.ones(batch_size, sequence_length_query, 1, emb_dim, device=device, dtype=dtype) / math.sqrt(emb_dim) - key = torch.ones(batch_size, sequence_length_val, 1, emb_dim, device=device, dtype=dtype) / math.sqrt(emb_dim) - - query = query.transpose(0, 1) - key = key.transpose(0, 1) - - attention_mask = torch.zeros(batch_size, 1, 1, sequence_length_val, device=device, dtype=dtype) - attention_mask[0, :, :, 2:] = 1 # average first two tensors in val - attention_mask[1, :, :, 1:] = 1 # select first item from val - - values = torch.stack([torch.arange(sequence_length_val)] * batch_size).to(device=device, dtype=dtype) + 1.0 - values = torch.stack([values] * emb_dim, dim=2).unsqueeze(2).transpose(0, 1) - - assert values.shape == (sequence_length_val, batch_size, 1, emb_dim) - - # softmax will make the the avg first 2 tensors in vals (ones + twos)/2 and second row is just ones - if isinstance(attention_layer, ESM2TEDotProductAttention): - raise NotImplementedError("TE requires reshaped input and is not implemented yet") - else: - output = attention_layer(query, key, values, attention_mask) - expected_output = torch.tensor( - [[[1.5000, 1.5000, 1.5000, 1.5000], [1.0000, 1.0000, 1.0000, 1.0000]]], device=device, dtype=dtype - ) - assert torch.equal(output, expected_output)