diff --git a/requirements-test.txt b/requirements-test.txt index b8663cecf..567b990cb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -8,3 +8,8 @@ awscli==1.33.33 nbval==0.11.0 # For NvFaidx equivalence tests pyfaidx==0.8.1.3 + +# Temporary pin for pytorch-lightning until megatron callbacks in ProgressPrinter can get fixed. +# See https://nvidia.slack.com/archives/C02A7LYGHK8/p1734727482697309 +pytorch-lightning<2.5.0 +lightning<2.5.0 diff --git a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/finetune/finetune_token_classifier.py b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/finetune/finetune_token_classifier.py index 018ebb5ac..b0991669d 100644 --- a/sub-packages/bionemo-esm2/src/bionemo/esm2/model/finetune/finetune_token_classifier.py +++ b/sub-packages/bionemo-esm2/src/bionemo/esm2/model/finetune/finetune_token_classifier.py @@ -27,6 +27,7 @@ from bionemo.esm2.api import ESM2GenericConfig, ESM2Model from bionemo.esm2.data import tokenizer +from bionemo.llm.data.collate import MLM_LOSS_IGNORE_INDEX from bionemo.llm.data.label2id_tokenizer import Label2IDTokenizer from bionemo.llm.data.types import BertSample from bionemo.llm.model.biobert.model import BioBertOutput @@ -230,6 +231,7 @@ def __init__( self.tokenizer = tokenizer label_tokenizer = Label2IDTokenizer() self.label_tokenizer = label_tokenizer.build_vocab("CHE") + self.label_cls_eos_id = MLM_LOSS_IGNORE_INDEX def __len__(self) -> int: """Length of dataset.""" @@ -257,13 +259,13 @@ def _tokenize_labels(self, labels_sequence: str) -> Tensor: # # for multi-label classification with BCEWithLogitsLoss # tokenized_labels = torch.nn.functional.one_hot(label_ids, num_classes=self.label_tokenizer.vocab_size) - # cls_eos = torch.full((1, self.label_tokenizer.vocab_size), -1, dtype=tokenized_labels.dtype) + # cls_eos = torch.full((1, self.label_tokenizer.vocab_size), self.label_cls_eos_id, dtype=tokenized_labels.dtype) # for multi-class (mutually exclusive) classification with CrossEntropyLoss tokenized_labels = label_ids - cls_eos = torch.tensor([-1], dtype=tokenized_labels.dtype) + cls_eos = torch.tensor([self.label_cls_eos_id], dtype=tokenized_labels.dtype) - # add cls / eos labels with padding value -1 to have the same shape as tokenized_sequence + # add cls / eos label ids with padding value -100 to have the same shape as tokenized_sequence labels = torch.cat((cls_eos, tokenized_labels, cls_eos)) return labels diff --git a/sub-packages/bionemo-llm/src/bionemo/llm/data/collate.py b/sub-packages/bionemo-llm/src/bionemo/llm/data/collate.py index e1b018cf6..9f72d4d04 100644 --- a/sub-packages/bionemo-llm/src/bionemo/llm/data/collate.py +++ b/sub-packages/bionemo-llm/src/bionemo/llm/data/collate.py @@ -28,6 +28,9 @@ _warned_once: bool = False +MLM_LOSS_IGNORE_INDEX = -100 # This should match the masked value used in the MLM loss mask. + + def padding_collate_fn( batch: Sequence[_T], padding_values: dict[str, int], @@ -105,7 +108,7 @@ def bert_padding_collate_fn( "text": padding_value, "types": 0, "attention_mask": False, - "labels": -100, # This should match the masked value used in the MLM loss mask. + "labels": MLM_LOSS_IGNORE_INDEX, # This should match the masked value used in the MLM loss mask. "loss_mask": False, "is_random": 0, }