From a1db32df6b209d99c5bb5412c3e1a28f039c8e6b Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Tue, 15 Oct 2024 12:10:27 +0200 Subject: [PATCH] [`docs`] Update the training snippets for some losses that should use the v3 Trainer (#2987) --- .../losses/Matryoshka2dLoss.py | 26 ++++++++------- .../losses/MatryoshkaLoss.py | 26 ++++++++------- .../losses/MegaBatchMarginLoss.py | 33 +++++++++++-------- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/sentence_transformers/losses/Matryoshka2dLoss.py b/sentence_transformers/losses/Matryoshka2dLoss.py index 4b77b9c74..7c85884d5 100644 --- a/sentence_transformers/losses/Matryoshka2dLoss.py +++ b/sentence_transformers/losses/Matryoshka2dLoss.py @@ -95,21 +95,23 @@ def __init__( Example: :: - from sentence_transformers import SentenceTransformer, losses, InputExample - from torch.utils.data import DataLoader + from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses + from datasets import Dataset model = SentenceTransformer("microsoft/mpnet-base") - train_examples = [ - InputExample(texts=['Anchor 1', 'Positive 1']), - InputExample(texts=['Anchor 2', 'Positive 2']), - ] - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32) - train_loss = losses.MultipleNegativesRankingLoss(model=model) - train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64]) - model.fit( - [(train_dataloader, train_loss)], - epochs=10, + train_dataset = Dataset.from_dict({ + "anchor": ["It's nice weather outside today.", "He drove to work."], + "positive": ["It's so sunny.", "He took the car to the office."], + }) + loss = losses.MultipleNegativesRankingLoss(model) + loss = losses.Matryoshka2dLoss(model, loss, [768, 512, 256, 128, 64]) + + trainer = SentenceTransformerTrainer( + model=model, + train_dataset=train_dataset, + loss=loss, ) + trainer.train() """ matryoshka_loss = MatryoshkaLoss( model, diff --git a/sentence_transformers/losses/MatryoshkaLoss.py b/sentence_transformers/losses/MatryoshkaLoss.py index e6a18aac0..997e7be0b 100644 --- a/sentence_transformers/losses/MatryoshkaLoss.py +++ b/sentence_transformers/losses/MatryoshkaLoss.py @@ -101,21 +101,23 @@ def __init__( Example: :: - from sentence_transformers import SentenceTransformer, losses, InputExample - from torch.utils.data import DataLoader + from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses + from datasets import Dataset model = SentenceTransformer("microsoft/mpnet-base") - train_examples = [ - InputExample(texts=['Anchor 1', 'Positive 1']), - InputExample(texts=['Anchor 2', 'Positive 2']), - ] - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32) - train_loss = losses.MultipleNegativesRankingLoss(model=model) - train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64]) - model.fit( - [(train_dataloader, train_loss)], - epochs=10, + train_dataset = Dataset.from_dict({ + "anchor": ["It's nice weather outside today.", "He drove to work."], + "positive": ["It's so sunny.", "He took the car to the office."], + }) + loss = losses.MultipleNegativesRankingLoss(model) + loss = losses.MatryoshkaLoss(model, loss, [768, 512, 256, 128, 64]) + + trainer = SentenceTransformerTrainer( + model=model, + train_dataset=train_dataset, + loss=loss, ) + trainer.train() """ super().__init__() self.model = model diff --git a/sentence_transformers/losses/MegaBatchMarginLoss.py b/sentence_transformers/losses/MegaBatchMarginLoss.py index a964eb726..22dbbe5ea 100644 --- a/sentence_transformers/losses/MegaBatchMarginLoss.py +++ b/sentence_transformers/losses/MegaBatchMarginLoss.py @@ -59,25 +59,30 @@ def __init__( Example: :: - from sentence_transformers import SentenceTransformer, InputExample, losses - from torch.utils.data import DataLoader + from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer, losses + from datasets import Dataset - model = SentenceTransformer('all-MiniLM-L6-v2') - - total_examples = 500 train_batch_size = 250 train_mini_batch_size = 32 - train_examples = [ - InputExample(texts=[f"This is sentence number {i}", f"This is sentence number {i+1}"]) for i in range(total_examples) - ] - train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size) - train_loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size) - - model.fit( - [(train_dataloader, train_loss)], - epochs=10, + model = SentenceTransformer('all-MiniLM-L6-v2') + train_dataset = Dataset.from_dict({ + "anchor": [f"This is sentence number {i}" for i in range(500)], + "positive": [f"This is sentence number {i}" for i in range(1, 501)], + }) + loss = losses.MegaBatchMarginLoss(model=model, mini_batch_size=train_mini_batch_size) + + args = SentenceTransformerTrainingArguments( + output_dir="output", + per_device_train_batch_size=train_batch_size, + ) + trainer = SentenceTransformerTrainer( + model=model, + args=args, + train_dataset=train_dataset, + loss=loss, ) + trainer.train() """ super().__init__() self.model = model