Skip to content

Commit

Permalink
multi-GPU support for mine_hard_negatives
Browse files Browse the repository at this point in the history
Added support for multi-GPU encoding in sentence embeddings with model.encode_multi_process
  • Loading branch information
alperctnkaya authored Sep 29, 2024
1 parent 73c8dc3 commit 44ed026
Showing 1 changed file with 32 additions and 12 deletions.
44 changes: 32 additions & 12 deletions sentence_transformers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,7 @@ def mine_hard_negatives(
faiss_batch_size: int = 16384,
use_faiss: bool = False,
verbose: bool = True,
use_multiple_gpus = True,
) -> Dataset:
"""
Add hard negatives to a dataset of (anchor, positive) pairs to create (anchor, positive, negative) triplets or
Expand Down Expand Up @@ -714,12 +715,21 @@ def mine_hard_negatives(
except Exception:
pass

corpus_embeddings = model.encode(
corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
query_embeddings = model.encode(
queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
if use_multiple_gpus:
pool = model.start_multi_process_pool()

corpus_embeddings = model.encode_multi_process(
corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True)

query_embeddings = model.encode_multi_process(
queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True)
else
corpus_embeddings = model.encode(
corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
query_embeddings = model.encode(
queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
index.add(corpus_embeddings)

scores_list = []
Expand All @@ -735,12 +745,22 @@ def mine_hard_negatives(

else:
# Embed the corpus and the queries
corpus_embeddings = model.encode(
corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
query_embeddings = model.encode(
queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)

if use_multiple_gpus:
pool = model.start_multi_process_pool()

corpus_embeddings = model.encode_multi_process(
corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True)

query_embeddings = model.encode_multi_process(
queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True)
else
corpus_embeddings = model.encode(
corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
query_embeddings = model.encode(
queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
)
scores = model.similarity(query_embeddings, corpus_embeddings).to(device)

# Keep only the range_max + max_positives highest scores. We offset by 1 to potentially include the positive pair
Expand Down

0 comments on commit 44ed026

Please sign in to comment.