From 856211d4128357cb17daac223d78ca510b40931b Mon Sep 17 00:00:00 2001 From: "ashok.b" Date: Fri, 17 Nov 2023 14:45:02 +0530 Subject: [PATCH] modifed masking before pooling --- .gitignore | 4 + InstructorEmbedding/instructor.py | 651 ++++++++++-------- evaluation/MTEB/examples/evaluate_model.py | 4 +- .../MTEB/mteb/abstasks/AbsTaskRetrieval.py | 33 +- evaluation/MTEB/setup.py | 2 + requirements.txt | 1 + train.py | 49 +- 7 files changed, 414 insertions(+), 330 deletions(-) diff --git a/.gitignore b/.gitignore index 9f11b75..c86c82f 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ .idea/ +/cache +/evaluation/MTEB/mteb.egg-info +/**/__pycache__ +/InstructorEmbedding.egg-info diff --git a/InstructorEmbedding/instructor.py b/InstructorEmbedding/instructor.py index b889ce0..0ba795c 100644 --- a/InstructorEmbedding/instructor.py +++ b/InstructorEmbedding/instructor.py @@ -1,29 +1,31 @@ # This script is based on the modifications from https://github.com/UKPLab/sentence-transformers -import torch -import os -import json import importlib +import json +import os +from collections import OrderedDict +from typing import Union + import numpy as np -from tqdm.autonotebook import trange -from torch import Tensor, device +import torch from sentence_transformers import SentenceTransformer from sentence_transformers.models import Transformer -from transformers import AutoConfig -from transformers import AutoTokenizer -from collections import OrderedDict -from torch import nn +from torch import Tensor, nn +from tqdm.autonotebook import trange +from transformers import AutoConfig, AutoTokenizer -def batch_to_device(batch, target_device: device): + +def batch_to_device(batch, target_device: str): for key in batch: if isinstance(batch[key], Tensor): batch[key] = batch[key].to(target_device) return batch -class INSTRUCTOR_Pooling(nn.Module): +class InstructorPooling(nn.Module): """Performs pooling (max or mean) on the token embeddings. - Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. + Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. + This layer also allows to use the CLS token if it is returned by the underlying word embedding model. You can concatenate multiple poolings together. :param word_embedding_dimension: Dimensions for the word embeddings @@ -32,35 +34,43 @@ class INSTRUCTOR_Pooling(nn.Module): :param pooling_mode_max_tokens: Use max in each dimension over all tokens. :param pooling_mode_mean_tokens: Perform mean-pooling :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but divide by sqrt(input_length). - :param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling, see https://arxiv.org/abs/2202.08904 - :param pooling_mode_lasttoken: Perform last token pooling, see https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005 + :param pooling_mode_weightedmean_tokens: Perform (position) weighted mean pooling, + see https://arxiv.org/abs/2202.08904 + :param pooling_mode_lasttoken: Perform last token pooling, + see https://arxiv.org/abs/2202.08904 & https://arxiv.org/abs/2201.10005 """ - def __init__(self, - word_embedding_dimension: int, - pooling_mode: str = None, - pooling_mode_cls_token: bool = False, - pooling_mode_max_tokens: bool = False, - pooling_mode_mean_tokens: bool = True, - pooling_mode_mean_sqrt_len_tokens: bool = False, - pooling_mode_weightedmean_tokens: bool = False, - pooling_mode_lasttoken: bool = False, - ): - super(INSTRUCTOR_Pooling, self).__init__() - - self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', - 'pooling_mode_max_tokens', - 'pooling_mode_mean_sqrt_len_tokens', 'pooling_mode_weightedmean_tokens', - 'pooling_mode_lasttoken'] + def __init__( + self, + word_embedding_dimension: int, + pooling_mode: Union[str, None] = None, + pooling_mode_cls_token: bool = False, + pooling_mode_max_tokens: bool = False, + pooling_mode_mean_tokens: bool = True, + pooling_mode_mean_sqrt_len_tokens: bool = False, + pooling_mode_weightedmean_tokens: bool = False, + pooling_mode_lasttoken: bool = False, + ): + super().__init__() + + self.config_keys = [ + "word_embedding_dimension", + "pooling_mode_cls_token", + "pooling_mode_mean_tokens", + "pooling_mode_max_tokens", + "pooling_mode_mean_sqrt_len_tokens", + "pooling_mode_weightedmean_tokens", + "pooling_mode_lasttoken", + ] if pooling_mode is not None: # Set pooling mode by string pooling_mode = pooling_mode.lower() - assert pooling_mode in ['mean', 'max', 'cls', 'weightedmean', 'lasttoken'] - pooling_mode_cls_token = (pooling_mode == 'cls') - pooling_mode_max_tokens = (pooling_mode == 'max') - pooling_mode_mean_tokens = (pooling_mode == 'mean') - pooling_mode_weightedmean_tokens = (pooling_mode == 'weightedmean') - pooling_mode_lasttoken = (pooling_mode == 'lasttoken') + assert pooling_mode in ["mean", "max", "cls", "weightedmean", "lasttoken"] + pooling_mode_cls_token = pooling_mode == "cls" + pooling_mode_max_tokens = pooling_mode == "max" + pooling_mode_mean_tokens = pooling_mode == "mean" + pooling_mode_weightedmean_tokens = pooling_mode == "weightedmean" + pooling_mode_lasttoken = pooling_mode == "lasttoken" self.word_embedding_dimension = word_embedding_dimension self.pooling_mode_cls_token = pooling_mode_cls_token @@ -70,13 +80,22 @@ def __init__(self, self.pooling_mode_weightedmean_tokens = pooling_mode_weightedmean_tokens self.pooling_mode_lasttoken = pooling_mode_lasttoken - pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, - pooling_mode_mean_sqrt_len_tokens, pooling_mode_weightedmean_tokens, - pooling_mode_lasttoken]) - self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) + pooling_mode_multiplier = sum( + [ + pooling_mode_cls_token, + pooling_mode_max_tokens, + pooling_mode_mean_tokens, + pooling_mode_mean_sqrt_len_tokens, + pooling_mode_weightedmean_tokens, + pooling_mode_lasttoken, + ] + ) + self.pooling_output_dimension = ( + pooling_mode_multiplier * word_embedding_dimension + ) def __repr__(self): - return "Pooling({})".format(self.get_config_dict()) + return f"Pooling({self.get_config_dict()})" def get_pooling_mode_str(self) -> str: """ @@ -84,42 +103,54 @@ def get_pooling_mode_str(self) -> str: """ modes = [] if self.pooling_mode_cls_token: - modes.append('cls') + modes.append("cls") if self.pooling_mode_mean_tokens: - modes.append('mean') + modes.append("mean") if self.pooling_mode_max_tokens: - modes.append('max') + modes.append("max") if self.pooling_mode_mean_sqrt_len_tokens: - modes.append('mean_sqrt_len_tokens') + modes.append("mean_sqrt_len_tokens") if self.pooling_mode_weightedmean_tokens: - modes.append('weightedmean') + modes.append("weightedmean") if self.pooling_mode_lasttoken: - modes.append('lasttoken') + modes.append("lasttoken") return "+".join(modes) def forward(self, features): # print(features.keys()) - token_embeddings = features['token_embeddings'] - attention_mask = features['attention_mask'] + token_embeddings = features["token_embeddings"] + attention_mask = features["attention_mask"] ## Pooling strategy output_vectors = [] if self.pooling_mode_cls_token: - cls_token = features.get('cls_token_embeddings', token_embeddings[:, 0]) # Take first token by default + cls_token = features.get( + "cls_token_embeddings", token_embeddings[:, 0] + ) # Take first token by default output_vectors.append(cls_token) if self.pooling_mode_max_tokens: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + token_embeddings[ + input_mask_expanded == 0 + ] = -1e9 # Set padding tokens to large negative value max_over_time = torch.max(token_embeddings, 1)[0] output_vectors.append(max_over_time) if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present - if 'token_weights_sum' in features: - sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) + if "token_weights_sum" in features: + sum_mask = ( + features["token_weights_sum"] + .unsqueeze(-1) + .expand(sum_embeddings.size()) + ) else: sum_mask = input_mask_expanded.sum(1) @@ -130,14 +161,17 @@ def forward(self, features): if self.pooling_mode_mean_sqrt_len_tokens: output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) if self.pooling_mode_weightedmean_tokens: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) # token_embeddings shape: bs, seq, hidden_dim weights = ( torch.arange(start=1, end=token_embeddings.shape[1] + 1) - .unsqueeze(0) - .unsqueeze(-1) - .expand(token_embeddings.size()) - .float().to(token_embeddings.device) + .unsqueeze(0) + .unsqueeze(-1) + .expand(token_embeddings.size()) + .float() + .to(token_embeddings.device) ) assert weights.shape == token_embeddings.shape == input_mask_expanded.shape input_mask_expanded = input_mask_expanded * weights @@ -145,19 +179,26 @@ def forward(self, features): sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present - if 'token_weights_sum' in features: - sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) + if "token_weights_sum" in features: + sum_mask = ( + features["token_weights_sum"] + .unsqueeze(-1) + .expand(sum_embeddings.size()) + ) else: sum_mask = input_mask_expanded.sum(1) sum_mask = torch.clamp(sum_mask, min=1e-9) output_vectors.append(sum_embeddings / sum_mask) if self.pooling_mode_lasttoken: - bs, seq_len, hidden_dim = token_embeddings.shape + batch_size, _, hidden_dim = token_embeddings.shape # attention_mask shape: (bs, seq_len) # Get shape [bs] indices of the last token (i.e. the last token for each batch item) - # argmin gives us the index of the first 0 in the attention mask; We get the last 1 index by subtracting 1 - gather_indices = torch.argmin(attention_mask, 1, keepdim=False) - 1 # Shape [bs] + # argmin gives us the index of the first 0 in the attention mask; + # We get the last 1 index by subtracting 1 + gather_indices = ( + torch.argmin(attention_mask, 1, keepdim=False) - 1 + ) # Shape [bs] # There are empty sequences, where the index would become -1 which will crash gather_indices = torch.clamp(gather_indices, min=0) @@ -165,18 +206,22 @@ def forward(self, features): # Turn indices from shape [bs] --> [bs, 1, hidden_dim] gather_indices = gather_indices.unsqueeze(-1).repeat(1, hidden_dim) gather_indices = gather_indices.unsqueeze(1) - assert gather_indices.shape == (bs, 1, hidden_dim) + assert gather_indices.shape == (batch_size, 1, hidden_dim) # Gather along the 1st dim (seq_len) (bs, seq_len, hidden_dim -> bs, hidden_dim) # Actually no need for the attention mask as we gather the last token where attn_mask = 1 # but as we set some indices (which shouldn't be attended to) to 0 with clamp, we # use the attention mask to ignore them again - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - embedding = torch.gather(token_embeddings * input_mask_expanded, 1, gather_indices).squeeze(dim=1) + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + embedding = torch.gather( + token_embeddings * input_mask_expanded, 1, gather_indices + ).squeeze(dim=1) output_vectors.append(embedding) output_vector = torch.cat(output_vectors, 1) - features.update({'sentence_embedding': output_vector}) + features.update({"sentence_embedding": output_vector}) return features def get_sentence_embedding_dimension(self): @@ -186,15 +231,20 @@ def get_config_dict(self): return {key: self.__dict__[key] for key in self.config_keys} def save(self, output_path): - with open(os.path.join(output_path, 'config.json'), 'w') as fOut: - json.dump(self.get_config_dict(), fOut, indent=2) + with open( + os.path.join(output_path, "config.json"), "w", encoding="UTF-8" + ) as config_file: + json.dump(self.get_config_dict(), config_file, indent=2) @staticmethod def load(input_path): - with open(os.path.join(input_path, 'config.json')) as fIn: - config = json.load(fIn) + with open( + os.path.join(input_path, "config.json"), encoding="UTF-8" + ) as config_file: + config = json.load(config_file) + + return InstructorPooling(**config) - return INSTRUCTOR_Pooling(**config) def import_from_string(dotted_path): """ @@ -202,9 +252,9 @@ def import_from_string(dotted_path): last name in the path. Raise ImportError if the import failed. """ try: - module_path, class_name = dotted_path.rsplit('.', 1) + module_path, class_name = dotted_path.rsplit(".", 1) except ValueError: - msg = "%s doesn't look like a module path" % dotted_path + msg = f"{dotted_path} doesn't look like a module path" raise ImportError(msg) try: @@ -215,94 +265,118 @@ def import_from_string(dotted_path): try: return getattr(module, class_name) except AttributeError: - msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name) + msg = f"Module {module_path} does not define a {class_name} attribute/class" raise ImportError(msg) -class INSTRUCTOR_Transformer(Transformer): - def __init__(self, model_name_or_path: str, max_seq_length = None, - model_args = {}, cache_dir = None, - tokenizer_args = {}, do_lower_case: bool = False, - tokenizer_name_or_path : str = None): - super(Transformer, self).__init__() - self.config_keys = ['max_seq_length', 'do_lower_case'] +class InstructorTransformer(Transformer): + def __init__( + self, + model_name_or_path: str, + max_seq_length=None, + model_args=None, + cache_dir=None, + tokenizer_args=None, + do_lower_case: bool = False, + tokenizer_name_or_path: Union[str, None] = None, + load_model: bool = True, + ): + super().__init__(model_name_or_path) + if model_args is None: + model_args = {} + if tokenizer_args is None: + tokenizer_args = {} + self.config_keys = ["max_seq_length", "do_lower_case"] self.do_lower_case = do_lower_case - self.model_name_or_path = model_name_or_path - if model_name_or_path=='bi-contriever': + if model_name_or_path == "bi-contriever": model_name_or_path = "facebook/contriever" - if model_name_or_path.startswith('bigtr'): - model_name_or_path = model_name_or_path.split('#')[1] - if 'bigtr' in model_name_or_path and os.path.isdir(model_name_or_path): - config = AutoConfig.from_pretrained(os.path.join(model_name_or_path,'with_prompt'), **model_args, cache_dir=cache_dir) + if model_name_or_path.startswith("bigtr"): + model_name_or_path = model_name_or_path.split("#")[1] + if "bigtr" in model_name_or_path and os.path.isdir(model_name_or_path): + config = AutoConfig.from_pretrained( + os.path.join(model_name_or_path, "with_prompt"), + **model_args, + cache_dir=cache_dir, + ) else: - config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) - self._load_model(self.model_name_or_path, config, cache_dir, **model_args) + config = AutoConfig.from_pretrained( + model_name_or_path, **model_args, cache_dir=cache_dir + ) - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path if tokenizer_name_or_path is not None else model_name_or_path, cache_dir=cache_dir, **tokenizer_args) + if load_model: + self._load_model(self.model_name_or_path, config, cache_dir, **model_args) + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name_or_path + if tokenizer_name_or_path is not None + else model_name_or_path, + cache_dir=cache_dir, + **tokenizer_args, + ) - #No max_seq_length set. Try to infer from model - # print('max_seq_length ', max_seq_length) if max_seq_length is None: - if hasattr(self.auto_model, "config") and hasattr(self.auto_model.config, "max_position_embeddings") and hasattr(self.tokenizer, "model_max_length"): - max_seq_length = min(self.auto_model.config.max_position_embeddings, self.tokenizer.model_max_length) + if ( + hasattr(self.auto_model, "config") + and hasattr(self.auto_model.config, "max_position_embeddings") + and hasattr(self.tokenizer, "model_max_length") + ): + max_seq_length = min( + self.auto_model.config.max_position_embeddings, + self.tokenizer.model_max_length, + ) self.max_seq_length = max_seq_length - - print('max_seq_length ',max_seq_length) - if tokenizer_name_or_path is not None: self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ def forward(self, features): - """Returns token_embeddings, cls_token""" - # print(features) - # exit(0) - trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']} - if 'token_type_ids' in features: - trans_features['token_type_ids'] = features['token_type_ids'] - - context_masks = None - if 'context_masks' in features: - context_masks = features['context_masks'] - output_states = self.auto_model(**trans_features, return_dict=False) + input_features = { + "input_ids": features["input_ids"], + "attention_mask": features["attention_mask"], + } + if "token_type_ids" in features: + input_features["token_type_ids"] = features["token_type_ids"] + + instruction_mask = features["instruction_mask"] + output_states = self.auto_model(**input_features, return_dict=False) output_tokens = output_states[0] - attention_mask = features['attention_mask'] - if context_masks is not None: - import torch - assert len(context_masks) == len(attention_mask) - n = len(attention_mask) - # print('n ',n) - for local_idx in range(n): - assert torch.sum(attention_mask[local_idx]).item() >= context_masks[local_idx].item(),\ - f'{attention_mask[local_idx]}, {context_masks[local_idx]}, ' \ - f'{torch.sum(attention_mask[local_idx]).item()}, {context_masks[local_idx].item()}' - attention_mask[local_idx][:context_masks[local_idx]] = 0 - - # print('forward here') - features.update({'token_embeddings': output_tokens, 'attention_mask': attention_mask}) + attention_mask = features["attention_mask"] + instruction_mask = features["instruction_mask"] + attention_mask = attention_mask * instruction_mask + features.update( + {"token_embeddings": output_tokens, "attention_mask": attention_mask} + ) if self.auto_model.config.output_hidden_states: all_layer_idx = 2 - if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states + if ( + len(output_states) < 3 + ): # Some models only output last_hidden_states and all_hidden_states all_layer_idx = 1 - hidden_states = output_states[all_layer_idx] - features.update({'all_layer_embeddings': hidden_states}) + features.update({"all_layer_embeddings": hidden_states}) return features @staticmethod def load(input_path: str): - #Old classes used other config names than 'sentence_bert_config.json' - for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']: + # Old classes used other config names than 'sentence_bert_config.json' + for config_name in [ + "sentence_bert_config.json", + "sentence_roberta_config.json", + "sentence_distilbert_config.json", + "sentence_camembert_config.json", + "sentence_albert_config.json", + "sentence_xlm-roberta_config.json", + "sentence_xlnet_config.json", + ]: sbert_config_path = os.path.join(input_path, config_name) if os.path.exists(sbert_config_path): break - with open(sbert_config_path) as fIn: - config = json.load(fIn) - return INSTRUCTOR_Transformer(model_name_or_path=input_path, **config) + with open(sbert_config_path, encoding="UTF-8") as config_file: + config = json.load(config_file) + return InstructorTransformer(model_name_or_path=input_path, **config) def tokenize(self, texts): """ @@ -311,80 +385,97 @@ def tokenize(self, texts): output = {} if isinstance(texts[0], str): to_tokenize = [texts] - to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize] # Lowercase if self.do_lower_case: to_tokenize = [[s.lower() for s in col] for col in to_tokenize] - tokenized = self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length) + input_features = self.tokenizer( + *to_tokenize, + padding="max_length", + truncation="longest_first", + return_tensors="pt", + max_length=self.max_seq_length, + ) - # elif isinstance(texts[0], dict): - # to_tokenize = [] - # output['text_keys'] = [] - # for lookup in texts: - # text_key, text = next(iter(lookup.items())) - # to_tokenize.append(text) - # output['text_keys'].append(text_key) - # to_tokenize = [to_tokenize] elif isinstance(texts[0], list): - import torch - assert isinstance(texts[0][1],str) - new_texts = [] - for s in texts: + assert isinstance(texts[0][1], str) + assert ( + len(texts[0]) == 2 + ), "The input should have both instruction and input text" + + instructions = [] + instruction_prepended_input_texts = [] + for pair in texts: + instruction = pair[0].strip() + text = pair[1].strip() if self.do_lower_case: - new_texts.append([s[0],s[1].strip().lower()]) - else: - new_texts.append([s[0], s[1].strip()]) - texts = new_texts - assert len(texts[0])==2,f'The input should have both instruction and input text' - # if len(texts[0])==3: - # print('component 3') - num = len(texts) - contexts = [] - concatenated_input_texts = [] - for local_idx in range(num): - assert len(texts[local_idx])==2 - contexts.append(texts[local_idx][0]) - concatenated_input_texts.append(''.join(texts[local_idx])) - assert isinstance(contexts[-1],str) - assert isinstance(concatenated_input_texts[-1],str) - tokenized = self.tokenize(concatenated_input_texts) - context_tok = self.tokenize(contexts) - tokenized['context_masks'] = torch.sum(context_tok['attention_mask'],dim=1) - tokenized['context_masks'] = tokenized['context_masks']-1 - for my_idx in range(len(tokenized['context_masks'])): - if tokenized['context_masks'][my_idx]<=1: - tokenized['context_masks'][my_idx] = 0 - # text_types = [pair[-1] for pair in texts] - # print(text_types) - # assert all([tid==1 for tid in text_types]) or all([tid==0 for tid in text_types]) - # tokenized['text_type'] = text_types[0] - # torch.set_printoptions(edgeitems=15) - # print(tokenized) - # exit(0) - # elif len(texts[0])==2: - # # print('component 2') - # input_texts = [pair[0] for pair in texts] - # text_types = [pair[-1] for pair in texts] - # assert all([tid == 1 for tid in text_types]) or all([tid == 0 for tid in text_types]) - # tokenized = self.tokenize(input_texts) - # tokenized['text_type'] = text_types[0] - # else: - # raise ValueError('tokenization error') + instruction = instruction.lower() + text = text.lower() + instructions.append(instruction) + instruction_prepended_input_texts.append("".join([instruction, text])) + + input_features = self.tokenize(instruction_prepended_input_texts) + instruction_features = self.tokenize(instructions) + input_features = Instructor.prepare_input_features( + input_features, instruction_features + ) else: - raise ValueError('not support other modes') - # batch1, batch2 = [], [] - # for text_tuple in texts: - # batch1.append(text_tuple[0]) - # batch2.append(text_tuple[1]) - # to_tokenize = [batch1, batch2] - - output.update(tokenized) + raise ValueError("not support other modes") + + output.update(input_features) return output -class INSTRUCTOR(SentenceTransformer): + +class Instructor(SentenceTransformer): + @staticmethod + def prepare_input_features( + input_features, instruction_features, return_data_type: str = "pt" + ): + if return_data_type == "np": + input_features["attention_mask"] = torch.from_numpy( + input_features["attention_mask"] + ) + instruction_features["attention_mask"] = torch.from_numpy( + instruction_features["attention_mask"] + ) + + input_attention_mask_shape = input_features["attention_mask"].shape + instruction_attention_mask = instruction_features["attention_mask"] + + # reducing the attention length by 1 in order to omit the attention corresponding to the end_token + instruction_attention_mask = instruction_attention_mask[:, 1:] + + # creating instruction attention matrix equivalent to the size of the input attention matrix + expanded_instruction_attention_mask = torch.zeros( + input_attention_mask_shape, dtype=torch.int64 + ) + # assigning the the actual instruction attention matrix to the expanded_instruction_attention_mask + # eg: + # instruction_attention_mask: 3x3 + # [[1,1,1], + # [1,1,0], + # [1,0,0]] + # expanded_instruction_attention_mask: 3x4 + # [[1,1,1,0], + # [1,1,0,0], + # [1,0,0,0]] + expanded_instruction_attention_mask[ + : instruction_attention_mask.size(0), : instruction_attention_mask.size(1) + ] = instruction_attention_mask + + # In the pooling layer we want to consider only the tokens corresponding to the input text + # and not the instruction. This is achieved by inverting the + # attention_mask corresponding to the instruction. + expanded_instruction_attention_mask = 1 - expanded_instruction_attention_mask + input_features["instruction_mask"] = expanded_instruction_attention_mask + if return_data_type == "np": + input_features["attention_mask"] = input_features["attention_mask"].numpy() + instruction_features["attention_mask"] = instruction_features[ + "attention_mask" + ].numpy() + return input_features def smart_batching_collate(self, batch): num_texts = len(batch[0].texts) @@ -394,109 +485,107 @@ def smart_batching_collate(self, batch): for example in batch: for idx, text in enumerate(example.texts): texts[idx].append(text) - labels.append(example.label) labels = torch.tensor(labels) + batched_input_features = [] - - sentence_features = [] for idx in range(num_texts): assert isinstance(texts[idx][0], list) - assert len(texts[idx][0])==2,f"The input should have both instruction and input text" - # if len(texts[idx][0])==3: - # print('component 3') + assert ( + len(texts[idx][0]) == 2 + ), "The input should have both instruction and input text" + num = len(texts[idx]) - contexts = [] - concatenated_input_texts = [] + instructions = [] + instruction_prepended_input_texts = [] for local_idx in range(num): - assert len(texts[idx][local_idx])==2 - contexts.append(texts[idx][local_idx][0]) - concatenated_input_texts.append(''.join(texts[idx][local_idx])) - assert isinstance(contexts[-1],str) - assert isinstance(concatenated_input_texts[-1],str) - tokenized = self.tokenize(concatenated_input_texts) - context_tok = self.tokenize(contexts) - tokenized['context_masks'] = torch.sum(context_tok['attention_mask'],dim=1) - tokenized['context_masks'] = tokenized['context_masks'] - 1 - for my_idx in range(len(tokenized['context_masks'])): - if tokenized['context_masks'][my_idx]<=1: - tokenized['context_masks'][my_idx] = 0 - # text_types = [pair[-1] for pair in texts[idx]] - # assert all([tid==1 for tid in text_types]) or all([tid==0 for tid in text_types]) - # tokenized['text_type'] = text_types[0] - # elif len(texts[idx][0])==2: - # input_texts = [pair[0] for pair in texts[idx]] - # text_types = [pair[-1] for pair in texts[idx]] - # assert all([tid == 1 for tid in text_types]) or all([tid == 0 for tid in text_types]) - # tokenized = self.tokenize(input_texts) - # tokenized['text_type'] = text_types[0] - # else: - # raise ValueError('tokenization error') - sentence_features.append(tokenized) - - return sentence_features, labels + assert len(texts[idx][local_idx]) == 2 + instructions.append(texts[idx][local_idx][0]) + instruction_prepended_input_texts.append("".join(texts[idx][local_idx])) + assert isinstance(instructions[-1], str) + assert isinstance(instruction_prepended_input_texts[-1], str) + + input_features = self.tokenize(instruction_prepended_input_texts) + instruction_features = self.tokenize(instructions) + input_features = Instructor.prepare_input_features( + input_features, instruction_features + ) + batched_input_features.append(input_features) + + return batched_input_features, labels def _load_sbert_model(self, model_path): """ Loads a full sentence-transformers model """ # Check if the config_sentence_transformers.json file exists (exists since v2 of the framework) - config_sentence_transformers_json_path = os.path.join(model_path, 'config_sentence_transformers.json') + config_sentence_transformers_json_path = os.path.join( + model_path, "config_sentence_transformers.json" + ) if os.path.exists(config_sentence_transformers_json_path): - with open(config_sentence_transformers_json_path) as fIn: - self._model_config = json.load(fIn) + with open( + config_sentence_transformers_json_path, encoding="UTF-8" + ) as config_file: + self._model_config = json.load(config_file) # Check if a readme exists - model_card_path = os.path.join(model_path, 'README.md') + model_card_path = os.path.join(model_path, "README.md") if os.path.exists(model_card_path): try: - with open(model_card_path, encoding='utf8') as fIn: - self._model_card_text = fIn.read() + with open(model_card_path, encoding="utf8") as config_file: + self._model_card_text = config_file.read() except: pass # Load the modules of sentence transformer - modules_json_path = os.path.join(model_path, 'modules.json') - with open(modules_json_path) as fIn: - modules_config = json.load(fIn) + modules_json_path = os.path.join(model_path, "modules.json") + with open(modules_json_path, encoding="UTF-8") as config_file: + modules_config = json.load(config_file) modules = OrderedDict() for module_config in modules_config: - if module_config['idx']==0: - print('load INSTRUCTOR_Transformer') - module_class = INSTRUCTOR_Transformer - elif module_config['idx']==1: - module_class = INSTRUCTOR_Pooling + if module_config["idx"] == 0: + module_class = InstructorTransformer + elif module_config["idx"] == 1: + module_class = InstructorPooling else: - module_class = import_from_string(module_config['type']) - module = module_class.load(os.path.join(model_path, module_config['path'])) - modules[module_config['name']] = module + module_class = import_from_string(module_config["type"]) + module = module_class.load(os.path.join(model_path, module_config["path"])) + modules[module_config["name"]] = module return modules - def encode(self, sentences, - batch_size: int = 32, - show_progress_bar: bool = None, - output_value: str = 'sentence_embedding', - convert_to_numpy: bool = True, - convert_to_tensor: bool = False, - device: str = None, - normalize_embeddings: bool = False): + def encode( + self, + sentences, + batch_size: int = 32, + show_progress_bar: Union[bool, None] = None, + output_value: str = "sentence_embedding", + convert_to_numpy: bool = True, + convert_to_tensor: bool = False, + device: Union[str, None] = None, + normalize_embeddings: bool = False, + ): """ Computes sentence embeddings :param sentences: the sentences to embed :param batch_size: the batch size used for the computation :param show_progress_bar: Output a progress bar when encode sentences - :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values - :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. - :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy + :param output_value: Default sentence_embedding, to get sentence embeddings. + Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values + :param convert_to_numpy: If true, the output is a list of numpy vectors. + Else, it is a list of pytorch tensors. + :param convert_to_tensor: If true, you get one large tensor as return. + Overwrites any setting from convert_to_numpy :param device: Which torch.device to use for the computation - :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. + :param normalize_embeddings: If set to true, returned vectors will have length 1. + In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. :return: - By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. + By default, a list of tensors is returned. If convert_to_tensor, + a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. """ self.eval() if show_progress_bar is None: @@ -505,12 +594,14 @@ def encode(self, sentences, if convert_to_tensor: convert_to_numpy = False - if output_value != 'sentence_embedding': + if output_value != "sentence_embedding": convert_to_tensor = False convert_to_numpy = False input_was_string = False - if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1 + if isinstance(sentences, str) or not hasattr( + sentences, "__len__" + ): # Cast an individual sentence to a list with length 1 sentences = [sentences] input_was_string = True @@ -520,41 +611,51 @@ def encode(self, sentences, self.to(device) all_embeddings = [] - if isinstance(sentences[0],list): + if isinstance(sentences[0], list): lengths = [] for sen in sentences: lengths.append(-self._text_length(sen[1])) length_sorted_idx = np.argsort(lengths) else: - length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences]) + length_sorted_idx = np.argsort( + [-self._text_length(sen) for sen in sentences] + ) sentences_sorted = [sentences[idx] for idx in length_sorted_idx] - for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): - sentences_batch = sentences_sorted[start_index:start_index+batch_size] + for start_index in trange( + 0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar + ): + sentences_batch = sentences_sorted[start_index : start_index + batch_size] features = self.tokenize(sentences_batch) features = batch_to_device(features, device) with torch.no_grad(): out_features = self.forward(features) - if output_value == 'token_embeddings': + if output_value == "token_embeddings": embeddings = [] - for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']): - last_mask_id = len(attention)-1 + for token_emb, attention in zip( + out_features[output_value], out_features["attention_mask"] + ): + last_mask_id = len(attention) - 1 while last_mask_id > 0 and attention[last_mask_id].item() == 0: last_mask_id -= 1 - embeddings.append(token_emb[0:last_mask_id+1]) - elif output_value is None: #Return all outputs + embeddings.append(token_emb[0 : last_mask_id + 1]) + elif output_value is None: # Return all outputs embeddings = [] - for sent_idx in range(len(out_features['sentence_embedding'])): - row = {name: out_features[name][sent_idx] for name in out_features} + for sent_idx in range(len(out_features["sentence_embedding"])): + row = { + name: out_features[name][sent_idx] for name in out_features + } embeddings.append(row) - else: #Sentence embeddings + else: # Sentence embeddings embeddings = out_features[output_value] embeddings = embeddings.detach() if normalize_embeddings: - embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + embeddings = torch.nn.functional.normalize( + embeddings, p=2, dim=1 + ) # fixes for #522 and #487 to avoid oom problems on gpu with large datasets if convert_to_numpy: @@ -572,4 +673,4 @@ def encode(self, sentences, if input_was_string: all_embeddings = all_embeddings[0] - return all_embeddings \ No newline at end of file + return all_embeddings diff --git a/evaluation/MTEB/examples/evaluate_model.py b/evaluation/MTEB/examples/evaluate_model.py index 65ce9a4..8960d89 100644 --- a/evaluation/MTEB/examples/evaluate_model.py +++ b/evaluation/MTEB/examples/evaluate_model.py @@ -3,7 +3,7 @@ import logging import argparse from mteb import MTEB -from InstructorEmbedding import INSTRUCTOR +from InstructorEmbedding import Instructor if __name__ == '__main__': logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() @@ -24,7 +24,7 @@ # from functools import partialmethod # # tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) - model = INSTRUCTOR(args.model_name,cache_folder=args.cache_dir) + model = Instructor(args.model_name,cache_folder=args.cache_dir) evaluation = MTEB(tasks=[args.task_name],task_langs=["en"]) evaluation.run(model, output_folder=args.output_dir, eval_splits=[args.split],args=args,) diff --git a/evaluation/MTEB/mteb/abstasks/AbsTaskRetrieval.py b/evaluation/MTEB/mteb/abstasks/AbsTaskRetrieval.py index d2666f5..908259a 100644 --- a/evaluation/MTEB/mteb/abstasks/AbsTaskRetrieval.py +++ b/evaluation/MTEB/mteb/abstasks/AbsTaskRetrieval.py @@ -597,7 +597,7 @@ def evaluate( model, split="test", batch_size=128, - corpus_chunk_size=None, + corpus_chunk_size=50000, target_devices=None, score_function="cos_sim", **kwargs @@ -708,7 +708,7 @@ def encode_queries(self, queries: List[str], batch_size: int, **kwargs): instruction = DEFINITIONS[self.args.prompt][self.args.task_name]['query'] if self.args.prompt: for s in queries: - new_sentences.append([instruction, s, 0]) + new_sentences.append([instruction, s]) else: new_sentences = queries @@ -717,7 +717,6 @@ def encode_queries(self, queries: List[str], batch_size: int, **kwargs): def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs): self.count += 1 - # print('count: ',self.count) if type(corpus) is dict: sentences = [ (corpus["title"][i] + ' ' + corpus["text"][i]).strip() @@ -733,28 +732,26 @@ def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int, **kwargs) new_sentences = [] instruction = DEFINITIONS[self.args.prompt][self.args.task_name]['corpus'] for s in sentences: - new_sentences.append([instruction, s, 0]) - # kwargs['show_progress_bar'] = False - return self.model.encode(sentences, batch_size=128, **kwargs) + new_sentences.append([instruction, s]) + return self.model.encode(new_sentences, batch_size=128, **kwargs) def encode_corpus_parallel( self, corpus: List[Dict[str, str]], pool: Dict[str, object], batch_size: int, chunk_id: int, **kwargs ): + sentences = [] instruction = DEFINITIONS[self.args.prompt][self.args.task_name]['corpus'] if type(corpus) is dict: - sentences = [ - [instruction, (corpus["title"][i] + self.sep + corpus["text"][i]).strip()] - (corpus["title"][i] + self.sep + corpus["text"][i]).strip() - if "title" in corpus - else corpus["text"][i].strip() - for i in range(len(corpus["text"])) - ] + for i in range(len(corpus["text"])): + sentence = corpus["text"][i].strip() + if "title" in corpus: + sentence = corpus["title"][i].strip() + self.sep + sentence + sentences.append([instruction, sentence]) else: - sentences = [ - [instruction, (doc["title"] + self.sep + doc["text"]).strip()] - (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() - for doc in corpus - ] + for doc in corpus: + sentence = doc["text"].strip() + if "title" in doc: + sentence = doc["title"].strip() + self.sep + sentence + sentences.append([instruction, sentence]) if chunk_id is not None and chunk_id >= len(pool["processes"]): output_queue = pool["output"] diff --git a/evaluation/MTEB/setup.py b/evaluation/MTEB/setup.py index e33c523..52b462a 100644 --- a/evaluation/MTEB/setup.py +++ b/evaluation/MTEB/setup.py @@ -84,6 +84,8 @@ "torch", "tqdm", "rich", + "beir", + "evaluate==0.2.0" ], extras_require=extras, classifiers=[ diff --git a/requirements.txt b/requirements.txt index 1bc98b5..05f8986 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ sentence_transformers>=2.2.0 torch tqdm rich +tensorboard \ No newline at end of file diff --git a/train.py b/train.py index 05f9eb3..4608a05 100644 --- a/train.py +++ b/train.py @@ -13,7 +13,7 @@ import transformers from filelock import FileLock -from InstructorEmbedding import INSTRUCTOR +from InstructorEmbedding import Instructor, InstructorTransformer from transformers import ( AutoTokenizer, DataCollatorForSeq2Seq, @@ -27,6 +27,9 @@ set_seed, ) from transformers.trainer_utils import get_last_checkpoint +from transformers.trainer_callback import TrainerCallback, TrainerState, TrainerControl +from transformers.training_args import TrainingArguments + from transformers.utils import check_min_version, is_offline_mode from torch.utils.data import Dataset, SequentialSampler from torch.utils.data.distributed import DistributedSampler @@ -100,7 +103,7 @@ def compute_loss(self, model, inputs, return_outputs=False): cur_inputs = { 'input_ids': inputs[f'{k}_input_ids'], 'attention_mask': inputs[f'{k}_attention_mask'], - 'context_masks': inputs[f'{k}_context_masks'], + 'instruction_mask': inputs[f'{k}_instruction_mask'], } cur_results[k] = model(cur_inputs)['sentence_embedding'] embeddings_query = cur_results['query'] @@ -156,7 +159,6 @@ class ModelArguments: """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ - model_name_or_path: str = field( metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} ) @@ -424,13 +426,8 @@ def main(): ) # Set seed before initializing model. - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + instructor_tokenizer = InstructorTransformer(model_name_or_path=model_args.model_name_or_path, load_model=False) + tokenizer = instructor_tokenizer.tokenizer #pre-trained tokentizer set_seed(training_args.seed) with open(os.path.join(model_args.cache_dir, 'medi-data.json')) as f: @@ -443,7 +440,7 @@ def main(): real_batch_size = max(training_args.per_device_train_batch_size, training_args.per_device_train_batch_size * torch.cuda.device_count()) - # print('real_batch_size: ', real_batch_size,training_args.per_device_train_batch_size,torch.cuda.device_count()) + def get_examples_raw(old_examples_raw, total_n, real_batch_size): examples_raw = [] for idx in range(0, total_n, real_batch_size): @@ -485,13 +482,11 @@ def get_dataset(examples_raw): for i in range(total_num): cur_e = examples_raw[i] for k in ['query','pos','neg']: - for s in cur_e[k][:-1]: - assert not '!@#$%^&**!@#$%^&**' in s cur_e[k][-1] = str(cur_e[k][-1]) if not data_args.add_prompt_to_document: cur_e[k][0] = '' assert cur_e[k][0].startswith('Represent ') or cur_e[k][0]=='' - examples[k].append('!@#$%^&**!@#$%^&**'.join(cur_e[k])) + examples[k].append(cur_e[k]) if not cur_e['task_id'] in task_name_map: task_name_map[cur_e['task_id']] = task_count task_count += 1 @@ -500,36 +495,20 @@ def get_dataset(examples_raw): train_raw_datasets = DatasetDict({'train':Dataset.from_dict(get_dataset(train_examples_raw))}) - model = INSTRUCTOR(real_name_or_path, cache_folder=model_args.cache_dir) + model = Instructor(real_name_or_path, cache_folder=model_args.cache_dir) column_names = train_raw_datasets["train"].column_names def preprocess_function(examples): all_tokenized = None for key in ['query','pos','neg']: - num = len(examples[key]) - contexts = [] - concatenated_input_texts = [] - for local_idx in range(num): - splits = examples[key][local_idx].split('!@#$%^&**!@#$%^&**') - assert len(splits) == 2 - contexts.append(splits[0]) - concatenated_input_texts.append(''.join(splits)) - assert isinstance(contexts[-1], str) - assert isinstance(concatenated_input_texts[-1], str) - tokenized = tokenizer(concatenated_input_texts,padding='max_length', truncation='longest_first', return_tensors="pt", max_length=data_args.max_source_length) - context_tok = tokenizer(contexts,padding='max_length', truncation='longest_first', return_tensors="pt", max_length=data_args.max_source_length) - tokenized['context_masks'] = torch.sum(context_tok['attention_mask'], dim=1) - tokenized['context_masks'] = tokenized['context_masks'] - 1 - for my_idx in range(len(tokenized['context_masks'])): - if tokenized['context_masks'][my_idx] <= 1: - tokenized['context_masks'][my_idx] = 0 - keys = tokenized.keys() + input_features = instructor_tokenizer.tokenize(examples[key]) + keys = input_features.keys() if all_tokenized is None: - all_tokenized = tokenized.copy() + all_tokenized = input_features.copy() for k in keys: all_tokenized[k] = all_tokenized[k].tolist() for k in keys: - all_tokenized[f'{key}_{k}'] = tokenized[k].tolist() + all_tokenized[f'{key}_{k}'] = input_features[k].tolist() all_tokenized['task_id'] = examples['task_id'] return all_tokenized