diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index 001be6a030..41518a582a 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -21,6 +21,7 @@ def __init__(self, model_name: Optional[str] = None, encoding_name: Optional[str] = None, add_bos_token: bool = False, + add_eos_token: bool = False, unk_token: Optional[str] = '<|endoftext|>', eos_token: Optional[str] = '<|endoftext|>', bos_token: Optional[str] = '<|endoftext|>', @@ -36,6 +37,7 @@ def __init__(self, encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None. Either model_name or encoding_name must be set, but not both. add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False. + add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False. unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'. eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'. bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'. @@ -66,10 +68,12 @@ def __init__(self, 'You need to specify either model_name or encoding_name.') self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token super().__init__(model_name=model_name, encoding_name=encoding_name, add_bos_token=add_bos_token, + add_eos_token=add_eos_token, unk_token=unk_token, eos_token=eos_token, bos_token=bos_token, @@ -179,17 +183,15 @@ def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]: - if self.add_bos_token: - bos_token_ids = [self.bos_token_id] - else: - bos_token_ids = [] + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - output = bos_token_ids + token_ids_0 + output = bos_token_id + token_ids_0 + eos_token_id - if token_ids_1 is None: - return output + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id - return output + bos_token_ids + token_ids_1 + return output def get_special_tokens_mask( self, @@ -221,15 +223,13 @@ def get_special_tokens_mask( token_ids_1=token_ids_1, already_has_special_tokens=True) - if not self.add_bos_token: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, - token_ids_1=token_ids_1, - already_has_special_tokens=False) + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) def create_token_type_ids_from_sequences( self, diff --git a/tests/test_tiktoken.py b/tests/test_tiktoken.py index a255a5ffa7..85ff18100b 100644 --- a/tests/test_tiktoken.py +++ b/tests/test_tiktoken.py @@ -45,14 +45,19 @@ def get_tokenizers_for_testing( - model_name: Optional[str], encoding_name: Optional[str], - tmp_path: pathlib.Path + model_name: Optional[str], + encoding_name: Optional[str], + tmp_path: pathlib.Path, + add_bos_token: bool = False, + add_eos_token: bool = False ) -> Tuple[TiktokenTokenizerWrapper, TiktokenTokenizerWrapper, 'Encoding']: tiktoken = pytest.importorskip('tiktoken') # Construction wrapped_tokenizer = TiktokenTokenizerWrapper(model_name=model_name, - encoding_name=encoding_name) + encoding_name=encoding_name, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token) if model_name is not None: original_tokenizer = tiktoken.encoding_for_model(model_name) else: @@ -201,3 +206,29 @@ def test_tiktoken_save_from_pretrained(model_name: Optional[str], model_name, encoding_name, tmp_path) check_hf_tokenizer_equivalence(wrapped_tokenizer, reloaded_wrapped_tokenizer) + + +@pytest.mark.parametrize('model_name,encoding_name', + MODEL_ENCODING_NAME_PARAMETRIZATION) +def test_tiktoken_encode_plus(model_name: Optional[str], + encoding_name: Optional[str], + tmp_path: pathlib.Path): + # Testing encode_plus which optionally wrap encodes with bos and eos tokens + wrapped_tokenizer, _, _ = get_tokenizers_for_testing(model_name, + encoding_name, + tmp_path, + add_bos_token=True, + add_eos_token=True) + + for test_string in TEST_STRINGS: + encoded_outputs = wrapped_tokenizer.encode_plus( + test_string, + add_special_tokens=True, + return_special_tokens_mask=True) + encoded_input_ids = encoded_outputs.input_ids + assert encoded_input_ids[0] == wrapped_tokenizer.bos_token_id + assert encoded_input_ids[-1] == wrapped_tokenizer.eos_token_id + + encoded_special_mask = encoded_outputs.special_tokens_mask + assert encoded_special_mask[0] == 1 + assert encoded_special_mask[-1] == 1