Skip to content

Commit

Permalink
Tiktoken wrapper add_eos_token option (#681)
Browse files Browse the repository at this point in the history
* add add_eos_token arg to tiktoken wrapper

* add add_eos_token arg to tiktoken wrapper

* yapf

* encode_plus tests

---------

Co-authored-by: Daniel King <[email protected]>
  • Loading branch information
rajammanabrolu and dakinggg authored Oct 20, 2023
1 parent 459947c commit f65b07e
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 18 deletions.
30 changes: 15 additions & 15 deletions llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(self,
model_name: Optional[str] = None,
encoding_name: Optional[str] = None,
add_bos_token: bool = False,
add_eos_token: bool = False,
unk_token: Optional[str] = '<|endoftext|>',
eos_token: Optional[str] = '<|endoftext|>',
bos_token: Optional[str] = '<|endoftext|>',
Expand All @@ -36,6 +37,7 @@ def __init__(self,
encoding_name (Optional[str], optional): The name of the encoding to load from tiktoken. Defaults to None.
Either model_name or encoding_name must be set, but not both.
add_bos_token (bool, optional): Whether to add bos tokens. Defaults to False.
add_eos_token (bool, optional): Whether to add eos tokens. Defaults to False.
unk_token (Optional[str], optional): The unk token. Defaults to '<|endoftext|>'.
eos_token (Optional[str], optional): The eos token. Defaults to '<|endoftext|>'.
bos_token (Optional[str], optional): The bos token. Defaults to '<|endoftext|>'.
Expand Down Expand Up @@ -66,10 +68,12 @@ def __init__(self,
'You need to specify either model_name or encoding_name.')

self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token

super().__init__(model_name=model_name,
encoding_name=encoding_name,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
unk_token=unk_token,
eos_token=eos_token,
bos_token=bos_token,
Expand Down Expand Up @@ -179,17 +183,15 @@ def build_inputs_with_special_tokens(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
if self.add_bos_token:
bos_token_ids = [self.bos_token_id]
else:
bos_token_ids = []
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []

output = bos_token_ids + token_ids_0
output = bos_token_id + token_ids_0 + eos_token_id

if token_ids_1 is None:
return output
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id

return output + bos_token_ids + token_ids_1
return output

def get_special_tokens_mask(
self,
Expand Down Expand Up @@ -221,15 +223,13 @@ def get_special_tokens_mask(
token_ids_1=token_ids_1,
already_has_special_tokens=True)

if not self.add_bos_token:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=False)
bos_token_id = [1] if self.add_bos_token else []
eos_token_id = [1] if self.add_eos_token else []

if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0))
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)

def create_token_type_ids_from_sequences(
self,
Expand Down
37 changes: 34 additions & 3 deletions tests/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,19 @@


def get_tokenizers_for_testing(
model_name: Optional[str], encoding_name: Optional[str],
tmp_path: pathlib.Path
model_name: Optional[str],
encoding_name: Optional[str],
tmp_path: pathlib.Path,
add_bos_token: bool = False,
add_eos_token: bool = False
) -> Tuple[TiktokenTokenizerWrapper, TiktokenTokenizerWrapper, 'Encoding']:
tiktoken = pytest.importorskip('tiktoken')

# Construction
wrapped_tokenizer = TiktokenTokenizerWrapper(model_name=model_name,
encoding_name=encoding_name)
encoding_name=encoding_name,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token)
if model_name is not None:
original_tokenizer = tiktoken.encoding_for_model(model_name)
else:
Expand Down Expand Up @@ -201,3 +206,29 @@ def test_tiktoken_save_from_pretrained(model_name: Optional[str],
model_name, encoding_name, tmp_path)
check_hf_tokenizer_equivalence(wrapped_tokenizer,
reloaded_wrapped_tokenizer)


@pytest.mark.parametrize('model_name,encoding_name',
MODEL_ENCODING_NAME_PARAMETRIZATION)
def test_tiktoken_encode_plus(model_name: Optional[str],
encoding_name: Optional[str],
tmp_path: pathlib.Path):
# Testing encode_plus which optionally wrap encodes with bos and eos tokens
wrapped_tokenizer, _, _ = get_tokenizers_for_testing(model_name,
encoding_name,
tmp_path,
add_bos_token=True,
add_eos_token=True)

for test_string in TEST_STRINGS:
encoded_outputs = wrapped_tokenizer.encode_plus(
test_string,
add_special_tokens=True,
return_special_tokens_mask=True)
encoded_input_ids = encoded_outputs.input_ids
assert encoded_input_ids[0] == wrapped_tokenizer.bos_token_id
assert encoded_input_ids[-1] == wrapped_tokenizer.eos_token_id

encoded_special_mask = encoded_outputs.special_tokens_mask
assert encoded_special_mask[0] == 1
assert encoded_special_mask[-1] == 1

0 comments on commit f65b07e

Please sign in to comment.