Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add special_tokens_in_strings Arg to byte_pair_tokenizer. #1546

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 7 additions & 14 deletions keras_nlp/models/bart/bart_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class BartTokenizer(BytePairTokenizer):
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
should expect special tokens in input strings that should be
tokenized and mapped correctly to their ids. Defaults to False.

Examples:

Expand Down Expand Up @@ -77,6 +80,7 @@ def __init__(
self,
vocabulary=None,
merges=None,
special_tokens_in_strings=False,
**kwargs,
):
self.start_token = "<s>"
Expand All @@ -86,27 +90,19 @@ def __init__(
super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[
special_tokens=[
self.start_token,
self.pad_token,
self.end_token,
],
special_tokens_in_strings=special_tokens_in_strings,
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
for token in [self.start_token, self.pad_token, self.end_token]:
if token not in self.vocabulary:
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in your "
"`vocabulary` or use a pretrained `vocabulary` name."
)

self.start_token_id = self.token_to_id(self.start_token)
self.pad_token_id = self.token_to_id(self.pad_token)
self.end_token_id = self.token_to_id(self.end_token)
Expand All @@ -117,8 +113,5 @@ def set_vocabulary_and_merges(self, vocabulary, merges):

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
del config["special_tokens"] # Not configurable; set in __init__.
return config
11 changes: 7 additions & 4 deletions keras_nlp/models/bart/bart_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ def setUp(self):
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.init_kwargs = {
"vocabulary": self.vocab,
"merges": self.merges,
"special_tokens_in_strings": True,
}
self.input_data = [
"<s> airplane at airport</s><pad>",
" airplane airport",
Expand All @@ -37,10 +41,9 @@ def test_tokenizer_basics(self):
cls=BartTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
# TODO: </s> should not get tokenized as <s>
expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]],
expected_output=[[0, 4, 5, 6, 4, 7, 2, 1], [4, 5, 4, 7]],
expected_detokenize_output=[
"<s> airplane at airport<s><pad>",
"<s> airplane at airport</s><pad>",
" airplane airport",
],
)
Expand Down
21 changes: 7 additions & 14 deletions keras_nlp/models/bloom/bloom_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class BloomTokenizer(BytePairTokenizer):
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
should expect special tokens in input strings that should be
tokenized and mapped correctly to their ids. Defaults to False.

Examples:

Expand Down Expand Up @@ -69,6 +72,7 @@ def __init__(
self,
vocabulary=None,
merges=None,
special_tokens_in_strings=False,
**kwargs,
):
self.start_token = "<s>"
Expand All @@ -78,27 +82,19 @@ def __init__(
super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[
special_tokens=[
self.start_token,
self.end_token,
self.pad_token,
],
special_tokens_in_strings=special_tokens_in_strings,
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
for token in [self.start_token, self.end_token, self.pad_token]:
if token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{token}'` in the provided "
f"`vocabulary`. Please provide `'{token}'` in "
"your `vocabulary` or use a pretrained `vocabulary` name."
)

self.start_token_id = self.token_to_id(self.start_token)
self.end_token_id = self.token_to_id(self.end_token)
self.pad_token_id = self.token_to_id(self.pad_token)
Expand All @@ -109,8 +105,5 @@ def set_vocabulary_and_merges(self, vocabulary, merges):

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
del config["special_tokens"] # Not configurable; set in __init__.
return config
12 changes: 8 additions & 4 deletions keras_nlp/models/bloom/bloom_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,22 @@ def setUp(self):
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.init_kwargs = {
"vocabulary": self.vocab,
"merges": self.merges,
"special_tokens_in_strings": True,
}
self.input_data = [
"<s>airplane at airport<pad>",
"<s> airplane airport<pad>",
"<s>airplane at airport</s><pad>",
"<s> airplane airport</s><pad>",
]

def test_tokenizer_basics(self):
self.run_preprocessing_layer_test(
cls=BloomTokenizer,
init_kwargs=self.init_kwargs,
input_data=self.input_data,
expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 2, 3, 2, 5, 8]],
expected_output=[[6, 1, 3, 4, 2, 5, 7, 8], [6, 2, 3, 2, 5, 7, 8]],
)

def test_errors_missing_special_tokens(self):
Expand Down
20 changes: 7 additions & 13 deletions keras_nlp/models/falcon/falcon_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class FalconTokenizer(BytePairTokenizer):
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
should expect special tokens in input strings that should be
tokenized and mapped correctly to their ids. Defaults to False.

Examples:

Expand Down Expand Up @@ -69,6 +72,7 @@ def __init__(
self,
vocabulary=None,
merges=None,
special_tokens_in_strings=False,
**kwargs,
):
# Falcon uses the same start as end token, i.e., "<|endoftext|>".
Expand All @@ -77,22 +81,15 @@ def __init__(
super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[self.end_token],
special_tokens=[self.end_token],
special_tokens_in_strings=special_tokens_in_strings,
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
if self.end_token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{self.end_token}'` in the provided "
f"`vocabulary`. Please provide `'{self.end_token}'` in "
"your `vocabulary` or use a pretrained `vocabulary` name."
)

self.end_token_id = self.token_to_id(self.end_token)
self.start_token_id = self.end_token_id
self.pad_token_id = 0
Expand All @@ -103,8 +100,5 @@ def set_vocabulary_and_merges(self, vocabulary, merges):

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
del config["special_tokens"] # Not configurable; set in __init__.
return config
6 changes: 5 additions & 1 deletion keras_nlp/models/falcon/falcon_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ def setUp(self):
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.init_kwargs = {
"vocabulary": self.vocab,
"merges": self.merges,
"special_tokens_in_strings": True,
}
self.input_data = [
" airplane at airport<|endoftext|>",
" airplane airport",
Expand Down
20 changes: 7 additions & 13 deletions keras_nlp/models/gpt2/gpt2_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class GPT2Tokenizer(BytePairTokenizer):
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
should expect special tokens in input strings that should be
tokenized and mapped correctly to their ids. Defaults to False.

Examples:

Expand Down Expand Up @@ -69,6 +72,7 @@ def __init__(
self,
vocabulary=None,
merges=None,
special_tokens_in_strings=False,
**kwargs,
):
# GPT2 uses the same start as end token, i.e., "<|endoftext|>".
Expand All @@ -77,22 +81,15 @@ def __init__(
super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[self.end_token],
special_tokens=[self.end_token],
special_tokens_in_strings=special_tokens_in_strings,
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
if self.end_token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{self.end_token}'` in the provided "
f"`vocabulary`. Please provide `'{self.end_token}'` in "
"your `vocabulary` or use a pretrained `vocabulary` name."
)

self.end_token_id = self.token_to_id(self.end_token)
self.start_token_id = self.end_token_id
self.pad_token_id = 0
Expand All @@ -103,8 +100,5 @@ def set_vocabulary_and_merges(self, vocabulary, merges):

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
del config["special_tokens"] # Not configurable; set in __init__.
return config
6 changes: 5 additions & 1 deletion keras_nlp/models/gpt2/gpt2_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ def setUp(self):
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.init_kwargs = {
"vocabulary": self.vocab,
"merges": self.merges,
"special_tokens_in_strings": True,
}
self.input_data = [
" airplane at airport<|endoftext|>",
" airplane airport",
Expand Down
20 changes: 7 additions & 13 deletions keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,16 @@ class GPTNeoXTokenizer(BytePairTokenizer):
it should be the file path to merge rules. The merge rule file
should have one merge rule per line. Every merge rule contains
merge entities separated by a space.
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
should expect special tokens in input strings that should be
tokenized and mapped correctly to their ids. Defaults to False.
"""

def __init__(
self,
vocabulary=None,
merges=None,
special_tokens_in_strings=False,
**kwargs,
):
# GPTNeoX uses the same start as end token, i.e., "<|endoftext|>".
Expand All @@ -55,22 +59,15 @@ def __init__(
super().__init__(
vocabulary=vocabulary,
merges=merges,
unsplittable_tokens=[self.end_token],
special_tokens=[self.end_token],
special_tokens_in_strings=special_tokens_in_strings,
**kwargs,
)

def set_vocabulary_and_merges(self, vocabulary, merges):
super().set_vocabulary_and_merges(vocabulary, merges)

if vocabulary is not None:
# Check for necessary special tokens.
if self.end_token not in self.get_vocabulary():
raise ValueError(
f"Cannot find token `'{self.end_token}'` in the provided "
f"`vocabulary`. Please provide `'{self.end_token}'` in "
"your `vocabulary` or use a pretrained `vocabulary` name."
)

self.end_token_id = self.token_to_id(self.end_token)
self.start_token_id = self.end_token_id
self.pad_token_id = 0
Expand All @@ -81,8 +78,5 @@ def set_vocabulary_and_merges(self, vocabulary, merges):

def get_config(self):
config = super().get_config()
# In the constructor, we pass the list of special tokens to the
# `unsplittable_tokens` arg of the superclass' constructor. Hence, we
# delete it from the config here.
del config["unsplittable_tokens"]
del config["special_tokens"] # Not configurable; set in __init__.
return config
6 changes: 5 additions & 1 deletion keras_nlp/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ def setUp(self):
self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
self.merges += ["Ġai r", "Ġa i", "pla ne"]
self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
self.init_kwargs = {
"vocabulary": self.vocab,
"merges": self.merges,
"special_tokens_in_strings": True,
}
self.input_data = [
" airplane at airport<|endoftext|>",
" airplane airport",
Expand Down
Loading
Loading