Skip to content

Commit

Permalink
[Bug Fix]update tokenizer utils (#3204)
Browse files Browse the repository at this point in the history
* update tokenizer utils

* update example

* remove debug code

* test=document_fix

* test=document_fix

* test=document_fix
  • Loading branch information
wj-Mcat authored Sep 6, 2022
1 parent ceded4b commit 9d9b00b
Show file tree
Hide file tree
Showing 14 changed files with 27 additions and 15 deletions.
3 changes: 2 additions & 1 deletion examples/benchmark/clue/mrc/run_c3.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):

new_data = tokenizer(tokens_t_list,
text_pair=tokens_c_list,
is_split_into_words=True)
is_split_into_words='token')

# Pad each new example for axis=2 of [batch_size, num_choices, seq_len],
# because length of each choice could be different.
Expand Down Expand Up @@ -305,6 +305,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on train dataset")

batchify_fn = lambda samples, fn=Dict({
'input_ids':
Pad(axis=1, pad_val=tokenizer.pad_token_id), # input
Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/DuEE/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def convert_example_to_feature(example,
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_len)

input_ids = tokenized_input['input_ids']
Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/msra_ner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def tokenize_and_align_labels(examples):
examples['tokens'],
max_seq_len=args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
is_split_into_words='token',
return_length=True)
labels = []

Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/msra_ner/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def tokenize_and_align_labels(examples):
examples['tokens'],
max_seq_len=args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
is_split_into_words='token',
return_length=True)
labels = []

Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/msra_ner/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def tokenize_and_align_labels(examples):
examples['tokens'],
max_seq_len=args.max_seq_length,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
is_split_into_words='token',
return_length=True)
labels = []

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def convert_to_features(example, tokenizer):
tokens = example[0]
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
is_split_into_words='token')
# Token '[CLS]' and '[SEP]' will get label 'O'
return tokenized_input['input_ids'], tokenized_input[
'token_type_ids'], tokenized_input['seq_len']
Expand Down
2 changes: 1 addition & 1 deletion examples/information_extraction/waybill_ie/run_ernie.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def convert_to_features(example, tokenizer, label_vocab):
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
is_split_into_words='token')
# Token '[CLS]' and '[SEP]' will get label 'O'
labels = ['O'] + labels + ['O']
tokenized_input['labels'] = [label_vocab[x] for x in labels]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def convert_to_features(example, tokenizer, label_vocab):
tokens, labels = example
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True)
is_split_into_words='token')
# Token '[CLS]' and '[SEP]' will get label 'O'
labels = ['O'] + labels + ['O']
tokenized_input['labels'] = [label_vocab[x] for x in labels]
Expand Down
2 changes: 1 addition & 1 deletion examples/sentiment_analysis/skep/predict_opinion.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
tokens = example["tokens"]
encoded_inputs = tokenizer(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_length)
input_ids = np.array(encoded_inputs["input_ids"], dtype="int64")
token_type_ids = np.array(encoded_inputs["token_type_ids"], dtype="int64")
Expand Down
2 changes: 1 addition & 1 deletion examples/text_to_knowledge/ernie-ctm/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def convert_example(example,
tokens = example["tokens"]
tokenized_input = tokenizer(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_len)

if is_test:
Expand Down
2 changes: 1 addition & 1 deletion examples/text_to_knowledge/nptag/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def convert_example(example,
tokens = list(example["text"]) + ["是"] + ["[MASK]"] * max_cls_len
inputs = tokenzier(tokens,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_length=max_seq_len)

label_indices = list(
Expand Down
2 changes: 1 addition & 1 deletion model_zoo/gpt/run_msra_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def tokenize_and_align_labels(example,
example = example['tokens']
tokenized_input = tokenizer(example,
return_length=True,
is_split_into_words=True,
is_split_into_words='token',
max_seq_len=max_seq_len,
return_token_type_ids=False)

Expand Down
4 changes: 2 additions & 2 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,7 +986,7 @@ def get_input_ids(text):
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
if is_split_into_words:
if is_split_into_words == True:
tokens = list(
itertools.chain(*(
self.tokenize(t, is_split_into_words=True, **kwargs)
Expand Down Expand Up @@ -1071,7 +1071,7 @@ def get_input_ids(text):
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
if is_split_into_words:
if is_split_into_words == True:
tokens = list(
itertools.chain(*(
self.tokenize(t, is_split_into_words=True, **kwargs)
Expand Down
13 changes: 12 additions & 1 deletion paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2013,7 +2013,7 @@ def __call__(self,
List[List[str]]]] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
is_split_into_words: Union[bool, str] = False,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
return_position_ids: bool = False,
Expand Down Expand Up @@ -2061,6 +2061,10 @@ def __call__(self,
a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
and 'offset_mapping' preserving the original example and position
information will be added to the returned dictionary. Defaults to 0.
is_split_into_words (Union[bool, str], optional):
when the text is words or tokens, `is_split_into_words` should be True or `token`.
`True`: means that the text should be words which should be tokenized.
`token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
padding (bool, str or [PaddingStrategy], optional):
Activates and controls padding. Accepts the following values:
Expand Down Expand Up @@ -2201,6 +2205,13 @@ def _is_valid_text_input(t):
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples).")

# check `split_into_words` value
if isinstance(is_split_into_words,
str) and is_split_into_words != 'token':
raise ValueError(
"the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
is_split_into_words)

if is_split_into_words:
is_batched = isinstance(text,
(list, tuple)) and text and isinstance(
Expand Down

0 comments on commit 9d9b00b

Please sign in to comment.