diff --git a/examples/benchmark/clue/mrc/run_c3.py b/examples/benchmark/clue/mrc/run_c3.py index 7d4898efc6eb..2dfa130793b6 100644 --- a/examples/benchmark/clue/mrc/run_c3.py +++ b/examples/benchmark/clue/mrc/run_c3.py @@ -258,7 +258,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length): new_data = tokenizer(tokens_t_list, text_pair=tokens_c_list, - is_split_into_words=True) + is_split_into_words='token') # Pad each new example for axis=2 of [batch_size, num_choices, seq_len], # because length of each choice could be different. @@ -305,6 +305,7 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length): remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset") + batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input diff --git a/examples/information_extraction/DuEE/sequence_labeling.py b/examples/information_extraction/DuEE/sequence_labeling.py index 77e8c09d866e..fb8a80b47de1 100644 --- a/examples/information_extraction/DuEE/sequence_labeling.py +++ b/examples/information_extraction/DuEE/sequence_labeling.py @@ -98,7 +98,7 @@ def convert_example_to_feature(example, tokens, labels = example tokenized_input = tokenizer(tokens, return_length=True, - is_split_into_words=True, + is_split_into_words='token', max_seq_len=max_seq_len) input_ids = tokenized_input['input_ids'] diff --git a/examples/information_extraction/msra_ner/eval.py b/examples/information_extraction/msra_ner/eval.py index fced4be407cc..c4beee30a15f 100644 --- a/examples/information_extraction/msra_ner/eval.py +++ b/examples/information_extraction/msra_ner/eval.py @@ -56,7 +56,7 @@ def tokenize_and_align_labels(examples): examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). - is_split_into_words=True, + is_split_into_words='token', return_length=True) labels = [] diff --git a/examples/information_extraction/msra_ner/predict.py b/examples/information_extraction/msra_ner/predict.py index 176ad95f8096..83e7b437386e 100644 --- a/examples/information_extraction/msra_ner/predict.py +++ b/examples/information_extraction/msra_ner/predict.py @@ -86,7 +86,7 @@ def tokenize_and_align_labels(examples): examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). - is_split_into_words=True, + is_split_into_words='token', return_length=True) labels = [] diff --git a/examples/information_extraction/msra_ner/train.py b/examples/information_extraction/msra_ner/train.py index cf200df6a7f5..a0eaf075d8b2 100644 --- a/examples/information_extraction/msra_ner/train.py +++ b/examples/information_extraction/msra_ner/train.py @@ -105,7 +105,7 @@ def tokenize_and_align_labels(examples): examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). - is_split_into_words=True, + is_split_into_words='token', return_length=True) labels = [] diff --git a/examples/information_extraction/waybill_ie/deploy/python/predict.py b/examples/information_extraction/waybill_ie/deploy/python/predict.py index 754c2ebc042d..65cb52c19ba1 100644 --- a/examples/information_extraction/waybill_ie/deploy/python/predict.py +++ b/examples/information_extraction/waybill_ie/deploy/python/predict.py @@ -116,7 +116,7 @@ def convert_to_features(example, tokenizer): tokens = example[0] tokenized_input = tokenizer(tokens, return_length=True, - is_split_into_words=True) + is_split_into_words='token') # Token '[CLS]' and '[SEP]' will get label 'O' return tokenized_input['input_ids'], tokenized_input[ 'token_type_ids'], tokenized_input['seq_len'] diff --git a/examples/information_extraction/waybill_ie/run_ernie.py b/examples/information_extraction/waybill_ie/run_ernie.py index 83ec9dedbfa6..8dcba198d96a 100644 --- a/examples/information_extraction/waybill_ie/run_ernie.py +++ b/examples/information_extraction/waybill_ie/run_ernie.py @@ -40,7 +40,7 @@ def convert_to_features(example, tokenizer, label_vocab): tokens, labels = example tokenized_input = tokenizer(tokens, return_length=True, - is_split_into_words=True) + is_split_into_words='token') # Token '[CLS]' and '[SEP]' will get label 'O' labels = ['O'] + labels + ['O'] tokenized_input['labels'] = [label_vocab[x] for x in labels] diff --git a/examples/information_extraction/waybill_ie/run_ernie_crf.py b/examples/information_extraction/waybill_ie/run_ernie_crf.py index 66853ad326e0..a4f54ae4a826 100644 --- a/examples/information_extraction/waybill_ie/run_ernie_crf.py +++ b/examples/information_extraction/waybill_ie/run_ernie_crf.py @@ -41,7 +41,7 @@ def convert_to_features(example, tokenizer, label_vocab): tokens, labels = example tokenized_input = tokenizer(tokens, return_length=True, - is_split_into_words=True) + is_split_into_words='token') # Token '[CLS]' and '[SEP]' will get label 'O' labels = ['O'] + labels + ['O'] tokenized_input['labels'] = [label_vocab[x] for x in labels] diff --git a/examples/sentiment_analysis/skep/predict_opinion.py b/examples/sentiment_analysis/skep/predict_opinion.py index 8356ced33961..37d9d857638d 100644 --- a/examples/sentiment_analysis/skep/predict_opinion.py +++ b/examples/sentiment_analysis/skep/predict_opinion.py @@ -67,7 +67,7 @@ def convert_example(example, tokenizer, max_seq_length=512, is_test=False): tokens = example["tokens"] encoded_inputs = tokenizer(tokens, return_length=True, - is_split_into_words=True, + is_split_into_words='token', max_seq_len=max_seq_length) input_ids = np.array(encoded_inputs["input_ids"], dtype="int64") token_type_ids = np.array(encoded_inputs["token_type_ids"], dtype="int64") diff --git a/examples/text_to_knowledge/ernie-ctm/data.py b/examples/text_to_knowledge/ernie-ctm/data.py index 3e0ee43e07b1..387cdf41831b 100644 --- a/examples/text_to_knowledge/ernie-ctm/data.py +++ b/examples/text_to_knowledge/ernie-ctm/data.py @@ -37,7 +37,7 @@ def convert_example(example, tokens = example["tokens"] tokenized_input = tokenizer(tokens, return_length=True, - is_split_into_words=True, + is_split_into_words='token', max_seq_len=max_seq_len) if is_test: diff --git a/examples/text_to_knowledge/nptag/data.py b/examples/text_to_knowledge/nptag/data.py index 83ebbb132f93..7f95d04c6b41 100644 --- a/examples/text_to_knowledge/nptag/data.py +++ b/examples/text_to_knowledge/nptag/data.py @@ -57,7 +57,7 @@ def convert_example(example, tokens = list(example["text"]) + ["是"] + ["[MASK]"] * max_cls_len inputs = tokenzier(tokens, return_length=True, - is_split_into_words=True, + is_split_into_words='token', max_length=max_seq_len) label_indices = list( diff --git a/model_zoo/gpt/run_msra_ner.py b/model_zoo/gpt/run_msra_ner.py index 4289be90f459..d51ac8ac9560 100644 --- a/model_zoo/gpt/run_msra_ner.py +++ b/model_zoo/gpt/run_msra_ner.py @@ -76,7 +76,7 @@ def tokenize_and_align_labels(example, example = example['tokens'] tokenized_input = tokenizer(example, return_length=True, - is_split_into_words=True, + is_split_into_words='token', max_seq_len=max_seq_len, return_token_type_ids=False) diff --git a/paddlenlp/transformers/tokenizer_utils.py b/paddlenlp/transformers/tokenizer_utils.py index 94c58af0ba83..66addbccbf77 100644 --- a/paddlenlp/transformers/tokenizer_utils.py +++ b/paddlenlp/transformers/tokenizer_utils.py @@ -986,7 +986,7 @@ def get_input_ids(text): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): - if is_split_into_words: + if is_split_into_words == True: tokens = list( itertools.chain(*( self.tokenize(t, is_split_into_words=True, **kwargs) @@ -1071,7 +1071,7 @@ def get_input_ids(text): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance( text[0], str): - if is_split_into_words: + if is_split_into_words == True: tokens = list( itertools.chain(*( self.tokenize(t, is_split_into_words=True, **kwargs) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index ed99b3b67533..2a22d3996733 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -2013,7 +2013,7 @@ def __call__(self, List[List[str]]]] = None, max_length: Optional[int] = None, stride: int = 0, - is_split_into_words: bool = False, + is_split_into_words: Union[bool, str] = False, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, return_position_ids: bool = False, @@ -2061,6 +2061,10 @@ def __call__(self, a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' and 'offset_mapping' preserving the original example and position information will be added to the returned dictionary. Defaults to 0. + is_split_into_words (Union[bool, str], optional): + when the text is words or tokens, `is_split_into_words` should be True or `token`. + `True`: means that the text should be words which should be tokenized. + `token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again. padding (bool, str or [PaddingStrategy], optional): Activates and controls padding. Accepts the following values: @@ -2201,6 +2205,13 @@ def _is_valid_text_input(t): "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " "or `List[List[str]]` (batch of pretokenized examples).") + # check `split_into_words` value + if isinstance(is_split_into_words, + str) and is_split_into_words != 'token': + raise ValueError( + "the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>", + is_split_into_words) + if is_split_into_words: is_batched = isinstance(text, (list, tuple)) and text and isinstance(