diff --git a/deeppavlov/configs/classifiers/emotions_xlm_roberta_base.json b/deeppavlov/configs/classifiers/emotions_xlm_roberta_base.json new file mode 100644 index 0000000000..4422ff9600 --- /dev/null +++ b/deeppavlov/configs/classifiers/emotions_xlm_roberta_base.json @@ -0,0 +1,105 @@ +{ + "dataset_reader": { + "class_name": "huggingface_dataset_reader", + "path": "cedr", + "name": "main", + "train": "train", + "test": "test" + }, + "dataset_iterator": { + "class_name": "huggingface_dataset_iterator", + "features": "text", + "label": "labels", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 1e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "batch_size": 64, + "metrics": [ + "accuracy", + "f1_weighted", + "f1_macro" + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "test"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "xlm-roberta-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/emotions_classifier/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/emotions/emotions_xlm_roberta_base.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/configs/classifiers/insults_xlm_roberta_base.json b/deeppavlov/configs/classifiers/insults_xlm_roberta_base.json new file mode 100644 index 0000000000..190b3e3f5a --- /dev/null +++ b/deeppavlov/configs/classifiers/insults_xlm_roberta_base.json @@ -0,0 +1,106 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "x": "Comment", + "y": "Class", + "data_path": "{DOWNLOADS_PATH}/insults_data" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 1e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "batch_size": 256, + "metrics": [ + "accuracy", + "f1_weighted", + "f1_macro" + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "xlm-roberta-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/insults_classifier/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz", + "subdir": "{DOWNLOADS_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/v1/classifiers/insults/insults_xlm_roberta_base.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/configs/classifiers/intents_distilbert_base_multi.json b/deeppavlov/configs/classifiers/intents_distilbert_base_multi.json new file mode 100644 index 0000000000..18089bfb23 --- /dev/null +++ b/deeppavlov/configs/classifiers/intents_distilbert_base_multi.json @@ -0,0 +1,105 @@ +{ + "dataset_reader": { + "class_name": "huggingface_dataset_reader", + "path": "AmazonScience/massive", + "name": "all", + "train": "train", + "valid": "validation", + "test": "test" + }, + "dataset_iterator": { + "class_name": "huggingface_dataset_iterator", + "features": "utt", + "label": "intent", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "batch_size": 128, + "metrics": [ + "accuracy", + "f1_weighted" + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "distilbert-base-multilingual-cased", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/intents_classification/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/intents/intents_classification_distilbert_multi.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/configs/classifiers/intents_xlm_roberta_base.json b/deeppavlov/configs/classifiers/intents_xlm_roberta_base.json new file mode 100644 index 0000000000..f260c0e6ec --- /dev/null +++ b/deeppavlov/configs/classifiers/intents_xlm_roberta_base.json @@ -0,0 +1,106 @@ +{ + "dataset_reader": { + "class_name": "huggingface_dataset_reader", + "path": "AmazonScience/massive", + "name": "all", + "train": "train", + "valid": "validation", + "test": "test" + }, + "dataset_iterator": { + "class_name": "huggingface_dataset_iterator", + "features": "utt", + "label": "intent", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "batch_size": 128, + "metrics": [ + "accuracy", + "f1_weighted" + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "xlm-roberta-base", + "ROOT_PATH": "~/test_dir", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/intents_classification/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/tmp/classifiers/intents/intents_xlm_roberta_base.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } + } + diff --git a/deeppavlov/configs/classifiers/sentiments_xlm_roberta_base.json b/deeppavlov/configs/classifiers/sentiments_xlm_roberta_base.json new file mode 100644 index 0000000000..94f5dd0120 --- /dev/null +++ b/deeppavlov/configs/classifiers/sentiments_xlm_roberta_base.json @@ -0,0 +1,104 @@ +{ + "dataset_reader": { + "class_name": "huggingface_dataset_reader", + "path": "mteb/tweet_sentiment_extraction", + "train": "train", + "test": "test" + }, + "dataset_iterator": { + "class_name": "huggingface_dataset_iterator", + "features": "text", + "label": "label_text", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 1e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "batch_size": 64, + "metrics": [ + "accuracy", + "f1_weighted", + "f1_macro" + ], + "validation_patience": 10, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "test"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "xlm-roberta-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/sentiments_classifier/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/sentiments/sentiments_xlm_roberta_base.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} \ No newline at end of file diff --git a/deeppavlov/configs/classifiers/topics_xlm_roberta_base.json b/deeppavlov/configs/classifiers/topics_xlm_roberta_base.json new file mode 100644 index 0000000000..44ea6a3ac5 --- /dev/null +++ b/deeppavlov/configs/classifiers/topics_xlm_roberta_base.json @@ -0,0 +1,105 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "x": "text", + "y": "topic", + "data_path": "{DOWNLOADS_PATH}/dp_topics_downsampled_data/", + "train" : "train.csv", + "valid" : "valid.csv", + "test": "test.csv" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 128, + "in": ["x"], + "out": ["bert_features"] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": ["y"], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": ["y"], + "out": ["y_ids"] + }, + { + "in": ["y_ids"], + "out": ["y_onehot"], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "torch_transformers_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "pretrained_bert": "{BASE_MODEL}", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 1e-05 + }, + "learning_rate_drop_patience": 3, + "learning_rate_drop_div": 2.0, + "in": ["bert_features"], + "in_y": ["y_ids"], + "out": ["y_pred_probas"] + }, + { + "in": ["y_pred_probas"], + "out": ["y_pred_ids"], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": ["y_pred_ids"], + "out": ["y_pred_labels"], + "ref": "classes_vocab" + } + ], + "out": ["y_pred_labels"] + }, + "train": { + "batch_size": 256, + "metrics": [ + "accuracy", + "f1_weighted", + "f1_macro" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid", "test"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "xlm-roberta-base", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/topics_classification/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/classifiers/topics/topics_xlm_roberta_base.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/configs/ner/ner_bert_base_address.json b/deeppavlov/configs/ner/ner_bert_base_address.json new file mode 100644 index 0000000000..31e9b3643e --- /dev/null +++ b/deeppavlov/configs/ner/ner_bert_base_address.json @@ -0,0 +1,111 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{ROOT_PATH}/downloads/", + "provide_pos": false + }, + "dataset_iterator": { + "class_name": "data_learning_iterator" + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "sentence_delimiter", + "in": ["x"], + "out": ["x_short"] + }, + { + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["x"], + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] + }, + { + "id": "tag_vocab", + "class_name": "simple_vocab", + "unk_token": ["O"], + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "fit_on": ["y"], + "in": ["y"], + "out": ["y_ind"] + }, + { + "class_name": "torch_transformers_sequence_tagger", + "n_tags": "#tag_vocab.len", + "pretrained_bert": "{BASE_MODEL}", + "attention_probs_keep_prob": 0.5, + "use_crf": true, + "encoder_layer_ids": [-1], + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05, + "weight_decay": 1e-06, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "clip_norm": 1.0, + "min_learning_rate": 1e-07, + "learning_rate_drop_patience": 20, + "learning_rate_drop_div": 1.5, + "load_before_drop": true, + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], + "in_y": ["y_ind"], + "out": ["y_pred_ind", "probas"] + }, + { + "ref": "tag_vocab", + "in": ["y_pred_ind"], + "out": ["y_pred"] + }, + { + "class_name": "sentence_concatenator", + "in": ["x_tokens", "y_pred"], + "out": ["output"] + } + ], + "out": ["x_tokens", "y_pred"] + }, + "train": { + "batch_size": 128, + "metrics": [ + { + "name": "ner_f1", + "inputs": ["y", "y_pred"] + }, + { + "name": "ner_token_f1", + "inputs": ["y", "y_pred"] + } + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "bert-base-multilingual-cased", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/ner/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/tmp/dp_demo.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } + } \ No newline at end of file diff --git a/deeppavlov/configs/ner/ner_bert_base_mult.json b/deeppavlov/configs/ner/ner_bert_base_mult.json new file mode 100644 index 0000000000..107c5fac8e --- /dev/null +++ b/deeppavlov/configs/ner/ner_bert_base_mult.json @@ -0,0 +1,103 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{DOWNLOADS_PATH}/conll2003/", + "dataset_name": "conll2003", + "provide_pos": false + }, + "dataset_iterator": { + "class_name": "data_learning_iterator" + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": true, + "max_seq_length": 128, + "in": ["x"], + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] + }, + { + "id": "tag_vocab", + "class_name": "simple_vocab", + "unk_token": ["O"], + "pad_with_zeros": true, + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "fit_on": ["y"], + "in": ["y"], + "out": ["y_ind"] + }, + { + "class_name": "torch_transformers_sequence_tagger", + "n_tags": "#tag_vocab.len", + "pretrained_bert": "{BASE_MODEL}", + "attention_probs_keep_prob": 0.5, + "use_crf": true, + "encoder_layer_ids": [-1], + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05, + "weight_decay": 1e-06, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "clip_norm": 1.0, + "min_learning_rate": 1e-07, + "learning_rate_drop_patience": 20, + "learning_rate_drop_div": 1.5, + "load_before_drop": true, + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], + "in_y": ["y_ind"], + "out": ["y_pred_ind", "probas"] + }, + { + "ref": "tag_vocab", + "in": ["y_pred_ind"], + "out": ["y_pred"] + } + ], + "out": ["x_tokens", "y_pred"] + }, + "train": { + "batch_size": 256, + "metrics": [ + { + "name": "ner_f1", + "inputs": ["y", "y_pred"] + }, + { + "name": "ner_token_f1", + "inputs": ["y", "y_pred"] + } + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "bert-base-multilingual-cased", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/amazon_onto_combined/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/tmp/ner/updated_ner/ner_bert_base_mult.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/configs/ner/ner_mult_long_demo.json b/deeppavlov/configs/ner/ner_mult_long_demo.json new file mode 100644 index 0000000000..3d76fec0e4 --- /dev/null +++ b/deeppavlov/configs/ner/ner_mult_long_demo.json @@ -0,0 +1,113 @@ +{ + "dataset_reader": { + "class_name": "conll2003_reader", + "data_path": "{DOWNLOADS_PATH}/conll2003/", + "dataset_name": "conll2003", + "provide_pos": false + }, + "dataset_iterator": { + "class_name": "data_learning_iterator" + }, + "chainer": { + "in": ["x"], + "in_y": ["y"], + "pipe": [ + { + "class_name": "sentence_delimiter", + "in": ["x"], + "out": ["x_short"] + }, + { + "class_name": "torch_transformers_ner_preprocessor", + "vocab_file": "{BASE_MODEL}", + "do_lower_case": false, + "max_seq_length": 256, + "in": ["x_short"], + "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] + }, + { + "id": "tag_vocab", + "class_name": "simple_vocab", + "unk_token": ["O"], + "pad_with_zeros": true, + "save_path": "{MODEL_PATH}/tag.dict", + "load_path": "{MODEL_PATH}/tag.dict", + "fit_on": ["y"], + "in": ["y"], + "out": ["y_ind"] + }, + { + "class_name": "torch_transformers_sequence_tagger", + "n_tags": "#tag_vocab.len", + "pretrained_bert": "{BASE_MODEL}", + "attention_probs_keep_prob": 0.5, + "use_crf": true, + "encoder_layer_ids": [-1], + "optimizer": "AdamW", + "optimizer_parameters": { + "lr": 2e-05, + "weight_decay": 1e-06, + "betas": [0.9, 0.999], + "eps": 1e-06 + }, + "clip_norm": 1.0, + "min_learning_rate": 1e-07, + "learning_rate_drop_patience": 20, + "learning_rate_drop_div": 1.5, + "load_before_drop": true, + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], + "in_y": ["y_ind"], + "out": ["y_pred_ind", "probas"] + }, + { + "ref": "tag_vocab", + "in": ["y_pred_ind"], + "out": ["y_pred"] + }, + { + "class_name": "sentence_concatenator", + "in": ["x_tokens", "y_pred"], + "out": ["output"] + } + ], + "out": ["output"] + }, + "train": { + "batch_size": 256, + "metrics": [ + { + "name": "ner_f1", + "inputs": ["y", "y_pred"] + }, + { + "name": "ner_token_f1", + "inputs": ["y", "y_pred"] + } + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": ["train", "valid"], + "class_name": "torch_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/", + "pytest_max_batches": 2 + }, + "metadata": { + "variables": { + "BASE_MODEL": "bert-base-multilingual-cased", + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/amazon_onto_combined/{BASE_MODEL}" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz", + "subdir": "{MODEL_PATH}" + } + ] + } +} diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 9995d88370..2e7c177c60 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -36,6 +36,8 @@ "line_reader": "deeppavlov.dataset_readers.line_reader:LineReader", "logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker", "mask": "deeppavlov.models.preprocessors.mask:Mask", + "sentence_concatenator": "deeppavlov.models.tokenizers.sentence_delimiter:SentenceConcatenator", + "sentence_delimiter": "deeppavlov.models.tokenizers.sentence_delimiter:SentenceDelimiter", "morphotagger_dataset_iterator": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator", "morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader", "multitask_reader":"deeppavlov.dataset_readers.multitask_reader:MultiTaskReader", diff --git a/deeppavlov/core/common/requirements_registry.json b/deeppavlov/core/common/requirements_registry.json index 732803debb..d1c6255599 100644 --- a/deeppavlov/core/common/requirements_registry.json +++ b/deeppavlov/core/common/requirements_registry.json @@ -204,5 +204,8 @@ ], "wikitionary_100K_vocab": [ "{DEEPPAVLOV_PATH}/requirements/lxml.txt" + ], + "sentence_delimiter": [ + "{DEEPPAVLOV_PATH}/requirements/pysbd.txt" ] } diff --git a/deeppavlov/models/tokenizers/sentence_delimiter.py b/deeppavlov/models/tokenizers/sentence_delimiter.py new file mode 100644 index 0000000000..c936e583ec --- /dev/null +++ b/deeppavlov/models/tokenizers/sentence_delimiter.py @@ -0,0 +1,24 @@ +import pysbd + +from deeppavlov.core.models.component import Component +from deeppavlov.core.common.registry import register + + +@register('sentence_delimiter') +def SentenceDelimiter(x_long): + seg = pysbd.Segmenter(clean=False) + xs = [a for a in seg.segment(x_long[0]) if len(a)>0] + return tuple(xs) + +@register('sentence_concatenator') +def SentenceConcatenator(x_long, y_long): + x_short = [] + y_short = [] + + for sent in x_long: + x_short.extend(sent) + + for sent in y_long: + y_short.extend(sent) + + return [[x_short, y_short]] \ No newline at end of file diff --git a/deeppavlov/requirements/pysbd.txt b/deeppavlov/requirements/pysbd.txt new file mode 100644 index 0000000000..c625687944 --- /dev/null +++ b/deeppavlov/requirements/pysbd.txt @@ -0,0 +1 @@ +pysbd==0.3.4 \ No newline at end of file diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 6b5b2cc0cc..fba3353f81 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -121,6 +121,12 @@ ("classifiers/superglue/superglue_record_roberta.json", "classifiers", ('TI',)): [RECORD_ARGUMENTS_INFER_CHECK], ("classifiers/superglue/superglue_wic_bert.json", "classifiers", ('TI',)): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/topics_distilbert_base_uncased.json", "classifiers", ('TI',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/emotions_xlm_roberta_base.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/insults_xlm_roberta_base.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/intents_distilbert_base_multi.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/sentiments_xlm_roberta_base.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/topics_xlm_roberta_base.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/few_shot_roberta.json", "classifiers", ('IP',)): [ ('Dummy text', ['Dummy text Dummy text', 'Dummy class'], ('Dummy class',)) ] @@ -221,7 +227,8 @@ ("ner/ner_ontonotes_bert.json", "ner_ontonotes_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_ontonotes_bert_mult.json", "ner_ontonotes_bert_mult", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("ner/ner_rus_bert.json", "ner_rus_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], - ("ner/ner_collection3_bert.json", "ner_collection3_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK] + ("ner/ner_collection3_bert.json", "ner_collection3_bert", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], + ("ner/ner_bert_base_mult.json", "ner_conll2003_bert", ('IP')): [ONE_ARGUMENT_INFER_CHECK] }, "sentence_segmentation": { ("sentence_segmentation/sentseg_dailydialog_bert.json", "sentseg_dailydialog_bert", ('IP', 'TI')): [