From 2fd55e6dc9853790e02c80c97f3e0b91017d4594 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Wed, 25 Aug 2021 17:39:46 +0300 Subject: [PATCH 01/11] Update nn_trainer.py --- deeppavlov/core/trainers/nn_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index a749a67933..c60cbe9813 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -18,6 +18,7 @@ from itertools import islice from logging import getLogger from pathlib import Path +from tqdm import tqdm from typing import List, Tuple, Union, Optional, Iterable from deeppavlov.core.common.errors import ConfigError @@ -279,7 +280,8 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: while True: impatient = False self._send_event(event_name='before_train') - for x, y_true in iterator.gen_batches(self.batch_size, data_type='train'): + log.info('Started the model training') + for x, y_true in tqdm(iterator.gen_batches(self.batch_size, data_type='train')): self.last_result = self._chainer.train_on_batch(x, y_true) if self.last_result is None: self.last_result = {} From 0247ef4cf5bc4783b3c218afe56eb2323a3c293b Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Wed, 25 Aug 2021 17:48:58 +0300 Subject: [PATCH 02/11] Added tqdm to trainer, improved handling of float labels in classifier --- .../basic_classification_reader.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index c354d2dc11..a2cfebdd93 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -35,6 +35,7 @@ class BasicClassificationDatasetReader(DatasetReader): @overrides def read(self, data_path: str, url: str = None, format: str = "csv", class_sep: str = None, + float_labels: bool = False, *args, **kwargs) -> dict: """ Read dataset from data_path directory. @@ -92,22 +93,18 @@ def read(self, data_path: str, url: str = None, x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') - if isinstance(x, list): - if class_sep is None: - # each sample is a tuple ("text", "label") - data[data_type] = [([row[x_] for x_ in x], str(row[y])) - for _, row in df.iterrows()] - else: - # each sample is a tuple ("text", ["label", "label", ...]) - data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) - for _, row in df.iterrows()] - else: - if class_sep is None: - # each sample is a tuple ("text", "label") - data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()] - else: - # each sample is a tuple ("text", ["label", "label", ...]) - data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] + data[data_type] = [] + for _, row in df.iterrows(): + if isinstance(x, list): + sample = [row[x_] for x_ in x] + else: + sample = row[x] + label = str(row[y]) + if class_sep: + label = str(row[y]).split(class_sep) + if float_labels: + label = [float(k) for k in labels] + data[data_type].append((sample, label)) else: log.warning("Cannot find {} file".format(file)) From 725b5d7142e4c2c3e61f6178793b850cbc8c4362 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Wed, 25 Aug 2021 17:56:19 +0300 Subject: [PATCH 03/11] Fixed bu with quoting, added representing classes as float, added tqdm in trainer --- deeppavlov/dataset_readers/basic_classification_reader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index a2cfebdd93..d3ba1058a2 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -49,6 +49,8 @@ def read(self, data_path: str, url: str = None, format: extension of files. Set of Values: ``"csv", "json"`` class_sep: string separator of labels in column with labels sep (str): delimeter for ``"csv"`` files. Default: None -> only one class per sample + float_labels (boolean): if True and class_sep is not None, we treat all classes as float + quotechar (str): what char we consider as quote in the dataset header (int): row number to use as the column names names (array): list of column names to use orient (str): indication of expected JSON string format @@ -81,7 +83,7 @@ def read(self, data_path: str, url: str = None, file = Path(data_path).joinpath(file_name) if file.exists(): if format == 'csv': - keys = ('sep', 'header', 'names') + keys = ('sep', 'header', 'names', 'quotechar') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_csv(file, **options) elif format == 'json': From 7f0ca43067208b81f68a0a439893bf9d08f939fb Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Wed, 25 Aug 2021 18:57:58 +0300 Subject: [PATCH 04/11] Two classification reader fixes and tqdm in trainer --- deeppavlov/dataset_readers/basic_classification_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index d3ba1058a2..1342783ee7 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -105,7 +105,7 @@ def read(self, data_path: str, url: str = None, if class_sep: label = str(row[y]).split(class_sep) if float_labels: - label = [float(k) for k in labels] + label = [float(k) for k in label] data[data_type].append((sample, label)) else: log.warning("Cannot find {} file".format(file)) From cfad40b2796dd9db97a541ac03c640ce05162d7a Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Thu, 26 Aug 2021 13:42:55 +0300 Subject: [PATCH 05/11] One more bug fix with NAN handling --- deeppavlov/dataset_readers/basic_classification_reader.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index 1342783ee7..16b5c01ed7 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -17,6 +17,7 @@ from pathlib import Path import pandas as pd +import numpy as np from overrides import overrides from deeppavlov.core.common.registry import register @@ -96,6 +97,7 @@ def read(self, data_path: str, url: str = None, x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') data[data_type] = [] + i = 0 for _, row in df.iterrows(): if isinstance(x, list): sample = [row[x_] for x_ in x] @@ -106,7 +108,11 @@ def read(self, data_path: str, url: str = None, label = str(row[y]).split(class_sep) if float_labels: label = [float(k) for k in label] - data[data_type].append((sample, label)) + if not np.isnan(sample) and not np.isnan(label): + data[data_type].append((sample, label)) + else: + log.warning(f'Skipping NAN received in file {file} at {i} row') + i += 1 else: log.warning("Cannot find {} file".format(file)) From 822dcbb05877de9e51070780b24be2adab5824a0 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Fri, 27 Aug 2021 23:40:51 +0300 Subject: [PATCH 06/11] Deeppavlov PR with fixes --- deeppavlov/dataset_readers/basic_classification_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index 16b5c01ed7..26dbfb909f 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -108,7 +108,7 @@ def read(self, data_path: str, url: str = None, label = str(row[y]).split(class_sep) if float_labels: label = [float(k) for k in label] - if not np.isnan(sample) and not np.isnan(label): + if sample == sample and label == label: data[data_type].append((sample, label)) else: log.warning(f'Skipping NAN received in file {file} at {i} row') From fc48b7a68af6fce3ab09434d0cfb3d74c93aded0 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Fri, 27 Aug 2021 23:41:23 +0300 Subject: [PATCH 07/11] DeepPavlov PR with fixes --- deeppavlov/dataset_readers/basic_classification_reader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index 26dbfb909f..70f528a767 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -17,7 +17,6 @@ from pathlib import Path import pandas as pd -import numpy as np from overrides import overrides from deeppavlov.core.common.registry import register @@ -108,7 +107,7 @@ def read(self, data_path: str, url: str = None, label = str(row[y]).split(class_sep) if float_labels: label = [float(k) for k in label] - if sample == sample and label == label: + if sample == sample and label == label: # not NAN data[data_type].append((sample, label)) else: log.warning(f'Skipping NAN received in file {file} at {i} row') From 22f9a574aafc4b25310e924e22a76ca6f8dd6d3d Mon Sep 17 00:00:00 2001 From: Fedor Ignatov Date: Wed, 1 Sep 2021 13:29:34 +0300 Subject: [PATCH 08/11] refactor: small changes in nn_trainer.py --- deeppavlov/core/trainers/nn_trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index c60cbe9813..49dcc07fb9 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -18,9 +18,10 @@ from itertools import islice from logging import getLogger from pathlib import Path -from tqdm import tqdm from typing import List, Tuple, Union, Optional, Iterable +from tqdm import tqdm + from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @@ -280,7 +281,7 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: while True: impatient = False self._send_event(event_name='before_train') - log.info('Started the model training') + log.info('The model training started') for x, y_true in tqdm(iterator.gen_batches(self.batch_size, data_type='train')): self.last_result = self._chainer.train_on_batch(x, y_true) if self.last_result is None: From 4e68e138912cc199f00ab1c3d0351ed908083778 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Tue, 14 Sep 2021 15:13:12 +0300 Subject: [PATCH 09/11] Update basic_classification_reader.py --- deeppavlov/dataset_readers/basic_classification_reader.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deeppavlov/dataset_readers/basic_classification_reader.py b/deeppavlov/dataset_readers/basic_classification_reader.py index 70f528a767..10e301b611 100644 --- a/deeppavlov/dataset_readers/basic_classification_reader.py +++ b/deeppavlov/dataset_readers/basic_classification_reader.py @@ -97,6 +97,7 @@ def read(self, data_path: str, url: str = None, y = kwargs.get('y', 'labels') data[data_type] = [] i = 0 + prev_n_classes = 0 # to capture samples with different n_classes for _, row in df.iterrows(): if isinstance(x, list): sample = [row[x_] for x_ in x] @@ -105,8 +106,11 @@ def read(self, data_path: str, url: str = None, label = str(row[y]) if class_sep: label = str(row[y]).split(class_sep) + if prev_n_classes == 0: + prev_n_classes = len(label) + assert len(label) == prev_n_classes, f"Wrong class number at {i} row" if float_labels: - label = [float(k) for k in label] + label = [float(k) for k in label] if sample == sample and label == label: # not NAN data[data_type].append((sample, label)) else: From 81e2dab611473dc47df44d991004134055ca1374 Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Mon, 20 Sep 2021 21:18:12 +0300 Subject: [PATCH 10/11] Update utils.py --- deeppavlov/core/data/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/core/data/utils.py b/deeppavlov/core/data/utils.py index 6d4eb88661..03f7bf3c2a 100644 --- a/deeppavlov/core/data/utils.py +++ b/deeppavlov/core/data/utils.py @@ -465,7 +465,7 @@ def flatten_str_batch(batch: Union[str, Iterable]) -> Union[list, chain]: ['a', 'b', 'c', 'd'] """ - if isinstance(batch, str): + if isinstance(batch, str) or isinstance(batch, str) or isinstance(batch, int) or isinstance(batch, float): return [batch] else: return chain(*[flatten_str_batch(sample) for sample in batch]) From 0e6c98df74bc15ac9bed7469cbc539230f2ee5bc Mon Sep 17 00:00:00 2001 From: dimakarp1996 Date: Thu, 7 Apr 2022 16:05:07 +0300 Subject: [PATCH 11/11] Update utils.py --- deeppavlov/core/data/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/core/data/utils.py b/deeppavlov/core/data/utils.py index 03f7bf3c2a..318f6e6a53 100644 --- a/deeppavlov/core/data/utils.py +++ b/deeppavlov/core/data/utils.py @@ -465,7 +465,7 @@ def flatten_str_batch(batch: Union[str, Iterable]) -> Union[list, chain]: ['a', 'b', 'c', 'd'] """ - if isinstance(batch, str) or isinstance(batch, str) or isinstance(batch, int) or isinstance(batch, float): + if isinstance(batch, str) or isinstance(batch, int) or isinstance(batch, float): return [batch] else: return chain(*[flatten_str_batch(sample) for sample in batch])