From 3f639f83304df984735ee5bd21f1c546db932021 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Tue, 23 Nov 2021 14:16:06 +0300 Subject: [PATCH 01/18] Test: modify .json file --- .../classifiers/sentiment_twitter.json | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 0d02ec5927..a137f77ffc 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", @@ -62,7 +62,7 @@ ], "main": true, "class_name": "keras_classification_model", - "save_path": "{MODEL_PATH}/model", + "save_path": "{MODEL_PATH}/new_model", "load_path": "{MODEL_PATH}/model", "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", @@ -100,7 +100,8 @@ ] }, "train": { - "epochs": 100, + "epochs": 5, + "log_every_n_epochs": 1, "batch_size": 64, "metrics": [ "accuracy", @@ -119,7 +120,14 @@ "valid", "test" ], - "class_name": "nn_trainer" + "class_name": "nn_trainer", + "tensorboard_log_dir": "{MODELS_PATH}/sentiment_twitter/logs", + "logger": [ + { + "name": "TensorboardLogger", + "log_dir": "{MODELS_PATH}/sentiment_twitter/logs_new" + } + ] }, "metadata": { "variables": { @@ -128,6 +136,10 @@ "MODELS_PATH": "{ROOT_PATH}/models", "MODEL_PATH": "{MODELS_PATH}/classifiers/sentiment_twitter_v6" }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/tf.txt", + "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" + ], "download": [ { "url": "http://files.deeppavlov.ai/datasets/sentiment_twitter_data.tar.gz", From a4da2b69ff915bf5d780218148bef82506e56de7 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 24 Nov 2021 01:08:57 +0300 Subject: [PATCH 02/18] Deleting _log funtion and _validate from nn_trainer.py --- .../classifiers/sentiment_twitter.json | 2 +- deeppavlov/core/common/logging_class.py | 149 +++++++++++ deeppavlov/core/trainers/fit_trainer.py | 62 ++++- deeppavlov/core/trainers/nn_trainer.py | 243 ++++++++++-------- 4 files changed, 338 insertions(+), 118 deletions(-) create mode 100644 deeppavlov/core/common/logging_class.py diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index a137f77ffc..7add79309e 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -125,7 +125,7 @@ "logger": [ { "name": "TensorboardLogger", - "log_dir": "{MODELS_PATH}/sentiment_twitter/logs_new" + "log_dir": "{MODELS_PATH}/sentiment_twitter/Tensorboard_logs" } ] }, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py new file mode 100644 index 0000000000..2c0b0fec6d --- /dev/null +++ b/deeppavlov/core/common/logging_class.py @@ -0,0 +1,149 @@ +from joblib import logger +import tensorflow as tf +import datetime +import time +from itertools import islice +from typing import List, Tuple, Union, Optional, Iterable +import json +from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder + + +class TrainLogger: + def __init__(self,log_dir): + pass + + def __call__(self,metrics): + pass + +class TensorboardLogger(TrainLogger): + def __init__(self, type, log_dir): + self.tb_writer = tf.summary.FileWriter(log_dir) + self.type = type + #self.tb_train_writer = tf.summary.FileWriter(str(log_dir / 'train_log')) + #self.tb_valid_writer = tf.summary.FileWriter(str(log_dir / 'valid_log')) + + def __call__(self, nn_trainer, iterator, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None , log = None ): # default value for log for now = None + if self.type =='train': + print("logging Training metrics...") + nn_trainer._send_event(event_name='before_log') + if nn_trainer.log_on_k_batches == 0: + report = { + 'time_spent': str(datetime.timedelta(seconds=round(time.time() - nn_trainer.start_time + 0.5))) + } + else: + data = islice(iterator.gen_batches(nn_trainer.batch_size, data_type='train', shuffle=True), + nn_trainer.log_on_k_batches) + report = nn_trainer.test(data, nn_trainer.train_metrics, start_time=nn_trainer.start_time) + + report.update({ + 'epochs_done': nn_trainer.epoch, + 'batches_seen': nn_trainer.train_batches_seen, + 'train_examples_seen': nn_trainer.examples + }) + + metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(nn_trainer.last_result.items()) + + report.update(nn_trainer.last_result) + if nn_trainer.losses: + report['loss'] = sum(nn_trainer.losses) / len(nn_trainer.losses) + nn_trainer.losses.clear() + metrics.append(('loss', report['loss'])) + + # if metrics and self.tensorboard_log_dir is not None: + # if metrics and nn_trainer.tensorboard_idx is not None: + # self.TensorboardLogger_train(self, metrics, tensorboard_tag, tensorboard_index) + if metrics and nn_trainer.tensorboard_idx is not None: + summary = nn_trainer._tf.Summary() + + for name, score in metrics: + summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + self.tb_writer.add_summary(summary, tensorboard_index) + self.tb_writer.flush() + # summary = self._tf.Summary() + + # for name, score in metrics: + # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + + # self.tb_writer.add_summary(summary, tensorboard_index) + # self.tb_writer.flush() + # self.TensorboardLogger_train(summary,tensorboard_index) + # self.TensorboardLogger('train',summary,tensorboard_index) + #self.tb_train_writer.add_summary(summary, tensorboard_index) + #self.tb_train_writer.flush() + + nn_trainer._send_event(event_name='after_train_log', data=report) + report = {'train': report} + print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) + else: + print("logging Validation metrics...") + nn_trainer._send_event(event_name='before_validation') + report = nn_trainer.test(iterator.gen_batches(nn_trainer.batch_size, data_type='valid', shuffle=False), + start_time=nn_trainer.start_time) + + report['epochs_done'] = nn_trainer.epoch + report['batches_seen'] = nn_trainer.train_batches_seen + report['train_examples_seen'] = nn_trainer.examples + + metrics = list(report['metrics'].items()) + + if tensorboard_tag is not None and nn_trainer.tensorboard_idx is not None: + summary = nn_trainer._tf.Summary() + for name, score in metrics: + summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + if tensorboard_index is None: + tensorboard_index = nn_trainer.train_batches_seen + self.tb_writer.add_summary(summary, tensorboard_index) + self.tb_writer.flush() + + m_name, score = metrics[0] + + # Update the patience + if nn_trainer.score_best is None: + nn_trainer.patience = 0 + else: + if nn_trainer.improved(score, nn_trainer.score_best): + nn_trainer.patience = 0 + else: + nn_trainer.patience += 1 + + # Run the validation model-saving logic + if nn_trainer._is_initial_validation(): + log.info('Initial best {} of {}'.format(m_name, score)) + nn_trainer.score_best = score + elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: + log.info('First best {} of {}'.format(m_name, score)) + nn_trainer.score_best = score + log.info('Saving model') + nn_trainer.save() + elif nn_trainer.improved(score, nn_trainer.score_best): + log.info('Improved best {} of {}'.format(m_name, score)) + nn_trainer.score_best = score + log.info('Saving model') + nn_trainer.save() + else: + log.info('Did not improve on the {} of {}'.format(m_name, nn_trainer.score_best)) + + report['impatience'] = nn_trainer.patience + if nn_trainer.validation_patience > 0: + report['patience_limit'] = nn_trainer.validation_patience + + nn_trainer._send_event(event_name='after_validation', data=report) + report = {'valid': report} + print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) + nn_trainer.validation_number += 1 + + + + # summary = tf.Summary() + # for name, score in metrics: + # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + # self.tb_writer.add_summary(summary, tensorboard_index) + # self.tb_writer.flush() + + + # if train_or_valid == 'train': + # self.tb_train_writer.add_summary(summary, tensorboard_index) + # self.tb_train_writer.flush() + # else: + # self.tb_valid_writer.add_summary(summary, tensorboard_index) + # self.tb_valid_writer.flush() \ No newline at end of file diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index 0378560564..5ffd5b62eb 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -52,6 +52,7 @@ class FitTrainer: in evaluation logs (default is ``False``) tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None (default is ``None``) + logger : list of dictionary of possible loggers provided in config file max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored @@ -61,8 +62,9 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, metrics: Iterable[Union[str, dict]] = ('accuracy',), evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, - tensorboard_log_dir: Optional[Union[str, Path]] = None, + # tensorboard_log_dir: Optional[Union[str, Path]] = None, max_test_batches: int = -1, + logger: list = [], **kwargs) -> None: if kwargs: log.info(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') @@ -75,20 +77,41 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, self.max_test_batches = None if max_test_batches < 0 else max_test_batches - self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir - if tensorboard_log_dir is not None: + # self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir + self.logger: list = logger + def get_method_idx(logger, name): + for i in range(len(logger)): + if logger[i]["name"] == name: + return i + return None + self.tensorboard_idx = get_method_idx(self.logger, "TensorboardLogger") + # self.wandb = get_method_idx(self.logger, "") + if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements # noinspection PyUnresolvedReferences import tensorflow except ImportError: log.warning('TensorFlow could not be imported, so tensorboard log directory' - f'`{self.tensorboard_log_dir}` will be ignored') - self.tensorboard_log_dir = None + f'`{self.logger[self.tensorboard_idx]["log_dir"]}` will be ignored') + self.logger[self.tensorboard_idx]["log_dir"] = None else: - self.tensorboard_log_dir = expand_path(tensorboard_log_dir) + self.logger[self.tensorboard_idx]["log_dir"] = expand_path(self.logger[self.tensorboard_idx]["log_dir"]) self._tf = tensorflow + # if tensorboard_log_dir is not None: + # try: + # # noinspection PyPackageRequirements + # # noinspection PyUnresolvedReferences + # import tensorflow + # except ImportError: + # log.warning('TensorFlow could not be imported, so tensorboard log directory' + # f'`{self.tensorboard_log_dir}` will be ignored') + # self.tensorboard_log_dir = None + # else: + # self.tensorboard_log_dir = expand_path(tensorboard_log_dir) + # self._tf = tensorflow + self._built = False self._saved = False self._loaded = False @@ -117,9 +140,19 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] # noinspection PyUnresolvedReferences result = component.partial_fit(*preprocessed) - if result is not None and self.tensorboard_log_dir is not None: + # if result is not None and self.tensorboard_log_dir is not None: + # if writer is None: + # writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / + # f'partial_fit_{component_index}_log')) + # for name, score in result.items(): + # summary = self._tf.Summary() + # summary.value.add(tag='partial_fit/' + name, simple_value=score) + # writer.add_summary(summary, i) + # writer.flush() + + if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: if writer is None: - writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / + writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / f'partial_fit_{component_index}_log')) for name, score in result.items(): summary = self._tf.Summary() @@ -132,8 +165,17 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] preprocessed = [preprocessed] result: Optional[Dict[str, Iterable[float]]] = component.fit(*preprocessed) - if result is not None and self.tensorboard_log_dir is not None: - writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / + # if result is not None and self.tensorboard_log_dir is not None: + # writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / + # f'fit_log_{component_index}')) + # for name, scores in result.items(): + # for i, score in enumerate(scores): + # summary = self._tf.Summary() + # summary.value.add(tag='fit/' + name, simple_value=score) + # writer.add_summary(summary, i) + # writer.flush() + if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: + writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / f'fit_log_{component_index}')) for name, scores in result.items(): for i, score in enumerate(scores): diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 6f6fd8b4bf..5417f65620 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -26,6 +26,8 @@ from deeppavlov.core.trainers.fit_trainer import FitTrainer from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder +from deeppavlov.core.common.logging_class import * + log = getLogger(__name__) @@ -57,6 +59,7 @@ class NNTrainer(FitTrainer): in evaluation logs (default is ``False``) tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None (default is ``None``) + logger : list of dictionary of possible loggers provided in config file validate_first: flag used to calculate metrics on the ``'valid'`` data type before starting training (default is ``True``) validation_patience: how many times in a row the validation metric has to not improve for early stopping, @@ -98,14 +101,19 @@ def __init__(self, chainer_config: dict, *, metric_optimization: str = 'maximize', evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, - tensorboard_log_dir: Optional[Union[str, Path]] = None, + # tensorboard_log_dir: Optional[Union[str, Path]] = None, + logger : list = [], ## see FitTrainer + max_test_batches: int = -1, validate_first: bool = True, validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1, log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1, + **kwargs) -> None: super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets, - show_examples=show_examples, tensorboard_log_dir=tensorboard_log_dir, + show_examples=show_examples, + # tensorboard_log_dir=tensorboard_log_dir, + logger = logger, max_test_batches=max_test_batches, **kwargs) if train_metrics is None: self.train_metrics = self.metrics @@ -146,9 +154,17 @@ def _improved(op): self.losses = [] self.start_time: Optional[float] = None - if self.tensorboard_log_dir is not None: - self.tb_train_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'train_log')) - self.tb_valid_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'valid_log')) + # if self.tensorboard_log_dir is not None: + # self.tb_train_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'train_log')) + # self.tb_valid_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'valid_log')) + + if self.tensorboard_idx is not None: + self.TensorboardLogger_train = TensorboardLogger('train', str(self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) + self.TensorboardLogger_valid = TensorboardLogger('valid', str(self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) + # self.TensorboardLogger = TensorboardLogger(self.logger[self.tensorboard_idx]["log_dir"]) + #self.tb_train_writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) + #self.tb_valid_writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) + def save(self) -> None: if self._loaded: @@ -162,102 +178,111 @@ def _is_initial_validation(self): def _is_first_validation(self): return self.validation_number == 1 - def _validate(self, iterator: DataLearningIterator, - tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: - self._send_event(event_name='before_validation') - report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False), - start_time=self.start_time) - - report['epochs_done'] = self.epoch - report['batches_seen'] = self.train_batches_seen - report['train_examples_seen'] = self.examples - - metrics = list(report['metrics'].items()) - - if tensorboard_tag is not None and self.tensorboard_log_dir is not None: - summary = self._tf.Summary() - for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) - if tensorboard_index is None: - tensorboard_index = self.train_batches_seen - self.tb_valid_writer.add_summary(summary, tensorboard_index) - self.tb_valid_writer.flush() - - m_name, score = metrics[0] - - # Update the patience - if self.score_best is None: - self.patience = 0 - else: - if self.improved(score, self.score_best): - self.patience = 0 - else: - self.patience += 1 - - # Run the validation model-saving logic - if self._is_initial_validation(): - log.info('Initial best {} of {}'.format(m_name, score)) - self.score_best = score - elif self._is_first_validation() and self.score_best is None: - log.info('First best {} of {}'.format(m_name, score)) - self.score_best = score - log.info('Saving model') - self.save() - elif self.improved(score, self.score_best): - log.info(f'Improved best {m_name} from {self.score_best} to {score}') - self.score_best = score - log.info('Saving model') - self.save() - else: - log.info('Did not improve on the {} of {}'.format(m_name, self.score_best)) - - report['impatience'] = self.patience - if self.validation_patience > 0: - report['patience_limit'] = self.validation_patience - - self._send_event(event_name='after_validation', data=report) - report = {'valid': report} - print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) - self.validation_number += 1 - - def _log(self, iterator: DataLearningIterator, - tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: - self._send_event(event_name='before_log') - if self.log_on_k_batches == 0: - report = { - 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))) - } - else: - data = islice(iterator.gen_batches(self.batch_size, data_type='train', shuffle=True), - self.log_on_k_batches) - report = self.test(data, self.train_metrics, start_time=self.start_time) - - report.update({ - 'epochs_done': self.epoch, - 'batches_seen': self.train_batches_seen, - 'train_examples_seen': self.examples - }) - - metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(self.last_result.items()) - - report.update(self.last_result) - if self.losses: - report['loss'] = sum(self.losses) / len(self.losses) - self.losses.clear() - metrics.append(('loss', report['loss'])) - - if metrics and self.tensorboard_log_dir is not None: - summary = self._tf.Summary() - - for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) - self.tb_train_writer.add_summary(summary, tensorboard_index) - self.tb_train_writer.flush() - - self._send_event(event_name='after_train_log', data=report) - - report = {'train': report} - print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) + # def _validate(self, iterator: DataLearningIterator, + # tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: + # self.TensorboardLogger_valid(self , iterator, tensorboard_tag, tensorboard_index, log) + # self._send_event(event_name='before_validation') + # report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False), + # start_time=self.start_time) + + # report['epochs_done'] = self.epoch + # report['batches_seen'] = self.train_batches_seen + # report['train_examples_seen'] = self.examples + + # metrics = list(report['metrics'].items()) + + # if tensorboard_tag is not None and self.tensorboard_log_dir is not None: + # summary = self._tf.Summary() + # for name, score in metrics: + # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + # if tensorboard_index is None: + # tensorboard_index = self.train_batches_seen + # self.tb_valid_writer.add_summary(summary, tensorboard_index) + # self.tb_valid_writer.flush() + + # m_name, score = metrics[0] + + # # Update the patience + # if self.score_best is None: + # self.patience = 0 + # else: + # if self.improved(score, self.score_best): + # self.patience = 0 + # else: + # self.patience += 1 + + # # Run the validation model-saving logic + # if self._is_initial_validation(): + # log.info('Initial best {} of {}'.format(m_name, score)) + # self.score_best = score + # elif self._is_first_validation() and self.score_best is None: + # log.info('First best {} of {}'.format(m_name, score)) + # self.score_best = score + # log.info('Saving model') + # self.save() + # elif self.improved(score, self.score_best): + # log.info('Improved best {} of {}'.format(m_name, score)) + # self.score_best = score + # log.info('Saving model') + # self.save() + # else: + # log.info('Did not improve on the {} of {}'.format(m_name, self.score_best)) + + # report['impatience'] = self.patience + # if self.validation_patience > 0: + # report['patience_limit'] = self.validation_patience + + # self._send_event(event_name='after_validation', data=report) + # report = {'valid': report} + # print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) + # self.validation_number += 1 + + # def _log(self, iterator: DataLearningIterator, + # tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: + # self.TensorboardLogger_train(self , iterator, tensorboard_tag, tensorboard_index , log) + # self._send_event(event_name='before_log') + # if self.log_on_k_batches == 0: + # report = { + # 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))) + # } + # else: + # data = islice(iterator.gen_batches(self.batch_size, data_type='train', shuffle=True), + # self.log_on_k_batches) + # report = self.test(data, self.train_metrics, start_time=self.start_time) + + # report.update({ + # 'epochs_done': self.epoch, + # 'batches_seen': self.train_batches_seen, + # 'train_examples_seen': self.examples + # }) + + # metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(self.last_result.items()) + + # report.update(self.last_result) + # if self.losses: + # report['loss'] = sum(self.losses) / len(self.losses) + # self.losses.clear() + # metrics.append(('loss', report['loss'])) + + # # if metrics and self.tensorboard_log_dir is not None: + # if metrics and self.tensorboard_idx is not None: + # self.TensorboardLogger_train(self, metrics, tensorboard_tag, tensorboard_index) + # # if metrics and self.tensorboard_idx is not None: + # # summary = self._tf.Summary() + + # # for name, score in metrics: + # # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + + # # self.TensorboardLogger_train(summary,tensorboard_index) + # # self.TensorboardLogger('train',summary,tensorboard_index) + # #self.tb_train_writer.add_summary(summary, tensorboard_index) + # #self.tb_train_writer.flush() + + # self._send_event(event_name='after_train_log', data=report) + + + # report = {'train': report} + # print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) def _send_event(self, event_name: str, data: Optional[dict] = None) -> None: report = { @@ -274,7 +299,8 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: """Train pipeline on batches using provided data iterator and initialization parameters""" self.start_time = time.time() if self.validate_first: - self._validate(iterator) + # self._validate(iterator) + self.TensorboardLogger_valid(self , iterator , log = log) while True: impatient = False @@ -292,11 +318,12 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.examples += len(x) if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: - self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + # self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + self.TensorboardLogger_train(self , iterator, 'every_n_batches', self.train_batches_seen , log) # log not used for TB_train if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: - self._validate(iterator, - tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + # self._validate(iterator,tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + self.TensorboardLogger_valid(self , iterator, 'every_n_batches', self.train_batches_seen, log) self._send_event(event_name='after_batch') @@ -315,10 +342,12 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.epoch += 1 if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: - self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) + # self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) + self.TensorboardLogger_train(self , iterator, 'every_n_epochs', self.epoch , log =log) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: - self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) + # self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) + self.TensorboardLogger_valid(self , iterator, 'every_n_epochs', self.epoch, log) self._send_event(event_name='after_epoch') From cd142f8c88756c9f6a85a46841d94668a018fcfe Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 1 Dec 2021 01:52:09 +0300 Subject: [PATCH 03/18] add docstrings to logging_class --- .../classifiers/sentiment_twitter.json | 29 ++- deeppavlov/core/common/logging_class.py | 218 +++++++++++++----- deeppavlov/core/trainers/fit_trainer.py | 100 ++++---- deeppavlov/core/trainers/nn_trainer.py | 183 ++++----------- 4 files changed, 257 insertions(+), 273 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 7add79309e..1846c11441 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", @@ -100,15 +100,17 @@ ] }, "train": { - "epochs": 5, - "log_every_n_epochs": 1, + "epochs": 10, "batch_size": 64, "metrics": [ "accuracy", "f1_macro", { "name": "roc_auc", - "inputs": ["y_onehot", "y_pred_probas"] + "inputs": [ + "y_onehot", + "y_pred_probas" + ] } ], "validation_patience": 5, @@ -121,13 +123,16 @@ "test" ], "class_name": "nn_trainer", - "tensorboard_log_dir": "{MODELS_PATH}/sentiment_twitter/logs", - "logger": [ - { - "name": "TensorboardLogger", - "log_dir": "{MODELS_PATH}/sentiment_twitter/Tensorboard_logs" - } - ] + "tensorboard_log_dir": "{MODELS_PATH}/sentiment_twitter/logs", + "logger": [ + { + "name": "TensorboardLogger", + "log_dir": "{MODELS_PATH}/sentiment_twitter/Tensorboard_logs" + }, + { + "name": "StdLogger" + } + ] }, "metadata": { "variables": { @@ -155,4 +160,4 @@ } ] } -} +} \ No newline at end of file diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index 2c0b0fec6d..c489355c4f 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -1,39 +1,123 @@ -from joblib import logger -import tensorflow as tf -import datetime +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json import time +import datetime from itertools import islice -from typing import List, Tuple, Union, Optional, Iterable -import json -from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder +from abc import ABC, abstractmethod +from typing import List, Tuple, Optional +from logging import getLogger + +import tensorflow as tf + +from deeppavlov.core.trainers.utils import NumpyArrayEncoder +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator + +log = getLogger(__name__) + + +class TrainLogger(ABC): + """An abstract class for logging metrics during training process. + + There are three types of logging: + 1- StdLogger: print metrics during training + 2- TensorboardLogger: to log metrics to local file specified by log_dir in .json file. + 3- WandbLogger: Not implemented yet. + + """ + @abstractmethod + def __init__(self): + """ + The constructor for TrainLogger class. + """ + raise NotImplementedError + + @abstractmethod + def __call__(self): + """ + Call method with metrics as parameters for logging, according to chosen method. + + """ + raise NotImplementedError + + @abstractmethod + def print_info(self): + """ + Print inforamtion about logging method, like the logging directory... + + """ + raise NotImplementedError -class TrainLogger: - def __init__(self,log_dir): - pass - - def __call__(self,metrics): - pass class TensorboardLogger(TrainLogger): - def __init__(self, type, log_dir): - self.tb_writer = tf.summary.FileWriter(log_dir) - self.type = type - #self.tb_train_writer = tf.summary.FileWriter(str(log_dir / 'train_log')) - #self.tb_valid_writer = tf.summary.FileWriter(str(log_dir / 'valid_log')) + """ + TensorboardLogger class for logging metrics during training process into a local folder, later using TensorBoard tool for visualizations the logged data. + + Args: + type: 'train' for logging metrics of training process or 'valid' for validation process. + log_dir: path to local folder to log data into. + + """ - def __call__(self, nn_trainer, iterator, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None , log = None ): # default value for log for now = None - if self.type =='train': - print("logging Training metrics...") + def __init__(self, type: str, log_dir: str): + self.type = type + if log_dir is not None: + self.tb_writer = tf.summary.FileWriter(log_dir) + self.log_dir = log_dir + + def __call__(self, + nn_trainer, + iterator: DataLearningIterator, + tensorboard_tag: Optional[str] = None, + tensorboard_index: Optional[int] = None, + ) -> dict: + """ + override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. + for 'valid' logging type, log metrics of validation process to log_dir/valid_log. + for 'valid' type, 'call' function saves best score on validation data, and the model parameters corresponding to the best score. + + Args: + nn_trainer: 'NNTrainer' object which contains 'self' as variable. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' + tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. + + Returns: + a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. + + """ + if self.type == 'train': nn_trainer._send_event(event_name='before_log') if nn_trainer.log_on_k_batches == 0: report = { - 'time_spent': str(datetime.timedelta(seconds=round(time.time() - nn_trainer.start_time + 0.5))) + 'time_spent': + str( + datetime.timedelta( + seconds=round(time.time() - nn_trainer.start_time + + 0.5))) } else: - data = islice(iterator.gen_batches(nn_trainer.batch_size, data_type='train', shuffle=True), - nn_trainer.log_on_k_batches) - report = nn_trainer.test(data, nn_trainer.train_metrics, start_time=nn_trainer.start_time) + data = islice( + iterator.gen_batches(nn_trainer.batch_size, + data_type='train', + shuffle=True), + nn_trainer.log_on_k_batches) + report = nn_trainer.test(data, + nn_trainer.train_metrics, + start_time=nn_trainer.start_time) report.update({ 'epochs_done': nn_trainer.epoch, @@ -41,44 +125,33 @@ def __call__(self, nn_trainer, iterator, tensorboard_tag: Optional[str] = None, 'train_examples_seen': nn_trainer.examples }) - metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(nn_trainer.last_result.items()) + metrics: List[Tuple[str, float]] = list( + report.get('metrics', {}).items()) + list( + nn_trainer.last_result.items()) report.update(nn_trainer.last_result) if nn_trainer.losses: - report['loss'] = sum(nn_trainer.losses) / len(nn_trainer.losses) + report['loss'] = sum(nn_trainer.losses) / len( + nn_trainer.losses) nn_trainer.losses.clear() metrics.append(('loss', report['loss'])) - # if metrics and self.tensorboard_log_dir is not None: - # if metrics and nn_trainer.tensorboard_idx is not None: - # self.TensorboardLogger_train(self, metrics, tensorboard_tag, tensorboard_index) if metrics and nn_trainer.tensorboard_idx is not None: + log.info(f"logging Training metrics to {self.log_dir}") summary = nn_trainer._tf.Summary() for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + summary.value.add(tag=f'{tensorboard_tag}/{name}', + simple_value=score) self.tb_writer.add_summary(summary, tensorboard_index) self.tb_writer.flush() - # summary = self._tf.Summary() - - # for name, score in metrics: - # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) - - # self.tb_writer.add_summary(summary, tensorboard_index) - # self.tb_writer.flush() - # self.TensorboardLogger_train(summary,tensorboard_index) - # self.TensorboardLogger('train',summary,tensorboard_index) - #self.tb_train_writer.add_summary(summary, tensorboard_index) - #self.tb_train_writer.flush() nn_trainer._send_event(event_name='after_train_log', data=report) - report = {'train': report} - print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) else: - print("logging Validation metrics...") nn_trainer._send_event(event_name='before_validation') - report = nn_trainer.test(iterator.gen_batches(nn_trainer.batch_size, data_type='valid', shuffle=False), - start_time=nn_trainer.start_time) + report = nn_trainer.test(iterator.gen_batches( + nn_trainer.batch_size, data_type='valid', shuffle=False), + start_time=nn_trainer.start_time) report['epochs_done'] = nn_trainer.epoch report['batches_seen'] = nn_trainer.train_batches_seen @@ -87,9 +160,11 @@ def __call__(self, nn_trainer, iterator, tensorboard_tag: Optional[str] = None, metrics = list(report['metrics'].items()) if tensorboard_tag is not None and nn_trainer.tensorboard_idx is not None: + log.info(f"logging Validation metrics to {self.log_dir}") summary = nn_trainer._tf.Summary() for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + summary.value.add(tag=f'{tensorboard_tag}/{name}', + simple_value=score) if tensorboard_index is None: tensorboard_index = nn_trainer.train_batches_seen self.tb_writer.add_summary(summary, tensorboard_index) @@ -110,7 +185,8 @@ def __call__(self, nn_trainer, iterator, tensorboard_tag: Optional[str] = None, if nn_trainer._is_initial_validation(): log.info('Initial best {} of {}'.format(m_name, score)) nn_trainer.score_best = score - elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: + elif nn_trainer._is_first_validation( + ) and nn_trainer.score_best is None: log.info('First best {} of {}'.format(m_name, score)) nn_trainer.score_best = score log.info('Saving model') @@ -121,29 +197,47 @@ def __call__(self, nn_trainer, iterator, tensorboard_tag: Optional[str] = None, log.info('Saving model') nn_trainer.save() else: - log.info('Did not improve on the {} of {}'.format(m_name, nn_trainer.score_best)) + log.info('Did not improve on the {} of {}'.format( + m_name, nn_trainer.score_best)) report['impatience'] = nn_trainer.patience if nn_trainer.validation_patience > 0: report['patience_limit'] = nn_trainer.validation_patience nn_trainer._send_event(event_name='after_validation', data=report) - report = {'valid': report} - print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) nn_trainer.validation_number += 1 + return report + + def print_info(self): + raise NotImplementedError + + +class StdLogger(TrainLogger): + """ + StdLogger class for printing report about current training or validation process to stdout. + + Args: + type: 'train' for printing report of training process or 'valid' for validation process. + log_true (boo): if True: print of the StdLogger is provided in .json file as logging method or not. default False. + + """ + def __init__(self, type: str, log_true: bool = False): + self.type = type + self.log_true = log_true + + def __call__(self, report: dict) -> None: + if(self.log_true): + report = {self.type: report} + log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) + def print_info(self): + raise NotImplementedError - # summary = tf.Summary() - # for name, score in metrics: - # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) - # self.tb_writer.add_summary(summary, tensorboard_index) - # self.tb_writer.flush() +class WandbLogger(TrainLogger): + def __init__(self): + raise NotImplementedError - # if train_or_valid == 'train': - # self.tb_train_writer.add_summary(summary, tensorboard_index) - # self.tb_train_writer.flush() - # else: - # self.tb_valid_writer.add_summary(summary, tensorboard_index) - # self.tb_valid_writer.flush() \ No newline at end of file + def print_info(self): + raise NotImplementedError diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index 5ffd5b62eb..ede1162148 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -18,7 +18,7 @@ from itertools import islice from logging import getLogger from pathlib import Path -from typing import Tuple, Dict, Union, Optional, Iterable, Any, Collection +from typing import List, Tuple, Dict, Union, Optional, Iterable, Any, Collection from deeppavlov.core.commands.infer import build_model from deeppavlov.core.commands.utils import expand_path @@ -64,28 +64,35 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, show_examples: bool = False, # tensorboard_log_dir: Optional[Union[str, Path]] = None, max_test_batches: int = -1, - logger: list = [], + logger: Optional[List[Dict]] = None, **kwargs) -> None: if kwargs: - log.info(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') + log.info( + f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') self.chainer_config = chainer_config - self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) + self._chainer = Chainer( + chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) self.batch_size = batch_size - self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params) + self.metrics = parse_metrics( + metrics, self._chainer.in_y, self._chainer.out_params) self.evaluation_targets = tuple(evaluation_targets) self.show_examples = show_examples self.max_test_batches = None if max_test_batches < 0 else max_test_batches # self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir - self.logger: list = logger + self.logger: Optional[List[Dict]] = logger + def get_method_idx(logger, name): - for i in range(len(logger)): - if logger[i]["name"] == name: - return i - return None + try: + for i in range(len(logger)): + if logger[i]["name"] == name: + return i + except: + return None self.tensorboard_idx = get_method_idx(self.logger, "TensorboardLogger") - # self.wandb = get_method_idx(self.logger, "") + self.stdlogger_idx = get_method_idx(self.logger, "StdLogger") + if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements @@ -94,24 +101,13 @@ def get_method_idx(logger, name): except ImportError: log.warning('TensorFlow could not be imported, so tensorboard log directory' f'`{self.logger[self.tensorboard_idx]["log_dir"]}` will be ignored') + self.tensorboard_idx = None # check it self.logger[self.tensorboard_idx]["log_dir"] = None else: - self.logger[self.tensorboard_idx]["log_dir"] = expand_path(self.logger[self.tensorboard_idx]["log_dir"]) + self.logger[self.tensorboard_idx]["log_dir"] = expand_path( + self.logger[self.tensorboard_idx]["log_dir"]) self._tf = tensorflow - # if tensorboard_log_dir is not None: - # try: - # # noinspection PyPackageRequirements - # # noinspection PyUnresolvedReferences - # import tensorflow - # except ImportError: - # log.warning('TensorFlow could not be imported, so tensorboard log directory' - # f'`{self.tensorboard_log_dir}` will be ignored') - # self.tensorboard_log_dir = None - # else: - # self.tensorboard_log_dir = expand_path(tensorboard_log_dir) - # self._tf = tensorflow - self._built = False self._saved = False self._loaded = False @@ -136,51 +132,37 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] writer = None for i, (x, y) in enumerate(iterator.gen_batches(self.batch_size, shuffle=False)): - preprocessed = self._chainer.compute(x, y, targets=targets) + preprocessed = self._chainer.compute( + x, y, targets=targets) # noinspection PyUnresolvedReferences result = component.partial_fit(*preprocessed) - # if result is not None and self.tensorboard_log_dir is not None: - # if writer is None: - # writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / - # f'partial_fit_{component_index}_log')) - # for name, score in result.items(): - # summary = self._tf.Summary() - # summary.value.add(tag='partial_fit/' + name, simple_value=score) - # writer.add_summary(summary, i) - # writer.flush() - if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: if writer is None: writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / f'partial_fit_{component_index}_log')) for name, score in result.items(): summary = self._tf.Summary() - summary.value.add(tag='partial_fit/' + name, simple_value=score) + summary.value.add( + tag='partial_fit/' + name, simple_value=score) writer.add_summary(summary, i) writer.flush() else: - preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets) + preprocessed = self._chainer.compute( + *iterator.get_instances(), targets=targets) if len(targets) == 1: preprocessed = [preprocessed] - result: Optional[Dict[str, Iterable[float]]] = component.fit(*preprocessed) - - # if result is not None and self.tensorboard_log_dir is not None: - # writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / - # f'fit_log_{component_index}')) - # for name, scores in result.items(): - # for i, score in enumerate(scores): - # summary = self._tf.Summary() - # summary.value.add(tag='fit/' + name, simple_value=score) - # writer.add_summary(summary, i) - # writer.flush() + result: Optional[Dict[str, Iterable[float]] + ] = component.fit(*preprocessed) + if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / f'fit_log_{component_index}')) for name, scores in result.items(): for i, score in enumerate(scores): summary = self._tf.Summary() - summary.value.add(tag='fit/' + name, simple_value=score) + summary.value.add( + tag='fit/' + name, simple_value=score) writer.add_summary(summary, i) writer.flush() @@ -197,7 +179,8 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] def _load(self) -> None: if not self._loaded: self._chainer.destroy() - self._chainer = build_model({'chainer': self.chainer_config}, load_trained=self._saved) + self._chainer = build_model( + {'chainer': self.chainer_config}, load_trained=self._saved) self._loaded = True def get_chainer(self) -> Chainer: @@ -237,7 +220,8 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], if metrics is None: metrics = self.metrics - expected_outputs = list(set().union(self._chainer.out_params, *[m.inputs for m in metrics])) + expected_outputs = list(set().union( + self._chainer.out_params, *[m.inputs for m in metrics])) outputs = {out: [] for out in expected_outputs} examples = 0 @@ -246,7 +230,8 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], for x, y_true in data: examples += len(x) - y_predicted = list(self._chainer.compute(list(x), list(y_true), targets=expected_outputs)) + y_predicted = list(self._chainer.compute( + list(x), list(y_true), targets=expected_outputs)) if len(expected_outputs) == 1: y_predicted = [y_predicted] for out, val in zip(outputs.values(), y_predicted): @@ -273,7 +258,8 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], for out_name, y_predicted_group in zip(expected_outputs, y_predicted) if out_name in self._chainer.out_params]) if len(self._chainer.out_params) == 1: - y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted] + y_predicted = [y_predicted_item[0] + for y_predicted_item in y_predicted] report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, @@ -302,10 +288,12 @@ def evaluate(self, iterator: DataLearningIterator, evaluation_targets: Optional[ res = {} for data_type in evaluation_targets: - data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False) + data_gen = iterator.gen_batches( + self.batch_size, data_type=data_type, shuffle=False) report = self.test(data_gen) res[data_type] = report if print_reports: - print(json.dumps({data_type: report}, ensure_ascii=False, cls=NumpyArrayEncoder)) + print(json.dumps({data_type: report}, + ensure_ascii=False, cls=NumpyArrayEncoder)) return res diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 5417f65620..c91cd34d3f 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -13,20 +13,16 @@ # limitations under the License. import datetime -import json import time -from itertools import islice from logging import getLogger -from pathlib import Path -from typing import List, Tuple, Union, Optional, Iterable +from typing import List, Dict, Union, Optional, Iterable from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.trainers.fit_trainer import FitTrainer -from deeppavlov.core.trainers.utils import parse_metrics, NumpyArrayEncoder - -from deeppavlov.core.common.logging_class import * +from deeppavlov.core.trainers.utils import parse_metrics +from deeppavlov.core.common.logging_class import TensorboardLogger, StdLogger, WandbLogger log = getLogger(__name__) @@ -59,7 +55,8 @@ class NNTrainer(FitTrainer): in evaluation logs (default is ``False``) tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None (default is ``None``) - logger : list of dictionary of possible loggers provided in config file + logger : list of dictionaries of possible loggers provided in config file, ignored if None + (default is ``None``), possible loggers: TensorboardLogger and StdLogger validate_first: flag used to calculate metrics on the ``'valid'`` data type before starting training (default is ``True``) validation_patience: how many times in a row the validation metric has to not improve for early stopping, @@ -91,7 +88,7 @@ class NNTrainer(FitTrainer): """ - def __init__(self, chainer_config: dict, *, + def __init__(self, chainer_config: dict, *, batch_size: int = 1, epochs: int = -1, start_epoch_num: int = 0, @@ -102,7 +99,7 @@ def __init__(self, chainer_config: dict, *, evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, # tensorboard_log_dir: Optional[Union[str, Path]] = None, - logger : list = [], ## see FitTrainer + logger: Optional[List[Dict]] = None, max_test_batches: int = -1, validate_first: bool = True, @@ -112,13 +109,13 @@ def __init__(self, chainer_config: dict, *, **kwargs) -> None: super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets, show_examples=show_examples, - # tensorboard_log_dir=tensorboard_log_dir, - logger = logger, + logger=logger, max_test_batches=max_test_batches, **kwargs) if train_metrics is None: self.train_metrics = self.metrics else: - self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params) + self.train_metrics = parse_metrics( + train_metrics, self._chainer.in_y, self._chainer.out_params) metric_optimization = metric_optimization.strip().lower() self.score_best = None @@ -132,7 +129,8 @@ def _improved(op): elif metric_optimization == 'minimize': self.improved = _improved(lambda a, b: a < b) else: - raise ConfigError('metric_optimization has to be one of {}'.format(['maximize', 'minimize'])) + raise ConfigError('metric_optimization has to be one of {}'.format( + ['maximize', 'minimize'])) self.validate_first = validate_first self.validation_number = 0 if validate_first else 1 @@ -154,17 +152,19 @@ def _improved(op): self.losses = [] self.start_time: Optional[float] = None - # if self.tensorboard_log_dir is not None: - # self.tb_train_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'train_log')) - # self.tb_valid_writer = self._tf.summary.FileWriter(str(self.tensorboard_log_dir / 'valid_log')) - if self.tensorboard_idx is not None: - self.TensorboardLogger_train = TensorboardLogger('train', str(self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) - self.TensorboardLogger_valid = TensorboardLogger('valid', str(self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) - # self.TensorboardLogger = TensorboardLogger(self.logger[self.tensorboard_idx]["log_dir"]) - #self.tb_train_writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) - #self.tb_valid_writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) - + self.tensorboardlogger_train = TensorboardLogger('train', str( + self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) + self.tensorboardlogger_valid = TensorboardLogger('valid', str( + self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) + else: + self.tensorboardlogger_train = TensorboardLogger('train') + self.tensorboardlogger_valid = TensorboardLogger('valid') + + self.std_logger_train = StdLogger( + 'train', self.stdlogger_idx is not None) + self.std_logger_valid = StdLogger( + 'valid', self.stdlogger_idx is not None) def save(self) -> None: if self._loaded: @@ -178,112 +178,6 @@ def _is_initial_validation(self): def _is_first_validation(self): return self.validation_number == 1 - # def _validate(self, iterator: DataLearningIterator, - # tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: - # self.TensorboardLogger_valid(self , iterator, tensorboard_tag, tensorboard_index, log) - # self._send_event(event_name='before_validation') - # report = self.test(iterator.gen_batches(self.batch_size, data_type='valid', shuffle=False), - # start_time=self.start_time) - - # report['epochs_done'] = self.epoch - # report['batches_seen'] = self.train_batches_seen - # report['train_examples_seen'] = self.examples - - # metrics = list(report['metrics'].items()) - - # if tensorboard_tag is not None and self.tensorboard_log_dir is not None: - # summary = self._tf.Summary() - # for name, score in metrics: - # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) - # if tensorboard_index is None: - # tensorboard_index = self.train_batches_seen - # self.tb_valid_writer.add_summary(summary, tensorboard_index) - # self.tb_valid_writer.flush() - - # m_name, score = metrics[0] - - # # Update the patience - # if self.score_best is None: - # self.patience = 0 - # else: - # if self.improved(score, self.score_best): - # self.patience = 0 - # else: - # self.patience += 1 - - # # Run the validation model-saving logic - # if self._is_initial_validation(): - # log.info('Initial best {} of {}'.format(m_name, score)) - # self.score_best = score - # elif self._is_first_validation() and self.score_best is None: - # log.info('First best {} of {}'.format(m_name, score)) - # self.score_best = score - # log.info('Saving model') - # self.save() - # elif self.improved(score, self.score_best): - # log.info('Improved best {} of {}'.format(m_name, score)) - # self.score_best = score - # log.info('Saving model') - # self.save() - # else: - # log.info('Did not improve on the {} of {}'.format(m_name, self.score_best)) - - # report['impatience'] = self.patience - # if self.validation_patience > 0: - # report['patience_limit'] = self.validation_patience - - # self._send_event(event_name='after_validation', data=report) - # report = {'valid': report} - # print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) - # self.validation_number += 1 - - # def _log(self, iterator: DataLearningIterator, - # tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None) -> None: - # self.TensorboardLogger_train(self , iterator, tensorboard_tag, tensorboard_index , log) - # self._send_event(event_name='before_log') - # if self.log_on_k_batches == 0: - # report = { - # 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))) - # } - # else: - # data = islice(iterator.gen_batches(self.batch_size, data_type='train', shuffle=True), - # self.log_on_k_batches) - # report = self.test(data, self.train_metrics, start_time=self.start_time) - - # report.update({ - # 'epochs_done': self.epoch, - # 'batches_seen': self.train_batches_seen, - # 'train_examples_seen': self.examples - # }) - - # metrics: List[Tuple[str, float]] = list(report.get('metrics', {}).items()) + list(self.last_result.items()) - - # report.update(self.last_result) - # if self.losses: - # report['loss'] = sum(self.losses) / len(self.losses) - # self.losses.clear() - # metrics.append(('loss', report['loss'])) - - # # if metrics and self.tensorboard_log_dir is not None: - # if metrics and self.tensorboard_idx is not None: - # self.TensorboardLogger_train(self, metrics, tensorboard_tag, tensorboard_index) - # # if metrics and self.tensorboard_idx is not None: - # # summary = self._tf.Summary() - - # # for name, score in metrics: - # # summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) - - # # self.TensorboardLogger_train(summary,tensorboard_index) - # # self.TensorboardLogger('train',summary,tensorboard_index) - # #self.tb_train_writer.add_summary(summary, tensorboard_index) - # #self.tb_train_writer.flush() - - # self._send_event(event_name='after_train_log', data=report) - - - # report = {'train': report} - # print(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) - def _send_event(self, event_name: str, data: Optional[dict] = None) -> None: report = { 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))), @@ -299,8 +193,8 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: """Train pipeline on batches using provided data iterator and initialization parameters""" self.start_time = time.time() if self.validate_first: - # self._validate(iterator) - self.TensorboardLogger_valid(self , iterator , log = log) + report_stdlogger = self.tensorboardlogger_valid(self, iterator) + self.std_logger_valid(report_stdlogger) while True: impatient = False @@ -318,13 +212,14 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.examples += len(x) if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: - # self._log(iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) - self.TensorboardLogger_train(self , iterator, 'every_n_batches', self.train_batches_seen , log) # log not used for TB_train + report_stdlogger = self.tensorboardlogger_train( + self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + self.std_logger_train(report_stdlogger) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: - # self._validate(iterator,tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) - self.TensorboardLogger_valid(self , iterator, 'every_n_batches', self.train_batches_seen, log) - + report_stdlogger = self.tensorboardlogger_valid( + self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + self.std_logger_valid(report_stdlogger) self._send_event(event_name='after_batch') if 0 < self.max_batches <= self.train_batches_seen: @@ -342,13 +237,14 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.epoch += 1 if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: - # self._log(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) - self.TensorboardLogger_train(self , iterator, 'every_n_epochs', self.epoch , log =log) + report_stdlogger = self.tensorboardlogger_train( + self, iterator, 'every_n_epochs', self.epoch) + self.std_logger_train(report_stdlogger) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: - # self._validate(iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) - self.TensorboardLogger_valid(self , iterator, 'every_n_epochs', self.epoch, log) - + report_stdlogger = self.tensorboardlogger_valid( + self, iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) + self.std_logger_valid(report_stdlogger) self._send_event(event_name='after_epoch') if 0 < self.max_epochs <= self.epoch: @@ -367,7 +263,8 @@ def train(self, iterator: DataLearningIterator) -> None: except KeyboardInterrupt: log.info('Stopped training') else: - log.warning(f'Using {self.__class__.__name__} for a pipeline without batched training') + log.warning( + f'Using {self.__class__.__name__} for a pipeline without batched training') # Run the at-train-exit model-saving logic if self.validation_number < 1: From 1b9361f50b48c96704119455d8260ed702f1442d Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 1 Dec 2021 02:11:18 +0300 Subject: [PATCH 04/18] correct data_path in .json file --- deeppavlov/configs/classifiers/sentiment_twitter.json | 2 +- deeppavlov/core/common/logging_class.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 1846c11441..8a61894e08 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index c489355c4f..eb647b7151 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -69,7 +69,7 @@ class TensorboardLogger(TrainLogger): Args: type: 'train' for logging metrics of training process or 'valid' for validation process. log_dir: path to local folder to log data into. - + """ def __init__(self, type: str, log_dir: str): @@ -220,8 +220,9 @@ class StdLogger(TrainLogger): Args: type: 'train' for printing report of training process or 'valid' for validation process. log_true (boo): if True: print of the StdLogger is provided in .json file as logging method or not. default False. - + """ + def __init__(self, type: str, log_true: bool = False): self.type = type self.log_true = log_true From be97470b0c982dda9d242b995e4be22af85bf5ce Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Mon, 6 Dec 2021 12:30:53 +0300 Subject: [PATCH 05/18] Add WandbLogger --- .../classifiers/sentiment_twitter.json | 5 ++ deeppavlov/core/common/logging_class.py | 65 +++++++++++++++++-- deeppavlov/core/trainers/fit_trainer.py | 2 +- deeppavlov/core/trainers/nn_trainer.py | 7 ++ requirements.txt | 1 + 5 files changed, 73 insertions(+), 7 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 8a61894e08..f4a2010786 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -131,6 +131,11 @@ }, { "name": "StdLogger" + }, + { + "name": "WandbLogger", + "API_Key":"40_chars from your setting in wandb site", + "config": {"configs": "an option for user to write his config, not implemented!"} } ] }, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index eb647b7151..1b88fdc337 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -21,6 +21,7 @@ from logging import getLogger import tensorflow as tf +import wandb from deeppavlov.core.trainers.utils import NumpyArrayEncoder from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @@ -67,8 +68,8 @@ class TensorboardLogger(TrainLogger): TensorboardLogger class for logging metrics during training process into a local folder, later using TensorBoard tool for visualizations the logged data. Args: - type: 'train' for logging metrics of training process or 'valid' for validation process. - log_dir: path to local folder to log data into. + type (str): 'train' for logging metrics of training process or 'valid' for validation process. + log_dir (str): path to local folder to log data into. """ @@ -77,6 +78,8 @@ def __init__(self, type: str, log_dir: str): if log_dir is not None: self.tb_writer = tf.summary.FileWriter(log_dir) self.log_dir = log_dir + else: + self.log_dir = None def __call__(self, nn_trainer, @@ -136,7 +139,7 @@ def __call__(self, nn_trainer.losses.clear() metrics.append(('loss', report['loss'])) - if metrics and nn_trainer.tensorboard_idx is not None: + if metrics and self.log_dir is not None: # nn_trainer.tensorboard_idx is not None log.info(f"logging Training metrics to {self.log_dir}") summary = nn_trainer._tf.Summary() @@ -159,7 +162,8 @@ def __call__(self, metrics = list(report['metrics'].items()) - if tensorboard_tag is not None and nn_trainer.tensorboard_idx is not None: + # nn_trainer.tensorboard_idx is not None: + if tensorboard_tag is not None and self.log_dir is not None: log.info(f"logging Validation metrics to {self.log_dir}") summary = nn_trainer._tf.Summary() for name, score in metrics: @@ -228,6 +232,13 @@ def __init__(self, type: str, log_true: bool = False): self.log_true = log_true def __call__(self, report: dict) -> None: + """ + Print report to stdout. + + Args: + report(dict): report to log to stdout. + + """ if(self.log_true): report = {self.type: report} log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) @@ -237,8 +248,50 @@ def print_info(self): class WandbLogger(TrainLogger): - def __init__(self): - raise NotImplementedError + """ + WandbLogger class for logging report about current training or validation process to WandB ("https://wandb.ai/site"). + + WandB is a central dashboard to keep track of your hyperparameters, system metrics, and predictions so you can compare models live, and share your findings. + + Args: + key (string, optional): authentication key. + + """ + + def __init__(self, key: Optional[str] = None): + self.config = {"lr": 0.1} # not completed + wandb.login(key=key, relogin=True) + wandb.init(project="Deeppavlov_Test", + group="Group_3", + job_type="train", + config=self.config, + name="Test_logging" + ) + + def __call__(self, report: dict) -> None: + """" + Logging report of the training process to wandb. + + Args: + report (dict): report to log to WandB. + + Returns: + a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. + + """ + for i in report.keys(): + if type(report[i]) == dict: + for j in report[i].keys(): + wandb.log({j: report[i].keys()[j]}) + else: + if (i == 'time_spent'): + t = time.strptime(report[i], '%H:%M:%S') + y_seconds = int(datetime.timedelta( + hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec).total_seconds()) + wandb.log({i: y_seconds}) + else: + wandb.log({i: report[i]}) def print_info(self): raise NotImplementedError + diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index ede1162148..089689383d 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -92,7 +92,7 @@ def get_method_idx(logger, name): return None self.tensorboard_idx = get_method_idx(self.logger, "TensorboardLogger") self.stdlogger_idx = get_method_idx(self.logger, "StdLogger") - + self.wandblogger_idx = get_method_idx(self.logger, "WandbLogger") if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index c91cd34d3f..2fddb1ae7e 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -160,6 +160,9 @@ def _improved(op): else: self.tensorboardlogger_train = TensorboardLogger('train') self.tensorboardlogger_valid = TensorboardLogger('valid') + + if self.wandblogger_idx is not None: + self.wandblogger = WandbLogger(self.logger[self.wandblogger_idx]["API_Key"]) self.std_logger_train = StdLogger( 'train', self.stdlogger_idx is not None) @@ -215,6 +218,8 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: report_stdlogger = self.tensorboardlogger_train( self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) self.std_logger_train(report_stdlogger) + if self.wandblogger_idx is not None: + self.wandblogger(report_stdlogger) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: report_stdlogger = self.tensorboardlogger_valid( @@ -240,6 +245,8 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: report_stdlogger = self.tensorboardlogger_train( self, iterator, 'every_n_epochs', self.epoch) self.std_logger_train(report_stdlogger) + if self.wandblogger_idx is not None: + self.wandblogger(report_stdlogger) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: report_stdlogger = self.tensorboardlogger_valid( diff --git a/requirements.txt b/requirements.txt index 0198dfc49b..2ed76c4537 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,4 @@ click==7.1.2 uvicorn==0.11.7 sacremoses==0.0.35 uvloop==0.14.0 +wandb==0.12.7 \ No newline at end of file From 9b6f71ec0f675f4faa5dcb7f75683ab9f68c6abd Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 22 Dec 2021 00:53:21 +0300 Subject: [PATCH 06/18] Add init --- .../classifiers/sentiment_twitter.json | 15 ++++++++++++--- deeppavlov/core/common/logging_class.py | 19 +++++++++---------- deeppavlov/core/trainers/fit_trainer.py | 1 + deeppavlov/core/trainers/nn_trainer.py | 5 +++-- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index f4a2010786..8117368bc5 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", @@ -134,8 +134,17 @@ }, { "name": "WandbLogger", - "API_Key":"40_chars from your setting in wandb site", - "config": {"configs": "an option for user to write his config, not implemented!"} + "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0253", + "project": "DeepTest", + "group": "Group_Name", + "job_type":"Name job Type", + "run_name":"Run name", + "config": { + "description": "add any hyperprameter you want to monitor, architecture discription,..", + "learning_rate": 0.02, + "architecture": "CNN", + "dataset": "CIFAR-100" + } } ] }, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index 1b88fdc337..74d7c713e3 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -17,7 +17,7 @@ import datetime from itertools import islice from abc import ABC, abstractmethod -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Dict from logging import getLogger import tensorflow as tf @@ -257,15 +257,14 @@ class WandbLogger(TrainLogger): key (string, optional): authentication key. """ - - def __init__(self, key: Optional[str] = None): - self.config = {"lr": 0.1} # not completed - wandb.login(key=key, relogin=True) - wandb.init(project="Deeppavlov_Test", - group="Group_3", - job_type="train", - config=self.config, - name="Test_logging" + # wandb_keys = ["project","group","job_type","name","config"] + def __init__(self, wandb_init: Optional[Dict] = None): + wandb.login(key=wandb_init.get("API_Key",None), relogin=True) + wandb.init(project=wandb_init.get("project",None), + group=wandb_init.get("group",None), + job_type=wandb_init.get("job_type",None), + config=wandb_init.get("config",None), + name=wandb_init.get("run_name",None) ) def __call__(self, report: dict) -> None: diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index 089689383d..ee5a548408 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -19,6 +19,7 @@ from logging import getLogger from pathlib import Path from typing import List, Tuple, Dict, Union, Optional, Iterable, Any, Collection +from collections import defaultdict from deeppavlov.core.commands.infer import build_model from deeppavlov.core.commands.utils import expand_path diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 2fddb1ae7e..6373bd4937 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -16,6 +16,7 @@ import time from logging import getLogger from typing import List, Dict, Union, Optional, Iterable +from collections import defaultdict from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register @@ -160,9 +161,9 @@ def _improved(op): else: self.tensorboardlogger_train = TensorboardLogger('train') self.tensorboardlogger_valid = TensorboardLogger('valid') - + if self.wandblogger_idx is not None: - self.wandblogger = WandbLogger(self.logger[self.wandblogger_idx]["API_Key"]) + self.wandblogger = WandbLogger(self.logger[self.wandblogger_idx]) self.std_logger_train = StdLogger( 'train', self.stdlogger_idx is not None) From eaf908dfab591a22f7b281ca3b85cc87d9794954 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 22 Dec 2021 11:51:14 +0300 Subject: [PATCH 07/18] Add wandb --- .../classifiers/sentiment_twitter.json | 5 +- deeppavlov/core/common/logging_class.py | 177 ++++++++++-------- deeppavlov/core/trainers/nn_trainer.py | 3 + 3 files changed, 106 insertions(+), 79 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 8117368bc5..5796c5908d 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 10, + "epochs": 30, "batch_size": 64, "metrics": [ "accuracy", @@ -143,8 +143,9 @@ "description": "add any hyperprameter you want to monitor, architecture discription,..", "learning_rate": 0.02, "architecture": "CNN", - "dataset": "CIFAR-100" + "dataset": "sentiment_twitter_data" } + } ] }, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index 74d7c713e3..0611dfd03c 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -35,9 +35,10 @@ class TrainLogger(ABC): There are three types of logging: 1- StdLogger: print metrics during training 2- TensorboardLogger: to log metrics to local file specified by log_dir in .json file. - 3- WandbLogger: Not implemented yet. + 3- WandbLogger: Not implemented yet. """ + @abstractmethod def __init__(self): """ @@ -49,7 +50,7 @@ def __init__(self): @abstractmethod def __call__(self): """ - Call method with metrics as parameters for logging, according to chosen method. + Call method with metrics as parameters for logging, according to chosen method. """ raise NotImplementedError @@ -81,16 +82,17 @@ def __init__(self, type: str, log_dir: str): else: self.log_dir = None - def __call__(self, - nn_trainer, - iterator: DataLearningIterator, - tensorboard_tag: Optional[str] = None, - tensorboard_index: Optional[int] = None, - ) -> dict: + def __call__( + self, + nn_trainer, + iterator: DataLearningIterator, + tensorboard_tag: Optional[str] = None, + tensorboard_index: Optional[int] = None, + ) -> dict: """ override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. for 'valid' logging type, log metrics of validation process to log_dir/valid_log. - for 'valid' type, 'call' function saves best score on validation data, and the model parameters corresponding to the best score. + for 'valid' type, 'call' function saves best score on validation data, and the model parameters corresponding to the best score. Args: nn_trainer: 'NNTrainer' object which contains 'self' as variable. @@ -102,73 +104,82 @@ def __call__(self, a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. """ - if self.type == 'train': - nn_trainer._send_event(event_name='before_log') + if self.type == "train": + nn_trainer._send_event(event_name="before_log") if nn_trainer.log_on_k_batches == 0: report = { - 'time_spent': - str( + "time_spent": str( datetime.timedelta( - seconds=round(time.time() - nn_trainer.start_time + - 0.5))) + seconds=round(time.time() - nn_trainer.start_time + 0.5) + ) + ) } else: data = islice( - iterator.gen_batches(nn_trainer.batch_size, - data_type='train', - shuffle=True), - nn_trainer.log_on_k_batches) - report = nn_trainer.test(data, - nn_trainer.train_metrics, - start_time=nn_trainer.start_time) - - report.update({ - 'epochs_done': nn_trainer.epoch, - 'batches_seen': nn_trainer.train_batches_seen, - 'train_examples_seen': nn_trainer.examples - }) + iterator.gen_batches( + nn_trainer.batch_size, data_type="train", shuffle=True + ), + nn_trainer.log_on_k_batches, + ) + report = nn_trainer.test( + data, nn_trainer.train_metrics, start_time=nn_trainer.start_time + ) + + report.update( + { + "epochs_done": nn_trainer.epoch, + "batches_seen": nn_trainer.train_batches_seen, + "train_examples_seen": nn_trainer.examples, + } + ) metrics: List[Tuple[str, float]] = list( - report.get('metrics', {}).items()) + list( - nn_trainer.last_result.items()) + report.get("metrics", {}).items() + ) + list(nn_trainer.last_result.items()) report.update(nn_trainer.last_result) if nn_trainer.losses: - report['loss'] = sum(nn_trainer.losses) / len( - nn_trainer.losses) + report["loss"] = sum(nn_trainer.losses) / len(nn_trainer.losses) nn_trainer.losses.clear() - metrics.append(('loss', report['loss'])) + metrics.append(("loss", report["loss"])) - if metrics and self.log_dir is not None: # nn_trainer.tensorboard_idx is not None + if ( + metrics and self.log_dir is not None + ): # nn_trainer.tensorboard_idx is not None log.info(f"logging Training metrics to {self.log_dir}") summary = nn_trainer._tf.Summary() for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', - simple_value=score) + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) self.tb_writer.add_summary(summary, tensorboard_index) self.tb_writer.flush() - nn_trainer._send_event(event_name='after_train_log', data=report) + nn_trainer._send_event(event_name="after_train_log", data=report) else: - nn_trainer._send_event(event_name='before_validation') - report = nn_trainer.test(iterator.gen_batches( - nn_trainer.batch_size, data_type='valid', shuffle=False), - start_time=nn_trainer.start_time) + nn_trainer._send_event(event_name="before_validation") + report = nn_trainer.test( + iterator.gen_batches( + nn_trainer.batch_size, data_type="valid", shuffle=False + ), + start_time=nn_trainer.start_time, + ) - report['epochs_done'] = nn_trainer.epoch - report['batches_seen'] = nn_trainer.train_batches_seen - report['train_examples_seen'] = nn_trainer.examples + report["epochs_done"] = nn_trainer.epoch + report["batches_seen"] = nn_trainer.train_batches_seen + report["train_examples_seen"] = nn_trainer.examples - metrics = list(report['metrics'].items()) + metrics = list(report["metrics"].items()) # nn_trainer.tensorboard_idx is not None: if tensorboard_tag is not None and self.log_dir is not None: log.info(f"logging Validation metrics to {self.log_dir}") summary = nn_trainer._tf.Summary() for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', - simple_value=score) + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) if tensorboard_index is None: tensorboard_index = nn_trainer.train_batches_seen self.tb_writer.add_summary(summary, tensorboard_index) @@ -187,28 +198,30 @@ def __call__(self, # Run the validation model-saving logic if nn_trainer._is_initial_validation(): - log.info('Initial best {} of {}'.format(m_name, score)) + log.info("Initial best {} of {}".format(m_name, score)) nn_trainer.score_best = score - elif nn_trainer._is_first_validation( - ) and nn_trainer.score_best is None: - log.info('First best {} of {}'.format(m_name, score)) + elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: + log.info("First best {} of {}".format(m_name, score)) nn_trainer.score_best = score - log.info('Saving model') + log.info("Saving model") nn_trainer.save() elif nn_trainer.improved(score, nn_trainer.score_best): - log.info('Improved best {} of {}'.format(m_name, score)) + log.info("Improved best {} of {}".format(m_name, score)) nn_trainer.score_best = score - log.info('Saving model') + log.info("Saving model") nn_trainer.save() else: - log.info('Did not improve on the {} of {}'.format( - m_name, nn_trainer.score_best)) + log.info( + "Did not improve on the {} of {}".format( + m_name, nn_trainer.score_best + ) + ) - report['impatience'] = nn_trainer.patience + report["impatience"] = nn_trainer.patience if nn_trainer.validation_patience > 0: - report['patience_limit'] = nn_trainer.validation_patience + report["patience_limit"] = nn_trainer.validation_patience - nn_trainer._send_event(event_name='after_validation', data=report) + nn_trainer._send_event(event_name="after_validation", data=report) nn_trainer.validation_number += 1 return report @@ -239,7 +252,7 @@ def __call__(self, report: dict) -> None: report(dict): report to log to stdout. """ - if(self.log_true): + if self.log_true: report = {self.type: report} log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) @@ -257,18 +270,22 @@ class WandbLogger(TrainLogger): key (string, optional): authentication key. """ - # wandb_keys = ["project","group","job_type","name","config"] + def __init__(self, wandb_init: Optional[Dict] = None): - wandb.login(key=wandb_init.get("API_Key",None), relogin=True) - wandb.init(project=wandb_init.get("project",None), - group=wandb_init.get("group",None), - job_type=wandb_init.get("job_type",None), - config=wandb_init.get("config",None), - name=wandb_init.get("run_name",None) - ) + wandb.login(key=wandb_init.get("API_Key", None), relogin=True) + + wandb.init( + anonymous="allow", + project=wandb_init.get("project", None), + group=wandb_init.get("group", None), + job_type=wandb_init.get("job_type", None), + config=wandb_init.get("config", None), + name=wandb_init.get("run_name", None), + id = wandb_init.get("id",None) # to resume a run + ) def __call__(self, report: dict) -> None: - """" + """ " Logging report of the training process to wandb. Args: @@ -281,16 +298,22 @@ def __call__(self, report: dict) -> None: for i in report.keys(): if type(report[i]) == dict: for j in report[i].keys(): - wandb.log({j: report[i].keys()[j]}) + wandb.log({j: report[i].keys()[j]},commit = False) else: - if (i == 'time_spent'): - t = time.strptime(report[i], '%H:%M:%S') - y_seconds = int(datetime.timedelta( - hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec).total_seconds()) - wandb.log({i: y_seconds}) + if i == "time_spent": + t = time.strptime(report[i], "%H:%M:%S") + y_seconds = int( + datetime.timedelta( + hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec + ).total_seconds() + ) + wandb.log({i: y_seconds},commit = False) else: - wandb.log({i: report[i]}) + wandb.log({i: report[i]},commit = False) + wandb.log({},commit= True) # to log all previous logs in one step. + + def close(self): + wandb.finish() def print_info(self): raise NotImplementedError - diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 6373bd4937..376982ab2c 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -268,6 +268,9 @@ def train(self, iterator: DataLearningIterator) -> None: if callable(getattr(self._chainer, 'train_on_batch', None)): try: self.train_on_batches(iterator) + # wandblogger will finish by itself, but finishing wandb manually here will be directly after training + if self.wandblogger_idx is not None: + self.wandblogger.close() except KeyboardInterrupt: log.info('Stopped training') else: From 968e00dbffdfef475fcda2ba2c385d59ad449e6f Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 22 Dec 2021 14:17:21 +0300 Subject: [PATCH 08/18] Testing --- deeppavlov/configs/classifiers/sentiment_twitter.json | 2 +- requirements.txt | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 5796c5908d..ca917ddf9e 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", diff --git a/requirements.txt b/requirements.txt index 2ed76c4537..b6fa7bdf47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,6 @@ click==7.1.2 uvicorn==0.11.7 sacremoses==0.0.35 uvloop==0.14.0 -wandb==0.12.7 \ No newline at end of file +wandb==0.12.7 +pybind11==2.2 +fasttext From 2ce99d86e190cebefab430b6599151978922afb7 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Fri, 24 Dec 2021 19:05:12 +0300 Subject: [PATCH 09/18] Testing --- .../classifiers/sentiment_twitter.json | 16 ++++++---- deeppavlov/core/common/logging_class.py | 29 +++++++++++-------- deeppavlov/core/trainers/nn_trainer.py | 2 +- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index ca917ddf9e..cc26129937 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", @@ -115,6 +115,7 @@ ], "validation_patience": 5, "val_every_n_epochs": 1, + "val_every_n_batchs":3, "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ @@ -135,17 +136,20 @@ { "name": "WandbLogger", "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0253", - "project": "DeepTest", - "group": "Group_Name", - "job_type":"Name job Type", - "run_name":"Run name", - "config": { + "init":{ + "project": "Deep_Test", + "group": "Group_Name", + "job_type":"Name job Type", + "name":"Run name", + "config": { "description": "add any hyperprameter you want to monitor, architecture discription,..", "learning_rate": 0.02, "architecture": "CNN", "dataset": "sentiment_twitter_data" } + } + } ] }, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index 0611dfd03c..cfe07eada1 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -271,18 +271,23 @@ class WandbLogger(TrainLogger): """ - def __init__(self, wandb_init: Optional[Dict] = None): - wandb.login(key=wandb_init.get("API_Key", None), relogin=True) - - wandb.init( - anonymous="allow", - project=wandb_init.get("project", None), - group=wandb_init.get("group", None), - job_type=wandb_init.get("job_type", None), - config=wandb_init.get("config", None), - name=wandb_init.get("run_name", None), - id = wandb_init.get("id",None) # to resume a run - ) + # def __init__(self, wandb_init: Optional[Dict] = None): + def __init__(self, API_Key = None, **kwargs): + print(kwargs) + # wandb.login(key=wandb_init.get("API_Key", None), relogin=True) + # wandb.login(key=kwargs.get("API_Key", None), relogin=True) + wandb.login(key=API_Key, relogin=True) + + # wandb.init( + # anonymous="allow", + # project=wandb_init.get("project", None), + # group=wandb_init.get("group", None), + # job_type=wandb_init.get("job_type", None), + # config=wandb_init.get("config", None), + # name=wandb_init.get("run_name", None), + # id = wandb_init.get("id",None) # to resume a run + # ) + wandb.init(**kwargs["init"]) def __call__(self, report: dict) -> None: """ " diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 376982ab2c..44d923771d 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -163,7 +163,7 @@ def _improved(op): self.tensorboardlogger_valid = TensorboardLogger('valid') if self.wandblogger_idx is not None: - self.wandblogger = WandbLogger(self.logger[self.wandblogger_idx]) + self.wandblogger = WandbLogger(**self.logger[self.wandblogger_idx]) self.std_logger_train = StdLogger( 'train', self.stdlogger_idx is not None) From fb5f950faa9aadfce1fb5db83388977e56ebedd5 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Wed, 5 Jan 2022 13:00:11 +0300 Subject: [PATCH 10/18] Update logging_class --- .../classifiers/sentiment_twitter.json | 12 +- deeppavlov/core/common/logging_class.py | 292 ++++++++++-------- deeppavlov/core/trainers/fit_trainer.py | 48 +-- deeppavlov/core/trainers/nn_trainer.py | 183 +++++++++-- 4 files changed, 349 insertions(+), 186 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index cc26129937..5c7156abc2 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -63,7 +63,7 @@ "main": true, "class_name": "keras_classification_model", "save_path": "{MODEL_PATH}/new_model", - "load_path": "{MODEL_PATH}/model", + "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 30, + "epochs": 20, "batch_size": 64, "metrics": [ "accuracy", @@ -113,10 +113,10 @@ ] } ], - "validation_patience": 5, + "validation_patience": 20, "val_every_n_epochs": 1, - "val_every_n_batchs":3, "log_every_n_epochs": 1, + "log_on_k_batches":2, "show_examples": false, "evaluation_targets": [ "train", @@ -137,8 +137,8 @@ "name": "WandbLogger", "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0253", "init":{ - "project": "Deep_Test", - "group": "Group_Name", + "project": "Deep_Test_final", + "group": "Group_Name11", "job_type":"Name job Type", "name":"Run name", "config": { diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index cfe07eada1..832fe92462 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -13,6 +13,7 @@ # limitations under the License. import json +import logging import time import datetime from itertools import islice @@ -46,71 +47,17 @@ def __init__(self): """ raise NotImplementedError - @abstractmethod - def __call__(self): - """ - Call method with metrics as parameters for logging, according to chosen method. - - """ - raise NotImplementedError - - @abstractmethod - def print_info(self): - """ - Print inforamtion about logging method, like the logging directory... - - """ - raise NotImplementedError - - -class TensorboardLogger(TrainLogger): - """ - TensorboardLogger class for logging metrics during training process into a local folder, later using TensorBoard tool for visualizations the logged data. - - Args: - type (str): 'train' for logging metrics of training process or 'valid' for validation process. - log_dir (str): path to local folder to log data into. - - """ - - def __init__(self, type: str, log_dir: str): - self.type = type - if log_dir is not None: - self.tb_writer = tf.summary.FileWriter(log_dir) - self.log_dir = log_dir - else: - self.log_dir = None - - def __call__( - self, - nn_trainer, - iterator: DataLearningIterator, - tensorboard_tag: Optional[str] = None, - tensorboard_index: Optional[int] = None, - ) -> dict: - """ - override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. - for 'valid' logging type, log metrics of validation process to log_dir/valid_log. - for 'valid' type, 'call' function saves best score on validation data, and the model parameters corresponding to the best score. - - Args: - nn_trainer: 'NNTrainer' object which contains 'self' as variable. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' - tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. - - Returns: - a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. - - """ - if self.type == "train": - nn_trainer._send_event(event_name="before_log") + def get_report(self, + nn_trainer, + iterator: DataLearningIterator, type: str = None): + if type == "train": if nn_trainer.log_on_k_batches == 0: report = { "time_spent": str( datetime.timedelta( - seconds=round(time.time() - nn_trainer.start_time + 0.5) + seconds=round( + time.time() - nn_trainer.start_time + 0.5) ) ) } @@ -139,26 +86,13 @@ def __call__( report.update(nn_trainer.last_result) if nn_trainer.losses: - report["loss"] = sum(nn_trainer.losses) / len(nn_trainer.losses) + report["loss"] = sum(nn_trainer.losses) / \ + len(nn_trainer.losses) nn_trainer.losses.clear() metrics.append(("loss", report["loss"])) - if ( - metrics and self.log_dir is not None - ): # nn_trainer.tensorboard_idx is not None - log.info(f"logging Training metrics to {self.log_dir}") - summary = nn_trainer._tf.Summary() - - for name, score in metrics: - summary.value.add( - tag=f"{tensorboard_tag}/{name}", simple_value=score - ) - self.tb_writer.add_summary(summary, tensorboard_index) - self.tb_writer.flush() - - nn_trainer._send_event(event_name="after_train_log", data=report) else: - nn_trainer._send_event(event_name="before_validation") + # nn_trainer._send_event(event_name="before_validation") report = nn_trainer.test( iterator.gen_batches( nn_trainer.batch_size, data_type="valid", shuffle=False @@ -172,19 +106,6 @@ def __call__( metrics = list(report["metrics"].items()) - # nn_trainer.tensorboard_idx is not None: - if tensorboard_tag is not None and self.log_dir is not None: - log.info(f"logging Validation metrics to {self.log_dir}") - summary = nn_trainer._tf.Summary() - for name, score in metrics: - summary.value.add( - tag=f"{tensorboard_tag}/{name}", simple_value=score - ) - if tensorboard_index is None: - tensorboard_index = nn_trainer.train_batches_seen - self.tb_writer.add_summary(summary, tensorboard_index) - self.tb_writer.flush() - m_name, score = metrics[0] # Update the patience @@ -221,45 +142,146 @@ def __call__( if nn_trainer.validation_patience > 0: report["patience_limit"] = nn_trainer.validation_patience - nn_trainer._send_event(event_name="after_validation", data=report) + # nn_trainer._send_event(event_name="after_validation", data=report) + nn_trainer.validation_number += 1 + return metrics, report - return report - def print_info(self): + @abstractmethod + def __call__(self, + nn_trainer, + iterator: DataLearningIterator, + tensorboard_tag: Optional[str] = None, + tensorboard_index: Optional[int] = None, type: str = None): + """ + Call method with metrics as parameters for logging, according to chosen method. + + """ raise NotImplementedError + @abstractmethod + def print_info(self): + """ + Print inforamtion about logging method, like the logging directory... + + """ + raise NotImplementedError class StdLogger(TrainLogger): """ - StdLogger class for printing report about current training or validation process to stdout. + StdLogger class for printing report about current training and validation processes to stdout. Args: type: 'train' for printing report of training process or 'valid' for validation process. log_true (boo): if True: print of the StdLogger is provided in .json file as logging method or not. default False. """ + def __init__(self, stdlogging :bool = True): + self.stdlogging = stdlogging + # self.type = type - def __init__(self, type: str, log_true: bool = False): - self.type = type - self.log_true = log_true + def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): + return super().get_report(nn_trainer, iterator, type=type) - def __call__(self, report: dict) -> None: + def __call__(self, nn_trainer, iterator: DataLearningIterator, type:str = None, report :Dict =None, metrics: Dict = None) -> None: """ Print report to stdout. Args: report(dict): report to log to stdout. + """ + if report is None: + print("Calling from StdLogger:::::::::::::::::::::::::::::::::::::::::::::::") + metrics, report = self.get_report(nn_trainer = nn_trainer, iterator = iterator, type = type) + if self.stdlogging: + log.info(json.dumps({type: report}, ensure_ascii=False, cls=NumpyArrayEncoder)) + return metrics, report + + def print_info(self): + raise NotImplementedError + + +class TensorboardLogger(TrainLogger): + """ + TensorboardLogger class for logging metrics during training process into a local folder, later using TensorBoard tool for visualizations the logged data. + + Args: + type (str): 'train' for logging metrics of training process or 'valid' for validation process. + log_dir (str): path to local folder to log data into. + + """ + + def __init__(self, log_dir: str = None): + self.train_log_dir = str(log_dir / 'train_log') + self.valid_log_dir = str(log_dir / 'valid_log') + self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) + self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) + + def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): + return super().get_report(nn_trainer, iterator, type=type) + + def __call__( + self, + nn_trainer, + iterator: DataLearningIterator, + type :str = None, + tensorboard_tag: Optional[str] = None, + tensorboard_index: Optional[int] = None, + report: Dict = None, + metrics :List = None, + ): + """ + override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. + for 'valid' logging type, log metrics of validation process to log_dir/valid_log. + for 'valid' type, 'call' function saves best score on validation data, and the model parameters corresponding to the best score. + + Args: + nn_trainer: 'NNTrainer' object which contains 'self' as variable. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' + tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. + + Returns: + a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. """ - if self.log_true: - report = {self.type: report} - log.info(json.dumps(report, ensure_ascii=False, cls=NumpyArrayEncoder)) + if report is None: + print("Calling from TensorboardLogger:::::::::::::::::::::::::::::::::::::::::::::::") + metrics, report = self.get_report(nn_trainer = nn_trainer, iterator = iterator, type = type) + + # logging to tensorboard: + if type == "train": + if metrics and self.train_log_dir is not None: # nn_trainer.tensorboard_idx is not None + # log.info(f"logging Training metrics to {self.train_log_dir}") + summary = nn_trainer._tf.Summary() + + for name, score in metrics: + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) + # if tensorboard_index is None: + # tensorboard_index = nn_trainer.train_batches_seen + self.tb_train_writer.add_summary(summary, tensorboard_index) + self.tb_train_writer.flush() + else: + if tensorboard_tag is not None and self.valid_log_dir is not None: + summary = nn_trainer._tf.Summary() + for name, score in metrics: + summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + if tensorboard_index is None: + tensorboard_index = nn_trainer.train_batches_seen + self.tb_valid_writer.add_summary(summary, tensorboard_index) + self.tb_valid_writer.flush() + return metrics, report def print_info(self): raise NotImplementedError + + + class WandbLogger(TrainLogger): """ WandbLogger class for logging report about current training or validation process to WandB ("https://wandb.ai/site"). @@ -270,26 +292,26 @@ class WandbLogger(TrainLogger): key (string, optional): authentication key. """ + + @staticmethod + def login(API_Key: str = None): + return wandb.login(key=API_Key, relogin=True) - # def __init__(self, wandb_init: Optional[Dict] = None): - def __init__(self, API_Key = None, **kwargs): - print(kwargs) - # wandb.login(key=wandb_init.get("API_Key", None), relogin=True) - # wandb.login(key=kwargs.get("API_Key", None), relogin=True) - wandb.login(key=API_Key, relogin=True) - - # wandb.init( - # anonymous="allow", - # project=wandb_init.get("project", None), - # group=wandb_init.get("group", None), - # job_type=wandb_init.get("job_type", None), - # config=wandb_init.get("config", None), - # name=wandb_init.get("run_name", None), - # id = wandb_init.get("id",None) # to resume a run - # ) - wandb.init(**kwargs["init"]) - - def __call__(self, report: dict) -> None: + def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): + return super().get_report(nn_trainer, iterator, type=type) + + def __init__(self, log_on: str = None, commit_on_valid:bool = False, **kwargs): + self.log_on = log_on # "epochs","batches" + self.commit_on_valid = commit_on_valid + wandb.init(**kwargs) + + def __call__(self,nn_trainer, + iterator: DataLearningIterator, + type :str = None, + report: Dict = None, + metrics :List = None, + step: int = 0 + ): """ " Logging report of the training process to wandb. @@ -300,10 +322,18 @@ def __call__(self, report: dict) -> None: a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. """ + if report is None: + print("Calling from WandbLogger:::::::::::::::::::::::::::::::::::::::::::::::") + metrics, report = self.get_report(nn_trainer = nn_trainer, iterator = iterator, type = type) + + logging_type = type +"/" for i in report.keys(): - if type(report[i]) == dict: - for j in report[i].keys(): - wandb.log({j: report[i].keys()[j]},commit = False) + if isinstance(report[i], dict): + # if type(report[i]) == dict: + for key,value in report[i].items(): + # for j in report[i].keys(): + wandb.log( + {logging_type+key: value}, commit=False, step=step) else: if i == "time_spent": t = time.strptime(report[i], "%H:%M:%S") @@ -312,12 +342,22 @@ def __call__(self, report: dict) -> None: hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec ).total_seconds() ) - wandb.log({i: y_seconds},commit = False) + wandb.log({logging_type+i+("(s)"): y_seconds}, + commit=False, step=step) else: - wandb.log({i: report[i]},commit = False) - wandb.log({},commit= True) # to log all previous logs in one step. + wandb.log( + {logging_type+i: report[i]}, commit=False, step=step) + + # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. + if (self.commit_on_valid and logging_type == "valid/") or (not self.commit_on_valid and logging_type == "train/"): + # to log all previous logs in one step. + wandb.log({}, commit=True, step=step) + + return metrics, report - def close(self): + @staticmethod + def close(): + wandb.log({}, commit=True) wandb.finish() def print_info(self): diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index ee5a548408..0a6db91474 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -17,9 +17,7 @@ import time from itertools import islice from logging import getLogger -from pathlib import Path from typing import List, Tuple, Dict, Union, Optional, Iterable, Any, Collection -from collections import defaultdict from deeppavlov.core.commands.infer import build_model from deeppavlov.core.commands.utils import expand_path @@ -51,9 +49,20 @@ class FitTrainer: evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``) show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch in evaluation logs (default is ``False``) - tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None + logger : list of dictionaries of possible loggers provided in config file. (default is ``None``) - logger : list of dictionary of possible loggers provided in config file + Possible loggers: + - TensorboardLogger: for running tesnorboard logs, keys: + "name": "TensorboardLogger", logging to tensorboard will be ignored if None + "log_dir":str or path to a directory where tensorboard logs can be stored, ignored if None + (default is ``None``) + - StdLogger: for logging report about current training and validation processes to stdout. Keys: + "name": "StdLogger". logging to stdout will be ignored if None. (default is ``None``) + - WandbLogger: logging report about current training and validation processes to WandB. with keys: + "name": "WandbLogger", logging to wandb will be ignored if None. + "API_Key": API of 40-chars from 'https://wandb.ai/home' personal account. + "init": dictionary of key:value for wandb.init configurations. see: 'https://docs.wandb.ai/ref/python/init' + (default is ``None``) max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored @@ -63,7 +72,6 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, metrics: Iterable[Union[str, dict]] = ('accuracy',), evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, - # tensorboard_log_dir: Optional[Union[str, Path]] = None, max_test_batches: int = -1, logger: Optional[List[Dict]] = None, **kwargs) -> None: @@ -81,19 +89,18 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, self.max_test_batches = None if max_test_batches < 0 else max_test_batches - # self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir self.logger: Optional[List[Dict]] = logger - def get_method_idx(logger, name): - try: - for i in range(len(logger)): - if logger[i]["name"] == name: - return i - except: - return None - self.tensorboard_idx = get_method_idx(self.logger, "TensorboardLogger") - self.stdlogger_idx = get_method_idx(self.logger, "StdLogger") - self.wandblogger_idx = get_method_idx(self.logger, "WandbLogger") + self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None + for i in range(len(logger)): + if logger[i].get("name",None) == "TensorboardLogger" and self.logger[i].get("log_dir", None) is not None: + self.tensorboard_idx = i + # self.tensorboard_log_dir = logger[i].get("log_dir",None) + if logger[i].get("name",None) == "StdLogger": + self.stdlogger_idx = i + if logger[i].get("name",None) == "WandbLogger": + self.wandblogger_idx = i + if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements @@ -102,8 +109,8 @@ def get_method_idx(logger, name): except ImportError: log.warning('TensorFlow could not be imported, so tensorboard log directory' f'`{self.logger[self.tensorboard_idx]["log_dir"]}` will be ignored') - self.tensorboard_idx = None # check it - self.logger[self.tensorboard_idx]["log_dir"] = None + self.tensorboard_idx = None + # self.logger[self.tensorboard_idx]["log_dir"] = None else: self.logger[self.tensorboard_idx]["log_dir"] = expand_path( self.logger[self.tensorboard_idx]["log_dir"]) @@ -138,7 +145,8 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] # noinspection PyUnresolvedReferences result = component.partial_fit(*preprocessed) - if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: + #if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: + if result is not None and self.tensorboard_idx is not None: if writer is None: writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / f'partial_fit_{component_index}_log')) @@ -156,7 +164,7 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] result: Optional[Dict[str, Iterable[float]] ] = component.fit(*preprocessed) - if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: + if result is not None and self.tensorboard_idx is not None: writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / f'fit_log_{component_index}')) for name, scores in result.items(): diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 44d923771d..add08b7060 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -134,6 +134,7 @@ def _improved(op): ['maximize', 'minimize'])) self.validate_first = validate_first + self.validate_ = StdLogger(self.stdlogger_idx is not None) self.validation_number = 0 if validate_first else 1 self.validation_patience = validation_patience self.val_every_n_epochs = val_every_n_epochs @@ -153,22 +154,55 @@ def _improved(op): self.losses = [] self.start_time: Optional[float] = None + # if self.tensorboard_idx is not None: + # self.tensorboardlogger_train = TensorboardLogger(type = 'train', log_dir = str( + # self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) + # self.tensorboardlogger_valid = TensorboardLogger(type = 'valid', log_dir = str( + # self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) + # else: + # self.tensorboardlogger_train = TensorboardLogger(type = 'train') + # self.tensorboardlogger_valid = TensorboardLogger(type ='valid') + + # if self.wandblogger_idx is not None: + # self.wandblogger = WandbLogger(train_epochs_freq=self.log_every_n_epochs, + # val_epochs_freq=self.val_every_n_epochs, **self.logger[self.wandblogger_idx]) + + # self.std_logger_train = StdLogger( + # 'train', self.stdlogger_idx is not None) + # self.std_logger_valid = StdLogger( + # 'valid', self.stdlogger_idx is not None) + if self.stdlogger_idx is not None: + self.std_logger = StdLogger(stdlogging=True) + if self.tensorboard_idx is not None: - self.tensorboardlogger_train = TensorboardLogger('train', str( - self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) - self.tensorboardlogger_valid = TensorboardLogger('valid', str( - self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) - else: - self.tensorboardlogger_train = TensorboardLogger('train') - self.tensorboardlogger_valid = TensorboardLogger('valid') + self.tensorboardlogger = TensorboardLogger( + log_dir=self.logger[self.tensorboard_idx]["log_dir"]) if self.wandblogger_idx is not None: - self.wandblogger = WandbLogger(**self.logger[self.wandblogger_idx]) - - self.std_logger_train = StdLogger( - 'train', self.stdlogger_idx is not None) - self.std_logger_valid = StdLogger( - 'valid', self.stdlogger_idx is not None) + try: + wandb_login = WandbLogger.login( + self.logger[self.wandblogger_idx].get("API_Key", None)) + except: + print("error in api key") + if wandb_login: + # log to wandb if logging on epoches provided: + if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: + self.wandb_logger = WandbLogger( + log_on="epochs", commit_on_valid = self.val_every_n_epochs > 0,**self.logger[self.wandblogger_idx].get("init", None)) + WandbLogger.log_on_train = True + elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: + self.wandb_logger = WandbLogger( + log_on="batches", commit_on_valid = self.val_every_n_batches > 0,**self.logger[self.wandblogger_idx].get("init", None)) + else: + log.info("Check API key for WandB, data will not logged in...") + # if self.tensorboard_idx is not None: + # self.tensorboardlogger_train = TensorboardLogger(type = 'train', log_dir = str( + # self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) + # self.tensorboardlogger_valid = TensorboardLogger(type = 'valid', log_dir = str( + # self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) + # else: + # self.tensorboardlogger_train = TensorboardLogger(type = 'train') + # self.tensorboardlogger_valid = TensorboardLogger(type ='valid') def save(self) -> None: if self._loaded: @@ -197,8 +231,9 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: """Train pipeline on batches using provided data iterator and initialization parameters""" self.start_time = time.time() if self.validate_first: - report_stdlogger = self.tensorboardlogger_valid(self, iterator) - self.std_logger_valid(report_stdlogger) + # report_stdlogger = self.tensorboardlogger_valid(self, iterator) + # self.std_logger_valid(report_stdlogger) + self.validate_(self, iterator, "valid") while True: impatient = False @@ -216,16 +251,53 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.examples += len(x) if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: - report_stdlogger = self.tensorboardlogger_train( - self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) - self.std_logger_train(report_stdlogger) - if self.wandblogger_idx is not None: - self.wandblogger(report_stdlogger) + # report_stdlogger = self.tensorboardlogger_train( + # self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + # self.std_logger_train(report_stdlogger) + + # test wandb for epochs first + # if self.wandblogger_idx is not None: + # self.wandblogger(report_stdlogger) + self._send_event(event_name="before_log") + metrics, report = None, None + if self.stdlogger_idx is not None: + metrics, report = self.std_logger( + self, iterator, "train", report, metrics) + + if self.tensorboard_idx is not None: + metrics, report = self.tensorboardlogger( + self, iterator, "train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) + + if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": + metrics, report = self.wandb_logger( + self, iterator=iterator, type="train", report=report, metrics=metrics, step = self.train_batches_seen // self.log_every_n_batches) + + self._send_event(event_name='after_train_log', data=report) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: - report_stdlogger = self.tensorboardlogger_valid( - self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) - self.std_logger_valid(report_stdlogger) + # report_stdlogger = self.tensorboardlogger_valid( + # self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) + # self.std_logger_valid(report_stdlogger) + self._send_event(event_name='before_validation') + report, metrics = None, None + if self.stdlogger_idx is not None: + metrics, report = self.std_logger( + self, iterator, "valid", report, metrics) + + if self.tensorboard_idx is not None: + metrics, report = self.tensorboardlogger( + self, iterator, "valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) + + if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": + metrics, report = self.wandb_logger( + self, iterator, "valid", report=report, metrics=metrics, step = self.train_batches_seen // self.val_every_n_batches) + + self._send_event( + event_name='after_validation', data=report) + # test wandb for epochs first + # if self.wandblogger_idx is not None: + # self.wandblogger(report_stdlogger, "valid") + self._send_event(event_name='after_batch') if 0 < self.max_batches <= self.train_batches_seen: @@ -241,18 +313,59 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: break self.epoch += 1 - if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: - report_stdlogger = self.tensorboardlogger_train( - self, iterator, 'every_n_epochs', self.epoch) - self.std_logger_train(report_stdlogger) - if self.wandblogger_idx is not None: - self.wandblogger(report_stdlogger) + self._send_event(event_name="before_log") + + report, metrics = None, None + if self.stdlogger_idx is not None: + metrics, report = self.std_logger( + self, iterator = iterator, type = "train", report = report, metrics = metrics) + # print("Report:", report) + # print("Metrics:", metrics) + if self.tensorboard_idx is not None: + metrics, report = self.tensorboardlogger( + self, + iterator = iterator, + type = "train", + tensorboard_tag='every_n_epochs', + tensorboard_index=self.epoch, + report=report, + metrics=metrics + ) + + if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": + metrics, report = self.wandb_logger( + self, iterator, "train", report=report, metrics=metrics, step = self.epoch // self.log_every_n_epochs) + + self._send_event(event_name='after_train_log', data=report) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: - report_stdlogger = self.tensorboardlogger_valid( - self, iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) - self.std_logger_valid(report_stdlogger) + # report_stdlogger = self.tensorboardlogger_valid( + # self, iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) + # self.std_logger_valid(report_stdlogger) + self._send_event(event_name='before_validation') + + report, metrics = None, None + if self.stdlogger_idx is not None: + metrics, report = self.std_logger( + self, iterator, "valid", report, metrics) + + if self.tensorboard_idx is not None: + metrics, report = self.tensorboardlogger( + self, iterator = iterator, type = "valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report, metrics=metrics) + + if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": + metrics, report = self.wandb_logger( + self, iterator, "valid", report=report, metrics=metrics, step = self.epoch // self.val_every_n_epochs) + + self._send_event(event_name='after_validation', data=report) + + # if self.wandblogger_idx is not None: + # self.wandblogger(report_stdlogger, "valid") + # if self.wandblogger_idx is not None: + # self.wandb_val_n_epochs( + # report_stdlogger, self.epoch // self.val_every_n_epochs) + self._send_event(event_name='after_epoch') if 0 < self.max_epochs <= self.epoch: @@ -269,8 +382,10 @@ def train(self, iterator: DataLearningIterator) -> None: try: self.train_on_batches(iterator) # wandblogger will finish by itself, but finishing wandb manually here will be directly after training - if self.wandblogger_idx is not None: - self.wandblogger.close() + # if self.wandblogger_idx is not None: + # self.wandblogger.close() + if self.wandblogger_idx is not None: + WandbLogger.close() except KeyboardInterrupt: log.info('Stopped training') else: From 6998d063f2dd29a2ee63609540e23e7461cadf70 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Thu, 6 Jan 2022 03:27:24 +0300 Subject: [PATCH 11/18] fix WandB --- .../classifiers/sentiment_twitter.json | 11 +- deeppavlov/core/common/logging_class.py | 107 +++++++++------- deeppavlov/core/trainers/fit_trainer.py | 33 ++--- deeppavlov/core/trainers/nn_trainer.py | 115 ++++-------------- 4 files changed, 107 insertions(+), 159 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 5c7156abc2..3992f287c7 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 20, + "epochs": 10, "batch_size": 64, "metrics": [ "accuracy", @@ -113,9 +113,9 @@ ] } ], - "validation_patience": 20, + "validation_patience": 5, "val_every_n_epochs": 1, - "log_every_n_epochs": 1, + "log_every_n_epochs": 3, "log_on_k_batches":2, "show_examples": false, "evaluation_targets": [ @@ -124,7 +124,6 @@ "test" ], "class_name": "nn_trainer", - "tensorboard_log_dir": "{MODELS_PATH}/sentiment_twitter/logs", "logger": [ { "name": "TensorboardLogger", @@ -146,10 +145,8 @@ "learning_rate": 0.02, "architecture": "CNN", "dataset": "sentiment_twitter_data" + } } - - } - } ] }, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index 832fe92462..e67af22b2e 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -15,6 +15,7 @@ import json import logging import time +from pathlib import Path import datetime from itertools import islice from abc import ABC, abstractmethod @@ -23,6 +24,7 @@ import tensorflow as tf import wandb +from wandb.errors import Error, UsageError, CommError from deeppavlov.core.trainers.utils import NumpyArrayEncoder from deeppavlov.core.data.data_learning_iterator import DataLearningIterator @@ -47,10 +49,11 @@ def __init__(self): """ raise NotImplementedError + @abstractmethod def get_report(self, - nn_trainer, - iterator: DataLearningIterator, type: str = None): + nn_trainer, + iterator: DataLearningIterator, type: str = None): if type == "train": if nn_trainer.log_on_k_batches == 0: report = { @@ -147,13 +150,13 @@ def get_report(self, nn_trainer.validation_number += 1 return metrics, report - @abstractmethod def __call__(self, nn_trainer, iterator: DataLearningIterator, + type: str = None, tensorboard_tag: Optional[str] = None, - tensorboard_index: Optional[int] = None, type: str = None): + tensorboard_index: Optional[int] = None): """ Call method with metrics as parameters for logging, according to chosen method. @@ -168,6 +171,7 @@ def print_info(self): """ raise NotImplementedError + class StdLogger(TrainLogger): """ StdLogger class for printing report about current training and validation processes to stdout. @@ -177,14 +181,14 @@ class StdLogger(TrainLogger): log_true (boo): if True: print of the StdLogger is provided in .json file as logging method or not. default False. """ - def __init__(self, stdlogging :bool = True): + + def __init__(self, stdlogging: bool = True): self.stdlogging = stdlogging - # self.type = type def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): return super().get_report(nn_trainer, iterator, type=type) - def __call__(self, nn_trainer, iterator: DataLearningIterator, type:str = None, report :Dict =None, metrics: Dict = None) -> None: + def __call__(self, nn_trainer, iterator: DataLearningIterator, type: str = None, report: Dict = None, metrics: Dict = None) -> None: """ Print report to stdout. @@ -193,9 +197,11 @@ def __call__(self, nn_trainer, iterator: DataLearningIterator, type:str = None, """ if report is None: print("Calling from StdLogger:::::::::::::::::::::::::::::::::::::::::::::::") - metrics, report = self.get_report(nn_trainer = nn_trainer, iterator = iterator, type = type) + metrics, report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type) if self.stdlogging: - log.info(json.dumps({type: report}, ensure_ascii=False, cls=NumpyArrayEncoder)) + log.info(json.dumps({type: report}, + ensure_ascii=False, cls=NumpyArrayEncoder)) return metrics, report def print_info(self): @@ -212,7 +218,7 @@ class TensorboardLogger(TrainLogger): """ - def __init__(self, log_dir: str = None): + def __init__(self, log_dir: Path = None): self.train_log_dir = str(log_dir / 'train_log') self.valid_log_dir = str(log_dir / 'valid_log') self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) @@ -220,16 +226,16 @@ def __init__(self, log_dir: str = None): def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): return super().get_report(nn_trainer, iterator, type=type) - + def __call__( self, nn_trainer, iterator: DataLearningIterator, - type :str = None, + type: str = None, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None, report: Dict = None, - metrics :List = None, + metrics: List = None, ): """ override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. @@ -247,9 +253,11 @@ def __call__( """ if report is None: - print("Calling from TensorboardLogger:::::::::::::::::::::::::::::::::::::::::::::::") - metrics, report = self.get_report(nn_trainer = nn_trainer, iterator = iterator, type = type) - + print( + "Calling from TensorboardLogger:::::::::::::::::::::::::::::::::::::::::::::::") + metrics, report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type) + # logging to tensorboard: if type == "train": if metrics and self.train_log_dir is not None: # nn_trainer.tensorboard_idx is not None @@ -268,7 +276,8 @@ def __call__( if tensorboard_tag is not None and self.valid_log_dir is not None: summary = nn_trainer._tf.Summary() for name, score in metrics: - summary.value.add(tag=f'{tensorboard_tag}/{name}', simple_value=score) + summary.value.add( + tag=f'{tensorboard_tag}/{name}', simple_value=score) if tensorboard_index is None: tensorboard_index = nn_trainer.train_batches_seen self.tb_valid_writer.add_summary(summary, tensorboard_index) @@ -279,9 +288,6 @@ def print_info(self): raise NotImplementedError - - - class WandbLogger(TrainLogger): """ WandbLogger class for logging report about current training or validation process to WandB ("https://wandb.ai/site"). @@ -292,26 +298,36 @@ class WandbLogger(TrainLogger): key (string, optional): authentication key. """ - + @staticmethod def login(API_Key: str = None): - return wandb.login(key=API_Key, relogin=True) + try: + wandb.login(key=API_Key, relogin=True) + wandb.init()# in case wandb.login() return True with not valid key, but this will throw an error when initializing + return True + except Exception as e: + log.warning(str(e)+", logging to WandB will be ignored") + return False def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): return super().get_report(nn_trainer, iterator, type=type) - def __init__(self, log_on: str = None, commit_on_valid:bool = False, **kwargs): - self.log_on = log_on # "epochs","batches" + def __init__(self, log_on: str = None, commit_on_valid: bool = False, **kwargs): + self.log_on = log_on # "epochs","batches" self.commit_on_valid = commit_on_valid - wandb.init(**kwargs) - - def __call__(self,nn_trainer, - iterator: DataLearningIterator, - type :str = None, - report: Dict = None, - metrics :List = None, - step: int = 0 - ): + print("HEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE") + try: + wandb.init(**kwargs, reinit=True) + except: + # set self.wandblogger_idx to None + print("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") + + def __call__(self, nn_trainer, + iterator: DataLearningIterator, + type: str = None, + report: Dict = None, + metrics: List = None + ): """ " Logging report of the training process to wandb. @@ -323,17 +339,19 @@ def __call__(self,nn_trainer, """ if report is None: - print("Calling from WandbLogger:::::::::::::::::::::::::::::::::::::::::::::::") - metrics, report = self.get_report(nn_trainer = nn_trainer, iterator = iterator, type = type) + print( + "Calling from WandbLogger:::::::::::::::::::::::::::::::::::::::::::::::") + metrics, report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type) - logging_type = type +"/" + logging_type = type + "/" for i in report.keys(): if isinstance(report[i], dict): - # if type(report[i]) == dict: - for key,value in report[i].items(): - # for j in report[i].keys(): + # if type(report[i]) == dict: + for key, value in report[i].items(): + # for j in report[i].keys(): wandb.log( - {logging_type+key: value}, commit=False, step=step) + {logging_type+key: value}, commit=False) else: if i == "time_spent": t = time.strptime(report[i], "%H:%M:%S") @@ -343,16 +361,15 @@ def __call__(self,nn_trainer, ).total_seconds() ) wandb.log({logging_type+i+("(s)"): y_seconds}, - commit=False, step=step) + commit=False) else: wandb.log( - {logging_type+i: report[i]}, commit=False, step=step) + {logging_type+i: report[i]}, commit=False) # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. if (self.commit_on_valid and logging_type == "valid/") or (not self.commit_on_valid and logging_type == "train/"): - # to log all previous logs in one step. - wandb.log({}, commit=True, step=step) - + wandb.log({}, commit=True) + return metrics, report @staticmethod diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index 0a6db91474..cbd703a5fd 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -49,19 +49,19 @@ class FitTrainer: evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``) show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch in evaluation logs (default is ``False``) - logger : list of dictionaries of possible loggers provided in config file. + logger : list of dictionaries of possible loggers from deeppavlov.configs files. (default is ``None``) Possible loggers: - - TensorboardLogger: for running tesnorboard logs, keys: + - TensorboardLogger: for logging to tesnorboard. Keys: "name": "TensorboardLogger", logging to tensorboard will be ignored if None "log_dir":str or path to a directory where tensorboard logs can be stored, ignored if None (default is ``None``) - StdLogger: for logging report about current training and validation processes to stdout. Keys: "name": "StdLogger". logging to stdout will be ignored if None. (default is ``None``) - - WandbLogger: logging report about current training and validation processes to WandB. with keys: + - WandbLogger: logging report about current training and validation processes to WandB. Keys: "name": "WandbLogger", logging to wandb will be ignored if None. - "API_Key": API of 40-chars from 'https://wandb.ai/home' personal account. - "init": dictionary of key:value for wandb.init configurations. see: 'https://docs.wandb.ai/ref/python/init' + "API_Key": API of 40 characters long from 'https://wandb.ai/home' personal account. + "init": dictionary of (key:value) for wandb.init configurations. see: 'https://docs.wandb.ai/ref/python/init' (default is ``None``) max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) @@ -92,15 +92,18 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, self.logger: Optional[List[Dict]] = logger self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None - for i in range(len(logger)): - if logger[i].get("name",None) == "TensorboardLogger" and self.logger[i].get("log_dir", None) is not None: - self.tensorboard_idx = i - # self.tensorboard_log_dir = logger[i].get("log_dir",None) - if logger[i].get("name",None) == "StdLogger": - self.stdlogger_idx = i - if logger[i].get("name",None) == "WandbLogger": - self.wandblogger_idx = i - + if logger is not None: + try: + for i in range(len(logger)): + if logger[i].get("name", None) == "StdLogger": + self.stdlogger_idx = i + if logger[i].get("name", None) == "TensorboardLogger" and self.logger[i].get("log_dir", None) is not None: + self.tensorboard_idx = i + if logger[i].get("name", None) == "WandbLogger": + self.wandblogger_idx = i + except AttributeError: + log.warning( + "Check logger dictionary in configs, logging will be ignored") if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements @@ -110,7 +113,6 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, log.warning('TensorFlow could not be imported, so tensorboard log directory' f'`{self.logger[self.tensorboard_idx]["log_dir"]}` will be ignored') self.tensorboard_idx = None - # self.logger[self.tensorboard_idx]["log_dir"] = None else: self.logger[self.tensorboard_idx]["log_dir"] = expand_path( self.logger[self.tensorboard_idx]["log_dir"]) @@ -145,7 +147,6 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] # noinspection PyUnresolvedReferences result = component.partial_fit(*preprocessed) - #if result is not None and self.logger[self.tensorboard_idx]["log_dir"] is not None: if result is not None and self.tensorboard_idx is not None: if writer is None: writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index add08b7060..50523ae31f 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -16,7 +16,6 @@ import time from logging import getLogger from typing import List, Dict, Union, Optional, Iterable -from collections import defaultdict from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.registry import register @@ -54,10 +53,8 @@ class NNTrainer(FitTrainer): evaluation_targets: data types on which to evaluate a trained pipeline (default is ``('valid', 'test')``) show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch in evaluation logs (default is ``False``) - tensorboard_log_dir: path to a directory where tensorboard logs can be stored, ignored if None - (default is ``None``) logger : list of dictionaries of possible loggers provided in config file, ignored if None - (default is ``None``), possible loggers: TensorboardLogger and StdLogger + (default is ``None``), possible loggers: TensorboardLogger, StdLogger and WandbLogger validate_first: flag used to calculate metrics on the ``'valid'`` data type before starting training (default is ``True``) validation_patience: how many times in a row the validation metric has to not improve for early stopping, @@ -99,9 +96,7 @@ def __init__(self, chainer_config: dict, *, metric_optimization: str = 'maximize', evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, - # tensorboard_log_dir: Optional[Union[str, Path]] = None, logger: Optional[List[Dict]] = None, - max_test_batches: int = -1, validate_first: bool = True, validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1, @@ -154,23 +149,6 @@ def _improved(op): self.losses = [] self.start_time: Optional[float] = None - # if self.tensorboard_idx is not None: - # self.tensorboardlogger_train = TensorboardLogger(type = 'train', log_dir = str( - # self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) - # self.tensorboardlogger_valid = TensorboardLogger(type = 'valid', log_dir = str( - # self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) - # else: - # self.tensorboardlogger_train = TensorboardLogger(type = 'train') - # self.tensorboardlogger_valid = TensorboardLogger(type ='valid') - - # if self.wandblogger_idx is not None: - # self.wandblogger = WandbLogger(train_epochs_freq=self.log_every_n_epochs, - # val_epochs_freq=self.val_every_n_epochs, **self.logger[self.wandblogger_idx]) - - # self.std_logger_train = StdLogger( - # 'train', self.stdlogger_idx is not None) - # self.std_logger_valid = StdLogger( - # 'valid', self.stdlogger_idx is not None) if self.stdlogger_idx is not None: self.std_logger = StdLogger(stdlogging=True) @@ -179,30 +157,17 @@ def _improved(op): log_dir=self.logger[self.tensorboard_idx]["log_dir"]) if self.wandblogger_idx is not None: - try: - wandb_login = WandbLogger.login( - self.logger[self.wandblogger_idx].get("API_Key", None)) - except: - print("error in api key") - if wandb_login: - # log to wandb if logging on epoches provided: + if WandbLogger.login( + self.logger[self.wandblogger_idx].get("API_Key", None)): if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: self.wandb_logger = WandbLogger( - log_on="epochs", commit_on_valid = self.val_every_n_epochs > 0,**self.logger[self.wandblogger_idx].get("init", None)) + log_on="epochs", commit_on_valid=self.val_every_n_epochs > 0, **self.logger[self.wandblogger_idx].get("init", None)) WandbLogger.log_on_train = True elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: self.wandb_logger = WandbLogger( - log_on="batches", commit_on_valid = self.val_every_n_batches > 0,**self.logger[self.wandblogger_idx].get("init", None)) + log_on="batches", commit_on_valid=self.val_every_n_batches > 0, **self.logger[self.wandblogger_idx].get("init", None)) else: - log.info("Check API key for WandB, data will not logged in...") - # if self.tensorboard_idx is not None: - # self.tensorboardlogger_train = TensorboardLogger(type = 'train', log_dir = str( - # self.logger[self.tensorboard_idx]["log_dir"] / 'train_log')) - # self.tensorboardlogger_valid = TensorboardLogger(type = 'valid', log_dir = str( - # self.logger[self.tensorboard_idx]["log_dir"] / 'valid_log')) - # else: - # self.tensorboardlogger_train = TensorboardLogger(type = 'train') - # self.tensorboardlogger_valid = TensorboardLogger(type ='valid') + self.wandblogger_idx = None def save(self) -> None: if self._loaded: @@ -231,9 +196,9 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: """Train pipeline on batches using provided data iterator and initialization parameters""" self.start_time = time.time() if self.validate_first: - # report_stdlogger = self.tensorboardlogger_valid(self, iterator) - # self.std_logger_valid(report_stdlogger) - self.validate_(self, iterator, "valid") + self._send_event(event_name="before_validation") + _, report = self.validate_(self, iterator, "valid") + self._send_event(event_name='after_validation', data=report) while True: impatient = False @@ -251,52 +216,40 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.examples += len(x) if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: - # report_stdlogger = self.tensorboardlogger_train( - # self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) - # self.std_logger_train(report_stdlogger) - - # test wandb for epochs first - # if self.wandblogger_idx is not None: - # self.wandblogger(report_stdlogger) self._send_event(event_name="before_log") metrics, report = None, None if self.stdlogger_idx is not None: metrics, report = self.std_logger( - self, iterator, "train", report, metrics) + self, iterator, type="train", report=report, metrics=metrics) if self.tensorboard_idx is not None: metrics, report = self.tensorboardlogger( - self, iterator, "train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) + self, iterator, type="train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": metrics, report = self.wandb_logger( - self, iterator=iterator, type="train", report=report, metrics=metrics, step = self.train_batches_seen // self.log_every_n_batches) + self, iterator=iterator, type="train", report=report, metrics=metrics) + # empty report if no logging method. self._send_event(event_name='after_train_log', data=report) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: - # report_stdlogger = self.tensorboardlogger_valid( - # self, iterator, tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen) - # self.std_logger_valid(report_stdlogger) self._send_event(event_name='before_validation') report, metrics = None, None if self.stdlogger_idx is not None: metrics, report = self.std_logger( - self, iterator, "valid", report, metrics) + self, iterator, type="valid", report=report, metrics=metrics) if self.tensorboard_idx is not None: metrics, report = self.tensorboardlogger( - self, iterator, "valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) + self, iterator, type="valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": metrics, report = self.wandb_logger( - self, iterator, "valid", report=report, metrics=metrics, step = self.train_batches_seen // self.val_every_n_batches) + self, iterator, type="valid", report=report, metrics=metrics) self._send_event( event_name='after_validation', data=report) - # test wandb for epochs first - # if self.wandblogger_idx is not None: - # self.wandblogger(report_stdlogger, "valid") self._send_event(event_name='after_batch') @@ -319,53 +272,36 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: report, metrics = None, None if self.stdlogger_idx is not None: metrics, report = self.std_logger( - self, iterator = iterator, type = "train", report = report, metrics = metrics) - # print("Report:", report) - # print("Metrics:", metrics) + self, iterator=iterator, type="train", report=report, metrics=metrics) + if self.tensorboard_idx is not None: metrics, report = self.tensorboardlogger( - self, - iterator = iterator, - type = "train", - tensorboard_tag='every_n_epochs', - tensorboard_index=self.epoch, - report=report, - metrics=metrics - ) + self, iterator=iterator, type="train", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report, metrics=metrics) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": metrics, report = self.wandb_logger( - self, iterator, "train", report=report, metrics=metrics, step = self.epoch // self.log_every_n_epochs) + self, iterator, type="train", report=report, metrics=metrics) self._send_event(event_name='after_train_log', data=report) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: - # report_stdlogger = self.tensorboardlogger_valid( - # self, iterator, tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch) - # self.std_logger_valid(report_stdlogger) self._send_event(event_name='before_validation') report, metrics = None, None if self.stdlogger_idx is not None: metrics, report = self.std_logger( - self, iterator, "valid", report, metrics) + self, iterator, type="valid", report=report, metrics=metrics) if self.tensorboard_idx is not None: metrics, report = self.tensorboardlogger( - self, iterator = iterator, type = "valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report, metrics=metrics) + self, iterator=iterator, type="valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report, metrics=metrics) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": metrics, report = self.wandb_logger( - self, iterator, "valid", report=report, metrics=metrics, step = self.epoch // self.val_every_n_epochs) + self, iterator, type="valid", report=report, metrics=metrics) self._send_event(event_name='after_validation', data=report) - # if self.wandblogger_idx is not None: - # self.wandblogger(report_stdlogger, "valid") - # if self.wandblogger_idx is not None: - # self.wandb_val_n_epochs( - # report_stdlogger, self.epoch // self.val_every_n_epochs) - self._send_event(event_name='after_epoch') if 0 < self.max_epochs <= self.epoch: @@ -381,11 +317,8 @@ def train(self, iterator: DataLearningIterator) -> None: if callable(getattr(self._chainer, 'train_on_batch', None)): try: self.train_on_batches(iterator) - # wandblogger will finish by itself, but finishing wandb manually here will be directly after training - # if self.wandblogger_idx is not None: - # self.wandblogger.close() if self.wandblogger_idx is not None: - WandbLogger.close() + self.wandb_logger.close() except KeyboardInterrupt: log.info('Stopped training') else: From de8df4af3c607704c2889cf1dc4eeb49aa4fc4d9 Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Thu, 6 Jan 2022 15:33:26 +0300 Subject: [PATCH 12/18] Update WandB --- .../classifiers/sentiment_twitter.json | 12 +- deeppavlov/core/common/logging_class.py | 202 +++++++++--------- deeppavlov/core/trainers/fit_trainer.py | 3 +- deeppavlov/core/trainers/nn_trainer.py | 72 +++---- 4 files changed, 143 insertions(+), 146 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 3992f287c7..6d7d937a22 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 10, + "epochs": 30, "batch_size": 64, "metrics": [ "accuracy", @@ -114,7 +114,7 @@ } ], "validation_patience": 5, - "val_every_n_epochs": 1, + "val_every_n_epochs": 4, "log_every_n_epochs": 3, "log_on_k_batches":2, "show_examples": false, @@ -125,6 +125,7 @@ ], "class_name": "nn_trainer", "logger": [ + "hello", { "name": "TensorboardLogger", "log_dir": "{MODELS_PATH}/sentiment_twitter/Tensorboard_logs" @@ -134,12 +135,13 @@ }, { "name": "WandbLogger", - "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0253", + "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0252", "init":{ - "project": "Deep_Test_final", - "group": "Group_Name11", + "project": "Deep_Test_third", + "group": "Group_Name", "job_type":"Name job Type", "name":"Run name", + "reinit": true, "config": { "description": "add any hyperprameter you want to monitor, architecture discription,..", "learning_rate": 0.02, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index e67af22b2e..a3b45c9bd1 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -13,7 +13,6 @@ # limitations under the License. import json -import logging import time from pathlib import Path import datetime @@ -24,10 +23,10 @@ import tensorflow as tf import wandb -from wandb.errors import Error, UsageError, CommError from deeppavlov.core.trainers.utils import NumpyArrayEncoder from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer log = getLogger(__name__) @@ -36,14 +35,14 @@ class TrainLogger(ABC): """An abstract class for logging metrics during training process. There are three types of logging: - 1- StdLogger: print metrics during training - 2- TensorboardLogger: to log metrics to local file specified by log_dir in .json file. - 3- WandbLogger: Not implemented yet. + - StdLogger: for logging report about current training and validation processes to stdout. + - TensorboardLogger: for logging to tensorboard. + - WandbLogger: for logging to WandB. """ @abstractmethod - def __init__(self): + def __init__() -> None: """ The constructor for TrainLogger class. @@ -52,8 +51,21 @@ def __init__(self): @abstractmethod def get_report(self, - nn_trainer, - iterator: DataLearningIterator, type: str = None): + nn_trainer: NNTrainer, + iterator: DataLearningIterator, type: str = None) -> dict: + """" + Get report about current process. + for 'valid' type, 'get_report' function also saves best score on validation data, and the model parameters corresponding to the best score. + + Args: + nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + type : if "train" returns report about training process, "valid" returns report about validation process. + + Returns: + dict contains data about current 'type' process. + + """ if type == "train": if nn_trainer.log_on_k_batches == 0: report = { @@ -94,8 +106,7 @@ def get_report(self, nn_trainer.losses.clear() metrics.append(("loss", report["loss"])) - else: - # nn_trainer._send_event(event_name="before_validation") + elif type == "valid": report = nn_trainer.test( iterator.gen_batches( nn_trainer.batch_size, data_type="valid", shuffle=False @@ -145,80 +156,63 @@ def get_report(self, if nn_trainer.validation_patience > 0: report["patience_limit"] = nn_trainer.validation_patience - # nn_trainer._send_event(event_name="after_validation", data=report) - nn_trainer.validation_number += 1 - return metrics, report - - @abstractmethod - def __call__(self, - nn_trainer, - iterator: DataLearningIterator, - type: str = None, - tensorboard_tag: Optional[str] = None, - tensorboard_index: Optional[int] = None): - """ - Call method with metrics as parameters for logging, according to chosen method. - - """ - raise NotImplementedError + return report @abstractmethod - def print_info(self): - """ - Print inforamtion about logging method, like the logging directory... - - """ + def __call__() -> None: raise NotImplementedError class StdLogger(TrainLogger): """ - StdLogger class for printing report about current training and validation processes to stdout. + StdLogger class for logging report about current training and validation processes to stdout. Args: - type: 'train' for printing report of training process or 'valid' for validation process. - log_true (boo): if True: print of the StdLogger is provided in .json file as logging method or not. default False. + stdlogging (bool): if True, log report to stdout. + the object of this class with stdlogging = False can be used for validation process. """ - def __init__(self, stdlogging: bool = True): + def __init__(self, stdlogging: bool = True) -> None: self.stdlogging = stdlogging - def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): - return super().get_report(nn_trainer, iterator, type=type) + def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: + return super().get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) - def __call__(self, nn_trainer, iterator: DataLearningIterator, type: str = None, report: Dict = None, metrics: Dict = None) -> None: + def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, report: Dict = None) -> dict: """ - Print report to stdout. + override call method, to log report to stdout. Args: - report(dict): report to log to stdout. + nn_trainer: NNTrainer object contains parameters required for preparing report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation. + type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + report: dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + dict contains logged data to stdout. + """ if report is None: - print("Calling from StdLogger:::::::::::::::::::::::::::::::::::::::::::::::") - metrics, report = self.get_report( + report = self.get_report( nn_trainer=nn_trainer, iterator=iterator, type=type) if self.stdlogging: log.info(json.dumps({type: report}, ensure_ascii=False, cls=NumpyArrayEncoder)) - return metrics, report - - def print_info(self): - raise NotImplementedError + return report class TensorboardLogger(TrainLogger): """ - TensorboardLogger class for logging metrics during training process into a local folder, later using TensorBoard tool for visualizations the logged data. + TensorboardLogger class for logging to tesnorboard. Args: - type (str): 'train' for logging metrics of training process or 'valid' for validation process. - log_dir (str): path to local folder to log data into. + log_dir (Path): path to local folder to log data into. """ - def __init__(self, log_dir: Path = None): + def __init__(self, log_dir: Path = None) -> None: self.train_log_dir = str(log_dir / 'train_log') self.valid_log_dir = str(log_dir / 'valid_log') self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) @@ -227,52 +221,45 @@ def __init__(self, log_dir: Path = None): def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): return super().get_report(nn_trainer, iterator, type=type) - def __call__( - self, - nn_trainer, - iterator: DataLearningIterator, - type: str = None, - tensorboard_tag: Optional[str] = None, - tensorboard_index: Optional[int] = None, - report: Dict = None, - metrics: List = None, - ): + def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None, report: Dict = None) -> dict: """ override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. for 'valid' logging type, log metrics of validation process to log_dir/valid_log. - for 'valid' type, 'call' function saves best score on validation data, and the model parameters corresponding to the best score. Args: - nn_trainer: 'NNTrainer' object which contains 'self' as variable. + nn_trainer: NNTrainer object contains parameters required for preparing the report. iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. + report: dictionary contains current process information, if None, use 'get_report' method to get this report. Returns: - a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. + dict contains metrics logged to tesnorboard. """ if report is None: - print( - "Calling from TensorboardLogger:::::::::::::::::::::::::::::::::::::::::::::::") - metrics, report = self.get_report( + report = self.get_report( nn_trainer=nn_trainer, iterator=iterator, type=type) - # logging to tensorboard: if type == "train": - if metrics and self.train_log_dir is not None: # nn_trainer.tensorboard_idx is not None - # log.info(f"logging Training metrics to {self.train_log_dir}") + metrics: List[Tuple[str, float]] = list( + report.get("metrics", {}).items() + ) + list(nn_trainer.last_result.items()) + if report.get("loss", None) is not None: + metrics.append(("loss", report["loss"])) + + if metrics and self.train_log_dir is not None: summary = nn_trainer._tf.Summary() for name, score in metrics: summary.value.add( tag=f"{tensorboard_tag}/{name}", simple_value=score ) - # if tensorboard_index is None: - # tensorboard_index = nn_trainer.train_batches_seen self.tb_train_writer.add_summary(summary, tensorboard_index) self.tb_train_writer.flush() else: + metrics = list(report["metrics"].items()) if tensorboard_tag is not None and self.valid_log_dir is not None: summary = nn_trainer._tf.Summary() for name, score in metrics: @@ -282,74 +269,83 @@ def __call__( tensorboard_index = nn_trainer.train_batches_seen self.tb_valid_writer.add_summary(summary, tensorboard_index) self.tb_valid_writer.flush() - return metrics, report - - def print_info(self): - raise NotImplementedError + return report class WandbLogger(TrainLogger): """ - WandbLogger class for logging report about current training or validation process to WandB ("https://wandb.ai/site"). + WandbLogger class for logging report about current training and validation processes to WandB during training. ("https://wandb.ai/site"). WandB is a central dashboard to keep track of your hyperparameters, system metrics, and predictions so you can compare models live, and share your findings. + WandB doesn't support more than one run concurrently, so logging will be on "epochs" or "batches" + If val_every_n_epochs > 0 or log_every_n_epochs > 0 in config file, logging to wandb will be on epochs. + Otherwise if val_every_n_batches > 0 or log_every_n_batches > 0 in config file, logging to wandb will be on batches. + if none of them, logging to wandb will be ignored. Args: - key (string, optional): authentication key. + log_on (str): if "epochs": logging to wandb on epochs, if "batches: logging on batches. + commit_on_valid (bool): If False wandb.log just updates the current metrics dict with the row argument and metrics won't be saved until wandb.log is called with commit=True + to commit training and validation reports with the same steps, this argument is True if logging on validation required + **kwargs: arguments for wandb initialization, more info: https://docs.wandb.ai/ref/python/init """ @staticmethod - def login(API_Key: str = None): + def login(API_Key: str = None, relogin: bool = True) -> bool: + """" + static method to login to wandb account, if login or init to wandb failed, logging to wandb will be ignored. + + Args: + API_Key (str): authentication key. + relogin (bool): if True, force relogin if already logged in. + report(dict): dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + True if login and init processes succeed, otherwise False and logging to wandb will be ignored. + + """ try: - wandb.login(key=API_Key, relogin=True) - wandb.init()# in case wandb.login() return True with not valid key, but this will throw an error when initializing + wandb.login(key=API_Key, relogin=relogin) + wandb.init() # in case wandb.login() returns True with not valid key, but this will throw an error when initializing return True except Exception as e: log.warning(str(e)+", logging to WandB will be ignored") return False - def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): - return super().get_report(nn_trainer, iterator, type=type) + def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: + return super().get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) - def __init__(self, log_on: str = None, commit_on_valid: bool = False, **kwargs): + def __init__(self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs) -> None: self.log_on = log_on # "epochs","batches" self.commit_on_valid = commit_on_valid - print("HEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE") - try: - wandb.init(**kwargs, reinit=True) - except: - # set self.wandblogger_idx to None - print("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") + wandb.init(**kwargs) - def __call__(self, nn_trainer, + def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, - report: Dict = None, - metrics: List = None + report: Dict = None ): """ " Logging report of the training process to wandb. Args: - report (dict): report to log to WandB. + nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + report (dict): report for logging to WandB. If None, use 'get_report' method to get this report. + type (str) : process type, if "train" logs report about training process, else if "valid" logs report about validation process. Returns: - a report dict containing calculated metrics, spent time value, and other metrics according to 'type'. + dict contains logged data to WandB. """ if report is None: - print( - "Calling from WandbLogger:::::::::::::::::::::::::::::::::::::::::::::::") - metrics, report = self.get_report( + report = self.get_report( nn_trainer=nn_trainer, iterator=iterator, type=type) logging_type = type + "/" for i in report.keys(): if isinstance(report[i], dict): - # if type(report[i]) == dict: for key, value in report[i].items(): - # for j in report[i].keys(): wandb.log( {logging_type+key: value}, commit=False) else: @@ -370,12 +366,10 @@ def __call__(self, nn_trainer, if (self.commit_on_valid and logging_type == "valid/") or (not self.commit_on_valid and logging_type == "train/"): wandb.log({}, commit=True) - return metrics, report + return report @staticmethod def close(): - wandb.log({}, commit=True) + """close function to commit the not commited logs and to mark a run as finished wiht wanb.finish method, and finishes uploading all data.""" + wandb.log({}, commit= True) wandb.finish() - - def print_info(self): - raise NotImplementedError diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index cbd703a5fd..c8b6d2af38 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -73,7 +73,7 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, evaluation_targets: Iterable[str] = ('valid', 'test'), show_examples: bool = False, max_test_batches: int = -1, - logger: Optional[List[Dict]] = None, + logger: Optional[List[dict]] = None, **kwargs) -> None: if kwargs: log.info( @@ -102,6 +102,7 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, if logger[i].get("name", None) == "WandbLogger": self.wandblogger_idx = i except AttributeError: + self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None log.warning( "Check logger dictionary in configs, logging will be ignored") if self.tensorboard_idx is not None: diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 50523ae31f..b1a13bc5d8 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -22,7 +22,6 @@ from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.trainers.fit_trainer import FitTrainer from deeppavlov.core.trainers.utils import parse_metrics -from deeppavlov.core.common.logging_class import TensorboardLogger, StdLogger, WandbLogger log = getLogger(__name__) @@ -153,16 +152,15 @@ def _improved(op): self.std_logger = StdLogger(stdlogging=True) if self.tensorboard_idx is not None: - self.tensorboardlogger = TensorboardLogger( + self.tensorboard_logger = TensorboardLogger( log_dir=self.logger[self.tensorboard_idx]["log_dir"]) if self.wandblogger_idx is not None: - if WandbLogger.login( - self.logger[self.wandblogger_idx].get("API_Key", None)): + if WandbLogger.login(API_Key = + self.logger[self.wandblogger_idx].get("API_Key", None), relogin = True): if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: self.wandb_logger = WandbLogger( log_on="epochs", commit_on_valid=self.val_every_n_epochs > 0, **self.logger[self.wandblogger_idx].get("init", None)) - WandbLogger.log_on_train = True elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: self.wandb_logger = WandbLogger( log_on="batches", commit_on_valid=self.val_every_n_batches > 0, **self.logger[self.wandblogger_idx].get("init", None)) @@ -197,7 +195,7 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: self.start_time = time.time() if self.validate_first: self._send_event(event_name="before_validation") - _, report = self.validate_(self, iterator, "valid") + report = self.validate_(self, iterator, "valid") self._send_event(event_name='after_validation', data=report) while True: @@ -217,36 +215,36 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: self._send_event(event_name="before_log") - metrics, report = None, None + report = None if self.stdlogger_idx is not None: - metrics, report = self.std_logger( - self, iterator, type="train", report=report, metrics=metrics) + report = self.std_logger( + self, iterator, type="train", report=report) if self.tensorboard_idx is not None: - metrics, report = self.tensorboardlogger( - self, iterator, type="train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) + report = self.tensorboard_logger( + self, iterator, type="train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": - metrics, report = self.wandb_logger( - self, iterator=iterator, type="train", report=report, metrics=metrics) + report = self.wandb_logger( + self, iterator=iterator, type="train", report=report) # empty report if no logging method. self._send_event(event_name='after_train_log', data=report) if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: self._send_event(event_name='before_validation') - report, metrics = None, None + report = None if self.stdlogger_idx is not None: - metrics, report = self.std_logger( - self, iterator, type="valid", report=report, metrics=metrics) + report = self.std_logger( + self, iterator, type="valid", report=report) if self.tensorboard_idx is not None: - metrics, report = self.tensorboardlogger( - self, iterator, type="valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report, metrics=metrics) + report = self.tensorboard_logger( + self, iterator, type="valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": - metrics, report = self.wandb_logger( - self, iterator, type="valid", report=report, metrics=metrics) + report = self.wandb_logger( + self, iterator, type="valid", report=report) self._send_event( event_name='after_validation', data=report) @@ -269,36 +267,36 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: self._send_event(event_name="before_log") - report, metrics = None, None + report = None if self.stdlogger_idx is not None: - metrics, report = self.std_logger( - self, iterator=iterator, type="train", report=report, metrics=metrics) - - if self.tensorboard_idx is not None: - metrics, report = self.tensorboardlogger( - self, iterator=iterator, type="train", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report, metrics=metrics) + report = self.std_logger( + self, iterator=iterator, type="train", report=report) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": - metrics, report = self.wandb_logger( - self, iterator, type="train", report=report, metrics=metrics) + report = self.wandb_logger( + self, iterator, type="train", report=report) + + if self.tensorboard_idx is not None: + report = self.tensorboard_logger( + self, iterator=iterator, type="train", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report) self._send_event(event_name='after_train_log', data=report) if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: self._send_event(event_name='before_validation') - report, metrics = None, None + report = None if self.stdlogger_idx is not None: - metrics, report = self.std_logger( - self, iterator, type="valid", report=report, metrics=metrics) + report = self.std_logger( + self, iterator, type="valid", report=report) if self.tensorboard_idx is not None: - metrics, report = self.tensorboardlogger( - self, iterator=iterator, type="valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report, metrics=metrics) + report = self.tensorboard_logger( + self, iterator=iterator, type="valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report) if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": - metrics, report = self.wandb_logger( - self, iterator, type="valid", report=report, metrics=metrics) + report = self.wandb_logger( + self, iterator, type="valid", report=report) self._send_event(event_name='after_validation', data=report) @@ -329,3 +327,5 @@ def train(self, iterator: DataLearningIterator) -> None: if self.validation_number < 1: log.info('Save model to capture early training results') self.save() + +from deeppavlov.core.common.logging_class import TensorboardLogger, StdLogger, WandbLogger From 0fde139c69d4ec0123533cc2398b11053495acca Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Fri, 7 Jan 2022 12:40:38 +0300 Subject: [PATCH 13/18] Update init in W&B --- .../classifiers/sentiment_twitter.json | 22 ++++++++----------- deeppavlov/core/common/logging_class.py | 11 ++++++---- deeppavlov/core/trainers/nn_trainer.py | 4 ++++ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 6d7d937a22..55b02dabd4 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -63,7 +63,7 @@ "main": true, "class_name": "keras_classification_model", "save_path": "{MODEL_PATH}/new_model", - + "load_path": "", "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ @@ -74,7 +74,7 @@ "filters_cnn": 256, "optimizer": "Adam", "learning_rate": 0.01, - "learning_rate_decay": 0.1, + "learning_rate_decay": 0.01, "loss": "binary_crossentropy", "last_layer_activation": "softmax", "coef_reg_cnn": 1e-3, @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 30, + "epochs": 10, "batch_size": 64, "metrics": [ "accuracy", @@ -114,9 +114,8 @@ } ], "validation_patience": 5, - "val_every_n_epochs": 4, - "log_every_n_epochs": 3, - "log_on_k_batches":2, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, "show_examples": false, "evaluation_targets": [ "train", @@ -125,7 +124,6 @@ ], "class_name": "nn_trainer", "logger": [ - "hello", { "name": "TensorboardLogger", "log_dir": "{MODELS_PATH}/sentiment_twitter/Tensorboard_logs" @@ -135,13 +133,11 @@ }, { "name": "WandbLogger", - "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0252", + "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0253", "init":{ - "project": "Deep_Test_third", - "group": "Group_Name", - "job_type":"Name job Type", - "name":"Run name", - "reinit": true, + "project": "Tuning Hyperparameters", + "group": "Tuning lr & lr_decay", + "job_type":"lr=0.01, lr_decay=0.01", "config": { "description": "add any hyperprameter you want to monitor, architecture discription,..", "learning_rate": 0.02, diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py index a3b45c9bd1..504be8b66d 100644 --- a/deeppavlov/core/common/logging_class.py +++ b/deeppavlov/core/common/logging_class.py @@ -305,9 +305,7 @@ def login(API_Key: str = None, relogin: bool = True) -> bool: """ try: - wandb.login(key=API_Key, relogin=relogin) - wandb.init() # in case wandb.login() returns True with not valid key, but this will throw an error when initializing - return True + return wandb.login(key=API_Key, relogin=relogin) except Exception as e: log.warning(str(e)+", logging to WandB will be ignored") return False @@ -318,7 +316,12 @@ def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type def __init__(self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs) -> None: self.log_on = log_on # "epochs","batches" self.commit_on_valid = commit_on_valid - wandb.init(**kwargs) + try: + wandb.init(**kwargs) + self.init_succeed = True + except Exception as e: + log.warning(str(e)+", logging to WandB will be ignored") + self.init_succeed = False def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index b1a13bc5d8..3f6e967faa 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -161,9 +161,13 @@ def _improved(op): if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: self.wandb_logger = WandbLogger( log_on="epochs", commit_on_valid=self.val_every_n_epochs > 0, **self.logger[self.wandblogger_idx].get("init", None)) + if self.wandb_logger.init_succeed == False: + self.wandblogger_idx = None elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: self.wandb_logger = WandbLogger( log_on="batches", commit_on_valid=self.val_every_n_batches > 0, **self.logger[self.wandblogger_idx].get("init", None)) + if self.wandb_logger.init_succeed == False: + self.wandblogger_idx = None else: self.wandblogger_idx = None From 95a090c28f71e733f8f58ae9c83feb91229a182c Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Sat, 22 Jan 2022 13:12:25 +0300 Subject: [PATCH 14/18] Create logging directory and place different logger in separate modules --- .../classifiers/sentiment_twitter.json | 12 +- .../core/common/logging/logging_class.py | 141 +++++++ deeppavlov/core/common/logging/std_logger.py | 54 +++ .../core/common/logging/tensorboard_logger.py | 87 ++++ .../core/common/logging/wandb_logger.py | 120 ++++++ deeppavlov/core/common/logging_class.py | 378 ------------------ deeppavlov/core/trainers/fit_trainer.py | 14 +- deeppavlov/core/trainers/nn_trainer.py | 264 ++++++++---- deeppavlov/requirements/wandb.txt | 3 + docs/conf.py | 2 +- requirements.txt | 5 +- 11 files changed, 592 insertions(+), 488 deletions(-) create mode 100644 deeppavlov/core/common/logging/logging_class.py create mode 100644 deeppavlov/core/common/logging/std_logger.py create mode 100644 deeppavlov/core/common/logging/tensorboard_logger.py create mode 100644 deeppavlov/core/common/logging/wandb_logger.py delete mode 100644 deeppavlov/core/common/logging_class.py create mode 100644 deeppavlov/requirements/wandb.txt diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 55b02dabd4..545ab38995 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", @@ -62,8 +62,8 @@ ], "main": true, "class_name": "keras_classification_model", - "save_path": "{MODEL_PATH}/new_model", - "load_path": "", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ @@ -73,7 +73,7 @@ ], "filters_cnn": 256, "optimizer": "Adam", - "learning_rate": 0.01, + "learning_rate": 0.1, "learning_rate_decay": 0.01, "loss": "binary_crossentropy", "last_layer_activation": "softmax", @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 10, + "epochs": 100, "batch_size": 64, "metrics": [ "accuracy", @@ -174,4 +174,4 @@ } ] } -} \ No newline at end of file +} diff --git a/deeppavlov/core/common/logging/logging_class.py b/deeppavlov/core/common/logging/logging_class.py new file mode 100644 index 0000000000..7ec6c9bac9 --- /dev/null +++ b/deeppavlov/core/common/logging/logging_class.py @@ -0,0 +1,141 @@ +# Copyright 2022 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import datetime +from itertools import islice +from abc import ABC, abstractmethod +from typing import List, Tuple +from logging import getLogger + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer + + +log = getLogger(__name__) + + +class TrainLogger(ABC): + """An abstract class for logging metrics during training process.""" + + def get_report( + self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None + ) -> dict: + """ " + Get report about current process. + for 'valid' type, 'get_report' function also saves best score on validation data, and the model parameters corresponding to the best score. + + Args: + nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + type : if "train" returns report about training process, "valid" returns report about validation process. + + Returns: + dict contains data about current 'type' process. + + """ + if type == "train": + if nn_trainer.log_on_k_batches == 0: + report = { + "time_spent": str( + datetime.timedelta( + seconds=round(time.time() - nn_trainer.start_time + 0.5) + ) + ) + } + else: + data = islice( + iterator.gen_batches( + nn_trainer.batch_size, data_type="train", shuffle=True + ), + nn_trainer.log_on_k_batches, + ) + report = nn_trainer.test( + data, nn_trainer.train_metrics, start_time=nn_trainer.start_time + ) + + report.update( + { + "epochs_done": nn_trainer.epoch, + "batches_seen": nn_trainer.train_batches_seen, + "train_examples_seen": nn_trainer.examples, + } + ) + + metrics: List[Tuple[str, float]] = list( + report.get("metrics", {}).items() + ) + list(nn_trainer.last_result.items()) + + report.update(nn_trainer.last_result) + if nn_trainer.losses: + report["loss"] = sum(nn_trainer.losses) / len(nn_trainer.losses) + nn_trainer.losses.clear() + metrics.append(("loss", report["loss"])) + + elif type == "valid": + report = nn_trainer.test( + iterator.gen_batches( + nn_trainer.batch_size, data_type="valid", shuffle=False + ), + start_time=nn_trainer.start_time, + ) + + report["epochs_done"] = nn_trainer.epoch + report["batches_seen"] = nn_trainer.train_batches_seen + report["train_examples_seen"] = nn_trainer.examples + + metrics = list(report["metrics"].items()) + + m_name, score = metrics[0] + + # Update the patience + if nn_trainer.score_best is None: + nn_trainer.patience = 0 + else: + if nn_trainer.improved(score, nn_trainer.score_best): + nn_trainer.patience = 0 + else: + nn_trainer.patience += 1 + + # Run the validation model-saving logic + if nn_trainer._is_initial_validation(): + log.info("Initial best {} of {}".format(m_name, score)) + nn_trainer.score_best = score + elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: + log.info("First best {} of {}".format(m_name, score)) + nn_trainer.score_best = score + log.info("Saving model") + nn_trainer.save() + elif nn_trainer.improved(score, nn_trainer.score_best): + log.info("Improved best {} of {}".format(m_name, score)) + nn_trainer.score_best = score + log.info("Saving model") + nn_trainer.save() + else: + log.info( + "Did not improve on the {} of {}".format( + m_name, nn_trainer.score_best + ) + ) + + report["impatience"] = nn_trainer.patience + if nn_trainer.validation_patience > 0: + report["patience_limit"] = nn_trainer.validation_patience + + nn_trainer.validation_number += 1 + return report + + @abstractmethod + def __call__() -> None: + raise NotImplementedError diff --git a/deeppavlov/core/common/logging/std_logger.py b/deeppavlov/core/common/logging/std_logger.py new file mode 100644 index 0000000000..1a828e4c5e --- /dev/null +++ b/deeppavlov/core/common/logging/std_logger.py @@ -0,0 +1,54 @@ +from typing import Dict +from logging import getLogger +import json + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.trainers.utils import NumpyArrayEncoder +from deeppavlov.core.common.logging.logging_class import TrainLogger + +log = getLogger(__name__) + + +class StdLogger(TrainLogger): + """ + StdLogger class for logging report about current training and validation processes to stdout. + + Args: + stdlogging (bool): if True, log report to stdout. + the object of this class with stdlogging = False can be used for validation process. + + """ + + def __init__(self, stdlogging: bool = True) -> None: + self.stdlogging = stdlogging + + def __call__( + self, + nn_trainer: NNTrainer, + iterator: DataLearningIterator, + type: str = None, + report: Dict = None, + ) -> dict: + """ + override call method, to log report to stdout. + + Args: + nn_trainer: NNTrainer object contains parameters required for preparing report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation. + type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + report: dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + dict contains logged data to stdout. + + """ + if report is None: + report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type + ) + if self.stdlogging: + log.info( + json.dumps({type: report}, ensure_ascii=False, cls=NumpyArrayEncoder) + ) + return report diff --git a/deeppavlov/core/common/logging/tensorboard_logger.py b/deeppavlov/core/common/logging/tensorboard_logger.py new file mode 100644 index 0000000000..f461712235 --- /dev/null +++ b/deeppavlov/core/common/logging/tensorboard_logger.py @@ -0,0 +1,87 @@ +from pathlib import Path +from typing import List, Tuple, Optional, Dict +from logging import getLogger + +import tensorflow as tf + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.common.logging.logging_class import TrainLogger + +log = getLogger(__name__) + + +class TensorboardLogger(TrainLogger): + """ + TensorboardLogger class for logging to tesnorboard. + + Args: + log_dir (Path): path to local folder to log data into. + + """ + + def __init__(self, log_dir: Path = None) -> None: + self.train_log_dir = str(log_dir / "train_log") + self.valid_log_dir = str(log_dir / "valid_log") + self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) + self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) + + def __call__( + self, + nn_trainer: NNTrainer, + iterator: DataLearningIterator, + type: str = None, + tensorboard_tag: Optional[str] = None, + tensorboard_index: Optional[int] = None, + report: Dict = None, + ) -> dict: + """ + override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. + for 'valid' logging type, log metrics of validation process to log_dir/valid_log. + + Args: + nn_trainer: NNTrainer object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' + tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. + report: dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + dict contains metrics logged to tesnorboard. + + """ + if report is None: + report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type + ) + + if type == "train": + metrics: List[Tuple[str, float]] = list( + report.get("metrics", {}).items() + ) + list(nn_trainer.last_result.items()) + if report.get("loss", None) is not None: + metrics.append(("loss", report["loss"])) + + if metrics and self.train_log_dir is not None: + summary = nn_trainer._tf.Summary() + + for name, score in metrics: + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) + self.tb_train_writer.add_summary(summary, tensorboard_index) + self.tb_train_writer.flush() + else: + metrics = list(report["metrics"].items()) + if tensorboard_tag is not None and self.valid_log_dir is not None: + summary = nn_trainer._tf.Summary() + for name, score in metrics: + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) + if tensorboard_index is None: + tensorboard_index = nn_trainer.train_batches_seen + self.tb_valid_writer.add_summary(summary, tensorboard_index) + self.tb_valid_writer.flush() + return report diff --git a/deeppavlov/core/common/logging/wandb_logger.py b/deeppavlov/core/common/logging/wandb_logger.py new file mode 100644 index 0000000000..c3cf4cca02 --- /dev/null +++ b/deeppavlov/core/common/logging/wandb_logger.py @@ -0,0 +1,120 @@ +import time +import datetime +from typing import Dict +from logging import getLogger + +import wandb + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.common.logging.logging_class import TrainLogger + + +log = getLogger(__name__) + + +class WandbLogger(TrainLogger): + """ + WandbLogger class for logging report about current training and validation processes to WandB during training. ("https://wandb.ai/site"). + + WandB is a central dashboard to keep track of your hyperparameters, system metrics, and predictions so you can compare models live, and share your findings. + WandB doesn't support more than one run concurrently, so logging will be on "epochs" or "batches" + If val_every_n_epochs > 0 or log_every_n_epochs > 0 in config file, logging to wandb will be on epochs. + Otherwise if val_every_n_batches > 0 or log_every_n_batches > 0 in config file, logging to wandb will be on batches. + if none of them, logging to wandb will be ignored. + + Args: + log_on (str): if "epochs": logging to wandb on epochs, if "batches: logging on batches. + commit_on_valid (bool): If False wandb.log just updates the current metrics dict with the row argument and metrics won't be saved until wandb.log is called with commit=True + to commit training and validation reports with the same steps, this argument is True if logging on validation required + **kwargs: arguments for wandb initialization, more info: https://docs.wandb.ai/ref/python/init + + """ + + @staticmethod + def login(API_Key: str = None, relogin: bool = True) -> bool: + """ " + static method to login to wandb account, if login or init to wandb failed, logging to wandb will be ignored. + + Args: + API_Key (str): authentication key. + relogin (bool): if True, force relogin if already logged in. + report(dict): dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + True if login and init processes succeed, otherwise False and logging to wandb will be ignored. + + """ + try: + return wandb.login(key=API_Key, relogin=relogin) + except Exception as e: + log.warning(str(e) + ", logging to WandB will be ignored") + return False + + def __init__( + self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs + ) -> None: + self.log_on = log_on # "epochs","batches" + self.commit_on_valid = commit_on_valid + try: + wandb.init(**kwargs) + self.init_succeed = True + except Exception as e: + log.warning(str(e) + ", logging to WandB will be ignored") + self.init_succeed = False + + def __call__( + self, + nn_trainer: NNTrainer, + iterator: DataLearningIterator, + type: str = None, + report: Dict = None, + ): + """ " + Logging report of the training process to wandb. + + Args: + nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + report (dict): report for logging to WandB. If None, use 'get_report' method to get this report. + type (str) : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + + Returns: + dict contains logged data to WandB. + + """ + if report is None: + report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type + ) + + logging_type = type + "/" + for i in report.keys(): + if isinstance(report[i], dict): + for key, value in report[i].items(): + wandb.log({logging_type + key: value}, commit=False) + else: + if i == "time_spent": + t = time.strptime(report[i], "%H:%M:%S") + y_seconds = int( + datetime.timedelta( + hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec + ).total_seconds() + ) + wandb.log({logging_type + i + ("(s)"): y_seconds}, commit=False) + else: + wandb.log({logging_type + i: report[i]}, commit=False) + + # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. + if (self.commit_on_valid and logging_type == "valid/") or ( + not self.commit_on_valid and logging_type == "train/" + ): + wandb.log({}, commit=True) + + return report + + @staticmethod + def close(): + """close function to commit the not commited logs and to mark a run as finished wiht wanb.finish method, and finishes uploading all data.""" + wandb.log({}, commit=True) + wandb.finish() diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py deleted file mode 100644 index 504be8b66d..0000000000 --- a/deeppavlov/core/common/logging_class.py +++ /dev/null @@ -1,378 +0,0 @@ -# Copyright 2019 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import time -from pathlib import Path -import datetime -from itertools import islice -from abc import ABC, abstractmethod -from typing import List, Tuple, Optional, Dict -from logging import getLogger - -import tensorflow as tf -import wandb - -from deeppavlov.core.trainers.utils import NumpyArrayEncoder -from deeppavlov.core.data.data_learning_iterator import DataLearningIterator -from deeppavlov.core.trainers.nn_trainer import NNTrainer - -log = getLogger(__name__) - - -class TrainLogger(ABC): - """An abstract class for logging metrics during training process. - - There are three types of logging: - - StdLogger: for logging report about current training and validation processes to stdout. - - TensorboardLogger: for logging to tensorboard. - - WandbLogger: for logging to WandB. - - """ - - @abstractmethod - def __init__() -> None: - """ - The constructor for TrainLogger class. - - """ - raise NotImplementedError - - @abstractmethod - def get_report(self, - nn_trainer: NNTrainer, - iterator: DataLearningIterator, type: str = None) -> dict: - """" - Get report about current process. - for 'valid' type, 'get_report' function also saves best score on validation data, and the model parameters corresponding to the best score. - - Args: - nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - type : if "train" returns report about training process, "valid" returns report about validation process. - - Returns: - dict contains data about current 'type' process. - - """ - if type == "train": - if nn_trainer.log_on_k_batches == 0: - report = { - "time_spent": str( - datetime.timedelta( - seconds=round( - time.time() - nn_trainer.start_time + 0.5) - ) - ) - } - else: - data = islice( - iterator.gen_batches( - nn_trainer.batch_size, data_type="train", shuffle=True - ), - nn_trainer.log_on_k_batches, - ) - report = nn_trainer.test( - data, nn_trainer.train_metrics, start_time=nn_trainer.start_time - ) - - report.update( - { - "epochs_done": nn_trainer.epoch, - "batches_seen": nn_trainer.train_batches_seen, - "train_examples_seen": nn_trainer.examples, - } - ) - - metrics: List[Tuple[str, float]] = list( - report.get("metrics", {}).items() - ) + list(nn_trainer.last_result.items()) - - report.update(nn_trainer.last_result) - if nn_trainer.losses: - report["loss"] = sum(nn_trainer.losses) / \ - len(nn_trainer.losses) - nn_trainer.losses.clear() - metrics.append(("loss", report["loss"])) - - elif type == "valid": - report = nn_trainer.test( - iterator.gen_batches( - nn_trainer.batch_size, data_type="valid", shuffle=False - ), - start_time=nn_trainer.start_time, - ) - - report["epochs_done"] = nn_trainer.epoch - report["batches_seen"] = nn_trainer.train_batches_seen - report["train_examples_seen"] = nn_trainer.examples - - metrics = list(report["metrics"].items()) - - m_name, score = metrics[0] - - # Update the patience - if nn_trainer.score_best is None: - nn_trainer.patience = 0 - else: - if nn_trainer.improved(score, nn_trainer.score_best): - nn_trainer.patience = 0 - else: - nn_trainer.patience += 1 - - # Run the validation model-saving logic - if nn_trainer._is_initial_validation(): - log.info("Initial best {} of {}".format(m_name, score)) - nn_trainer.score_best = score - elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: - log.info("First best {} of {}".format(m_name, score)) - nn_trainer.score_best = score - log.info("Saving model") - nn_trainer.save() - elif nn_trainer.improved(score, nn_trainer.score_best): - log.info("Improved best {} of {}".format(m_name, score)) - nn_trainer.score_best = score - log.info("Saving model") - nn_trainer.save() - else: - log.info( - "Did not improve on the {} of {}".format( - m_name, nn_trainer.score_best - ) - ) - - report["impatience"] = nn_trainer.patience - if nn_trainer.validation_patience > 0: - report["patience_limit"] = nn_trainer.validation_patience - - nn_trainer.validation_number += 1 - return report - - @abstractmethod - def __call__() -> None: - raise NotImplementedError - - -class StdLogger(TrainLogger): - """ - StdLogger class for logging report about current training and validation processes to stdout. - - Args: - stdlogging (bool): if True, log report to stdout. - the object of this class with stdlogging = False can be used for validation process. - - """ - - def __init__(self, stdlogging: bool = True) -> None: - self.stdlogging = stdlogging - - def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: - return super().get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) - - def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, report: Dict = None) -> dict: - """ - override call method, to log report to stdout. - - Args: - nn_trainer: NNTrainer object contains parameters required for preparing report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation. - type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. - report: dictionary contains current process information, if None, use 'get_report' method to get this report. - - Returns: - dict contains logged data to stdout. - - """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type) - if self.stdlogging: - log.info(json.dumps({type: report}, - ensure_ascii=False, cls=NumpyArrayEncoder)) - return report - - -class TensorboardLogger(TrainLogger): - """ - TensorboardLogger class for logging to tesnorboard. - - Args: - log_dir (Path): path to local folder to log data into. - - """ - - def __init__(self, log_dir: Path = None) -> None: - self.train_log_dir = str(log_dir / 'train_log') - self.valid_log_dir = str(log_dir / 'valid_log') - self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) - self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) - - def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): - return super().get_report(nn_trainer, iterator, type=type) - - def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None, report: Dict = None) -> dict: - """ - override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. - for 'valid' logging type, log metrics of validation process to log_dir/valid_log. - - Args: - nn_trainer: NNTrainer object contains parameters required for preparing the report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. - tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' - tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. - report: dictionary contains current process information, if None, use 'get_report' method to get this report. - - Returns: - dict contains metrics logged to tesnorboard. - - """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type) - - if type == "train": - metrics: List[Tuple[str, float]] = list( - report.get("metrics", {}).items() - ) + list(nn_trainer.last_result.items()) - if report.get("loss", None) is not None: - metrics.append(("loss", report["loss"])) - - if metrics and self.train_log_dir is not None: - summary = nn_trainer._tf.Summary() - - for name, score in metrics: - summary.value.add( - tag=f"{tensorboard_tag}/{name}", simple_value=score - ) - self.tb_train_writer.add_summary(summary, tensorboard_index) - self.tb_train_writer.flush() - else: - metrics = list(report["metrics"].items()) - if tensorboard_tag is not None and self.valid_log_dir is not None: - summary = nn_trainer._tf.Summary() - for name, score in metrics: - summary.value.add( - tag=f'{tensorboard_tag}/{name}', simple_value=score) - if tensorboard_index is None: - tensorboard_index = nn_trainer.train_batches_seen - self.tb_valid_writer.add_summary(summary, tensorboard_index) - self.tb_valid_writer.flush() - return report - - -class WandbLogger(TrainLogger): - """ - WandbLogger class for logging report about current training and validation processes to WandB during training. ("https://wandb.ai/site"). - - WandB is a central dashboard to keep track of your hyperparameters, system metrics, and predictions so you can compare models live, and share your findings. - WandB doesn't support more than one run concurrently, so logging will be on "epochs" or "batches" - If val_every_n_epochs > 0 or log_every_n_epochs > 0 in config file, logging to wandb will be on epochs. - Otherwise if val_every_n_batches > 0 or log_every_n_batches > 0 in config file, logging to wandb will be on batches. - if none of them, logging to wandb will be ignored. - - Args: - log_on (str): if "epochs": logging to wandb on epochs, if "batches: logging on batches. - commit_on_valid (bool): If False wandb.log just updates the current metrics dict with the row argument and metrics won't be saved until wandb.log is called with commit=True - to commit training and validation reports with the same steps, this argument is True if logging on validation required - **kwargs: arguments for wandb initialization, more info: https://docs.wandb.ai/ref/python/init - - """ - - @staticmethod - def login(API_Key: str = None, relogin: bool = True) -> bool: - """" - static method to login to wandb account, if login or init to wandb failed, logging to wandb will be ignored. - - Args: - API_Key (str): authentication key. - relogin (bool): if True, force relogin if already logged in. - report(dict): dictionary contains current process information, if None, use 'get_report' method to get this report. - - Returns: - True if login and init processes succeed, otherwise False and logging to wandb will be ignored. - - """ - try: - return wandb.login(key=API_Key, relogin=relogin) - except Exception as e: - log.warning(str(e)+", logging to WandB will be ignored") - return False - - def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: - return super().get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) - - def __init__(self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs) -> None: - self.log_on = log_on # "epochs","batches" - self.commit_on_valid = commit_on_valid - try: - wandb.init(**kwargs) - self.init_succeed = True - except Exception as e: - log.warning(str(e)+", logging to WandB will be ignored") - self.init_succeed = False - - def __call__(self, nn_trainer: NNTrainer, - iterator: DataLearningIterator, - type: str = None, - report: Dict = None - ): - """ " - Logging report of the training process to wandb. - - Args: - nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - report (dict): report for logging to WandB. If None, use 'get_report' method to get this report. - type (str) : process type, if "train" logs report about training process, else if "valid" logs report about validation process. - - Returns: - dict contains logged data to WandB. - - """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type) - - logging_type = type + "/" - for i in report.keys(): - if isinstance(report[i], dict): - for key, value in report[i].items(): - wandb.log( - {logging_type+key: value}, commit=False) - else: - if i == "time_spent": - t = time.strptime(report[i], "%H:%M:%S") - y_seconds = int( - datetime.timedelta( - hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec - ).total_seconds() - ) - wandb.log({logging_type+i+("(s)"): y_seconds}, - commit=False) - else: - wandb.log( - {logging_type+i: report[i]}, commit=False) - - # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. - if (self.commit_on_valid and logging_type == "valid/") or (not self.commit_on_valid and logging_type == "train/"): - wandb.log({}, commit=True) - - return report - - @staticmethod - def close(): - """close function to commit the not commited logs and to mark a run as finished wiht wanb.finish method, and finishes uploading all data.""" - wandb.log({}, commit= True) - wandb.finish() diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index c8b6d2af38..da7aa7686d 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -51,18 +51,6 @@ class FitTrainer: in evaluation logs (default is ``False``) logger : list of dictionaries of possible loggers from deeppavlov.configs files. (default is ``None``) - Possible loggers: - - TensorboardLogger: for logging to tesnorboard. Keys: - "name": "TensorboardLogger", logging to tensorboard will be ignored if None - "log_dir":str or path to a directory where tensorboard logs can be stored, ignored if None - (default is ``None``) - - StdLogger: for logging report about current training and validation processes to stdout. Keys: - "name": "StdLogger". logging to stdout will be ignored if None. (default is ``None``) - - WandbLogger: logging report about current training and validation processes to WandB. Keys: - "name": "WandbLogger", logging to wandb will be ignored if None. - "API_Key": API of 40 characters long from 'https://wandb.ai/home' personal account. - "init": dictionary of (key:value) for wandb.init configurations. see: 'https://docs.wandb.ai/ref/python/init' - (default is ``None``) max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored @@ -105,6 +93,8 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None log.warning( "Check logger dictionary in configs, logging will be ignored") + if self.tensorboard_idx is None and self.wandblogger_idx is None: + self.stdlogger_idx = 1 if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 3f6e967faa..d706a8a434 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -26,7 +26,7 @@ log = getLogger(__name__) -@register('nn_trainer') +@register("nn_trainer") class NNTrainer(FitTrainer): """ | Bases :class:`~deeppavlov.core.trainers.FitTrainer` @@ -85,47 +85,67 @@ class NNTrainer(FitTrainer): """ - def __init__(self, chainer_config: dict, *, - batch_size: int = 1, - epochs: int = -1, - start_epoch_num: int = 0, - max_batches: int = -1, - metrics: Iterable[Union[str, dict]] = ('accuracy',), - train_metrics: Optional[Iterable[Union[str, dict]]] = None, - metric_optimization: str = 'maximize', - evaluation_targets: Iterable[str] = ('valid', 'test'), - show_examples: bool = False, - logger: Optional[List[Dict]] = None, - max_test_batches: int = -1, - validate_first: bool = True, - validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1, - log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1, - - **kwargs) -> None: - super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets, - show_examples=show_examples, - logger=logger, - max_test_batches=max_test_batches, **kwargs) + def __init__( + self, + chainer_config: dict, + *, + batch_size: int = 1, + epochs: int = -1, + start_epoch_num: int = 0, + max_batches: int = -1, + metrics: Iterable[Union[str, dict]] = ("accuracy",), + train_metrics: Optional[Iterable[Union[str, dict]]] = None, + metric_optimization: str = "maximize", + evaluation_targets: Iterable[str] = ("valid", "test"), + show_examples: bool = False, + logger: Optional[List[Dict]] = None, + max_test_batches: int = -1, + validate_first: bool = True, + validation_patience: int = 5, + val_every_n_epochs: int = -1, + val_every_n_batches: int = -1, + log_every_n_batches: int = -1, + log_every_n_epochs: int = -1, + log_on_k_batches: int = 1, + **kwargs, + ) -> None: + super().__init__( + chainer_config, + batch_size=batch_size, + metrics=metrics, + evaluation_targets=evaluation_targets, + show_examples=show_examples, + logger=logger, + max_test_batches=max_test_batches, + **kwargs, + ) if train_metrics is None: self.train_metrics = self.metrics else: self.train_metrics = parse_metrics( - train_metrics, self._chainer.in_y, self._chainer.out_params) + train_metrics, self._chainer.in_y, self._chainer.out_params + ) metric_optimization = metric_optimization.strip().lower() self.score_best = None def _improved(op): - return lambda score, baseline: False if baseline is None or score is None \ + return ( + lambda score, baseline: False + if baseline is None or score is None else op(score, baseline) + ) - if metric_optimization == 'maximize': + if metric_optimization == "maximize": self.improved = _improved(lambda a, b: a > b) - elif metric_optimization == 'minimize': + elif metric_optimization == "minimize": self.improved = _improved(lambda a, b: a < b) else: - raise ConfigError('metric_optimization has to be one of {}'.format( - ['maximize', 'minimize'])) + raise ConfigError( + "metric_optimization has to be one of {}".format( + ["maximize", "minimize"] + ) + ) self.validate_first = validate_first self.validate_ = StdLogger(self.stdlogger_idx is not None) @@ -153,27 +173,36 @@ def _improved(op): if self.tensorboard_idx is not None: self.tensorboard_logger = TensorboardLogger( - log_dir=self.logger[self.tensorboard_idx]["log_dir"]) + log_dir=self.logger[self.tensorboard_idx]["log_dir"] + ) if self.wandblogger_idx is not None: - if WandbLogger.login(API_Key = - self.logger[self.wandblogger_idx].get("API_Key", None), relogin = True): + if WandbLogger.login( + API_Key=self.logger[self.wandblogger_idx].get("API_Key", None), + relogin=True, + ): if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: self.wandb_logger = WandbLogger( - log_on="epochs", commit_on_valid=self.val_every_n_epochs > 0, **self.logger[self.wandblogger_idx].get("init", None)) + log_on="epochs", + commit_on_valid=self.val_every_n_epochs > 0, + **self.logger[self.wandblogger_idx].get("init", None), + ) if self.wandb_logger.init_succeed == False: - self.wandblogger_idx = None + self.wandblogger_idx = None elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: self.wandb_logger = WandbLogger( - log_on="batches", commit_on_valid=self.val_every_n_batches > 0, **self.logger[self.wandblogger_idx].get("init", None)) + log_on="batches", + commit_on_valid=self.val_every_n_batches > 0, + **self.logger[self.wandblogger_idx].get("init", None), + ) if self.wandb_logger.init_succeed == False: - self.wandblogger_idx = None + self.wandblogger_idx = None else: self.wandblogger_idx = None def save(self) -> None: if self._loaded: - raise RuntimeError('Cannot save already finalized chainer') + raise RuntimeError("Cannot save already finalized chainer") self._chainer.save() @@ -185,10 +214,12 @@ def _is_first_validation(self): def _send_event(self, event_name: str, data: Optional[dict] = None) -> None: report = { - 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))), - 'epochs_done': self.epoch, - 'batches_seen': self.train_batches_seen, - 'train_examples_seen': self.examples + "time_spent": str( + datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5)) + ), + "epochs_done": self.epoch, + "batches_seen": self.train_batches_seen, + "train_examples_seen": self.examples, } if data is not None: report.update(data) @@ -200,67 +231,94 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: if self.validate_first: self._send_event(event_name="before_validation") report = self.validate_(self, iterator, "valid") - self._send_event(event_name='after_validation', data=report) + self._send_event(event_name="after_validation", data=report) while True: impatient = False - self._send_event(event_name='before_train') - for x, y_true in iterator.gen_batches(self.batch_size, data_type='train'): + self._send_event(event_name="before_train") + for x, y_true in iterator.gen_batches(self.batch_size, data_type="train"): self.last_result = self._chainer.train_on_batch(x, y_true) if self.last_result is None: self.last_result = {} elif not isinstance(self.last_result, dict): - self.last_result = {'loss': self.last_result} - if 'loss' in self.last_result: - self.losses.append(self.last_result.pop('loss')) + self.last_result = {"loss": self.last_result} + if "loss" in self.last_result: + self.losses.append(self.last_result.pop("loss")) self.train_batches_seen += 1 self.examples += len(x) - if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: + if ( + self.log_every_n_batches > 0 + and self.train_batches_seen % self.log_every_n_batches == 0 + ): self._send_event(event_name="before_log") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator, type="train", report=report) + self, iterator, type="train", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator, type="train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report) - - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": + self, + iterator, + type="train", + tensorboard_tag="every_n_batches", + tensorboard_index=self.train_batches_seen, + report=report, + ) + + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "batches" + ): report = self.wandb_logger( - self, iterator=iterator, type="train", report=report) + self, iterator=iterator, type="train", report=report + ) # empty report if no logging method. - self._send_event(event_name='after_train_log', data=report) + self._send_event(event_name="after_train_log", data=report) - if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: - self._send_event(event_name='before_validation') + if ( + self.val_every_n_batches > 0 + and self.train_batches_seen % self.val_every_n_batches == 0 + ): + self._send_event(event_name="before_validation") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator, type="valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report) - - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": + self, + iterator, + type="valid", + tensorboard_tag="every_n_batches", + tensorboard_index=self.train_batches_seen, + report=report, + ) + + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "batches" + ): report = self.wandb_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) - self._send_event( - event_name='after_validation', data=report) + self._send_event(event_name="after_validation", data=report) - self._send_event(event_name='after_batch') + self._send_event(event_name="after_batch") if 0 < self.max_batches <= self.train_batches_seen: impatient = True break if 0 < self.validation_patience <= self.patience: - log.info('Ran out of patience') + log.info("Ran out of patience") impatient = True break @@ -268,68 +326,100 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: break self.epoch += 1 - if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: + if ( + self.log_every_n_epochs > 0 + and self.epoch % self.log_every_n_epochs == 0 + ): self._send_event(event_name="before_log") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator=iterator, type="train", report=report) + self, iterator=iterator, type="train", report=report + ) - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "epochs" + ): report = self.wandb_logger( - self, iterator, type="train", report=report) + self, iterator, type="train", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator=iterator, type="train", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report) - - self._send_event(event_name='after_train_log', data=report) - - if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: - self._send_event(event_name='before_validation') + self, + iterator=iterator, + type="train", + tensorboard_tag="every_n_epochs", + tensorboard_index=self.epoch, + report=report, + ) + + self._send_event(event_name="after_train_log", data=report) + + if ( + self.val_every_n_epochs > 0 + and self.epoch % self.val_every_n_epochs == 0 + ): + self._send_event(event_name="before_validation") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator=iterator, type="valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report) - - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": + self, + iterator=iterator, + type="valid", + tensorboard_tag="every_n_epochs", + tensorboard_index=self.epoch, + report=report, + ) + + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "epochs" + ): report = self.wandb_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) - self._send_event(event_name='after_validation', data=report) + self._send_event(event_name="after_validation", data=report) - self._send_event(event_name='after_epoch') + self._send_event(event_name="after_epoch") if 0 < self.max_epochs <= self.epoch: break if 0 < self.validation_patience <= self.patience: - log.info('Ran out of patience') + log.info("Ran out of patience") break def train(self, iterator: DataLearningIterator) -> None: """Call :meth:`~fit_chainer` and then :meth:`~train_on_batches` with provided data iterator as an argument""" self.fit_chainer(iterator) - if callable(getattr(self._chainer, 'train_on_batch', None)): + if callable(getattr(self._chainer, "train_on_batch", None)): try: self.train_on_batches(iterator) if self.wandblogger_idx is not None: self.wandb_logger.close() except KeyboardInterrupt: - log.info('Stopped training') + log.info("Stopped training") else: log.warning( - f'Using {self.__class__.__name__} for a pipeline without batched training') + f"Using {self.__class__.__name__} for a pipeline without batched training" + ) # Run the at-train-exit model-saving logic if self.validation_number < 1: - log.info('Save model to capture early training results') + log.info("Save model to capture early training results") self.save() -from deeppavlov.core.common.logging_class import TensorboardLogger, StdLogger, WandbLogger + +from deeppavlov.core.common.logging.wandb_logger import WandbLogger +from deeppavlov.core.common.logging.std_logger import StdLogger +from deeppavlov.core.common.logging.tensorboard_logger import TensorboardLogger diff --git a/deeppavlov/requirements/wandb.txt b/deeppavlov/requirements/wandb.txt new file mode 100644 index 0000000000..9983212701 --- /dev/null +++ b/deeppavlov/requirements/wandb.txt @@ -0,0 +1,3 @@ +wandb==0.12.7 +pybind11==2.2 +fasttext \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index b3a4f11237..7454c1a691 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -193,7 +193,7 @@ autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa', 'lxml', 'nemo', 'nemo_asr', 'nemo_tts', 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa', 'russian_tagsets', 'sacremoses', 'sortedcontainers', 'spacy', 'tensorflow', 'tensorflow_hub', - 'torch', 'transformers', 'udapi', 'ufal_udpipe', 'whapi', 'xeger'] + 'torch', 'transformers', 'udapi', 'ufal_udpipe','wandb', 'whapi', 'xeger'] extlinks = { 'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None) diff --git a/requirements.txt b/requirements.txt index b6fa7bdf47..4708c92e6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,4 @@ tqdm==4.62.0 click==7.1.2 uvicorn==0.11.7 sacremoses==0.0.35 -uvloop==0.14.0 -wandb==0.12.7 -pybind11==2.2 -fasttext +uvloop==0.14.0 \ No newline at end of file From c3ee83d82895fe49dabc34ca978b2d1de93d7cca Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Sat, 22 Jan 2022 13:12:25 +0300 Subject: [PATCH 15/18] Create logging directory and place different loggers in separate modules --- .../classifiers/sentiment_twitter.json | 12 +- .../core/common/logging/logging_class.py | 141 +++++++ deeppavlov/core/common/logging/std_logger.py | 54 +++ .../core/common/logging/tensorboard_logger.py | 87 ++++ .../core/common/logging/wandb_logger.py | 120 ++++++ deeppavlov/core/common/logging_class.py | 378 ------------------ deeppavlov/core/trainers/fit_trainer.py | 14 +- deeppavlov/core/trainers/nn_trainer.py | 264 ++++++++---- deeppavlov/requirements/wandb.txt | 3 + docs/conf.py | 2 +- requirements.txt | 5 +- 11 files changed, 592 insertions(+), 488 deletions(-) create mode 100644 deeppavlov/core/common/logging/logging_class.py create mode 100644 deeppavlov/core/common/logging/std_logger.py create mode 100644 deeppavlov/core/common/logging/tensorboard_logger.py create mode 100644 deeppavlov/core/common/logging/wandb_logger.py delete mode 100644 deeppavlov/core/common/logging_class.py create mode 100644 deeppavlov/requirements/wandb.txt diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 55b02dabd4..545ab38995 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -3,7 +3,7 @@ "class_name": "basic_classification_reader", "x": "Twit", "y": "Class", - "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data/modified_data" + "data_path": "{DOWNLOADS_PATH}/sentiment_twitter_data" }, "dataset_iterator": { "class_name": "basic_classification_iterator", @@ -62,8 +62,8 @@ ], "main": true, "class_name": "keras_classification_model", - "save_path": "{MODEL_PATH}/new_model", - "load_path": "", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ @@ -73,7 +73,7 @@ ], "filters_cnn": 256, "optimizer": "Adam", - "learning_rate": 0.01, + "learning_rate": 0.1, "learning_rate_decay": 0.01, "loss": "binary_crossentropy", "last_layer_activation": "softmax", @@ -100,7 +100,7 @@ ] }, "train": { - "epochs": 10, + "epochs": 100, "batch_size": 64, "metrics": [ "accuracy", @@ -174,4 +174,4 @@ } ] } -} \ No newline at end of file +} diff --git a/deeppavlov/core/common/logging/logging_class.py b/deeppavlov/core/common/logging/logging_class.py new file mode 100644 index 0000000000..7ec6c9bac9 --- /dev/null +++ b/deeppavlov/core/common/logging/logging_class.py @@ -0,0 +1,141 @@ +# Copyright 2022 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import datetime +from itertools import islice +from abc import ABC, abstractmethod +from typing import List, Tuple +from logging import getLogger + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer + + +log = getLogger(__name__) + + +class TrainLogger(ABC): + """An abstract class for logging metrics during training process.""" + + def get_report( + self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None + ) -> dict: + """ " + Get report about current process. + for 'valid' type, 'get_report' function also saves best score on validation data, and the model parameters corresponding to the best score. + + Args: + nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + type : if "train" returns report about training process, "valid" returns report about validation process. + + Returns: + dict contains data about current 'type' process. + + """ + if type == "train": + if nn_trainer.log_on_k_batches == 0: + report = { + "time_spent": str( + datetime.timedelta( + seconds=round(time.time() - nn_trainer.start_time + 0.5) + ) + ) + } + else: + data = islice( + iterator.gen_batches( + nn_trainer.batch_size, data_type="train", shuffle=True + ), + nn_trainer.log_on_k_batches, + ) + report = nn_trainer.test( + data, nn_trainer.train_metrics, start_time=nn_trainer.start_time + ) + + report.update( + { + "epochs_done": nn_trainer.epoch, + "batches_seen": nn_trainer.train_batches_seen, + "train_examples_seen": nn_trainer.examples, + } + ) + + metrics: List[Tuple[str, float]] = list( + report.get("metrics", {}).items() + ) + list(nn_trainer.last_result.items()) + + report.update(nn_trainer.last_result) + if nn_trainer.losses: + report["loss"] = sum(nn_trainer.losses) / len(nn_trainer.losses) + nn_trainer.losses.clear() + metrics.append(("loss", report["loss"])) + + elif type == "valid": + report = nn_trainer.test( + iterator.gen_batches( + nn_trainer.batch_size, data_type="valid", shuffle=False + ), + start_time=nn_trainer.start_time, + ) + + report["epochs_done"] = nn_trainer.epoch + report["batches_seen"] = nn_trainer.train_batches_seen + report["train_examples_seen"] = nn_trainer.examples + + metrics = list(report["metrics"].items()) + + m_name, score = metrics[0] + + # Update the patience + if nn_trainer.score_best is None: + nn_trainer.patience = 0 + else: + if nn_trainer.improved(score, nn_trainer.score_best): + nn_trainer.patience = 0 + else: + nn_trainer.patience += 1 + + # Run the validation model-saving logic + if nn_trainer._is_initial_validation(): + log.info("Initial best {} of {}".format(m_name, score)) + nn_trainer.score_best = score + elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: + log.info("First best {} of {}".format(m_name, score)) + nn_trainer.score_best = score + log.info("Saving model") + nn_trainer.save() + elif nn_trainer.improved(score, nn_trainer.score_best): + log.info("Improved best {} of {}".format(m_name, score)) + nn_trainer.score_best = score + log.info("Saving model") + nn_trainer.save() + else: + log.info( + "Did not improve on the {} of {}".format( + m_name, nn_trainer.score_best + ) + ) + + report["impatience"] = nn_trainer.patience + if nn_trainer.validation_patience > 0: + report["patience_limit"] = nn_trainer.validation_patience + + nn_trainer.validation_number += 1 + return report + + @abstractmethod + def __call__() -> None: + raise NotImplementedError diff --git a/deeppavlov/core/common/logging/std_logger.py b/deeppavlov/core/common/logging/std_logger.py new file mode 100644 index 0000000000..1a828e4c5e --- /dev/null +++ b/deeppavlov/core/common/logging/std_logger.py @@ -0,0 +1,54 @@ +from typing import Dict +from logging import getLogger +import json + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.trainers.utils import NumpyArrayEncoder +from deeppavlov.core.common.logging.logging_class import TrainLogger + +log = getLogger(__name__) + + +class StdLogger(TrainLogger): + """ + StdLogger class for logging report about current training and validation processes to stdout. + + Args: + stdlogging (bool): if True, log report to stdout. + the object of this class with stdlogging = False can be used for validation process. + + """ + + def __init__(self, stdlogging: bool = True) -> None: + self.stdlogging = stdlogging + + def __call__( + self, + nn_trainer: NNTrainer, + iterator: DataLearningIterator, + type: str = None, + report: Dict = None, + ) -> dict: + """ + override call method, to log report to stdout. + + Args: + nn_trainer: NNTrainer object contains parameters required for preparing report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation. + type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + report: dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + dict contains logged data to stdout. + + """ + if report is None: + report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type + ) + if self.stdlogging: + log.info( + json.dumps({type: report}, ensure_ascii=False, cls=NumpyArrayEncoder) + ) + return report diff --git a/deeppavlov/core/common/logging/tensorboard_logger.py b/deeppavlov/core/common/logging/tensorboard_logger.py new file mode 100644 index 0000000000..f461712235 --- /dev/null +++ b/deeppavlov/core/common/logging/tensorboard_logger.py @@ -0,0 +1,87 @@ +from pathlib import Path +from typing import List, Tuple, Optional, Dict +from logging import getLogger + +import tensorflow as tf + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.common.logging.logging_class import TrainLogger + +log = getLogger(__name__) + + +class TensorboardLogger(TrainLogger): + """ + TensorboardLogger class for logging to tesnorboard. + + Args: + log_dir (Path): path to local folder to log data into. + + """ + + def __init__(self, log_dir: Path = None) -> None: + self.train_log_dir = str(log_dir / "train_log") + self.valid_log_dir = str(log_dir / "valid_log") + self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) + self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) + + def __call__( + self, + nn_trainer: NNTrainer, + iterator: DataLearningIterator, + type: str = None, + tensorboard_tag: Optional[str] = None, + tensorboard_index: Optional[int] = None, + report: Dict = None, + ) -> dict: + """ + override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. + for 'valid' logging type, log metrics of validation process to log_dir/valid_log. + + Args: + nn_trainer: NNTrainer object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' + tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. + report: dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + dict contains metrics logged to tesnorboard. + + """ + if report is None: + report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type + ) + + if type == "train": + metrics: List[Tuple[str, float]] = list( + report.get("metrics", {}).items() + ) + list(nn_trainer.last_result.items()) + if report.get("loss", None) is not None: + metrics.append(("loss", report["loss"])) + + if metrics and self.train_log_dir is not None: + summary = nn_trainer._tf.Summary() + + for name, score in metrics: + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) + self.tb_train_writer.add_summary(summary, tensorboard_index) + self.tb_train_writer.flush() + else: + metrics = list(report["metrics"].items()) + if tensorboard_tag is not None and self.valid_log_dir is not None: + summary = nn_trainer._tf.Summary() + for name, score in metrics: + summary.value.add( + tag=f"{tensorboard_tag}/{name}", simple_value=score + ) + if tensorboard_index is None: + tensorboard_index = nn_trainer.train_batches_seen + self.tb_valid_writer.add_summary(summary, tensorboard_index) + self.tb_valid_writer.flush() + return report diff --git a/deeppavlov/core/common/logging/wandb_logger.py b/deeppavlov/core/common/logging/wandb_logger.py new file mode 100644 index 0000000000..c3cf4cca02 --- /dev/null +++ b/deeppavlov/core/common/logging/wandb_logger.py @@ -0,0 +1,120 @@ +import time +import datetime +from typing import Dict +from logging import getLogger + +import wandb + +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator +from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.common.logging.logging_class import TrainLogger + + +log = getLogger(__name__) + + +class WandbLogger(TrainLogger): + """ + WandbLogger class for logging report about current training and validation processes to WandB during training. ("https://wandb.ai/site"). + + WandB is a central dashboard to keep track of your hyperparameters, system metrics, and predictions so you can compare models live, and share your findings. + WandB doesn't support more than one run concurrently, so logging will be on "epochs" or "batches" + If val_every_n_epochs > 0 or log_every_n_epochs > 0 in config file, logging to wandb will be on epochs. + Otherwise if val_every_n_batches > 0 or log_every_n_batches > 0 in config file, logging to wandb will be on batches. + if none of them, logging to wandb will be ignored. + + Args: + log_on (str): if "epochs": logging to wandb on epochs, if "batches: logging on batches. + commit_on_valid (bool): If False wandb.log just updates the current metrics dict with the row argument and metrics won't be saved until wandb.log is called with commit=True + to commit training and validation reports with the same steps, this argument is True if logging on validation required + **kwargs: arguments for wandb initialization, more info: https://docs.wandb.ai/ref/python/init + + """ + + @staticmethod + def login(API_Key: str = None, relogin: bool = True) -> bool: + """ " + static method to login to wandb account, if login or init to wandb failed, logging to wandb will be ignored. + + Args: + API_Key (str): authentication key. + relogin (bool): if True, force relogin if already logged in. + report(dict): dictionary contains current process information, if None, use 'get_report' method to get this report. + + Returns: + True if login and init processes succeed, otherwise False and logging to wandb will be ignored. + + """ + try: + return wandb.login(key=API_Key, relogin=relogin) + except Exception as e: + log.warning(str(e) + ", logging to WandB will be ignored") + return False + + def __init__( + self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs + ) -> None: + self.log_on = log_on # "epochs","batches" + self.commit_on_valid = commit_on_valid + try: + wandb.init(**kwargs) + self.init_succeed = True + except Exception as e: + log.warning(str(e) + ", logging to WandB will be ignored") + self.init_succeed = False + + def __call__( + self, + nn_trainer: NNTrainer, + iterator: DataLearningIterator, + type: str = None, + report: Dict = None, + ): + """ " + Logging report of the training process to wandb. + + Args: + nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. + iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + report (dict): report for logging to WandB. If None, use 'get_report' method to get this report. + type (str) : process type, if "train" logs report about training process, else if "valid" logs report about validation process. + + Returns: + dict contains logged data to WandB. + + """ + if report is None: + report = self.get_report( + nn_trainer=nn_trainer, iterator=iterator, type=type + ) + + logging_type = type + "/" + for i in report.keys(): + if isinstance(report[i], dict): + for key, value in report[i].items(): + wandb.log({logging_type + key: value}, commit=False) + else: + if i == "time_spent": + t = time.strptime(report[i], "%H:%M:%S") + y_seconds = int( + datetime.timedelta( + hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec + ).total_seconds() + ) + wandb.log({logging_type + i + ("(s)"): y_seconds}, commit=False) + else: + wandb.log({logging_type + i: report[i]}, commit=False) + + # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. + if (self.commit_on_valid and logging_type == "valid/") or ( + not self.commit_on_valid and logging_type == "train/" + ): + wandb.log({}, commit=True) + + return report + + @staticmethod + def close(): + """close function to commit the not commited logs and to mark a run as finished wiht wanb.finish method, and finishes uploading all data.""" + wandb.log({}, commit=True) + wandb.finish() diff --git a/deeppavlov/core/common/logging_class.py b/deeppavlov/core/common/logging_class.py deleted file mode 100644 index 504be8b66d..0000000000 --- a/deeppavlov/core/common/logging_class.py +++ /dev/null @@ -1,378 +0,0 @@ -# Copyright 2019 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import time -from pathlib import Path -import datetime -from itertools import islice -from abc import ABC, abstractmethod -from typing import List, Tuple, Optional, Dict -from logging import getLogger - -import tensorflow as tf -import wandb - -from deeppavlov.core.trainers.utils import NumpyArrayEncoder -from deeppavlov.core.data.data_learning_iterator import DataLearningIterator -from deeppavlov.core.trainers.nn_trainer import NNTrainer - -log = getLogger(__name__) - - -class TrainLogger(ABC): - """An abstract class for logging metrics during training process. - - There are three types of logging: - - StdLogger: for logging report about current training and validation processes to stdout. - - TensorboardLogger: for logging to tensorboard. - - WandbLogger: for logging to WandB. - - """ - - @abstractmethod - def __init__() -> None: - """ - The constructor for TrainLogger class. - - """ - raise NotImplementedError - - @abstractmethod - def get_report(self, - nn_trainer: NNTrainer, - iterator: DataLearningIterator, type: str = None) -> dict: - """" - Get report about current process. - for 'valid' type, 'get_report' function also saves best score on validation data, and the model parameters corresponding to the best score. - - Args: - nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - type : if "train" returns report about training process, "valid" returns report about validation process. - - Returns: - dict contains data about current 'type' process. - - """ - if type == "train": - if nn_trainer.log_on_k_batches == 0: - report = { - "time_spent": str( - datetime.timedelta( - seconds=round( - time.time() - nn_trainer.start_time + 0.5) - ) - ) - } - else: - data = islice( - iterator.gen_batches( - nn_trainer.batch_size, data_type="train", shuffle=True - ), - nn_trainer.log_on_k_batches, - ) - report = nn_trainer.test( - data, nn_trainer.train_metrics, start_time=nn_trainer.start_time - ) - - report.update( - { - "epochs_done": nn_trainer.epoch, - "batches_seen": nn_trainer.train_batches_seen, - "train_examples_seen": nn_trainer.examples, - } - ) - - metrics: List[Tuple[str, float]] = list( - report.get("metrics", {}).items() - ) + list(nn_trainer.last_result.items()) - - report.update(nn_trainer.last_result) - if nn_trainer.losses: - report["loss"] = sum(nn_trainer.losses) / \ - len(nn_trainer.losses) - nn_trainer.losses.clear() - metrics.append(("loss", report["loss"])) - - elif type == "valid": - report = nn_trainer.test( - iterator.gen_batches( - nn_trainer.batch_size, data_type="valid", shuffle=False - ), - start_time=nn_trainer.start_time, - ) - - report["epochs_done"] = nn_trainer.epoch - report["batches_seen"] = nn_trainer.train_batches_seen - report["train_examples_seen"] = nn_trainer.examples - - metrics = list(report["metrics"].items()) - - m_name, score = metrics[0] - - # Update the patience - if nn_trainer.score_best is None: - nn_trainer.patience = 0 - else: - if nn_trainer.improved(score, nn_trainer.score_best): - nn_trainer.patience = 0 - else: - nn_trainer.patience += 1 - - # Run the validation model-saving logic - if nn_trainer._is_initial_validation(): - log.info("Initial best {} of {}".format(m_name, score)) - nn_trainer.score_best = score - elif nn_trainer._is_first_validation() and nn_trainer.score_best is None: - log.info("First best {} of {}".format(m_name, score)) - nn_trainer.score_best = score - log.info("Saving model") - nn_trainer.save() - elif nn_trainer.improved(score, nn_trainer.score_best): - log.info("Improved best {} of {}".format(m_name, score)) - nn_trainer.score_best = score - log.info("Saving model") - nn_trainer.save() - else: - log.info( - "Did not improve on the {} of {}".format( - m_name, nn_trainer.score_best - ) - ) - - report["impatience"] = nn_trainer.patience - if nn_trainer.validation_patience > 0: - report["patience_limit"] = nn_trainer.validation_patience - - nn_trainer.validation_number += 1 - return report - - @abstractmethod - def __call__() -> None: - raise NotImplementedError - - -class StdLogger(TrainLogger): - """ - StdLogger class for logging report about current training and validation processes to stdout. - - Args: - stdlogging (bool): if True, log report to stdout. - the object of this class with stdlogging = False can be used for validation process. - - """ - - def __init__(self, stdlogging: bool = True) -> None: - self.stdlogging = stdlogging - - def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: - return super().get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) - - def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, report: Dict = None) -> dict: - """ - override call method, to log report to stdout. - - Args: - nn_trainer: NNTrainer object contains parameters required for preparing report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation. - type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. - report: dictionary contains current process information, if None, use 'get_report' method to get this report. - - Returns: - dict contains logged data to stdout. - - """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type) - if self.stdlogging: - log.info(json.dumps({type: report}, - ensure_ascii=False, cls=NumpyArrayEncoder)) - return report - - -class TensorboardLogger(TrainLogger): - """ - TensorboardLogger class for logging to tesnorboard. - - Args: - log_dir (Path): path to local folder to log data into. - - """ - - def __init__(self, log_dir: Path = None) -> None: - self.train_log_dir = str(log_dir / 'train_log') - self.valid_log_dir = str(log_dir / 'valid_log') - self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) - self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) - - def get_report(self, nn_trainer, iterator: DataLearningIterator, type: str = None): - return super().get_report(nn_trainer, iterator, type=type) - - def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None, report: Dict = None) -> dict: - """ - override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. - for 'valid' logging type, log metrics of validation process to log_dir/valid_log. - - Args: - nn_trainer: NNTrainer object contains parameters required for preparing the report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. - tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' - tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. - report: dictionary contains current process information, if None, use 'get_report' method to get this report. - - Returns: - dict contains metrics logged to tesnorboard. - - """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type) - - if type == "train": - metrics: List[Tuple[str, float]] = list( - report.get("metrics", {}).items() - ) + list(nn_trainer.last_result.items()) - if report.get("loss", None) is not None: - metrics.append(("loss", report["loss"])) - - if metrics and self.train_log_dir is not None: - summary = nn_trainer._tf.Summary() - - for name, score in metrics: - summary.value.add( - tag=f"{tensorboard_tag}/{name}", simple_value=score - ) - self.tb_train_writer.add_summary(summary, tensorboard_index) - self.tb_train_writer.flush() - else: - metrics = list(report["metrics"].items()) - if tensorboard_tag is not None and self.valid_log_dir is not None: - summary = nn_trainer._tf.Summary() - for name, score in metrics: - summary.value.add( - tag=f'{tensorboard_tag}/{name}', simple_value=score) - if tensorboard_index is None: - tensorboard_index = nn_trainer.train_batches_seen - self.tb_valid_writer.add_summary(summary, tensorboard_index) - self.tb_valid_writer.flush() - return report - - -class WandbLogger(TrainLogger): - """ - WandbLogger class for logging report about current training and validation processes to WandB during training. ("https://wandb.ai/site"). - - WandB is a central dashboard to keep track of your hyperparameters, system metrics, and predictions so you can compare models live, and share your findings. - WandB doesn't support more than one run concurrently, so logging will be on "epochs" or "batches" - If val_every_n_epochs > 0 or log_every_n_epochs > 0 in config file, logging to wandb will be on epochs. - Otherwise if val_every_n_batches > 0 or log_every_n_batches > 0 in config file, logging to wandb will be on batches. - if none of them, logging to wandb will be ignored. - - Args: - log_on (str): if "epochs": logging to wandb on epochs, if "batches: logging on batches. - commit_on_valid (bool): If False wandb.log just updates the current metrics dict with the row argument and metrics won't be saved until wandb.log is called with commit=True - to commit training and validation reports with the same steps, this argument is True if logging on validation required - **kwargs: arguments for wandb initialization, more info: https://docs.wandb.ai/ref/python/init - - """ - - @staticmethod - def login(API_Key: str = None, relogin: bool = True) -> bool: - """" - static method to login to wandb account, if login or init to wandb failed, logging to wandb will be ignored. - - Args: - API_Key (str): authentication key. - relogin (bool): if True, force relogin if already logged in. - report(dict): dictionary contains current process information, if None, use 'get_report' method to get this report. - - Returns: - True if login and init processes succeed, otherwise False and logging to wandb will be ignored. - - """ - try: - return wandb.login(key=API_Key, relogin=relogin) - except Exception as e: - log.warning(str(e)+", logging to WandB will be ignored") - return False - - def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: - return super().get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) - - def __init__(self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs) -> None: - self.log_on = log_on # "epochs","batches" - self.commit_on_valid = commit_on_valid - try: - wandb.init(**kwargs) - self.init_succeed = True - except Exception as e: - log.warning(str(e)+", logging to WandB will be ignored") - self.init_succeed = False - - def __call__(self, nn_trainer: NNTrainer, - iterator: DataLearningIterator, - type: str = None, - report: Dict = None - ): - """ " - Logging report of the training process to wandb. - - Args: - nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. - iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation - report (dict): report for logging to WandB. If None, use 'get_report' method to get this report. - type (str) : process type, if "train" logs report about training process, else if "valid" logs report about validation process. - - Returns: - dict contains logged data to WandB. - - """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type) - - logging_type = type + "/" - for i in report.keys(): - if isinstance(report[i], dict): - for key, value in report[i].items(): - wandb.log( - {logging_type+key: value}, commit=False) - else: - if i == "time_spent": - t = time.strptime(report[i], "%H:%M:%S") - y_seconds = int( - datetime.timedelta( - hours=t.tm_hour, minutes=t.tm_min, seconds=t.tm_sec - ).total_seconds() - ) - wandb.log({logging_type+i+("(s)"): y_seconds}, - commit=False) - else: - wandb.log( - {logging_type+i: report[i]}, commit=False) - - # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. - if (self.commit_on_valid and logging_type == "valid/") or (not self.commit_on_valid and logging_type == "train/"): - wandb.log({}, commit=True) - - return report - - @staticmethod - def close(): - """close function to commit the not commited logs and to mark a run as finished wiht wanb.finish method, and finishes uploading all data.""" - wandb.log({}, commit= True) - wandb.finish() diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index c8b6d2af38..da7aa7686d 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -51,18 +51,6 @@ class FitTrainer: in evaluation logs (default is ``False``) logger : list of dictionaries of possible loggers from deeppavlov.configs files. (default is ``None``) - Possible loggers: - - TensorboardLogger: for logging to tesnorboard. Keys: - "name": "TensorboardLogger", logging to tensorboard will be ignored if None - "log_dir":str or path to a directory where tensorboard logs can be stored, ignored if None - (default is ``None``) - - StdLogger: for logging report about current training and validation processes to stdout. Keys: - "name": "StdLogger". logging to stdout will be ignored if None. (default is ``None``) - - WandbLogger: logging report about current training and validation processes to WandB. Keys: - "name": "WandbLogger", logging to wandb will be ignored if None. - "API_Key": API of 40 characters long from 'https://wandb.ai/home' personal account. - "init": dictionary of (key:value) for wandb.init configurations. see: 'https://docs.wandb.ai/ref/python/init' - (default is ``None``) max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored @@ -105,6 +93,8 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None log.warning( "Check logger dictionary in configs, logging will be ignored") + if self.tensorboard_idx is None and self.wandblogger_idx is None: + self.stdlogger_idx = 1 if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index 3f6e967faa..d706a8a434 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -26,7 +26,7 @@ log = getLogger(__name__) -@register('nn_trainer') +@register("nn_trainer") class NNTrainer(FitTrainer): """ | Bases :class:`~deeppavlov.core.trainers.FitTrainer` @@ -85,47 +85,67 @@ class NNTrainer(FitTrainer): """ - def __init__(self, chainer_config: dict, *, - batch_size: int = 1, - epochs: int = -1, - start_epoch_num: int = 0, - max_batches: int = -1, - metrics: Iterable[Union[str, dict]] = ('accuracy',), - train_metrics: Optional[Iterable[Union[str, dict]]] = None, - metric_optimization: str = 'maximize', - evaluation_targets: Iterable[str] = ('valid', 'test'), - show_examples: bool = False, - logger: Optional[List[Dict]] = None, - max_test_batches: int = -1, - validate_first: bool = True, - validation_patience: int = 5, val_every_n_epochs: int = -1, val_every_n_batches: int = -1, - log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1, - - **kwargs) -> None: - super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets, - show_examples=show_examples, - logger=logger, - max_test_batches=max_test_batches, **kwargs) + def __init__( + self, + chainer_config: dict, + *, + batch_size: int = 1, + epochs: int = -1, + start_epoch_num: int = 0, + max_batches: int = -1, + metrics: Iterable[Union[str, dict]] = ("accuracy",), + train_metrics: Optional[Iterable[Union[str, dict]]] = None, + metric_optimization: str = "maximize", + evaluation_targets: Iterable[str] = ("valid", "test"), + show_examples: bool = False, + logger: Optional[List[Dict]] = None, + max_test_batches: int = -1, + validate_first: bool = True, + validation_patience: int = 5, + val_every_n_epochs: int = -1, + val_every_n_batches: int = -1, + log_every_n_batches: int = -1, + log_every_n_epochs: int = -1, + log_on_k_batches: int = 1, + **kwargs, + ) -> None: + super().__init__( + chainer_config, + batch_size=batch_size, + metrics=metrics, + evaluation_targets=evaluation_targets, + show_examples=show_examples, + logger=logger, + max_test_batches=max_test_batches, + **kwargs, + ) if train_metrics is None: self.train_metrics = self.metrics else: self.train_metrics = parse_metrics( - train_metrics, self._chainer.in_y, self._chainer.out_params) + train_metrics, self._chainer.in_y, self._chainer.out_params + ) metric_optimization = metric_optimization.strip().lower() self.score_best = None def _improved(op): - return lambda score, baseline: False if baseline is None or score is None \ + return ( + lambda score, baseline: False + if baseline is None or score is None else op(score, baseline) + ) - if metric_optimization == 'maximize': + if metric_optimization == "maximize": self.improved = _improved(lambda a, b: a > b) - elif metric_optimization == 'minimize': + elif metric_optimization == "minimize": self.improved = _improved(lambda a, b: a < b) else: - raise ConfigError('metric_optimization has to be one of {}'.format( - ['maximize', 'minimize'])) + raise ConfigError( + "metric_optimization has to be one of {}".format( + ["maximize", "minimize"] + ) + ) self.validate_first = validate_first self.validate_ = StdLogger(self.stdlogger_idx is not None) @@ -153,27 +173,36 @@ def _improved(op): if self.tensorboard_idx is not None: self.tensorboard_logger = TensorboardLogger( - log_dir=self.logger[self.tensorboard_idx]["log_dir"]) + log_dir=self.logger[self.tensorboard_idx]["log_dir"] + ) if self.wandblogger_idx is not None: - if WandbLogger.login(API_Key = - self.logger[self.wandblogger_idx].get("API_Key", None), relogin = True): + if WandbLogger.login( + API_Key=self.logger[self.wandblogger_idx].get("API_Key", None), + relogin=True, + ): if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: self.wandb_logger = WandbLogger( - log_on="epochs", commit_on_valid=self.val_every_n_epochs > 0, **self.logger[self.wandblogger_idx].get("init", None)) + log_on="epochs", + commit_on_valid=self.val_every_n_epochs > 0, + **self.logger[self.wandblogger_idx].get("init", None), + ) if self.wandb_logger.init_succeed == False: - self.wandblogger_idx = None + self.wandblogger_idx = None elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: self.wandb_logger = WandbLogger( - log_on="batches", commit_on_valid=self.val_every_n_batches > 0, **self.logger[self.wandblogger_idx].get("init", None)) + log_on="batches", + commit_on_valid=self.val_every_n_batches > 0, + **self.logger[self.wandblogger_idx].get("init", None), + ) if self.wandb_logger.init_succeed == False: - self.wandblogger_idx = None + self.wandblogger_idx = None else: self.wandblogger_idx = None def save(self) -> None: if self._loaded: - raise RuntimeError('Cannot save already finalized chainer') + raise RuntimeError("Cannot save already finalized chainer") self._chainer.save() @@ -185,10 +214,12 @@ def _is_first_validation(self): def _send_event(self, event_name: str, data: Optional[dict] = None) -> None: report = { - 'time_spent': str(datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5))), - 'epochs_done': self.epoch, - 'batches_seen': self.train_batches_seen, - 'train_examples_seen': self.examples + "time_spent": str( + datetime.timedelta(seconds=round(time.time() - self.start_time + 0.5)) + ), + "epochs_done": self.epoch, + "batches_seen": self.train_batches_seen, + "train_examples_seen": self.examples, } if data is not None: report.update(data) @@ -200,67 +231,94 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: if self.validate_first: self._send_event(event_name="before_validation") report = self.validate_(self, iterator, "valid") - self._send_event(event_name='after_validation', data=report) + self._send_event(event_name="after_validation", data=report) while True: impatient = False - self._send_event(event_name='before_train') - for x, y_true in iterator.gen_batches(self.batch_size, data_type='train'): + self._send_event(event_name="before_train") + for x, y_true in iterator.gen_batches(self.batch_size, data_type="train"): self.last_result = self._chainer.train_on_batch(x, y_true) if self.last_result is None: self.last_result = {} elif not isinstance(self.last_result, dict): - self.last_result = {'loss': self.last_result} - if 'loss' in self.last_result: - self.losses.append(self.last_result.pop('loss')) + self.last_result = {"loss": self.last_result} + if "loss" in self.last_result: + self.losses.append(self.last_result.pop("loss")) self.train_batches_seen += 1 self.examples += len(x) - if self.log_every_n_batches > 0 and self.train_batches_seen % self.log_every_n_batches == 0: + if ( + self.log_every_n_batches > 0 + and self.train_batches_seen % self.log_every_n_batches == 0 + ): self._send_event(event_name="before_log") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator, type="train", report=report) + self, iterator, type="train", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator, type="train", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report) - - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": + self, + iterator, + type="train", + tensorboard_tag="every_n_batches", + tensorboard_index=self.train_batches_seen, + report=report, + ) + + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "batches" + ): report = self.wandb_logger( - self, iterator=iterator, type="train", report=report) + self, iterator=iterator, type="train", report=report + ) # empty report if no logging method. - self._send_event(event_name='after_train_log', data=report) + self._send_event(event_name="after_train_log", data=report) - if self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0: - self._send_event(event_name='before_validation') + if ( + self.val_every_n_batches > 0 + and self.train_batches_seen % self.val_every_n_batches == 0 + ): + self._send_event(event_name="before_validation") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator, type="valid", tensorboard_tag='every_n_batches', tensorboard_index=self.train_batches_seen, report=report) - - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "batches": + self, + iterator, + type="valid", + tensorboard_tag="every_n_batches", + tensorboard_index=self.train_batches_seen, + report=report, + ) + + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "batches" + ): report = self.wandb_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) - self._send_event( - event_name='after_validation', data=report) + self._send_event(event_name="after_validation", data=report) - self._send_event(event_name='after_batch') + self._send_event(event_name="after_batch") if 0 < self.max_batches <= self.train_batches_seen: impatient = True break if 0 < self.validation_patience <= self.patience: - log.info('Ran out of patience') + log.info("Ran out of patience") impatient = True break @@ -268,68 +326,100 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: break self.epoch += 1 - if self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0: + if ( + self.log_every_n_epochs > 0 + and self.epoch % self.log_every_n_epochs == 0 + ): self._send_event(event_name="before_log") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator=iterator, type="train", report=report) + self, iterator=iterator, type="train", report=report + ) - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "epochs" + ): report = self.wandb_logger( - self, iterator, type="train", report=report) + self, iterator, type="train", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator=iterator, type="train", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report) - - self._send_event(event_name='after_train_log', data=report) - - if self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0: - self._send_event(event_name='before_validation') + self, + iterator=iterator, + type="train", + tensorboard_tag="every_n_epochs", + tensorboard_index=self.epoch, + report=report, + ) + + self._send_event(event_name="after_train_log", data=report) + + if ( + self.val_every_n_epochs > 0 + and self.epoch % self.val_every_n_epochs == 0 + ): + self._send_event(event_name="before_validation") report = None if self.stdlogger_idx is not None: report = self.std_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) if self.tensorboard_idx is not None: report = self.tensorboard_logger( - self, iterator=iterator, type="valid", tensorboard_tag='every_n_epochs', tensorboard_index=self.epoch, report=report) - - if self.wandblogger_idx is not None and self.wandb_logger.log_on == "epochs": + self, + iterator=iterator, + type="valid", + tensorboard_tag="every_n_epochs", + tensorboard_index=self.epoch, + report=report, + ) + + if ( + self.wandblogger_idx is not None + and self.wandb_logger.log_on == "epochs" + ): report = self.wandb_logger( - self, iterator, type="valid", report=report) + self, iterator, type="valid", report=report + ) - self._send_event(event_name='after_validation', data=report) + self._send_event(event_name="after_validation", data=report) - self._send_event(event_name='after_epoch') + self._send_event(event_name="after_epoch") if 0 < self.max_epochs <= self.epoch: break if 0 < self.validation_patience <= self.patience: - log.info('Ran out of patience') + log.info("Ran out of patience") break def train(self, iterator: DataLearningIterator) -> None: """Call :meth:`~fit_chainer` and then :meth:`~train_on_batches` with provided data iterator as an argument""" self.fit_chainer(iterator) - if callable(getattr(self._chainer, 'train_on_batch', None)): + if callable(getattr(self._chainer, "train_on_batch", None)): try: self.train_on_batches(iterator) if self.wandblogger_idx is not None: self.wandb_logger.close() except KeyboardInterrupt: - log.info('Stopped training') + log.info("Stopped training") else: log.warning( - f'Using {self.__class__.__name__} for a pipeline without batched training') + f"Using {self.__class__.__name__} for a pipeline without batched training" + ) # Run the at-train-exit model-saving logic if self.validation_number < 1: - log.info('Save model to capture early training results') + log.info("Save model to capture early training results") self.save() -from deeppavlov.core.common.logging_class import TensorboardLogger, StdLogger, WandbLogger + +from deeppavlov.core.common.logging.wandb_logger import WandbLogger +from deeppavlov.core.common.logging.std_logger import StdLogger +from deeppavlov.core.common.logging.tensorboard_logger import TensorboardLogger diff --git a/deeppavlov/requirements/wandb.txt b/deeppavlov/requirements/wandb.txt new file mode 100644 index 0000000000..9983212701 --- /dev/null +++ b/deeppavlov/requirements/wandb.txt @@ -0,0 +1,3 @@ +wandb==0.12.7 +pybind11==2.2 +fasttext \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index b3a4f11237..7454c1a691 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -193,7 +193,7 @@ autodoc_mock_imports = ['bert_dp', 'bs4', 'faiss', 'fastText', 'fasttext', 'gensim', 'hdt', 'kenlm', 'librosa', 'lxml', 'nemo', 'nemo_asr', 'nemo_tts', 'nltk', 'opt_einsum', 'rapidfuzz', 'rasa', 'russian_tagsets', 'sacremoses', 'sortedcontainers', 'spacy', 'tensorflow', 'tensorflow_hub', - 'torch', 'transformers', 'udapi', 'ufal_udpipe', 'whapi', 'xeger'] + 'torch', 'transformers', 'udapi', 'ufal_udpipe','wandb', 'whapi', 'xeger'] extlinks = { 'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None) diff --git a/requirements.txt b/requirements.txt index b6fa7bdf47..4708c92e6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,4 @@ tqdm==4.62.0 click==7.1.2 uvicorn==0.11.7 sacremoses==0.0.35 -uvloop==0.14.0 -wandb==0.12.7 -pybind11==2.2 -fasttext +uvloop==0.14.0 \ No newline at end of file From 145c311603c78820b83e6068ffaca27be237a99c Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Sat, 22 Jan 2022 13:17:32 +0300 Subject: [PATCH 16/18] make std logger default logging method --- deeppavlov/core/trainers/fit_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index da7aa7686d..89ed25091b 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -94,7 +94,7 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, log.warning( "Check logger dictionary in configs, logging will be ignored") if self.tensorboard_idx is None and self.wandblogger_idx is None: - self.stdlogger_idx = 1 + self.stdlogger_idx = 1 # make std logger default if self.tensorboard_idx is not None: try: # noinspection PyPackageRequirements From 6306c0145719a3f1b43b559feebb5da4e96bceab Mon Sep 17 00:00:00 2001 From: Ihab Asaad Date: Sun, 23 Jan 2022 11:44:36 +0300 Subject: [PATCH 17/18] Add logging methods to configuration.rst --- docs/intro/configuration.rst | 132 +++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/docs/intro/configuration.rst b/docs/intro/configuration.rst index 9f873c5e9c..c5818386a6 100644 --- a/docs/intro/configuration.rst +++ b/docs/intro/configuration.rst @@ -222,6 +222,138 @@ _______ | | Default value for ``inputs`` parameter is a concatenation of chainer's ``in_y`` and ``out`` parameters. +Logging data during training process +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +| Logging data is done following two steps: + +* Add at least one of the following arguments in configuration file with strictly positive integer value: + + - ``val_every_n_batches``: how often (in batches) to validate the pipeline. + + - ``val_every_n_epochs``: how often (in epochs) to validate the pipeline. + + - ``log_every_n_epochs``: how often (in epochs) to calculate metrics on train data. + + - ``log_every_n_batches``: how often (in batches) to calculate metrics on train data. + + Logging will be ignored for negative or zero. + + Example: + + .. code:: python + + "train": { + "log_every_n_epochs": 3, + "val_every_n_batches": 2 + } + + To log training data every 3 epochs, and validation data every 2 batches, using the appropriate logging method. + +* Add the logging method: + + Deeppavlov library supports three types of logging: + + - StdLogging: for logging data about current training and validation processes to stdout. + + To log data using this logger, add "logger" list containing dictionary with ``name``: ``StdLogger`` in configuration file. + For example: + + .. code:: python + + "train": { + "logger": [ + { + "name": "StdLogger" + } + ], + ... + } + + - TensorboardLogger: for logging data to Tensorboard, stored in local folder. + + To log data using this logger, add logger name, with local directory path. + + For example: + + .. code:: python + + "train": { + "logger": [ + { + "name": "TensorboardLogger", + "log_dir": "local_folder/Tensorboard_logs" + } + ], + ... + } + + In this case, training data will be stored in "local_folder/Tensorboard_logs/train_log", + and validation data in "local_folder/Tensorboard_logs/valid_log". + + To visualize training logs, use the following command line: + + "tensorboard --logdir local_folder/Tensorboard_logs/train_log" + + - WandbLogger: for logging data to Weights & Biases platform in real time. + + To log data using this logger, add logger name, with API key. + + To get API key: + + Sign up to wandb platform : https://wandb.ai/site if don’t have an account, login and go to setting (upper right corner), copy the API key. + + To create a new run in W&B with specific configurations, add ``init`` keyword with its configuration as dictionary (see https://docs.wandb.ai/ref/python/init). + + For example: + + .. code:: python + + "train": { + "logger": [ + { + "name": "WandbLogger", + "API_Key":"API of 40 characters long", + "init":{ + "project": "project_name", + "group": "group_name", + "job_type":"job_type", + "name":"run_name", + }, + "config": { + "learning_rate": 0.1, + } + } + ], + ... + } + + Logging to W&B will be on epochs if ``log_every_n_epochs`` or ``val_every_n_epochs`` were added to configuration file, otherwise logging on batches if ``log_every_n_batches`` or ``val_every_n_batches`` were added. + + To view run while training, follow the run link logged to stdout. + + To add more than one logger type as dictionary, for example: + + .. code:: python + + "train": { + "logger": [ + { + "name": "TensorboardLogger", + "log_dir": "local_folder/Tensorboard_logs" + }, + { + "name": "StdLogger" + } + ], + ... + } + + Default logging method is ``StdLogger`` (if ``logger`` not provided in configuration file), for no logging, add ``logger`` with empty list. + + + + + DatasetReader ~~~~~~~~~~~~~ From 27a940b7150d313d603708e2eb06dd28afacd0ca Mon Sep 17 00:00:00 2001 From: Ihab-Asaad Date: Fri, 18 Feb 2022 21:00:22 +0300 Subject: [PATCH 18/18] delete loggers init from nn_trainer.py --- .../classifiers/sentiment_twitter.json | 2 +- .../core/common/logging/logging_class.py | 24 +- deeppavlov/core/common/logging/std_logger.py | 19 +- .../core/common/logging/tensorboard_logger.py | 43 ++-- .../core/common/logging/wandb_logger.py | 63 ++++-- deeppavlov/core/trainers/fit_trainer.py | 99 ++++----- deeppavlov/core/trainers/nn_trainer.py | 209 ++++-------------- deeppavlov/requirements/wandb.txt | 2 - requirements.txt | 2 +- 9 files changed, 173 insertions(+), 290 deletions(-) diff --git a/deeppavlov/configs/classifiers/sentiment_twitter.json b/deeppavlov/configs/classifiers/sentiment_twitter.json index 545ab38995..be1f205fe9 100644 --- a/deeppavlov/configs/classifiers/sentiment_twitter.json +++ b/deeppavlov/configs/classifiers/sentiment_twitter.json @@ -133,7 +133,7 @@ }, { "name": "WandbLogger", - "API_Key":"be5cac1976dae2abd87fd045a7a101248c0a0253", + "API_Key":"40-chars API KEY", "init":{ "project": "Tuning Hyperparameters", "group": "Tuning lr & lr_decay", diff --git a/deeppavlov/core/common/logging/logging_class.py b/deeppavlov/core/common/logging/logging_class.py index 7ec6c9bac9..8014607650 100644 --- a/deeppavlov/core/common/logging/logging_class.py +++ b/deeppavlov/core/common/logging/logging_class.py @@ -29,9 +29,7 @@ class TrainLogger(ABC): """An abstract class for logging metrics during training process.""" - def get_report( - self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None - ) -> dict: + def get_report(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None) -> dict: """ " Get report about current process. for 'valid' type, 'get_report' function also saves best score on validation data, and the model parameters corresponding to the best score. @@ -47,20 +45,11 @@ def get_report( """ if type == "train": if nn_trainer.log_on_k_batches == 0: - report = { - "time_spent": str( - datetime.timedelta( - seconds=round(time.time() - nn_trainer.start_time + 0.5) - ) - ) - } + report = {"time_spent": str(datetime.timedelta( + seconds=round(time.time() - nn_trainer.start_time + 0.5)))} else: - data = islice( - iterator.gen_batches( - nn_trainer.batch_size, data_type="train", shuffle=True - ), - nn_trainer.log_on_k_batches, - ) + data = islice(iterator.gen_batches(nn_trainer.batch_size, data_type="train", shuffle=True), + nn_trainer.log_on_k_batches,) report = nn_trainer.test( data, nn_trainer.train_metrics, start_time=nn_trainer.start_time ) @@ -139,3 +128,6 @@ def get_report( @abstractmethod def __call__() -> None: raise NotImplementedError + + def close(): + raise NotImplementedError diff --git a/deeppavlov/core/common/logging/std_logger.py b/deeppavlov/core/common/logging/std_logger.py index 1a828e4c5e..19e6a45677 100644 --- a/deeppavlov/core/common/logging/std_logger.py +++ b/deeppavlov/core/common/logging/std_logger.py @@ -17,19 +17,14 @@ class StdLogger(TrainLogger): Args: stdlogging (bool): if True, log report to stdout. the object of this class with stdlogging = False can be used for validation process. - + **kwargs: additional parameters whose names will be logged but otherwise ignored """ - def __init__(self, stdlogging: bool = True) -> None: + def __init__(self, stdlogging: bool = True, **kwargs) -> None: self.stdlogging = stdlogging - def __call__( - self, - nn_trainer: NNTrainer, - iterator: DataLearningIterator, - type: str = None, - report: Dict = None, - ) -> dict: + def __call__(self,nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, report: Dict = None, + **kwargs) -> dict: """ override call method, to log report to stdout. @@ -38,7 +33,7 @@ def __call__( iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation. type : process type, if "train" logs report about training process, else if "valid" logs report about validation process. report: dictionary contains current process information, if None, use 'get_report' method to get this report. - + **kwargs: additional parameters whose names will be logged but otherwise ignored Returns: dict contains logged data to stdout. @@ -52,3 +47,7 @@ def __call__( json.dumps({type: report}, ensure_ascii=False, cls=NumpyArrayEncoder) ) return report + + @staticmethod + def close(): + log.info("Logging to Stdout completed") \ No newline at end of file diff --git a/deeppavlov/core/common/logging/tensorboard_logger.py b/deeppavlov/core/common/logging/tensorboard_logger.py index f461712235..dc99c57c4b 100644 --- a/deeppavlov/core/common/logging/tensorboard_logger.py +++ b/deeppavlov/core/common/logging/tensorboard_logger.py @@ -2,10 +2,11 @@ from typing import List, Tuple, Optional, Dict from logging import getLogger -import tensorflow as tf +from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.data.data_learning_iterator import DataLearningIterator from deeppavlov.core.trainers.nn_trainer import NNTrainer +from deeppavlov.core.trainers.fit_trainer import FitTrainer from deeppavlov.core.common.logging.logging_class import TrainLogger log = getLogger(__name__) @@ -16,25 +17,30 @@ class TensorboardLogger(TrainLogger): TensorboardLogger class for logging to tesnorboard. Args: + fit_trainer: FitTrainer object passed to set Tensorflow as one of its parameter if successful importation. log_dir (Path): path to local folder to log data into. """ - def __init__(self, log_dir: Path = None) -> None: - self.train_log_dir = str(log_dir / "train_log") - self.valid_log_dir = str(log_dir / "valid_log") - self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) - self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) + def __init__(self, fit_trainer:FitTrainer , log_dir: Path = None) -> None: + try: + # noinspection PyPackageRequirements + # noinspection PyUnresolvedReferences + import tensorflow as tf + except ImportError: + log.warning('TensorFlow could not be imported, so tensorboard log directory' + f'`{log_dir}` will be ignored') + else: + log_dir = expand_path(log_dir) + fit_trainer._tf = tf + self.train_log_dir = str(log_dir / 'train_log') + self.valid_log_dir = str(log_dir / 'valid_log') + self.tb_train_writer = tf.summary.FileWriter(self.train_log_dir) + self.tb_valid_writer = tf.summary.FileWriter(self.valid_log_dir) - def __call__( - self, - nn_trainer: NNTrainer, - iterator: DataLearningIterator, - type: str = None, - tensorboard_tag: Optional[str] = None, - tensorboard_index: Optional[int] = None, - report: Dict = None, - ) -> dict: + def __call__(self, nn_trainer: NNTrainer, iterator: DataLearningIterator, type: str = None, + tensorboard_tag: Optional[str] = None, tensorboard_index: Optional[int] = None, + report: Dict = None, **kwargs) -> dict: """ override call method, for 'train' logging type, log metircs of training process to log_dir/train_log. for 'valid' logging type, log metrics of validation process to log_dir/valid_log. @@ -46,7 +52,8 @@ def __call__( tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' tensorboard_index: one of two options: 'train_batches_seen', 'epoch' corresponding to 'tensorboard_tag' types respectively. report: dictionary contains current process information, if None, use 'get_report' method to get this report. - + **kwargs: additional parameters whose names will be logged but otherwise ignored + Returns: dict contains metrics logged to tesnorboard. @@ -85,3 +92,7 @@ def __call__( self.tb_valid_writer.add_summary(summary, tensorboard_index) self.tb_valid_writer.flush() return report + + @staticmethod + def close(): + log.info("Logging to Tensorboard completed") \ No newline at end of file diff --git a/deeppavlov/core/common/logging/wandb_logger.py b/deeppavlov/core/common/logging/wandb_logger.py index c3cf4cca02..c65c559ae6 100644 --- a/deeppavlov/core/common/logging/wandb_logger.py +++ b/deeppavlov/core/common/logging/wandb_logger.py @@ -1,6 +1,6 @@ import time import datetime -from typing import Dict +from typing import Dict, Optional from logging import getLogger import wandb @@ -24,9 +24,18 @@ class WandbLogger(TrainLogger): if none of them, logging to wandb will be ignored. Args: - log_on (str): if "epochs": logging to wandb on epochs, if "batches: logging on batches. + API_Key (str): authentication key. + relogin (bool): if True, force relogin if already logged in. commit_on_valid (bool): If False wandb.log just updates the current metrics dict with the row argument and metrics won't be saved until wandb.log is called with commit=True to commit training and validation reports with the same steps, this argument is True if logging on validation required + val_every_n_epochs: how often (in epochs) to validate the pipeline, ignored if negative or zero + (default is ``-1``) + val_every_n_batches: how often (in batches) to validate the pipeline, ignored if negative or zero + (default is ``-1``) + log_every_n_epochs: how often (in epochs) to calculate metrics on train data, ignored if negative or zero + (default is ``-1``) + log_every_n_batches: how often (in batches) to calculate metrics on train data, ignored if negative or zero + (default is ``-1``) **kwargs: arguments for wandb initialization, more info: https://docs.wandb.ai/ref/python/init """ @@ -39,7 +48,6 @@ def login(API_Key: str = None, relogin: bool = True) -> bool: Args: API_Key (str): authentication key. relogin (bool): if True, force relogin if already logged in. - report(dict): dictionary contains current process information, if None, use 'get_report' method to get this report. Returns: True if login and init processes succeed, otherwise False and logging to wandb will be ignored. @@ -48,34 +56,45 @@ def login(API_Key: str = None, relogin: bool = True) -> bool: try: return wandb.login(key=API_Key, relogin=relogin) except Exception as e: - log.warning(str(e) + ", logging to WandB will be ignored") + log.warning(str(e) + ', logging to WandB will be ignored') return False - def __init__( - self, log_on: str = "epochs", commit_on_valid: bool = False, **kwargs - ) -> None: - self.log_on = log_on # "epochs","batches" - self.commit_on_valid = commit_on_valid - try: - wandb.init(**kwargs) - self.init_succeed = True - except Exception as e: - log.warning(str(e) + ", logging to WandB will be ignored") + def __init__(self, API_Key: str = None, relogin: bool = True, val_every_n_epochs: int = -1, + val_every_n_batches: int = -1, log_every_n_batches: int = -1, log_every_n_epochs: int = -1, **kwargs) -> None: + if self.login(API_Key = API_Key, relogin = relogin): + try: + wandb.init(**kwargs.get('init', None)) + self.init_succeed = True + if log_every_n_epochs > 0 or val_every_n_epochs > 0: + self.log_on ='every_n_epochs' + self.commit_on_valid = val_every_n_epochs > 0 + + elif log_every_n_batches > 0 or val_every_n_batches > 0: + self.log_on ='every_n_batches' + self.commit_on_valid = val_every_n_batches > 0 + + except Exception as e: + log.warning(str(e) + ', logging to WandB will be ignored') + self.init_succeed = False + else: + log.warning('login to WandB failed') self.init_succeed = False def __call__( self, nn_trainer: NNTrainer, iterator: DataLearningIterator, + tensorboard_tag: Optional[str] = None, type: str = None, report: Dict = None, - ): - """ " + **kwargs): + """ Logging report of the training process to wandb. Args: nn_trainer: 'NNTrainer' object contains parameters required for preparing the report. iterator: :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for evaluation + tensorboard_tag: one of two options : 'every_n_batches', 'every_n_epochs' report (dict): report for logging to WandB. If None, use 'get_report' method to get this report. type (str) : process type, if "train" logs report about training process, else if "valid" logs report about validation process. @@ -83,11 +102,11 @@ def __call__( dict contains logged data to WandB. """ - if report is None: - report = self.get_report( - nn_trainer=nn_trainer, iterator=iterator, type=type - ) + if not self.init_succeed or tensorboard_tag != self.log_on: + return None + if report is None: + report = self.get_report(nn_trainer=nn_trainer, iterator=iterator, type=type) logging_type = type + "/" for i in report.keys(): if isinstance(report[i], dict): @@ -107,8 +126,7 @@ def __call__( # if "val_every_n_epochs" is not None, we have to commit data on validation logging, otherwise on training. if (self.commit_on_valid and logging_type == "valid/") or ( - not self.commit_on_valid and logging_type == "train/" - ): + not self.commit_on_valid and logging_type == "train/"): wandb.log({}, commit=True) return report @@ -118,3 +136,4 @@ def close(): """close function to commit the not commited logs and to mark a run as finished wiht wanb.finish method, and finishes uploading all data.""" wandb.log({}, commit=True) wandb.finish() + log.info("Logging to W&B completed") diff --git a/deeppavlov/core/trainers/fit_trainer.py b/deeppavlov/core/trainers/fit_trainer.py index 89ed25091b..595d01235c 100644 --- a/deeppavlov/core/trainers/fit_trainer.py +++ b/deeppavlov/core/trainers/fit_trainer.py @@ -20,7 +20,6 @@ from typing import List, Tuple, Dict, Union, Optional, Iterable, Any, Collection from deeppavlov.core.commands.infer import build_model -from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.chainer import Chainer from deeppavlov.core.common.params import from_params from deeppavlov.core.common.registry import register @@ -49,10 +48,18 @@ class FitTrainer: evaluation_targets: data types on which to evaluate trained pipeline (default is ``('valid', 'test')``) show_examples: a flag used to print inputs, expected outputs and predicted outputs for the last batch in evaluation logs (default is ``False``) - logger : list of dictionaries of possible loggers from deeppavlov.configs files. + logger: list of dictionaries with train and evaluation loggers configuration. (default is ``None``) max_test_batches: maximum batches count for pipeline testing and evaluation, ignored if negative (default is ``-1``) + val_every_n_epochs: how often (in epochs) to validate the pipeline, ignored if negative or zero + (default is ``-1``) + val_every_n_batches: how often (in batches) to validate the pipeline, ignored if negative or zero + (default is ``-1``) + log_every_n_epochs: how often (in epochs) to calculate metrics on train data, ignored if negative or zero + (default is ``-1``) + log_every_n_batches: how often (in batches) to calculate metrics on train data, ignored if negative or zero + (default is ``-1``) **kwargs: additional parameters whose names will be logged but otherwise ignored """ @@ -62,52 +69,50 @@ def __init__(self, chainer_config: dict, *, batch_size: int = -1, show_examples: bool = False, max_test_batches: int = -1, logger: Optional[List[dict]] = None, + val_every_n_batches: int = -1, val_every_n_epochs: int = -1, + log_every_n_batches: int = -1, log_every_n_epochs: int = -1, **kwargs) -> None: if kwargs: - log.info( - f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') + log.info(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:') self.chainer_config = chainer_config - self._chainer = Chainer( - chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) + self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y')) self.batch_size = batch_size - self.metrics = parse_metrics( - metrics, self._chainer.in_y, self._chainer.out_params) + self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params) self.evaluation_targets = tuple(evaluation_targets) self.show_examples = show_examples self.max_test_batches = None if max_test_batches < 0 else max_test_batches - self.logger: Optional[List[Dict]] = logger + from deeppavlov.core.common.logging.logging_class import TrainLogger + from deeppavlov.core.common.logging.std_logger import StdLogger + + self.logger: List[TrainLogger] = [] self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None - if logger is not None: - try: - for i in range(len(logger)): - if logger[i].get("name", None) == "StdLogger": - self.stdlogger_idx = i - if logger[i].get("name", None) == "TensorboardLogger" and self.logger[i].get("log_dir", None) is not None: - self.tensorboard_idx = i - if logger[i].get("name", None) == "WandbLogger": - self.wandblogger_idx = i - except AttributeError: - self.tensorboard_idx, self.stdlogger_idx, self.wandblogger_idx = None, None, None - log.warning( - "Check logger dictionary in configs, logging will be ignored") - if self.tensorboard_idx is None and self.wandblogger_idx is None: - self.stdlogger_idx = 1 # make std logger default - if self.tensorboard_idx is not None: + + if logger is None: + logger = [{'name': 'StdLogger'}] + for logger_config in logger: + logger_name = logger_config.pop('name',None) + if logger_name is None: + raise KeyError("There is no 'name' key in logger configuration") + lgr = None try: - # noinspection PyPackageRequirements - # noinspection PyUnresolvedReferences - import tensorflow + if logger_name == 'StdLogger': + lgr = StdLogger(**logger_config) + elif logger_name == 'TensorboardLogger': + from deeppavlov.core.common.logging.tensorboard_logger import TensorboardLogger + lgr = TensorboardLogger(self, **logger_config) + elif logger_name == 'WandbLogger': + from deeppavlov.core.common.logging.wandb_logger import WandbLogger + lgr = WandbLogger(**logger_config, val_every_n_batches = val_every_n_batches, + val_every_n_epochs = val_every_n_epochs, + log_every_n_batches = log_every_n_batches, log_every_n_epochs=log_every_n_epochs) except ImportError: - log.warning('TensorFlow could not be imported, so tensorboard log directory' - f'`{self.logger[self.tensorboard_idx]["log_dir"]}` will be ignored') - self.tensorboard_idx = None - else: - self.logger[self.tensorboard_idx]["log_dir"] = expand_path( - self.logger[self.tensorboard_idx]["log_dir"]) - self._tf = tensorflow + log.warning(f'{logger_name} will be ignored due to import error. Check that all necessary requirements' + f'are installed') + if lgr is not None: + self.logger.append(lgr) self._built = False self._saved = False @@ -133,8 +138,7 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] writer = None for i, (x, y) in enumerate(iterator.gen_batches(self.batch_size, shuffle=False)): - preprocessed = self._chainer.compute( - x, y, targets=targets) + preprocessed = self._chainer.compute(x, y, targets=targets) # noinspection PyUnresolvedReferences result = component.partial_fit(*preprocessed) @@ -149,12 +153,10 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] writer.add_summary(summary, i) writer.flush() else: - preprocessed = self._chainer.compute( - *iterator.get_instances(), targets=targets) + preprocessed = self._chainer.compute(*iterator.get_instances(), targets=targets) if len(targets) == 1: preprocessed = [preprocessed] - result: Optional[Dict[str, Iterable[float]] - ] = component.fit(*preprocessed) + result: Optional[Dict[str, Iterable[float]]] = component.fit(*preprocessed) if result is not None and self.tensorboard_idx is not None: writer = self._tf.summary.FileWriter(str(self.logger[self.tensorboard_idx]["log_dir"] / @@ -180,8 +182,7 @@ def fit_chainer(self, iterator: Union[DataFittingIterator, DataLearningIterator] def _load(self) -> None: if not self._loaded: self._chainer.destroy() - self._chainer = build_model( - {'chainer': self.chainer_config}, load_trained=self._saved) + self._chainer = build_model({'chainer': self.chainer_config}, load_trained=self._saved) self._loaded = True def get_chainer(self) -> Chainer: @@ -221,8 +222,7 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], if metrics is None: metrics = self.metrics - expected_outputs = list(set().union( - self._chainer.out_params, *[m.inputs for m in metrics])) + expected_outputs = list(set().union(self._chainer.out_params, *[m.inputs for m in metrics])) outputs = {out: [] for out in expected_outputs} examples = 0 @@ -231,8 +231,7 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], for x, y_true in data: examples += len(x) - y_predicted = list(self._chainer.compute( - list(x), list(y_true), targets=expected_outputs)) + y_predicted = list(self._chainer.compute(list(x), list(y_true), targets=expected_outputs)) if len(expected_outputs) == 1: y_predicted = [y_predicted] for out, val in zip(outputs.values(), y_predicted): @@ -259,8 +258,7 @@ def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], for out_name, y_predicted_group in zip(expected_outputs, y_predicted) if out_name in self._chainer.out_params]) if len(self._chainer.out_params) == 1: - y_predicted = [y_predicted_item[0] - for y_predicted_item in y_predicted] + y_predicted = [y_predicted_item[0] for y_predicted_item in y_predicted] report['examples'] = [{ 'x': x_item, 'y_predicted': y_predicted_item, @@ -289,8 +287,7 @@ def evaluate(self, iterator: DataLearningIterator, evaluation_targets: Optional[ res = {} for data_type in evaluation_targets: - data_gen = iterator.gen_batches( - self.batch_size, data_type=data_type, shuffle=False) + data_gen = iterator.gen_batches(self.batch_size, data_type=data_type, shuffle=False) report = self.test(data_gen) res[data_type] = report if print_reports: diff --git a/deeppavlov/core/trainers/nn_trainer.py b/deeppavlov/core/trainers/nn_trainer.py index d706a8a434..ec578b51eb 100644 --- a/deeppavlov/core/trainers/nn_trainer.py +++ b/deeppavlov/core/trainers/nn_trainer.py @@ -26,7 +26,7 @@ log = getLogger(__name__) -@register("nn_trainer") +@register('nn_trainer') class NNTrainer(FitTrainer): """ | Bases :class:`~deeppavlov.core.trainers.FitTrainer` @@ -85,10 +85,7 @@ class NNTrainer(FitTrainer): """ - def __init__( - self, - chainer_config: dict, - *, + def __init__(self, chainer_config: dict, *, batch_size: int = 1, epochs: int = -1, start_epoch_num: int = 0, @@ -107,47 +104,33 @@ def __init__( log_every_n_batches: int = -1, log_every_n_epochs: int = -1, log_on_k_batches: int = 1, - **kwargs, - ) -> None: - super().__init__( - chainer_config, - batch_size=batch_size, - metrics=metrics, - evaluation_targets=evaluation_targets, - show_examples=show_examples, - logger=logger, - max_test_batches=max_test_batches, - **kwargs, - ) + **kwargs) -> None: + super().__init__(chainer_config, batch_size=batch_size, metrics=metrics, evaluation_targets=evaluation_targets, + show_examples=show_examples, logger=logger, max_test_batches=max_test_batches, + val_every_n_batches = val_every_n_batches, val_every_n_epochs = val_every_n_epochs, + log_every_n_batches = log_every_n_batches, log_every_n_epochs=log_every_n_epochs, **kwargs) + if train_metrics is None: self.train_metrics = self.metrics else: - self.train_metrics = parse_metrics( - train_metrics, self._chainer.in_y, self._chainer.out_params - ) + self.train_metrics = parse_metrics(train_metrics, self._chainer.in_y, self._chainer.out_params) metric_optimization = metric_optimization.strip().lower() self.score_best = None def _improved(op): - return ( - lambda score, baseline: False - if baseline is None or score is None + return lambda score, baseline: False if baseline is None or score is None \ else op(score, baseline) - ) if metric_optimization == "maximize": self.improved = _improved(lambda a, b: a > b) elif metric_optimization == "minimize": self.improved = _improved(lambda a, b: a < b) else: - raise ConfigError( - "metric_optimization has to be one of {}".format( - ["maximize", "minimize"] - ) - ) + raise ConfigError("metric_optimization has to be one of {}".format(["maximize", "minimize"])) self.validate_first = validate_first + from deeppavlov.core.common.logging.std_logger import StdLogger self.validate_ = StdLogger(self.stdlogger_idx is not None) self.validation_number = 0 if validate_first else 1 self.validation_patience = validation_patience @@ -168,41 +151,10 @@ def _improved(op): self.losses = [] self.start_time: Optional[float] = None - if self.stdlogger_idx is not None: - self.std_logger = StdLogger(stdlogging=True) - - if self.tensorboard_idx is not None: - self.tensorboard_logger = TensorboardLogger( - log_dir=self.logger[self.tensorboard_idx]["log_dir"] - ) - - if self.wandblogger_idx is not None: - if WandbLogger.login( - API_Key=self.logger[self.wandblogger_idx].get("API_Key", None), - relogin=True, - ): - if self.log_every_n_epochs > 0 or self.val_every_n_epochs > 0: - self.wandb_logger = WandbLogger( - log_on="epochs", - commit_on_valid=self.val_every_n_epochs > 0, - **self.logger[self.wandblogger_idx].get("init", None), - ) - if self.wandb_logger.init_succeed == False: - self.wandblogger_idx = None - elif self.log_every_n_batches > 0 or self.val_every_n_batches > 0: - self.wandb_logger = WandbLogger( - log_on="batches", - commit_on_valid=self.val_every_n_batches > 0, - **self.logger[self.wandblogger_idx].get("init", None), - ) - if self.wandb_logger.init_succeed == False: - self.wandblogger_idx = None - else: - self.wandblogger_idx = None def save(self) -> None: if self._loaded: - raise RuntimeError("Cannot save already finalized chainer") + raise RuntimeError('Cannot save already finalized chainer') self._chainer.save() @@ -254,60 +206,21 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: ): self._send_event(event_name="before_log") report = None - if self.stdlogger_idx is not None: - report = self.std_logger( - self, iterator, type="train", report=report - ) - - if self.tensorboard_idx is not None: - report = self.tensorboard_logger( - self, - iterator, - type="train", - tensorboard_tag="every_n_batches", - tensorboard_index=self.train_batches_seen, - report=report, - ) - - if ( - self.wandblogger_idx is not None - and self.wandb_logger.log_on == "batches" - ): - report = self.wandb_logger( - self, iterator=iterator, type="train", report=report - ) + + for lgr in self.logger: + report = lgr(self, iterator, type="train", tensorboard_tag="every_n_batches", + tensorboard_index=self.train_batches_seen, report=report) # empty report if no logging method. self._send_event(event_name="after_train_log", data=report) - if ( - self.val_every_n_batches > 0 - and self.train_batches_seen % self.val_every_n_batches == 0 - ): + if (self.val_every_n_batches > 0 and self.train_batches_seen % self.val_every_n_batches == 0): self._send_event(event_name="before_validation") report = None - if self.stdlogger_idx is not None: - report = self.std_logger( - self, iterator, type="valid", report=report - ) - - if self.tensorboard_idx is not None: - report = self.tensorboard_logger( - self, - iterator, - type="valid", - tensorboard_tag="every_n_batches", - tensorboard_index=self.train_batches_seen, - report=report, - ) - - if ( - self.wandblogger_idx is not None - and self.wandb_logger.log_on == "batches" - ): - report = self.wandb_logger( - self, iterator, type="valid", report=report - ) + + for lgr in self.logger: + report = lgr(self, iterator, type="valid",tensorboard_tag="every_n_batches", + tensorboard_index=self.train_batches_seen, report = report) self._send_event(event_name="after_validation", data=report) @@ -318,7 +231,7 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: break if 0 < self.validation_patience <= self.patience: - log.info("Ran out of patience") + log.info('Ran out of patience') impatient = True break @@ -326,67 +239,25 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: break self.epoch += 1 - if ( - self.log_every_n_epochs > 0 - and self.epoch % self.log_every_n_epochs == 0 - ): + if (self.log_every_n_epochs > 0 and self.epoch % self.log_every_n_epochs == 0): self._send_event(event_name="before_log") report = None - if self.stdlogger_idx is not None: - report = self.std_logger( - self, iterator=iterator, type="train", report=report - ) - if ( - self.wandblogger_idx is not None - and self.wandb_logger.log_on == "epochs" - ): - report = self.wandb_logger( - self, iterator, type="train", report=report - ) - - if self.tensorboard_idx is not None: - report = self.tensorboard_logger( - self, - iterator=iterator, - type="train", - tensorboard_tag="every_n_epochs", - tensorboard_index=self.epoch, - report=report, - ) + for lgr in self.logger: + report = lgr(self, iterator, type="train",tensorboard_tag="every_n_epochs", + tensorboard_index=self.epoch, report=report) self._send_event(event_name="after_train_log", data=report) - if ( - self.val_every_n_epochs > 0 - and self.epoch % self.val_every_n_epochs == 0 - ): + if (self.val_every_n_epochs > 0 and self.epoch % self.val_every_n_epochs == 0): self._send_event(event_name="before_validation") report = None - if self.stdlogger_idx is not None: - report = self.std_logger( - self, iterator, type="valid", report=report - ) - - if self.tensorboard_idx is not None: - report = self.tensorboard_logger( - self, - iterator=iterator, - type="valid", - tensorboard_tag="every_n_epochs", - tensorboard_index=self.epoch, - report=report, - ) - if ( - self.wandblogger_idx is not None - and self.wandb_logger.log_on == "epochs" - ): - report = self.wandb_logger( - self, iterator, type="valid", report=report - ) + for lgr in self.logger: + report = lgr(self, iterator, type="valid",tensorboard_tag="every_n_epochs", + tensorboard_index=self.epoch,report = report) self._send_event(event_name="after_validation", data=report) @@ -402,24 +273,20 @@ def train_on_batches(self, iterator: DataLearningIterator) -> None: def train(self, iterator: DataLearningIterator) -> None: """Call :meth:`~fit_chainer` and then :meth:`~train_on_batches` with provided data iterator as an argument""" self.fit_chainer(iterator) - if callable(getattr(self._chainer, "train_on_batch", None)): + if callable(getattr(self._chainer, 'train_on_batch', None)): try: self.train_on_batches(iterator) - if self.wandblogger_idx is not None: - self.wandb_logger.close() except KeyboardInterrupt: - log.info("Stopped training") + log.info('Stopped training') else: - log.warning( - f"Using {self.__class__.__name__} for a pipeline without batched training" - ) + log.warning(f'Using {self.__class__.__name__} for a pipeline without batched training') # Run the at-train-exit model-saving logic if self.validation_number < 1: - log.info("Save model to capture early training results") + log.info('Save model to capture early training results') self.save() + + for lgr in self.logger: + lgr.close() -from deeppavlov.core.common.logging.wandb_logger import WandbLogger -from deeppavlov.core.common.logging.std_logger import StdLogger -from deeppavlov.core.common.logging.tensorboard_logger import TensorboardLogger diff --git a/deeppavlov/requirements/wandb.txt b/deeppavlov/requirements/wandb.txt index 9983212701..910c81a42a 100644 --- a/deeppavlov/requirements/wandb.txt +++ b/deeppavlov/requirements/wandb.txt @@ -1,3 +1 @@ wandb==0.12.7 -pybind11==2.2 -fasttext \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4708c92e6a..0198dfc49b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ tqdm==4.62.0 click==7.1.2 uvicorn==0.11.7 sacremoses==0.0.35 -uvloop==0.14.0 \ No newline at end of file +uvloop==0.14.0