diff --git a/ci/unit_tests/test_spleen_deepedit_annotation.py b/ci/unit_tests/test_spleen_deepedit_annotation.py index 48e2ff3f..6a6b1556 100644 --- a/ci/unit_tests/test_spleen_deepedit_annotation.py +++ b/ci/unit_tests/test_spleen_deepedit_annotation.py @@ -80,7 +80,7 @@ def tearDown(self): def test_train_config(self, override): override["dataset_dir"] = self.dataset_dir bundle_root = override["bundle_root"] - + sys.path = [bundle_root] + sys.path trainer = ConfigWorkflow( workflow="train", config_file=os.path.join(bundle_root, "configs/train.json"), @@ -96,7 +96,7 @@ def test_eval_config(self, override): bundle_root = override["bundle_root"] train_file = os.path.join(bundle_root, "configs/train.json") eval_file = os.path.join(bundle_root, "configs/evaluate.json") - + sys.path = [bundle_root] + sys.path validator = ConfigWorkflow( # override train.json, thus set the workflow to "train" rather than "eval" workflow="train", diff --git a/ci/unit_tests/test_spleen_deepedit_annotation_dist.py b/ci/unit_tests/test_spleen_deepedit_annotation_dist.py index 7f11e586..d4dfa80b 100644 --- a/ci/unit_tests/test_spleen_deepedit_annotation_dist.py +++ b/ci/unit_tests/test_spleen_deepedit_annotation_dist.py @@ -11,6 +11,7 @@ import os import shutil +import sys import tempfile import unittest @@ -62,6 +63,7 @@ def test_train_mgpu_config(self, override): mgpu_train_file = os.path.join(bundle_root, "configs/multi_gpu_train.json") output_path = os.path.join(bundle_root, "configs/train_override.json") n_gpu = torch.cuda.device_count() + sys.path = [bundle_root] + sys.path export_config_and_run_mgpu_cmd( config_file=[train_file, mgpu_train_file], logging_file=os.path.join(bundle_root, "configs/logging.conf"), diff --git a/models/spleen_deepedit_annotation/configs/metadata.json b/models/spleen_deepedit_annotation/configs/metadata.json index d4d46cd3..964a7a49 100644 --- a/models/spleen_deepedit_annotation/configs/metadata.json +++ b/models/spleen_deepedit_annotation/configs/metadata.json @@ -1,7 +1,8 @@ { "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json", - "version": "0.4.9", + "version": "0.5.0", "changelog": { + "0.5.0": "enable finetune and early stop", "0.4.9": "fix orientation issue on clicks", "0.4.8": "Add infer transforms to manage clicks from viewer", "0.4.7": "fix the wrong GPU index issue of multi-node", diff --git a/models/spleen_deepedit_annotation/configs/multi_gpu_train.json b/models/spleen_deepedit_annotation/configs/multi_gpu_train.json index 021f3c70..fbc38d1a 100644 --- a/models/spleen_deepedit_annotation/configs/multi_gpu_train.json +++ b/models/spleen_deepedit_annotation/configs/multi_gpu_train.json @@ -23,7 +23,7 @@ "shuffle": false }, "validate#dataloader#sampler": "@validate#sampler", - "validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers", + "validate#evaluator#val_handlers": "$@validate#handlers[: -3 if dist.get_rank() > 0 else None]", "initialize": [ "$import torch.distributed as dist", "$dist.is_initialized() or dist.init_process_group(backend='nccl')", @@ -31,6 +31,7 @@ "$monai.utils.set_determinism(seed=123)" ], "run": [ + "$@validate#handlers#0.set_trainer(trainer=@train#trainer) if @early_stop else None", "$@train#trainer.run()" ], "finalize": [ diff --git a/models/spleen_deepedit_annotation/configs/train.json b/models/spleen_deepedit_annotation/configs/train.json index 4361b5be..52d28e94 100644 --- a/models/spleen_deepedit_annotation/configs/train.json +++ b/models/spleen_deepedit_annotation/configs/train.json @@ -2,7 +2,8 @@ "imports": [ "$import glob", "$import os", - "$import ignite" + "$import ignite", + "$import scripts" ], "bundle_root": ".", "ckpt_dir": "$@bundle_root + '/models'", @@ -14,6 +15,10 @@ "spleen": 1, "background": 0 }, + "finetune": false, + "finetune_model_path": "$@bundle_root + '/models/model.pt'", + "early_stop": false, + "epochs": 500, "spatial_size": [ 128, 128, @@ -302,6 +307,14 @@ ] }, "handlers": [ + { + "_target_": "CheckpointLoader", + "_disabled_": "$not @finetune", + "load_path": "@finetune_model_path", + "load_dict": { + "model": "@network" + } + }, { "_target_": "LrScheduleHandler", "lr_scheduler": "@lr_scheduler", @@ -342,7 +355,7 @@ "trainer": { "_target_": "SupervisedTrainer", "device": "@device", - "max_epochs": 500, + "max_epochs": "@epochs", "train_data_loader": "@train#dataloader", "network": "@network", "optimizer": "@optimizer", @@ -379,6 +392,14 @@ }, "postprocessing": "%train#postprocessing", "handlers": [ + { + "_target_": "EarlyStopHandler", + "_disabled_": "$not @early_stop", + "trainer": null, + "patience": 1, + "score_function": "$scripts.score_function", + "min_delta": 0.01 + }, { "_target_": "StatsHandler", "iteration_log": false @@ -429,6 +450,7 @@ "$monai.utils.set_determinism(seed=123)" ], "run": [ + "$@validate#handlers#0.set_trainer(trainer=@train#trainer) if @early_stop else None", "$@train#trainer.run()" ] } diff --git a/models/spleen_deepedit_annotation/scripts/__init__.py b/models/spleen_deepedit_annotation/scripts/__init__.py new file mode 100644 index 00000000..17cbbfc0 --- /dev/null +++ b/models/spleen_deepedit_annotation/scripts/__init__.py @@ -0,0 +1 @@ +from .early_stop_score_function import score_function diff --git a/models/spleen_deepedit_annotation/scripts/early_stop_score_function.py b/models/spleen_deepedit_annotation/scripts/early_stop_score_function.py new file mode 100644 index 00000000..350f3ffe --- /dev/null +++ b/models/spleen_deepedit_annotation/scripts/early_stop_score_function.py @@ -0,0 +1,15 @@ +import os + +import torch +import torch.distributed as dist + + +def score_function(engine): + val_metric = engine.state.metrics["val_mean_dice"] + if dist.is_initialized(): + device = torch.device("cuda:" + os.environ["LOCAL_RANK"]) + val_metric = torch.tensor([val_metric]).to(device) + dist.all_reduce(val_metric, op=dist.ReduceOp.SUM) + val_metric /= dist.get_world_size() + return val_metric.item() + return val_metric