Enhance Deepedit bundle to enable finetune and early stopping (Projec…

…t-MONAI#504) ### Description This PR is used to enable early stopping and finetune for the spleen deepedit annotation bundle. ### Status **Ready** ### Please ensure all the checkboxes:  - [x] Codeformat tests passed locally by running `./runtests.sh --codeformat`. - [ ] In-line docstrings updated. - [ ] Update `version` and `changelog` in `metadata.json` if changing an existing bundle. - [ ] Please ensure the naming rules in config files meet our requirements (please refer to: `CONTRIBUTING.md`). - [ ] Ensure versions of packages such as `monai`, `pytorch` and `numpy` are correct in `metadata.json`. - [ ] Descriptions should be consistent with the content, such as `eval_metrics` of the provided weights and TorchScript modules. - [ ] Files larger than 25MB are excluded and replaced by providing download links in `large_file.yml`. - [ ] Avoid using path that contains personal information within config files (such as use `/home/your_name/` for `"bundle_root"`). --------- Signed-off-by: Yiheng Wang <[email protected]>
yiheng-wang-nv · Sep 18, 2023 · 2a90cff · 2a90cff
1 parent df9bab7
commit 2a90cff
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 6 deletions.
diff --git a/ci/unit_tests/test_spleen_deepedit_annotation.py b/ci/unit_tests/test_spleen_deepedit_annotation.py
@@ -80,7 +80,7 @@ def tearDown(self):
     def test_train_config(self, override):
         override["dataset_dir"] = self.dataset_dir
         bundle_root = override["bundle_root"]
-
+        sys.path = [bundle_root] + sys.path
         trainer = ConfigWorkflow(
             workflow="train",
             config_file=os.path.join(bundle_root, "configs/train.json"),
@@ -96,7 +96,7 @@ def test_eval_config(self, override):
         bundle_root = override["bundle_root"]
         train_file = os.path.join(bundle_root, "configs/train.json")
         eval_file = os.path.join(bundle_root, "configs/evaluate.json")
-
+        sys.path = [bundle_root] + sys.path
         validator = ConfigWorkflow(
             # override train.json, thus set the workflow to "train" rather than "eval"
             workflow="train",

diff --git a/ci/unit_tests/test_spleen_deepedit_annotation_dist.py b/ci/unit_tests/test_spleen_deepedit_annotation_dist.py
@@ -11,6 +11,7 @@
 
 import os
 import shutil
+import sys
 import tempfile
 import unittest
 
@@ -62,6 +63,7 @@ def test_train_mgpu_config(self, override):
         mgpu_train_file = os.path.join(bundle_root, "configs/multi_gpu_train.json")
         output_path = os.path.join(bundle_root, "configs/train_override.json")
         n_gpu = torch.cuda.device_count()
+        sys.path = [bundle_root] + sys.path
         export_config_and_run_mgpu_cmd(
             config_file=[train_file, mgpu_train_file],
             logging_file=os.path.join(bundle_root, "configs/logging.conf"),

diff --git a/models/spleen_deepedit_annotation/configs/metadata.json b/models/spleen_deepedit_annotation/configs/metadata.json
@@ -1,7 +1,8 @@
 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
-    "version": "0.4.9",
+    "version": "0.5.0",
     "changelog": {
+        "0.5.0": "enable finetune and early stop",
         "0.4.9": "fix orientation issue on clicks",
         "0.4.8": "Add infer transforms to manage clicks from viewer",
         "0.4.7": "fix the wrong GPU index issue of multi-node",

diff --git a/models/spleen_deepedit_annotation/configs/multi_gpu_train.json b/models/spleen_deepedit_annotation/configs/multi_gpu_train.json
@@ -23,14 +23,15 @@
         "shuffle": false
     },
     "validate#dataloader#sampler": "@validate#sampler",
-    "validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
+    "validate#evaluator#val_handlers": "$@validate#handlers[: -3 if dist.get_rank() > 0 else None]",
     "initialize": [
         "$import torch.distributed as dist",
         "$dist.is_initialized() or dist.init_process_group(backend='nccl')",
         "$torch.cuda.set_device(@device)",
         "$monai.utils.set_determinism(seed=123)"
     ],
     "run": [
+        "$@validate#handlers#0.set_trainer(trainer=@train#trainer) if @early_stop else None",
         "$@train#trainer.run()"
     ],
     "finalize": [

diff --git a/models/spleen_deepedit_annotation/configs/train.json b/models/spleen_deepedit_annotation/configs/train.json
@@ -2,7 +2,8 @@
     "imports": [
         "$import glob",
         "$import os",
-        "$import ignite"
+        "$import ignite",
+        "$import scripts"
     ],
     "bundle_root": ".",
     "ckpt_dir": "$@bundle_root + '/models'",
@@ -14,6 +15,10 @@
         "spleen": 1,
         "background": 0
     },
+    "finetune": false,
+    "finetune_model_path": "$@bundle_root + '/models/model.pt'",
+    "early_stop": false,
+    "epochs": 500,
     "spatial_size": [
         128,
         128,
@@ -302,6 +307,14 @@
             ]
         },
         "handlers": [
+            {
+                "_target_": "CheckpointLoader",
+                "_disabled_": "$not @finetune",
+                "load_path": "@finetune_model_path",
+                "load_dict": {
+                    "model": "@network"
+                }
+            },
             {
                 "_target_": "LrScheduleHandler",
                 "lr_scheduler": "@lr_scheduler",
@@ -342,7 +355,7 @@
         "trainer": {
             "_target_": "SupervisedTrainer",
             "device": "@device",
-            "max_epochs": 500,
+            "max_epochs": "@epochs",
             "train_data_loader": "@train#dataloader",
             "network": "@network",
             "optimizer": "@optimizer",
@@ -379,6 +392,14 @@
         },
         "postprocessing": "%train#postprocessing",
         "handlers": [
+            {
+                "_target_": "EarlyStopHandler",
+                "_disabled_": "$not @early_stop",
+                "trainer": null,
+                "patience": 1,
+                "score_function": "$scripts.score_function",
+                "min_delta": 0.01
+            },
             {
                 "_target_": "StatsHandler",
                 "iteration_log": false
@@ -429,6 +450,7 @@
         "$monai.utils.set_determinism(seed=123)"
     ],
     "run": [
+        "$@validate#handlers#0.set_trainer(trainer=@train#trainer) if @early_stop else None",
         "$@train#trainer.run()"
     ]
 }
diff --git a/models/spleen_deepedit_annotation/scripts/__init__.py b/models/spleen_deepedit_annotation/scripts/__init__.py
@@ -0,0 +1 @@
+from .early_stop_score_function import score_function
diff --git a/models/spleen_deepedit_annotation/scripts/early_stop_score_function.py b/models/spleen_deepedit_annotation/scripts/early_stop_score_function.py
@@ -0,0 +1,15 @@
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def score_function(engine):
+    val_metric = engine.state.metrics["val_mean_dice"]
+    if dist.is_initialized():
+        device = torch.device("cuda:" + os.environ["LOCAL_RANK"])
+        val_metric = torch.tensor([val_metric]).to(device)
+        dist.all_reduce(val_metric, op=dist.ReduceOp.SUM)
+        val_metric /= dist.get_world_size()
+        return val_metric.item()
+    return val_metric