Skip to content

Commit

Permalink
Enhance Deepedit bundle to enable finetune and early stopping (Projec…
Browse files Browse the repository at this point in the history
…t-MONAI#504)

### Description
This PR is used to enable early stopping and finetune for the spleen
deepedit annotation bundle.

### Status
**Ready**

### Please ensure all the checkboxes:
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Codeformat tests passed locally by running `./runtests.sh
--codeformat`.
- [ ] In-line docstrings updated.
- [ ] Update `version` and `changelog` in `metadata.json` if changing an
existing bundle.
- [ ] Please ensure the naming rules in config files meet our
requirements (please refer to: `CONTRIBUTING.md`).
- [ ] Ensure versions of packages such as `monai`, `pytorch` and `numpy`
are correct in `metadata.json`.
- [ ] Descriptions should be consistent with the content, such as
`eval_metrics` of the provided weights and TorchScript modules.
- [ ] Files larger than 25MB are excluded and replaced by providing
download links in `large_file.yml`.
- [ ] Avoid using path that contains personal information within config
files (such as use `/home/your_name/` for `"bundle_root"`).

---------

Signed-off-by: Yiheng Wang <[email protected]>
  • Loading branch information
yiheng-wang-nv authored Sep 18, 2023
1 parent df9bab7 commit 2a90cff
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 6 deletions.
4 changes: 2 additions & 2 deletions ci/unit_tests/test_spleen_deepedit_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def tearDown(self):
def test_train_config(self, override):
override["dataset_dir"] = self.dataset_dir
bundle_root = override["bundle_root"]

sys.path = [bundle_root] + sys.path
trainer = ConfigWorkflow(
workflow="train",
config_file=os.path.join(bundle_root, "configs/train.json"),
Expand All @@ -96,7 +96,7 @@ def test_eval_config(self, override):
bundle_root = override["bundle_root"]
train_file = os.path.join(bundle_root, "configs/train.json")
eval_file = os.path.join(bundle_root, "configs/evaluate.json")

sys.path = [bundle_root] + sys.path
validator = ConfigWorkflow(
# override train.json, thus set the workflow to "train" rather than "eval"
workflow="train",
Expand Down
2 changes: 2 additions & 0 deletions ci/unit_tests/test_spleen_deepedit_annotation_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import os
import shutil
import sys
import tempfile
import unittest

Expand Down Expand Up @@ -62,6 +63,7 @@ def test_train_mgpu_config(self, override):
mgpu_train_file = os.path.join(bundle_root, "configs/multi_gpu_train.json")
output_path = os.path.join(bundle_root, "configs/train_override.json")
n_gpu = torch.cuda.device_count()
sys.path = [bundle_root] + sys.path
export_config_and_run_mgpu_cmd(
config_file=[train_file, mgpu_train_file],
logging_file=os.path.join(bundle_root, "configs/logging.conf"),
Expand Down
3 changes: 2 additions & 1 deletion models/spleen_deepedit_annotation/configs/metadata.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json",
"version": "0.4.9",
"version": "0.5.0",
"changelog": {
"0.5.0": "enable finetune and early stop",
"0.4.9": "fix orientation issue on clicks",
"0.4.8": "Add infer transforms to manage clicks from viewer",
"0.4.7": "fix the wrong GPU index issue of multi-node",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@
"shuffle": false
},
"validate#dataloader#sampler": "@validate#sampler",
"validate#evaluator#val_handlers": "$None if dist.get_rank() > 0 else @validate#handlers",
"validate#evaluator#val_handlers": "$@validate#handlers[: -3 if dist.get_rank() > 0 else None]",
"initialize": [
"$import torch.distributed as dist",
"$dist.is_initialized() or dist.init_process_group(backend='nccl')",
"$torch.cuda.set_device(@device)",
"$monai.utils.set_determinism(seed=123)"
],
"run": [
"$@validate#handlers#0.set_trainer(trainer=@train#trainer) if @early_stop else None",
"$@train#trainer.run()"
],
"finalize": [
Expand Down
26 changes: 24 additions & 2 deletions models/spleen_deepedit_annotation/configs/train.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"imports": [
"$import glob",
"$import os",
"$import ignite"
"$import ignite",
"$import scripts"
],
"bundle_root": ".",
"ckpt_dir": "$@bundle_root + '/models'",
Expand All @@ -14,6 +15,10 @@
"spleen": 1,
"background": 0
},
"finetune": false,
"finetune_model_path": "$@bundle_root + '/models/model.pt'",
"early_stop": false,
"epochs": 500,
"spatial_size": [
128,
128,
Expand Down Expand Up @@ -302,6 +307,14 @@
]
},
"handlers": [
{
"_target_": "CheckpointLoader",
"_disabled_": "$not @finetune",
"load_path": "@finetune_model_path",
"load_dict": {
"model": "@network"
}
},
{
"_target_": "LrScheduleHandler",
"lr_scheduler": "@lr_scheduler",
Expand Down Expand Up @@ -342,7 +355,7 @@
"trainer": {
"_target_": "SupervisedTrainer",
"device": "@device",
"max_epochs": 500,
"max_epochs": "@epochs",
"train_data_loader": "@train#dataloader",
"network": "@network",
"optimizer": "@optimizer",
Expand Down Expand Up @@ -379,6 +392,14 @@
},
"postprocessing": "%train#postprocessing",
"handlers": [
{
"_target_": "EarlyStopHandler",
"_disabled_": "$not @early_stop",
"trainer": null,
"patience": 1,
"score_function": "$scripts.score_function",
"min_delta": 0.01
},
{
"_target_": "StatsHandler",
"iteration_log": false
Expand Down Expand Up @@ -429,6 +450,7 @@
"$monai.utils.set_determinism(seed=123)"
],
"run": [
"$@validate#handlers#0.set_trainer(trainer=@train#trainer) if @early_stop else None",
"$@train#trainer.run()"
]
}
1 change: 1 addition & 0 deletions models/spleen_deepedit_annotation/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .early_stop_score_function import score_function
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os

import torch
import torch.distributed as dist


def score_function(engine):
val_metric = engine.state.metrics["val_mean_dice"]
if dist.is_initialized():
device = torch.device("cuda:" + os.environ["LOCAL_RANK"])
val_metric = torch.tensor([val_metric]).to(device)
dist.all_reduce(val_metric, op=dist.ReduceOp.SUM)
val_metric /= dist.get_world_size()
return val_metric.item()
return val_metric

0 comments on commit 2a90cff

Please sign in to comment.