From 98c93edfd2a9f66494321ba2f24385e2e8cf111d Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:10:23 +0800 Subject: [PATCH 1/6] fix vista2d readme Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- models/vista2d/configs/metadata.json | 3 ++- models/vista2d/docs/README.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/models/vista2d/configs/metadata.json b/models/vista2d/configs/metadata.json index a1cba412..cd5ae1dc 100644 --- a/models/vista2d/configs/metadata.json +++ b/models/vista2d/configs/metadata.json @@ -1,7 +1,8 @@ { "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20240725.json", - "version": "0.2.9", + "version": "0.3.0", "changelog": { + "0.3.0": "update readme", "0.2.9": "fix unsupported data dtype in findContours", "0.2.8": "remove relative path in readme", "0.2.7": "enhance readme", diff --git a/models/vista2d/docs/README.md b/models/vista2d/docs/README.md index 4ff5cfa1..457ea6e7 100644 --- a/models/vista2d/docs/README.md +++ b/models/vista2d/docs/README.md @@ -63,7 +63,7 @@ AttributeError: partially initialized module 'cv2' has no attribute 'dnn' (most when executing. To resolve this issue, please uninstall OpenCV and then re-install `cellpose` with a command like: ```Bash -pip uninstall -y opencv && rm /usr/local/lib/python3.x/dist-packages/cv2 +pip uninstall -y opencv && rm /usr/local/lib/python3.*/dist-packages/cv2 ``` Alternatively, you can use the following command to install `cellpose` without its dependencies: From 771cfb666f1109a93b8fa273ce34ae41454895e5 Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:19:23 +0800 Subject: [PATCH 2/6] update readme Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- models/vista2d/docs/README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/models/vista2d/docs/README.md b/models/vista2d/docs/README.md index 457ea6e7..afcdb12a 100644 --- a/models/vista2d/docs/README.md +++ b/models/vista2d/docs/README.md @@ -52,7 +52,12 @@ The default dataset for training, validation, and inference is the [Cellpose](ht Additionally, all data lists are available in the `datalists.zip` file located in the root directory of the bundle. Extract the contents of the `.zip` file to access the data lists. ### Dependencies -Please refer to `required_packages_version` in `configs/metadata.json` to install all necessary dependencies before executing. +Please refer to the `required_packages_version` section in `configs/metadata.json` to install all necessary dependencies before execution. If you’re using the MONAI container, you can simply run the commands below and ignore any "opencv-python-headless not installed" error message, as this package is already included in the container. + +``` +pip install fastremap==1.15.0 roifile==2024.5.24 natsort==8.4.0 +pip install --no-deps cellpose +``` Important Note: if your environment already contains OpenCV, installing `cellpose` may lead to conflicts and produce errors such as: @@ -60,13 +65,14 @@ Important Note: if your environment already contains OpenCV, installing `cellpos AttributeError: partially initialized module 'cv2' has no attribute 'dnn' (most likely due to a circular import) ``` -when executing. To resolve this issue, please uninstall OpenCV and then re-install `cellpose` with a command like: +To resolve this, uninstall `OpenCV` first, and then install `cellpose` using the following commands: ```Bash pip uninstall -y opencv && rm /usr/local/lib/python3.*/dist-packages/cv2 ``` +Make sure to replace 3.* with your actual Python version (e.g., 3.10). -Alternatively, you can use the following command to install `cellpose` without its dependencies: +Alternatively, you can install `cellpose` without its dependencies to avoid potential conflicts: ``` pip install --no-deps cellpose From acaae9b5ba21ab804a9989c8a45d5f529173ddf5 Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:30:32 +0800 Subject: [PATCH 3/6] update readme for nccl timeout issue in pathology Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- models/pathology_tumor_detection/configs/metadata.json | 3 ++- models/pathology_tumor_detection/docs/README.md | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/models/pathology_tumor_detection/configs/metadata.json b/models/pathology_tumor_detection/configs/metadata.json index e3c7cfa0..32f7cab6 100644 --- a/models/pathology_tumor_detection/configs/metadata.json +++ b/models/pathology_tumor_detection/configs/metadata.json @@ -1,7 +1,8 @@ { "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20220324.json", - "version": "0.6.1", + "version": "0.6.2", "changelog": { + "0.6.2": "enhance readme for nccl timout issue", "0.6.1": "fix multi-gpu issue", "0.6.0": "use monai 1.4 and update large files", "0.5.9": "update to use monai 1.3.1", diff --git a/models/pathology_tumor_detection/docs/README.md b/models/pathology_tumor_detection/docs/README.md index 46cc761a..2890df64 100644 --- a/models/pathology_tumor_detection/docs/README.md +++ b/models/pathology_tumor_detection/docs/README.md @@ -135,6 +135,16 @@ torchrun --standalone --nnodes=1 --nproc_per_node=2 -m monai.bundle run --config Please note that the distributed training-related options depend on the actual running environment; thus, users may need to remove `--standalone`, modify `--nnodes`, or do some other necessary changes according to the machine used. For more details, please refer to [pytorch's official tutorial](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). +**Note:** When using a container based on [PyTorch 24.0x](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes), you may encounter random NCCL timeout errors. To address this issue, consider the following adjustments: + +- Reduce the `num_workers`: Decreasing the number of data loader workers can help minimize these errors. +- Set `pin_memory` to `False`: Disabling pinned memory may reduce the likelihood of timeouts. +- Switch to the `gloo` backend: As a workaround, you can set the distributed training backend to `gloo` to avoid NCCL-related timeouts. + +You can implement these settings by adding flags like `--train#dataloader#num_workers 0` or `--train#dataloader#pin_memory false`. + +These adjustments can improve stability when working with NCCL in distributed training. + #### Execute inference ``` From 38d76355738b57692a0a8ececfd15a8aac49c3cb Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:33:39 +0800 Subject: [PATCH 4/6] remove unnecessary notes Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- models/pathology_tumor_detection/docs/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/models/pathology_tumor_detection/docs/README.md b/models/pathology_tumor_detection/docs/README.md index 2890df64..cf04c829 100644 --- a/models/pathology_tumor_detection/docs/README.md +++ b/models/pathology_tumor_detection/docs/README.md @@ -143,8 +143,6 @@ Please note that the distributed training-related options depend on the actual r You can implement these settings by adding flags like `--train#dataloader#num_workers 0` or `--train#dataloader#pin_memory false`. -These adjustments can improve stability when working with NCCL in distributed training. - #### Execute inference ``` From 813d8e83067207a704dc2e2a3fe59267e63b82a2 Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:40:35 +0800 Subject: [PATCH 5/6] set pin_memory to false Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- .../pathology_tumor_detection/configs/train.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/models/pathology_tumor_detection/configs/train.json b/models/pathology_tumor_detection/configs/train.json index 52344bad..c2b10fb3 100644 --- a/models/pathology_tumor_detection/configs/train.json +++ b/models/pathology_tumor_detection/configs/train.json @@ -4,14 +4,14 @@ "$import ignite" ], "lr": 0.001, - "num_epochs": 4, + "num_epochs": 2, "val_interval": 1, "bundle_root": ".", "ckpt_dir": "$os.path.join(@bundle_root, 'models')", "output_dir": "$os.path.join(@bundle_root, 'log')", - "training_file": "$os.path.join(@bundle_root, 'training.csv')", - "validation_file": "$os.path.join(@bundle_root, 'validation.csv')", - "dataset_dir": "/workspace/data/medical/pathology", + "training_file": "/workspace/Code/tutorials/pathology/tumor_detection/ignite/training.csv", + "validation_file": "/workspace/Code/tutorials/pathology/tumor_detection/ignite/training.csv", + "dataset_dir": "/workspace/Code/tutorials/pathology/tumor_detection/ignite/", "wsi_reader": "cuCIM", "region_size": [ 768, @@ -173,8 +173,8 @@ "dataloader": { "_target_": "DataLoader", "dataset": "@train#dataset", - "batch_size": 128, - "pin_memory": true, + "batch_size": 50, + "pin_memory": false, "num_workers": 8 }, "inferer": { @@ -324,8 +324,8 @@ "dataloader": { "_target_": "DataLoader", "dataset": "@validate#dataset", - "batch_size": 128, - "pin_memory": true, + "batch_size": 50, + "pin_memory": false, "shuffle": false, "num_workers": 8 }, From b3542a8f6fb2920cd33375fda519ca68952b401f Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:25:42 +0800 Subject: [PATCH 6/6] revert change Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- models/pathology_tumor_detection/configs/train.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/models/pathology_tumor_detection/configs/train.json b/models/pathology_tumor_detection/configs/train.json index c2b10fb3..78dba67f 100644 --- a/models/pathology_tumor_detection/configs/train.json +++ b/models/pathology_tumor_detection/configs/train.json @@ -4,14 +4,14 @@ "$import ignite" ], "lr": 0.001, - "num_epochs": 2, + "num_epochs": 4, "val_interval": 1, "bundle_root": ".", "ckpt_dir": "$os.path.join(@bundle_root, 'models')", "output_dir": "$os.path.join(@bundle_root, 'log')", - "training_file": "/workspace/Code/tutorials/pathology/tumor_detection/ignite/training.csv", - "validation_file": "/workspace/Code/tutorials/pathology/tumor_detection/ignite/training.csv", - "dataset_dir": "/workspace/Code/tutorials/pathology/tumor_detection/ignite/", + "training_file": "$os.path.join(@bundle_root, 'training.csv')", + "validation_file": "$os.path.join(@bundle_root, 'validation.csv')", + "dataset_dir": "/workspace/data/medical/pathology", "wsi_reader": "cuCIM", "region_size": [ 768, @@ -173,7 +173,7 @@ "dataloader": { "_target_": "DataLoader", "dataset": "@train#dataset", - "batch_size": 50, + "batch_size": 128, "pin_memory": false, "num_workers": 8 }, @@ -324,7 +324,7 @@ "dataloader": { "_target_": "DataLoader", "dataset": "@validate#dataset", - "batch_size": 50, + "batch_size": 128, "pin_memory": false, "shuffle": false, "num_workers": 8