Merge branch 'open-mmlab:main' into pipeline

open-mmlab · Oct 24, 2023 · 7562cba · 7562cba
2 parents d6b4f5a + c65187c
commit 7562cba
Show file tree

Hide file tree

Showing 42 changed files with 621 additions and 205 deletions.
diff --git a/.circleci/test.yml b/.circleci/test.yml
@@ -49,11 +49,38 @@ jobs:
           command: pip install -e . -v
       - run:
           name: Install unit tests dependencies
-          command: pip install -r requirements/tests.txt
+          command: pip install -r requirements/tests_lite.txt
       - run:
           name: Run unit tests
           command: pytest tests/test_config tests/test_registry tests/test_fileio tests/test_logging tests/test_utils --ignore=tests/test_utils/test_dl_utils
 
+  build_lite:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+        default: "3.7.4"
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Upgrade pip
+          command: |
+            pip install pip --upgrade
+            pip --version
+      - run:
+          name: Build MMEngine from source
+          command: MMENGINE_LITE=1 pip install -e . -v
+      - run:
+          name: Install unit tests dependencies
+          command: pip install -r requirements/tests_lite.txt
+      - run:
+          name: Run unit tests
+          command: pytest tests/test_config tests/test_registry tests/test_logging tests/test_utils --ignore=tests/test_utils/test_dl_utils
+
   build_cpu:
     parameters:
       # The python version must match available image tags in
@@ -110,16 +137,22 @@ jobs:
         type: string
       cuda:
         type: enum
-        enum: ["10.1", "10.2", "11.1", "11.7"]
+        enum: ["10.1", "10.2", "11.1", "11.7", "11.8"]
       cudnn:
         type: integer
         default: 7
     machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
+      image: linux-cuda-11:default
       docker_layer_caching: true
-    resource_class: gpu.nvidia.small
+    resource_class: gpu.nvidia.small.multi
     steps:
       - checkout
+      - run:
+          name: Install nvidia-container-toolkit and Restart Docker
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y nvidia-container-toolkit
+            sudo systemctl restart docker
       - run:
           name: Build Docker image
           command: |
@@ -217,6 +250,10 @@ workflows:
             branches:
               ignore:
                 - main
+      - build_lite:
+          name: build lite
+          requires:
+            - lint
       - build_without_torch:
           name: build without torch
           requires:
@@ -230,8 +267,8 @@ workflows:
             - lint
       - build_cpu:
           name: maximum_version_cpu
-          torch: 2.0.0
-          torchvision: 0.15.1
+          torch: 2.1.0
+          torchvision: 0.16.0
           python: 3.9.0
           requires:
             - minimum_version_cpu
@@ -259,8 +296,8 @@ workflows:
             - hold
       - build_cuda:
           name: maximum_version_gpu
-          torch: 2.0.0
-          cuda: "11.7"
+          torch: 2.1.0
+          cuda: "11.8"
           cudnn: 8
           requires:
             - hold

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -1 +1 @@
-We appreciate all contributions to improve MMEngine. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
+We appreciate all contributions to improve MMEngine. Please refer to [Contributing to OpenMMLab](https://mmengine.readthedocs.io/en/latest/notes/contributing.html) for more details about the contributing guideline.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -1,4 +1,4 @@
-Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.
+Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers. By the way, if you're not familiar with how to use pre-commit to fix lint issues or add unit tests, please refer to [Contributing to OpenMMLab](https://mmengine.readthedocs.io/en/latest/notes/contributing.html).
 
 ## Motivation
 
@@ -21,5 +21,5 @@ If this PR introduces a new feature, it is better to list some use cases here, a
 
 1. Pre-commit or other linting tools are used to fix the potential lint issues.
 2. The modification is covered by complete unit tests. If not, please add more unit test to ensure the correctness.
-3. If the modification has potential influence on downstream projects, this PR should be tested with downstream projects, like MMDet or MMCls.
+3. If the modification has potential influence on downstream projects, this PR should be tested with downstream projects, like MMDetection or MMPretrain.
 4. The documentation has been modified accordingly, like docstring or example tutorials.
diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
@@ -58,7 +58,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7]
-        torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0]
+        torch: [1.6.0, 1.7.1, 1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]
         include:
           - torch: 1.6.0
             torchvision: 0.7.0
@@ -79,6 +79,14 @@ jobs:
           - torch: 2.0.0
             torchvision: 0.15.1
             python-version: 3.8
+          - torch: 2.1.0
+            torchvision: 0.16.0
+            python-version: 3.8
+        exclude:
+          - torch: 2.0.0
+            python-version: 3.7
+          - torch: 2.1.0
+            python-version: 3.7
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
@@ -228,15 +236,17 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        python-version: [3.7]
-        torch: [1.6.0, 1.8.1, 1.13.0]
+        python-version: [3.8]
+        torch: [1.6.0, 1.8.1, 1.13.0, 2.1.0]
         include:
           - torch: 1.6.0
             torchvision: 0.7.0
           - torch: 1.8.1
             torchvision: 0.9.1
           - torch: 1.13.0
             torchvision: 0.14.0
+          - torch: 2.1.0
+            torchvision: 0.16.0
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.7
@@ -273,13 +283,17 @@ jobs:
       matrix:
         python-version: [3.7]
         platform: [cpu, cu111]
-        torch: [1.8.1]
-        torchvision: [0.9.1]
+        torch: [1.8.1, 2.1.0]
+        torchvision: [0.9.1, 0.16.0]
         include:
           - python-version: 3.8
             platform: cu117
             torch: 2.0.0
             torchvision: 0.15.1
+          - python-version: 3.8
+            platform: cu118
+            torch: 2.1.0
+            torchvision: 0.16.0
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
@@ -136,15 +136,13 @@ jobs:
     runs-on: windows-2022
     strategy:
       matrix:
-        python-version: [3.7]
-        platform: [cpu, cu111]
-        torch: [1.8.1]
-        torchvision: [0.9.1]
+        torch: [2.1.0]
+        torchvision: [0.16.0]
         include:
           - python-version: 3.8
-            platform: cu117
-            torch: 2.0.0
-            torchvision: 0.15.1
+            platform: cu118
+            torch: 2.1.0
+            torchvision: 0.16.0
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -13,3 +13,4 @@ python:
   install:
     - requirements: requirements/runtime.txt
     - requirements: requirements/docs.txt
+    - requirements: requirements/docs_extra.txt
diff --git a/README.md b/README.md
@@ -58,29 +58,17 @@ English | [简体中文](README_zh-CN.md)
 
 ## What's New
 
-v0.8.4 was released on 2023-08-03.
+v0.9.0 was released on 2023-10-10.
 
 Highlights:
 
-- Support enabling `efficient_conv_bn_eval` for efficient convolution and batch normalization. See [save memory on gpu](https://mmengine.readthedocs.io/en/latest/common_usage/save_gpu_memory.html#save-memory-on-gpu) for more details
+- Support training with [ColossalAI](https://colossalai.org/). Refer to the [Training Large Models](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html#colossalai) for more detailed usages.
 
-- Add an [example](./examples/llama2/) to finetune Llama2.
+- Support gradient checkpointing. Refer to the [Save Memory on GPU](https://mmengine.readthedocs.io/en/latest/common_usage/save_gpu_memory.html#gradient-checkpointing) for more details.
 
-- Support training with [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html?highlight=fsdp) and [DeepSpeed](https://www.deepspeed.ai/). Refer to the [Training Large Models](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html) for more detailed usages.
+- Supports multiple visualization backends, including `NeptuneVisBackend`, `DVCLiveVisBackend` and `AimVisBackend`. Refer to [Visualization Backends](https://mmengine.readthedocs.io/en/latest/common_usage/visualize_training_log.html) for more details.
 
-- Introduce the pure Python style configuration file:
-
-  - Support navigating to base configuration file in IDE
-  - Support navigating to base variable in IDE
-  - Support navigating to source code of class in IDE
-  - Support inheriting two configuration files containing the same field
-  - Load the configuration file without other third-party requirements
-
-  Refer to the [tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) for more detailed usages.
-
-  ![new-config-en](https://github.com/open-mmlab/mmengine/assets/57566630/7eb41748-9374-488f-901e-fcd7f0d3c8a1)
-
-Read [Changelog](./docs/en/notes/changelog.md#v083-08032023) for more details.
+Read [Changelog](./docs/en/notes/changelog.md#v090-10102023) for more details.
 
 ## Table of Contents
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -58,29 +58,17 @@
 
 ## 最近进展
 
-最新版本 v0.8.4 在 2023.08.03 发布。
+最新版本 v0.9.0 在 2023.10.10 发布。
 
 亮点：
 
-- 支持使用 `efficient_conv_bn_eval` 参数开启更高效的 `ConvBN` 推理模式。详见[节省显存文档](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html)
+- 支持使用 [ColossalAI](https://colossalai.org/) 进行训练。可阅读[大模型训练](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/large_model_training.html#colossalai)了解用法。
 
-- 新增微调 Llama2 的[示例](./examples/llama2/)。
+- 支持梯度检查点。详见[用法](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/save_gpu_memory.html#id3)。
 
-- 支持使用 [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html?highlight=fsdp) 和 [DeepSpeed](https://www.deepspeed.ai/) 进行训练。可阅读[大模型训练](https://mmengine.readthedocs.io/zh_cn/latest/common_usage/large_model_training.html)了解用法。
+- 支持多种可视化后端，包括`NeptuneVisBackend`、`DVCLiveVisBackend` 和 `AimVisBackend`。可阅读[可视化后端](https://mmengine.readthedocs.io/zh_CN/latest/common_usage/visualize_training_log.html)了解用法。
 
-- 引入纯 Python 风格的配置文件：
-
-  - 支持在 IDE 中导航到基础配置文件
-  - 支持在 IDE 中导航到基础变量
-  - 支持在 IDE 中导航到类的源代码
-  - 支持继承包含相同字段的两个配置文件
-  - 在加载配置文件时不需要其他第三方依赖
-
-  请参考[教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/config.html#python-beta)以获取更详细的用法说明。
-
-  ![new-config-zh_cn](https://github.com/open-mmlab/mmengine/assets/57566630/c2da9a73-c911-4f78-8253-e3f29496d9f8)
-
-如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v083-08032023)
+如果想了解更多版本更新细节和历史信息，请阅读[更新日志](./docs/en/notes/changelog.md#v090-10102023)
 
 ## 目录
 

diff --git a/docs/en/common_usage/better_optimizers.md b/docs/en/common_usage/better_optimizers.md
@@ -96,7 +96,7 @@ runner.train()
 [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) provides `AdamW8bit`, `Adam8bit`, `Adagrad8bit`, `PagedAdam8bit`, `PagedAdamW8bit`, `LAMB8bit`, `LARS8bit`, `RMSprop8bit`, `Lion8bit`, `PagedLion8bit` and `SGD8bit` optimziers。
 
 ```{note}
-If you use the optimizer provided by bitsandbytes, you need to upgrade mmengine to `0.8.5`.
+If you use the optimizer provided by bitsandbytes, you need to upgrade mmengine to `0.9.0`.
 ```
 
 - Installation
@@ -127,7 +127,7 @@ runner.train()
 [transformers](https://github.com/huggingface/transformers) provides `Adafactor` optimzier。
 
 ```{note}
-If you use the optimizer provided by transformers, you need to upgrade mmengine to `0.8.5`.
+If you use the optimizer provided by transformers, you need to upgrade mmengine to `0.9.0`.
 ```
 
 - Installation

diff --git a/docs/en/common_usage/debug_tricks.md b/docs/en/common_usage/debug_tricks.md
@@ -50,6 +50,56 @@ As we can see, the number of iterations has changed to `313`. Compared to before
 02/20 14:45:01 - mmengine - INFO - Epoch(train)   [1][300/313]  lr: 1.0000e-01  eta: 0:20:39  time: 0.0143  data_time: 0.0003  memory: 214  loss: 1.814
 ```
 
+## Training for a fixed number of iterations (epoch-based training)
+
+During the process of debugging code, sometimes it is necessary to train for several epochs, such as debugging the validation process or checking whether the checkpoint saving meets expectations. However, if the dataset is too large, it may take a long time to complete one epoch. In such cases, you can configure the `num_batch_per_epoch` parameter of the dataloader.
+
+```{note}
+The `num_batch_per_epoch` parameter is not a native parameter of PyTorch dataloaders but an additional parameter added by MMEngine to achieve this functionality.
+```
+
+Let's take the model defined in [5 minutes to get started with MMEngine](../get_started/15_minutes.md) as an example. By setting `num_batch_per_epoch=5` in both `train_dataloader` and `val_dataloader`, you can ensure that one epoch consists of only 5 iterations.
+
+```python
+train_dataloader = dict(
+    batch_size=32,
+    dataset=train_set,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    collate_fn=dict(type='default_collate'),
+    num_batch_per_epoch=5)
+val_dataloader = dict(
+    batch_size=32,
+    dataset=valid_set,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    collate_fn=dict(type='default_collate'),
+    num_batch_per_epoch=5)
+runner = Runner(
+    model=MMResNet50(),
+    work_dir='./work_dir',
+    train_dataloader=train_dataloader,
+    optim_wrapper=dict(optimizer=dict(type=SGD, lr=0.001, momentum=0.9)),
+    train_cfg=dict(by_epoch=True, max_epochs=2, val_interval=1),
+    val_dataloader=val_dataloader,
+    val_cfg=dict(),
+    val_evaluator=dict(type=Accuracy),
+    launcher=args.launcher,
+)
+runner.train()
+```
+
+As we can see, the number of iterations has been reduced to 5. Compared to the original setting, this allows you to complete one epoch more quickly.
+
+```
+08/18 20:27:22 - mmengine - INFO - Epoch(train) [1][5/5]  lr: 1.0000e-03  eta: 0:00:02  time: 0.4566  data_time: 0.0074  memory: 477  loss: 6.7576
+08/18 20:27:22 - mmengine - INFO - Saving checkpoint at 1 epochs
+08/18 20:27:22 - mmengine - WARNING - `save_param_scheduler` is True but `self.param_schedulers` is None, so skip saving parameter schedulers
+08/18 20:27:23 - mmengine - INFO - Epoch(val) [1][5/5]    accuracy: 7.5000  data_time: 0.0044  time: 0.0146
+08/18 20:27:23 - mmengine - INFO - Exp name: 20230818_202715
+08/18 20:27:23 - mmengine - INFO - Epoch(train) [2][5/5]  lr: 1.0000e-03  eta: 0:00:00  time: 0.2501  data_time: 0.0077  memory: 477  loss: 5.3044
+08/18 20:27:23 - mmengine - INFO - Saving checkpoint at 2 epochs
+08/18 20:27:24 - mmengine - INFO - Epoch(val) [2][5/5]    accuracy: 12.5000  data_time: 0.0058  time: 0.0175
+```
+
 ## Find Unused Parameters
 
 When using multiple GPUs training, if model's parameters are involved in forward computation but are not used in producing loss, the program may throw the following error:

diff --git a/docs/en/common_usage/large_model_training.md b/docs/en/common_usage/large_model_training.md
@@ -1,4 +1,4 @@
-# Traning Big Models
+# Training Big Models
 
 When training large models, significant resources are required. A single GPU memory is often insufficient to meet the training needs. As a result, techniques for training large models have been developed, and one typical approach is [DeepSpeed ZeRO](https://www.deepspeed.ai/tutorials/zero/#zero-overview). DeepSpeed ZeRO supports optimizer, gradient, and parameter sharding.
 
@@ -85,7 +85,7 @@ torchrun --nproc-per-node 2 examples/distributed_training_with_flexible_runner.p
 ```
 
 <details>
-<summary>traning log</summary>
+<summary>training log</summary>
 
 ```
 07/03 13:04:17 - mmengine - INFO - Epoch(train)  [1][ 10/196]  lr: 3.3333e-04  eta: 0:13:14  time: 0.4073  data_time: 0.0335  memory: 970  loss: 6.1887
@@ -157,7 +157,7 @@ torchrun --nproc-per-node 2 examples/distributed_training_with_flexible_runner.p
 ```
 
 <details>
-<summary>traning log</summary>
+<summary>training log</summary>
 
 ```
 07/03 13:05:37 - mmengine - INFO - Epoch(train)  [1][ 10/196]  lr: 3.3333e-04  eta: 0:08:28  time: 0.2606  data_time: 0.0330  memory: 954  loss: 6.1265
@@ -185,7 +185,7 @@ torchrun --nproc-per-node 2 examples/distributed_training_with_flexible_runner.p
 
 ## ColossalAI
 
-[ColossalAI](https://colossalai.org/) is a comprehensive large-scale model training system that utilizes efficient parallelization techniques. Starting from MMEngine v0.8.5, it supports training models using optimization strategies from the ZeRO series in ColossalAI.
+[ColossalAI](https://colossalai.org/) is a comprehensive large-scale model training system that utilizes efficient parallelization techniques. Starting from MMEngine v0.9.0, it supports training models using optimization strategies from the ZeRO series in ColossalAI.
 
 Install ColossalAI with a version greater than v0.3.1. This version requirement is due to a [bug](https://github.com/hpcaitech/ColossalAI/issues/4393) in v0.3.1 that causes some program blocking, which has been fixed in later versions. If the highest available version of ColossalAI is still v0.3.1, it is recommended to install ColossalAI from the source code on the main branch.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		We appreciate all contributions to improve MMEngine. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
		We appreciate all contributions to improve MMEngine. Please refer to [Contributing to OpenMMLab](https://mmengine.readthedocs.io/en/latest/notes/contributing.html) for more details about the contributing guideline.