From 43963f89ad85ff40472674d579a328fe8532202e Mon Sep 17 00:00:00 2001
From: sunjiahao1999 <578431509@qq.com>
Date: Thu, 28 Dec 2023 22:26:18 +0800
Subject: [PATCH] fix mvfoc3d config && add doc

---
 .../_base_/datasets/waymoD5-mv3d-3class.py    | 20 +++---
 configs/mvfcos3d/README.md                    | 62 +++++++++++++++++++
 ...fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py} |  2 +-
 ...dcn_centerhead_16xb2_waymoD5-3d-3class.py} |  0
 configs/pgd/README.md                         | 17 +++++
 5 files changed, 90 insertions(+), 11 deletions(-)
 create mode 100644 configs/mvfcos3d/README.md
 rename configs/{dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py => mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py} (93%)
 rename configs/{dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py => mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py} (100%)

diff --git a/configs/_base_/datasets/waymoD5-mv3d-3class.py b/configs/_base_/datasets/waymoD5-mv3d-3class.py
index 43ed90f9e6..bacf804339 100644
--- a/configs/_base_/datasets/waymoD5-mv3d-3class.py
+++ b/configs/_base_/datasets/waymoD5-mv3d-3class.py
@@ -1,5 +1,5 @@
 # dataset settings
-# D3 in the config name means the whole dataset is divided into 3 folds
+# D5 in the config name means the whole dataset is divided into 3 folds
 # We only use one fold for efficient experiments
 dataset_type = 'WaymoDataset'
 data_root = 'data/waymo/kitti_format/'
@@ -49,9 +49,6 @@
         with_label_3d=True,
         with_bbox_depth=True),
     dict(type='MultiViewWrapper', transforms=train_transforms),
-        #  randomness_keys= [
-        #     'scale', 'scale_factor', 'crop_size', 'img_crop_offset', 'flip',
-        #     'flip_direction']),
     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
     dict(type='ObjectNameFilter', classes=class_names),
     dict(
@@ -74,7 +71,10 @@
         to_float32=True,
         backend_args=backend_args),
     dict(type='MultiViewWrapper', transforms=test_transforms),
-    dict(type='Pack3DDetInputs', keys=['img'], meta_keys=[
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
             'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
             'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
             'num_ref_frames', 'num_views'
@@ -88,7 +88,10 @@
         to_float32=True,
         backend_args=backend_args),
     dict(type='MultiViewWrapper', transforms=test_transforms),
-    dict(type='Pack3DDetInputs', keys=['img'], meta_keys=[
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
             'box_type_3d', 'img_shape', 'ori_cam2img', 'scale_factor',
             'sample_idx', 'context_name', 'timestamp', 'lidar2cam',
             'num_ref_frames', 'num_views'
@@ -170,11 +173,8 @@
         backend_args=backend_args))
 val_evaluator = dict(
     type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
     waymo_bin_file='./data/waymo/waymo_format/cam_gt.bin',
-    pklfile_prefix='./mmdet3d_mvfoc3d_pred',
-    convert_kitti_format=False,
     metric='LET_mAP',
-    backend_args=backend_args)
+    result_prefix='./mvfoc3d_pred')
 
 test_evaluator = val_evaluator
diff --git a/configs/mvfcos3d/README.md b/configs/mvfcos3d/README.md
new file mode 100644
index 0000000000..934ec1ad6c
--- /dev/null
+++ b/configs/mvfcos3d/README.md
@@ -0,0 +1,62 @@
+# MV-FCOS3D++: Multi-View Camera-Only 4D Object Detection with Pretrained Monocular Backbones
+
+> [MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones](https://arxiv.org/abs/2207.12716)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this technical report, we present our solution, dubbed MV-FCOS3D++, for the Camera-Only 3D Detection track in Waymo Open Dataset Challenge 2022. For multi-view camera-only 3D detection, methods based on bird-eye-view or 3D geometric representations can leverage the stereo cues from overlapped regions between adjacent views and directly perform 3D detection without hand-crafted post-processing. However, it lacks direct semantic supervision for 2D backbones, which can be complemented by pretraining simple monocular-based detectors. Our solution is a multi-view framework for 4D detection following this paradigm. It is built upon a simple monocular detector FCOS3D++, pretrained only with object annotations of Waymo, and converts multi-view features to a 3D grid space to detect 3D objects thereon. A dual-path neck for single-frame understanding and temporal stereo matching is devised to incorporate multi-frame information. Our method finally achieves 49.75% mAPL with a single model and wins 2nd place in the WOD challenge, without any LiDAR-based depth supervision during training. The code will be released at [this https URL](https://github.com/Tai-Wang/Depth-from-Motion).
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection3d/assets/72679458/9313eb3c-cc41-40be-9ead-549b3b5fef44" width="800"/>
+</div>
+
+## Introduction
+
+We implement multi-view FCOS3D++ and provide the results on Waymo dataset.
+
+## Usage
+
+### Training commands
+
+1. You should train PGD first:
+
+```bash
+bash tools/dist_train.py configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py 8
+```
+
+2. Given pre-trained PGD backbone, you could train multi-view FCOS3D++:
+
+```bash
+bash tools/dist_train.sh configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py --cfg-options load_from=${PRETRAINED_CHECKPOINT}
+```
+
+**Note**:
+the path of `load_from` needs to be changed to yours accordingly.
+
+## Results and models
+
+### Waymo
+
+|                                Backbone                                | Load Interval | mAPL | mAP  | mAPH |                                                                                             Download                                                                                             |
+| :--------------------------------------------------------------------: | :-----------: | :--: | :--: | :--: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101+DCN](./multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py) |      5x       | 38.2 | 52.9 | 49.5 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class_20231127_122815.log) |
+|                              above @ Car                               |               | 56.5 | 73.3 | 72.3 |                                                                                                                                                                                                  |
+|                           above @ Pedestrian                           |               | 34.8 | 49.5 | 43.1 |                                                                                                                                                                                                  |
+|                            above @ Cyclist                             |               | 23.2 | 35.9 | 33.3 |                                                                                                                                                                                                  |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@article{wang2022mvfcos3d++,
+  title={{MV-FCOS3D++: Multi-View} Camera-Only 4D Object Detection with Pretrained Monocular Backbones},
+  author={Wang, Tai and Lian, Qing and Zhu, Chenming and Zhu, Xinge and Zhang, Wenwei},
+  journal={arXiv preprint},
+  year={2022}
+}
+```
diff --git a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
similarity index 93%
rename from configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py
rename to configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
index cabc368da7..b75a6db5b3 100644
--- a/configs/dfm/multiview-dfm_r101-dcn_16xb2_waymoD5-3d-3class.py
+++ b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_8xb2_waymoD5-3d-3class.py
@@ -44,6 +44,6 @@
 )
 
 log_level = 'INFO'
-load_from = 'work_dirs/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/epoch_24.pth'
+load_from = None
 resume = False
 find_unused_parameters = True  # only 1 of 4 FPN outputs is used
diff --git a/configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py b/configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
similarity index 100%
rename from configs/dfm/multiview-dfm_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
rename to configs/mvfcos3d/multiview-fcos3d_r101-dcn_centerhead_16xb2_waymoD5-3d-3class.py
diff --git a/configs/pgd/README.md b/configs/pgd/README.md
index 2237a0f4ef..6c41522b3d 100644
--- a/configs/pgd/README.md
+++ b/configs/pgd/README.md
@@ -50,6 +50,23 @@ Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the
 | [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py) |   2x    |   9.20   | 35.8 | 42.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135.log.json) |
 |                                   above w/ tta                                    |   2x    |   9.20   | 36.8 | 43.1 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 
+### Waymo
+
+|                                   Backbone                                   | Load Interval |  Camera view  | mAPL | mAP  | mAPH  |                                                                                             Download                                                                                              |
+| :--------------------------------------------------------------------------: | :-----------: | :-----------: | :--: | :--: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py) |      3x       | front-of-view | 15.8 | 22.7 | 21.51 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d_20231107_164117.log) |
+|                                 above @ Car                                  |               |               | 36.7 | 51.6 | 51.0  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 9.0  | 14.1 | 11.4  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.6  | 2.5  |  2.2  |                                                                                                                                                                                                   |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py)  |      3x       |  multi-view   | 20.8 | 29.3 | 27.7  |  [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d_20231120_202732.log)  |
+|                                 above @ Car                                  |               |               | 41.2 | 56.1 | 55.2  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 20.0 | 29.6 | 25.8  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.4  | 2.2  |  2.0  |                                                                                                                                                                                                   |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
 ## Citation
 
 ```latex