From 894c4dceafb40d0ce69e103ee2de131636186bf2 Mon Sep 17 00:00:00 2001 From: Danila Rukhovich Date: Sun, 15 Aug 2021 18:35:39 +0300 Subject: [PATCH] [Feature] Add center sampling for indoor datasets (#23) * try centerness topk configs * add perspective_ and imvoxelnet_sunrgbd configs with centerness top18 * fix typo in perspective config * try topk=27 for sunrgbd and scannet * try more regress ranges for sunrgbd * try less regress ranges for scannet * rename indoor configs to _top27 * add resnet 18 and 34 experiments * ready to merge center sampling to master * fix typo in readme * fix topk parameter to 28 --- README.md | 1 + .../imvoxelnet_perspective_sunrgbd_top27.py | 128 +++++++++++++++++ .../imvoxelnet/imvoxelnet_scannet_top27.py | 132 +++++++++++++++++ .../imvoxelnet/imvoxelnet_sunrgbd_top27.py | 126 ++++++++++++++++ .../imvoxelnet_total_sunrgbd_top27.py | 136 ++++++++++++++++++ mmdet3d/models/dense_heads/imvoxel_head.py | 20 +++ 6 files changed, 543 insertions(+) create mode 100644 configs/imvoxelnet/imvoxelnet_perspective_sunrgbd_top27.py create mode 100644 configs/imvoxelnet/imvoxelnet_scannet_top27.py create mode 100644 configs/imvoxelnet/imvoxelnet_sunrgbd_top27.py create mode 100644 configs/imvoxelnet/imvoxelnet_total_sunrgbd_top27.py diff --git a/README.md b/README.md index b49de52..c7ad869 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ # ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection **News**: + * :fire: August, 2021. We adapt center sampling for indoor detection. New configs are in [configs/imvoxelnet/*_top27.py](https://github.com/saic-vul/imvoxelnet/tree/master/configs/imvoxelnet). For example, this improves `ScanNet` `mAP` by more than 5%. Models and preprint will be updated soon. * :fire: July, 2021. We update `ScanNet` image preprocessing both [here](https://github.com/saic-vul/imvoxelnet/pull/21) and in [mmdetection3d](https://github.com/open-mmlab/mmdetection3d/pull/696). * :fire: June, 2021. `ImVoxelNet` for `KITTI` is now [supported](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/imvoxelnet) in [mmdetection3d](https://github.com/open-mmlab/mmdetection3d). diff --git a/configs/imvoxelnet/imvoxelnet_perspective_sunrgbd_top27.py b/configs/imvoxelnet/imvoxelnet_perspective_sunrgbd_top27.py new file mode 100644 index 0000000..3cd6ae6 --- /dev/null +++ b/configs/imvoxelnet/imvoxelnet_perspective_sunrgbd_top27.py @@ -0,0 +1,128 @@ +model = dict( + type='ImVoxelNet', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=64, + num_outs=4), + neck_3d=dict( + type='ImVoxelNeck', + channels=[64, 128, 256, 512], + out_channels=64, + down_layers=[1, 2, 3, 4], + up_layers=[3, 2, 1], + conditional=False), + bbox_head=dict( + type='SunRgbdImVoxelHead', + n_classes=30, + n_channels=64, + n_convs=0, + n_reg_outs=7, + centerness_topk=28), + n_voxels=(80, 80, 32), + voxel_size=(.08, .08, .08)) +train_cfg = dict() +test_cfg = dict( + nms_pre=1000, + nms_thr=.15, + use_rotate_nms=True, + score_thr=.0) +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +dataset_type = 'SunRgbdPerspectiveMultiViewDataset' +data_root = 'data/sunrgbd/' +class_names = ('recycle_bin', 'cpu', 'paper', 'toilet', 'stool', 'whiteboard', 'coffee_table', 'picture', + 'keyboard', 'dresser', 'painting', 'bookshelf', 'night_stand', 'endtable', 'drawer', 'sink', + 'monitor', 'computer', 'cabinet', 'shelf', 'lamp', 'garbage_bin', 'box', 'bed', 'sofa', + 'sofa_chair', 'pillow', 'desk', 'table', 'chair') + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict( + type='MultiViewPipeline', + n_images=1, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Resize', img_scale=[(512, 384), (768, 576)], multiscale_mode='range', keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32)]), + dict(type='SunRgbdRandomFlip'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])] +test_pipeline = [ + dict( + type='MultiViewPipeline', + n_images=1, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='Resize', img_scale=(640, 480), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32)]), + dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False), + dict(type='Collect3D', keys=['img'])] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_perspective_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + filter_empty_gt=True, + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_perspective_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_perspective_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict(grad_clip=dict(max_norm=35., norm_type=2)) +lr_config = dict(policy='step', step=[8, 11]) +total_epochs = 12 + +checkpoint_config = dict(interval=1, max_keep_ckpts=1) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +evaluation = dict(interval=1) +dist_params = dict(backend='nccl') +find_unused_parameters = True # todo: fix number of FPN outputs +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/imvoxelnet/imvoxelnet_scannet_top27.py b/configs/imvoxelnet/imvoxelnet_scannet_top27.py new file mode 100644 index 0000000..19fc411 --- /dev/null +++ b/configs/imvoxelnet/imvoxelnet_scannet_top27.py @@ -0,0 +1,132 @@ +model = dict( + type='ImVoxelNet', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=64, + num_outs=4), + neck_3d=dict( + type='ImVoxelNeck', + channels=[64, 128, 256, 512], + out_channels=64, + down_layers=[1, 2, 3, 4], + up_layers=[3, 2, 1], + conditional=False), + bbox_head=dict( + type='ScanNetImVoxelHead', + loss_bbox=dict(type='AxisAlignedIoULoss', loss_weight=1.0), + n_classes=18, + n_channels=64, + n_convs=0, + n_reg_outs=6, + centerness_topk=28), + voxel_size=(.08, .08, .08), + n_voxels=(80, 80, 32)) +train_cfg = dict() +test_cfg = dict( + nms_pre=1000, + iou_thr=.15, + score_thr=.0) +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +dataset_type = 'ScanNetMultiViewDataset' +data_root = 'data/scannet/' +class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', + 'bookshelf', 'picture', 'counter', 'desk', 'curtain', + 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', + 'garbagebin') + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict( + type='MultiViewPipeline', + n_images=20, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='Resize', img_scale=(640, 480), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=(480, 640)) + ]), + dict(type='RandomShiftOrigin', std=(.7, .7, .0)), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='MultiViewPipeline', + n_images=50, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='Resize', img_scale=(640, 480), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=(480, 640)) + ]), + dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False), + dict(type='Collect3D', keys=['img']) +] +data = dict( + samples_per_gpu=1, + workers_per_gpu=1, + train=dict( + type='RepeatDataset', + times=3, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + filter_empty_gt=True, + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth') +) + +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict(grad_clip=dict(max_norm=35., norm_type=2)) +lr_config = dict(policy='step', step=[8, 11]) +total_epochs = 12 + +checkpoint_config = dict(interval=1, max_keep_ckpts=1) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +evaluation = dict(interval=1) +dist_params = dict(backend='nccl') +find_unused_parameters = True # todo: fix number of FPN outputs +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/imvoxelnet/imvoxelnet_sunrgbd_top27.py b/configs/imvoxelnet/imvoxelnet_sunrgbd_top27.py new file mode 100644 index 0000000..e5c0975 --- /dev/null +++ b/configs/imvoxelnet/imvoxelnet_sunrgbd_top27.py @@ -0,0 +1,126 @@ +model = dict( + type='ImVoxelNet', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=64, + num_outs=4), + neck_3d=dict( + type='ImVoxelNeck', + channels=[64, 128, 256, 512], + out_channels=64, + down_layers=[1, 2, 3, 4], + up_layers=[3, 2, 1], + conditional=False), + bbox_head=dict( + type='SunRgbdImVoxelHead', + n_classes=10, + n_channels=64, + n_convs=0, + n_reg_outs=7, + centerness_topk=28), + n_voxels=(80, 80, 32), + voxel_size=(.08, .08, .08)) +train_cfg = dict() +test_cfg = dict( + nms_pre=1000, + nms_thr=.15, + use_rotate_nms=True, + score_thr=.0) +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +dataset_type = 'SunRgbdMultiViewDataset' +data_root = 'data/sunrgbd/' +class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', + 'night_stand', 'bookshelf', 'bathtub') + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict( + type='MultiViewPipeline', + n_images=1, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Resize', img_scale=[(512, 384), (768, 576)], multiscale_mode='range', keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32)]), + dict(type='SunRgbdRandomFlip'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])] +test_pipeline = [ + dict( + type='MultiViewPipeline', + n_images=1, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='Resize', img_scale=(640, 480), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32)]), + dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False), + dict(type='Collect3D', keys=['img'])] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_imvoxelnet_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + filter_empty_gt=True, + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_imvoxelnet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_imvoxelnet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict(grad_clip=dict(max_norm=35., norm_type=2)) +lr_config = dict(policy='step', step=[8, 11]) +total_epochs = 12 + +checkpoint_config = dict(interval=1, max_keep_ckpts=1) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +evaluation = dict(interval=1) +dist_params = dict(backend='nccl') +find_unused_parameters = True # todo: fix number of FPN outputs +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/imvoxelnet/imvoxelnet_total_sunrgbd_top27.py b/configs/imvoxelnet/imvoxelnet_total_sunrgbd_top27.py new file mode 100644 index 0000000..1cb87a4 --- /dev/null +++ b/configs/imvoxelnet/imvoxelnet_total_sunrgbd_top27.py @@ -0,0 +1,136 @@ +model = dict( + type='ImVoxelNet', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + head_2d=dict( + type='LayoutHead', + n_channels=2048, + linear_size=256, + dropout=.0, + loss_angle=dict(type='SmoothL1Loss', loss_weight=100.), + loss_layout=dict(type='IoU3DLoss', loss_weight=1.)), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=64, + num_outs=4), + neck_3d=dict( + type='ImVoxelNeck', + channels=[64, 128, 256, 512], + out_channels=64, + down_layers=[1, 2, 3, 4], + up_layers=[3, 2, 1], + conditional=False), + bbox_head=dict( + type='SunRgbdImVoxelHead', + n_classes=33, + n_channels=64, + n_convs=0, + n_reg_outs=7, + centerness_topk=28, + regress_ranges=((-1e8, .6), (.4, 1.1), (0.9, 1e8))), + n_voxels=(80, 80, 32), + voxel_size=(.08, .08, .08)) +train_cfg = dict() +test_cfg = dict( + nms_pre=1000, + nms_thr=.15, + use_rotate_nms=True, + score_thr=.0) +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) + +dataset_type = 'SunRgbdTotalMultiViewDataset' +data_root = 'data/sunrgbd/' +class_names = [ + 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', + 'blinds', 'desk', 'shelves', 'curtain', 'dresser', 'pillow', 'mirror', 'clothes', 'books', + 'fridge', 'tv', 'paper', 'towel', 'shower_curtain', 'box', 'whiteboard', 'person', 'night_stand', 'toilet', + 'sink', 'lamp', 'bathtub', 'bag' +] + +train_pipeline = [ + dict(type='LoadAnnotations3D'), + dict( + type='MultiViewPipeline', + n_images=1, + transforms=[ + dict(type='SunRgbdTotalLoadImageFromFile'), + dict(type='Resize', img_scale=[(512, 384), (768, 576)], multiscale_mode='range', keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32)]), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])] +test_pipeline = [ + dict( + type='MultiViewPipeline', + n_images=1, + transforms=[ + dict(type='LoadImageFromFile'), + dict(type='Resize', img_scale=(640, 480), keep_ratio=True), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32)]), + dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False), + dict(type='Collect3D', keys=['img'])] +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=1, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_total_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + filter_empty_gt=True, + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_total_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_total_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict(grad_clip=dict(max_norm=35., norm_type=2)) +lr_config = dict(policy='step', step=[8, 11]) +total_epochs = 12 + +checkpoint_config = dict(interval=1, max_keep_ckpts=1) +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +evaluation = dict(interval=1) +dist_params = dict(backend='nccl') +find_unused_parameters = True # todo: fix number of FPN outputs +log_level = 'INFO' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmdet3d/models/dense_heads/imvoxel_head.py b/mmdet3d/models/dense_heads/imvoxel_head.py index 763844b..2a15137 100644 --- a/mmdet3d/models/dense_heads/imvoxel_head.py +++ b/mmdet3d/models/dense_heads/imvoxel_head.py @@ -17,6 +17,7 @@ def __init__(self, n_channels, n_convs, n_reg_outs, + centerness_topk=-1, regress_ranges=((-1., .75), (.75, 1.5), (1.5, INF)), loss_centerness=dict( type='CrossEntropyLoss', @@ -33,6 +34,7 @@ def __init__(self, test_cfg=None): super().__init__() self.n_classes = n_classes + self.centerness_topk = centerness_topk self.regress_ranges = regress_ranges self.loss_centerness = build_loss(loss_centerness) self.loss_bbox = build_loss(loss_bbox) @@ -385,6 +387,15 @@ def get_targets(self, points, gt_bboxes, gt_labels): (max_regress_distance >= regress_ranges[..., 0]) & (max_regress_distance <= regress_ranges[..., 1])) + # condition3: limit topk locations per box by centerness + if self.centerness_topk > 0: + centerness = compute_centerness(bbox_targets) + centerness = torch.where(inside_gt_bbox_mask, centerness, torch.ones_like(centerness) * -1) + centerness = torch.where(inside_regress_range, centerness, torch.ones_like(centerness) * -1) + top_centerness = torch.topk(centerness, self.centerness_topk, dim=0).values[-1] + inside_top_centerness = centerness > top_centerness.unsqueeze(0) + volumes[inside_top_centerness == 0] = INF + # if there are still more than one objects for a location, # we choose the one with minimal area volumes[inside_gt_bbox_mask == 0] = INF @@ -497,6 +508,15 @@ def get_targets(self, points, gt_bboxes, gt_labels): (max_regress_distance >= regress_ranges[..., 0]) & (max_regress_distance <= regress_ranges[..., 1])) + # condition3: limit topk locations per box by centerness + if self.centerness_topk > 0: + centerness = compute_centerness(bbox_targets) + centerness = torch.where(inside_gt_bbox_mask, centerness, torch.ones_like(centerness) * -1) + centerness = torch.where(inside_regress_range, centerness, torch.ones_like(centerness) * -1) + top_centerness = torch.topk(centerness, self.centerness_topk, dim=0).values[-1] + inside_top_centerness = centerness > top_centerness.unsqueeze(0) + volumes[inside_top_centerness == 0] = INF + # if there are still more than one objects for a location, # we choose the one with minimal area volumes[inside_gt_bbox_mask == 0] = INF