diff --git a/configs/classification/cifar100/README.md b/configs/classification/cifar100/README.md index 389c09c3..8d53a902 100644 --- a/configs/classification/cifar100/README.md +++ b/configs/classification/cifar100/README.md @@ -80,7 +80,7 @@ We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/open **Setup** * Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/). -* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. +* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. We released the trained models and logs in [vits-mix-cifar100-weights](https://github.com/Westlake-AI/openmixup/releases/tag/vits-mix-cifar100-weights). | Backbones | $Beta$ | DEiT-S(/16) | Swin-T | |---------------|:--------:|:-----------:|:----------:| @@ -98,7 +98,7 @@ We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/open | ResizeMix* | 1 | 68.45 | 80.16 | | TransMix | 0.8,1 | 76.17 | - | | AutoMix | 2 | 76.24 | 82.67 | -| SAMix* | 2 | 77.94 | | +| SAMix* | 2 | 77.94 | 82.62 | ## Citation diff --git a/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py index a6771dfc..70de8a86 100644 --- a/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py +++ b/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py @@ -36,13 +36,13 @@ head_one=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_mix=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_weights=dict( decent_weight=[], accent_weight=[], head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), diff --git a/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py index 658319db..c8b2819e 100644 --- a/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py +++ b/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py @@ -1,5 +1,5 @@ _base_ = [ - '../../../_base_/datasets/cifar100/sz224_swin_bs100.py', + '../../../_base_/datasets/cifar100/sz224_randaug_bs100.py', '../../../_base_/default_runtime.py', ] diff --git a/configs/classification/cifar100/mixups/vits/convnext_t_mixups_bs100.py b/configs/classification/cifar100/mixups/vits/convnext_t_mixups_bs100.py new file mode 100644 index 00000000..7a231c74 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_t_mixups_bs100.py @@ -0,0 +1,76 @@ +_base_ = [ + '../../../_base_/datasets/cifar100/sz32_randaug_bs100.py', + '../../../_base_/default_runtime.py', +] + +# model settings +model = dict( + type='MixUpClassification', + pretrained=None, + alpha=[1, 0.8], + mix_mode=['cutmix', 'mixup'], + mix_args=dict( + alignmix=dict(eps=0.1, max_iter=100), + attentivemix=dict(grid_size=32, top_k=None, beta=8), # AttentiveMix+ in this repo (use pre-trained) + automix=dict(mask_adjust=0, lam_margin=0), # require pre-trained mixblock + fmix=dict(decay_power=3, size=(32,32), max_soft=0., reformulate=False), + gridmix=dict(n_holes=(2, 6), hole_aspect_ratio=1., + cut_area_ratio=(0.5, 1), cut_aspect_ratio=(0.5, 2)), + manifoldmix=dict(layer=(0, 3)), + puzzlemix=dict(transport=True, t_batch_size=None, t_size=4, # t_size for small-scale datasets + block_num=5, beta=1.2, gamma=0.5, eta=0.2, neigh_size=4, n_labels=3, t_eps=0.8), + resizemix=dict(scope=(0.1, 0.8), use_alpha=True), + samix=dict(mask_adjust=0, lam_margin=0.08), # require pre-trained mixblock + transmix=dict(mix_mode="cutmix"), + ), + backbone=dict( + type='ConvNeXt_CIFAR', + arch='tiny', + out_indices=(3,), # x-1: stage-x + act_cfg=dict(type='GELU'), + drop_path_rate=0.3, + gap_before_final_norm=True, + ), + head=dict( + type='ClsMixupHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=False, + in_channels=768, num_classes=100), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer=['LayerNorm', 'BatchNorm'], val=1., bias=0.) + ], +) + +# optimizer +optimizer = dict( + type='AdamW', + lr=1e-3, + weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999), + paramwise_options={ + '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.), + 'norm': dict(weight_decay=0.), + 'bias': dict(weight_decay=0.), + 'gamma': dict(weight_decay=0.), + }) + +# interval for accumulate gradient +update_interval = 1 # total: 1 x bs100 x 1 accumulates = bs100 + +# fp16 +use_fp16 = True +fp16 = dict(type='mmcv', loss_scale='dynamic') +optimizer_config = dict(grad_clip=None, update_interval=update_interval) + +# learning policy +lr_config = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-6, + warmup='linear', + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_attentivemix_a2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_attentivemix_a2_bs100_ep200.py new file mode 100644 index 00000000..bafabf82 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_attentivemix_a2_bs100_ep200.py @@ -0,0 +1,18 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + pretrained=None, + pretrained_k="torchvision://resnet50", + alpha=2, # float or list + mix_mode="attentivemix", + backbone_k=dict( # PyTorch pre-trained R-18 is required for attentivemix+ + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + style='pytorch'), +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_automix_l2_a2_near_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_automix_l2_a2_near_bs100_ep200.py new file mode 100644 index 00000000..c0228387 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_automix_l2_a2_near_bs100_ep200.py @@ -0,0 +1,120 @@ +_base_ = [ + '../../../../_base_/datasets/cifar100/sz32_randaug_bs100.py', + '../../../../_base_/default_runtime.py', +] + +# model settings +model = dict( + type='AutoMixup', + pretrained=None, + alpha=2.0, + momentum=0.999, + mask_layer=2, # dowmsampling to 1/16 + mask_loss=0.1, # using loss + mask_adjust=0, # none for large datasets + lam_margin=0.08, + switch_off=1.0, # switch off mixblock (fixed) + mask_up_override=None, + debug=True, + backbone=dict( + type='ConvNeXt_CIFAR', + arch='tiny', + out_indices=(2, 3), # x-1: stage-x + act_cfg=dict(type='GELU'), + drop_path_rate=0.3, + gap_before_final_norm=True, + ), + mix_block = dict( # AutoMix + type='PixelMixBlock', + in_channels=384, reduction=2, use_scale=True, + unsampling_mode=['nearest',], # str or list, train & test MixBlock, 'nearest' for AutoMix + lam_concat=True, lam_concat_v=False, # AutoMix.V1: lam cat q,k,v + lam_mul=False, lam_residual=False, lam_mul_k=-1, # SAMix lam: none + x_qk_concat=False, x_v_concat=False, # SAMix x concat: none + att_norm_cfg=None, # AutoMix: attention norm for fp16 + mask_loss_mode="L1", mask_loss_margin=0.1, # L1 loss, 0.1 + frozen=False), + head_one=dict( + type='VisionTransformerClsHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=False, + in_channels=768, num_classes=100), + head_mix=dict( + type='VisionTransformerClsHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=False, + in_channels=768, num_classes=100), + head_weights=dict( + decent_weight=[], accent_weight=[], + head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], +) + +# interval for accumulate gradient +update_interval = 1 # total: 8 x bs128 x 1 accumulates = bs1024 + +custom_hooks = [ + dict(type='SAVEHook', + save_interval=500 * 20, # 20 ep + iter_per_epoch=500, + ), + dict(type='CustomCosineAnnealingHook', # 0.1 to 0 + attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False, # by iter + update_interval=update_interval, + ), + dict(type='CosineScheduleHook', + end_momentum=0.99996, # 0.999 to 0.99996 + adjust_scope=[0.25, 1.0], + warming_up="constant", + update_interval=update_interval, + interval=1) +] + +# optimizer +optimizer = dict( + type='AdamW', + lr=1e-3, + weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999), + paramwise_options={ + '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.), + 'norm': dict(weight_decay=0.), + 'bias': dict(weight_decay=0.), + 'gamma': dict(weight_decay=0.), + 'mix_block': dict(lr=1e-3), + }) +# # Sets `find_unused_parameters`: randomly switch off mixblock +# find_unused_parameters = True + +# fp16 +use_fp16 = False +fp16 = dict(type='mmcv', loss_scale='dynamic') +optimizer_config = dict(grad_clip=None, update_interval=update_interval) + +# lr scheduler: Swim for DeiT +lr_config = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, + warmup='linear', + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# additional scheduler +addtional_scheduler = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, # 0.1 x lr + paramwise_options=['mix_block'], + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# validation hook +evaluation = dict(initial=False, save_best=None) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_cutmix_a2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_cutmix_a2_bs100_ep200.py new file mode 100644 index 00000000..cdd70245 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_cutmix_a2_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=2.0, + mix_mode="cutmix", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_deit_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_deit_bs100_ep200.py new file mode 100644 index 00000000..0af832c7 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_deit_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=[1, 0.8], + mix_mode=['cutmix', 'mixup'], +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_fmix_a1_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_fmix_a1_bs100_ep200.py new file mode 100644 index 00000000..a530ac14 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_fmix_a1_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=1.0, + mix_mode="fmix", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_gridmix_a1_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_gridmix_a1_bs100_ep200.py new file mode 100644 index 00000000..d91847ed --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_gridmix_a1_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=1.0, + mix_mode="gridmix", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_mixup_a0_8_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_mixup_a0_8_bs100_ep200.py new file mode 100644 index 00000000..354ef35f --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_mixup_a0_8_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=0.8, + mix_mode="mixup", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_puzzlemix_a2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_puzzlemix_a2_bs100_ep200.py new file mode 100644 index 00000000..3f5b243f --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_puzzlemix_a2_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=2.0, + mix_mode="puzzlemix", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_resizemix_lam01_08_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_resizemix_lam01_08_bs100_ep200.py new file mode 100644 index 00000000..33c3b143 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_resizemix_lam01_08_bs100_ep200.py @@ -0,0 +1,13 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=1.0, + mix_mode="resizemix", + mix_args=dict( + resizemix=dict(scope=(0.1, 0.8), use_alpha=True), + ), +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_saliencymix_a0_2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_saliencymix_a0_2_bs100_ep200.py new file mode 100644 index 00000000..99f42e3c --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_saliencymix_a0_2_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=0.2, + mix_mode="saliencymix", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_samix_l2_a2_near_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_samix_l2_a2_near_bs100_ep200.py new file mode 100644 index 00000000..e8c6126c --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_samix_l2_a2_near_bs100_ep200.py @@ -0,0 +1,131 @@ +_base_ = [ + '../../../../_base_/datasets/cifar100/sz224_randaug_bs100.py', + '../../../../_base_/default_runtime.py', +] + +# value_neck_cfg +conv1x1=dict( + type="ConvNeck", + in_channels=384, hid_channels=192, out_channels=1, # MixBlock v + num_layers=2, kernel_size=1, + with_last_norm=False, norm_cfg=dict(type='BN'), # default + with_last_dropout=0.1, with_avg_pool=False, with_residual=False) # no res + dropout + +# model settings +model = dict( + type='AutoMixup', + pretrained=None, + alpha=2.0, + momentum=0.999, + mask_layer=2, # dowmsampling to 1/16 + mask_loss=0.1, # using loss + mask_adjust=0, # none for large datasets + lam_margin=0.08, + switch_off=1.0, # switch off mixblock (fixed) + mask_up_override=None, + debug=True, + backbone=dict( + type='ConvNeXt_CIFAR', + arch='tiny', + out_indices=(2, 3), # x-1: stage-x + act_cfg=dict(type='GELU'), + drop_path_rate=0.3, + gap_before_final_norm=True, + ), + mix_block = dict( # AutoMix + type='PixelMixBlock', + in_channels=384, reduction=2, use_scale=True, + unsampling_mode=['nearest',], # str or list, train & test MixBlock, 'nearest' for AutoMix + # unsampling_mode=['bilinear',], # str or list, tricks in SAMix + lam_concat=False, lam_concat_v=False, # AutoMix.V1: none + lam_mul=True, lam_residual=True, lam_mul_k=-1, # SAMix lam: mult + k=-1 (-1 for large datasets) + value_neck_cfg=conv1x1, # SAMix: non-linear value + x_qk_concat=True, x_v_concat=False, # SAMix x concat: q,k + # att_norm_cfg=dict(type='BN'), # norm after q,k (design for fp16, also conduct better performace in fp32) + mask_loss_mode="L1+Variance", mask_loss_margin=0.1, # L1+Var loss, tricks in SAMix + frozen=False), + head_one=dict( + type='VisionTransformerClsHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=False, + in_channels=768, num_classes=100), + head_mix=dict( + type='VisionTransformerClsHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=False, + in_channels=768, num_classes=100), + head_weights=dict( + decent_weight=[], accent_weight=[], + head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], +) + +# interval for accumulate gradient +update_interval = 1 # total: 8 x bs128 x 1 accumulates = bs1024 + +custom_hooks = [ + dict(type='SAVEHook', + save_interval=500 * 20, # 20 ep + iter_per_epoch=500, + ), + dict(type='CustomCosineAnnealingHook', # 0.1 to 0 + attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False, # by iter + update_interval=update_interval, + ), + dict(type='CosineScheduleHook', + end_momentum=0.99996, # 0.999 to 0.99996 + adjust_scope=[0.25, 1.0], + warming_up="constant", + update_interval=update_interval, + interval=1) +] + +# optimizer +optimizer = dict( + type='AdamW', + lr=1e-3, + weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999), + paramwise_options={ + '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.), + 'norm': dict(weight_decay=0.), + 'bias': dict(weight_decay=0.), + 'gamma': dict(weight_decay=0.), + 'mix_block': dict(lr=7.5e-4), + }) +# # Sets `find_unused_parameters`: randomly switch off mixblock +# find_unused_parameters = True + +# fp16 +use_fp16 = False +fp16 = dict(type='mmcv', loss_scale='dynamic') +optimizer_config = dict( + grad_clip=dict(max_norm=20.0), update_interval=update_interval) + +# lr scheduler: Swim for DeiT +lr_config = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, + warmup='linear', + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# additional scheduler +addtional_scheduler = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, # 0.1 x lr + paramwise_options=['mix_block'], + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# validation hook +evaluation = dict(initial=False, save_best=None) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_smoothmix_a0_2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_smoothmix_a0_2_bs100_ep200.py new file mode 100644 index 00000000..21e6e8c9 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_smoothmix_a0_2_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=0.2, + mix_mode="smoothmix", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_vanilla_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_vanilla_bs100_ep200.py new file mode 100644 index 00000000..f486c5d5 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_vanilla_bs100_ep200.py @@ -0,0 +1,10 @@ +_base_ = "../convnext_t_mixups_bs100.py" + +# model settings +model = dict( + alpha=1.0, + mix_mode="vanilla", +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py index d309abec..7e3f3535 100644 --- a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py +++ b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py @@ -36,13 +36,13 @@ head_one=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_mix=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_weights=dict( decent_weight=[], accent_weight=[], head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), diff --git a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py index c05e7027..e05e28d1 100644 --- a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py +++ b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py @@ -46,13 +46,13 @@ head_one=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_mix=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_weights=dict( decent_weight=[], accent_weight=[], head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), diff --git a/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_automix_l3_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_automix_l3_a2_near_sz224_bs100_ep200.py new file mode 100644 index 00000000..21247159 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_automix_l3_a2_near_sz224_bs100_ep200.py @@ -0,0 +1,121 @@ +_base_ = [ + '../../../../_base_/datasets/cifar100/sz224_randaug_bs100.py', + '../../../../_base_/default_runtime.py', +] + +# model settings +model = dict( + type='AutoMixup', + pretrained=None, + alpha=2.0, + momentum=0.999, + mask_layer=3, # dowmsampling to 1/16 + mask_loss=0.1, # using loss + mask_adjust=0, # none for large datasets + lam_margin=0.08, + switch_off=1.0, # switch off mixblock (fixed) + mask_up_override=None, + debug=True, + backbone=dict( + type='SwinTransformer', + arch='tiny', + img_size=224, + drop_path_rate=0.2, + out_indices=(2,3,), # x-1: stage-x + ), + mix_block = dict( # AutoMix + type='PixelMixBlock', + in_channels=768, reduction=2, use_scale=True, + unsampling_mode=['nearest',], # str or list, train & test MixBlock, 'nearest' for AutoMix + lam_concat=True, lam_concat_v=False, # AutoMix.V1: lam cat q,k,v + lam_mul=False, lam_residual=False, lam_mul_k=-1, # SAMix lam: none + x_qk_concat=False, x_v_concat=False, # SAMix x concat: none + att_norm_cfg=None, # AutoMix: attention norm for fp16 + mask_loss_mode="L1", mask_loss_margin=0.1, # L1 loss, 0.1 + frozen=False), + head_one=dict( + type='ClsMixupHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=True, + in_channels=768, num_classes=100), + head_mix=dict( + type='ClsMixupHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=True, + in_channels=768, num_classes=100), + head_weights=dict( + decent_weight=[], accent_weight=[], + head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], +) + +# interval for accumulate gradient +update_interval = 1 # total: 8 x bs128 x 1 accumulates = bs1024 + +custom_hooks = [ + dict(type='SAVEHook', + save_interval=500 * 20, # 20 ep + iter_per_epoch=500, + ), + dict(type='CustomCosineAnnealingHook', # 0.1 to 0 + attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False, # by iter + update_interval=update_interval, + ), + dict(type='CosineScheduleHook', + end_momentum=0.99996, # 0.999 to 0.99996 + adjust_scope=[0.25, 1.0], + warming_up="constant", + update_interval=update_interval, + interval=1) +] + +# optimizer +optimizer = dict( + type='AdamW', + lr=5e-4, + weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999), + paramwise_options={ + '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.), + 'norm': dict(weight_decay=0.), + 'bias': dict(weight_decay=0.), + 'absolute_pos_embed': dict(weight_decay=0.), + 'relative_position_bias_table': dict(weight_decay=0.), + 'mix_block': dict(lr=1e-3), + }) +# # Sets `find_unused_parameters`: randomly switch off mixblock +# find_unused_parameters = True + +# fp16 +use_fp16 = False +fp16 = dict(type='mmcv', loss_scale='dynamic') +optimizer_config = dict( + grad_clip=dict(max_norm=10.0), update_interval=update_interval) + +# lr scheduler: Swim for DeiT +lr_config = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, + warmup='linear', + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# additional scheduler +addtional_scheduler = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, # 0.1 x lr + paramwise_options=['mix_block'], + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# validation hook +evaluation = dict(initial=False, save_best=None) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_samix_l3_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_samix_l3_a2_near_sz224_bs100_ep200.py new file mode 100644 index 00000000..3b753af0 --- /dev/null +++ b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_samix_l3_a2_near_sz224_bs100_ep200.py @@ -0,0 +1,131 @@ +_base_ = [ + '../../../../_base_/datasets/cifar100/sz224_randaug_bs100.py', + '../../../../_base_/default_runtime.py', +] + +# value_neck_cfg +conv1x1=dict( + type="ConvNeck", + in_channels=768, hid_channels=384, out_channels=1, # MixBlock v + num_layers=2, kernel_size=1, + with_last_norm=False, norm_cfg=dict(type='BN'), # default + with_last_dropout=0.1, with_avg_pool=False, with_residual=False) # no res + dropout + +# model settings +model = dict( + type='AutoMixup', + pretrained=None, + alpha=2.0, + momentum=0.999, + mask_layer=2, # dowmsampling to 1/16 + mask_loss=0.1, # using loss + mask_adjust=0, # none for large datasets + lam_margin=0.08, + switch_off=1.0, # switch off mixblock (fixed) + mask_up_override=None, + debug=True, + backbone=dict( + type='SwinTransformer', + arch='tiny', + img_size=224, + drop_path_rate=0.2, + out_indices=(2,3,), # x-1: stage-x + ), + mix_block = dict( # AutoMix + type='PixelMixBlock', + in_channels=768, reduction=2, use_scale=True, + unsampling_mode=['nearest',], # str or list, train & test MixBlock, 'nearest' for AutoMix + # unsampling_mode=['bilinear',], # str or list, tricks in SAMix + lam_concat=False, lam_concat_v=False, # AutoMix.V1: none + lam_mul=True, lam_residual=True, lam_mul_k=-1, # SAMix lam: mult + k=-1 (-1 for large datasets) + value_neck_cfg=conv1x1, # SAMix: non-linear value + x_qk_concat=True, x_v_concat=False, # SAMix x concat: q,k + # att_norm_cfg=dict(type='BN'), # norm after q,k (design for fp16, also conduct better performace in fp32) + mask_loss_mode="L1+Variance", mask_loss_margin=0.1, # L1+Var loss, tricks in SAMix + frozen=False), + head_one=dict( + type='ClsMixupHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=True, + in_channels=768, num_classes=100), + head_mix=dict( + type='ClsMixupHead', # mixup CE + label smooth + loss=dict(type='LabelSmoothLoss', + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + with_avg_pool=True, + in_channels=768, num_classes=100), + head_weights=dict( + decent_weight=[], accent_weight=[], + head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], +) + +# interval for accumulate gradient +update_interval = 1 # total: 8 x bs128 x 1 accumulates = bs1024 + +custom_hooks = [ + dict(type='SAVEHook', + save_interval=500 * 20, # 20 ep + iter_per_epoch=500, + ), + dict(type='CustomCosineAnnealingHook', # 0.1 to 0 + attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False, # by iter + update_interval=update_interval, + ), + dict(type='CosineScheduleHook', + end_momentum=0.99996, # 0.999 to 0.99996 + adjust_scope=[0.25, 1.0], + warming_up="constant", + update_interval=update_interval, + interval=1) +] + +# optimizer +optimizer = dict( + type='AdamW', + lr=5e-4, + weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999), + paramwise_options={ + '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.), + 'norm': dict(weight_decay=0.), + 'bias': dict(weight_decay=0.), + 'absolute_pos_embed': dict(weight_decay=0.), + 'relative_position_bias_table': dict(weight_decay=0.), + 'mix_block': dict(lr=5e-4), + }) +# Sets `find_unused_parameters`: randomly switch off mixblock +find_unused_parameters = True + +# fp16 +use_fp16 = False +fp16 = dict(type='mmcv', loss_scale='dynamic') +optimizer_config = dict( + grad_clip=dict(max_norm=20.0), update_interval=update_interval) + +# lr scheduler: Swim for DeiT +lr_config = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, + warmup='linear', + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# additional scheduler +addtional_scheduler = dict( + policy='CosineAnnealing', + by_epoch=False, min_lr=1e-4, # 0.1 x lr + paramwise_options=['mix_block'], + warmup_iters=20, warmup_by_epoch=True, + warmup_ratio=1e-5, +) + +# validation hook +evaluation = dict(initial=False, save_best=None) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py index ebce03e0..f7cb3e77 100644 --- a/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py +++ b/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py @@ -46,13 +46,13 @@ head_one=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_mix=dict( type='VisionTransformerClsHead', # mixup CE + label smooth loss=dict(type='LabelSmoothLoss', - label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0), - in_channels=384, num_classes=1000), + label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0), + in_channels=384, num_classes=100), head_weights=dict( decent_weight=[], accent_weight=[], head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1), diff --git a/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py index 8e310681..347836bb 100644 --- a/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py +++ b/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py @@ -1,5 +1,5 @@ _base_ = [ - '../../../_base_/datasets/cifar100/sz224_swin_bs100.py', + '../../../_base_/datasets/cifar100/sz224_randaug_bs100.py', '../../../_base_/default_runtime.py', ] diff --git a/docs/en/mixup_benchmarks/Mixup_cifar.md b/docs/en/mixup_benchmarks/Mixup_cifar.md index 2c7a0eae..5e82226e 100644 --- a/docs/en/mixup_benchmarks/Mixup_cifar.md +++ b/docs/en/mixup_benchmarks/Mixup_cifar.md @@ -147,25 +147,25 @@ These benchmarks follow CutMix settings, training 200/400/800/1200 epochs from s **Setup** -* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/). -* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. - -| Backbones | $Beta$ | DEiT-S(/16) | Swin-T | -|---------------|:--------:|:-----------:|:----------:| -| Epoch | $\alpha$ | 200 epochs | 200 epochs | -| Vanilla | - | 65.81 | 78.41 | -| MixUp | 0.8 | 69.98 | 76.78 | -| CutMix | 2 | 74.12 | 80.64 | -| DeiT | 0.8,1 | 75.92 | 81.25 | -| SmoothMix | 0.2 | 67.54 | 66.69 | -| SaliencyMix | 0.2 | 69.78 | 80.40 | -| AttentiveMix+ | 2 | 75.98 | 81.13 | -| FMix* | 1 | 70.41 | 80.72 | -| GridMix | 1 | 68.86 | 78.54 | -| PuzzleMix | 2 | 73.60 | 80.33 | -| ResizeMix* | 1 | 68.45 | 80.16 | -| TransMix | 0.8,1 | 76.17 | - | -| AutoMix | 2 | 76.24 | 82.67 | -| SAMix* | 2 | 77.94 | | +* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures, but use $32\times 32$ for ConvNeXt. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT/ConvNeXt and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/). +* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. We released the trained models and logs in [vits-mix-cifar100-weights](https://github.com/Westlake-AI/openmixup/releases/tag/vits-mix-cifar100-weights). + +| Backbones | $Beta$ | DEiT-S(/16) | Swin-T | ConvNeXt-T | +|---------------|:--------:|:-----------:|:----------:|:----------:| +| Epoch | $\alpha$ | 200 epochs | 200 epochs | 200 epochs | +| Vanilla | - | 65.81 | 78.41 | | +| MixUp | 0.8 | 69.98 | 76.78 | | +| CutMix | 2 | 74.12 | 80.64 | | +| DeiT | 0.8,1 | 75.92 | 81.25 | | +| SmoothMix | 0.2 | 67.54 | 66.69 | | +| SaliencyMix | 0.2 | 69.78 | 80.40 | | +| AttentiveMix+ | 2 | 75.98 | 81.13 | | +| FMix* | 1 | 70.41 | 80.72 | | +| GridMix | 1 | 68.86 | 78.54 | | +| PuzzleMix | 2 | 73.60 | 80.33 | | +| ResizeMix* | 1 | 68.45 | 80.16 | | +| TransMix | 0.8,1 | 76.17 | - | | +| AutoMix | 2 | 76.24 | 82.67 | | +| SAMix* | 2 | 77.94 | 82.62 | |

(back to top)

\ No newline at end of file diff --git a/docs/en/model_zoos/Mixup_sup.md b/docs/en/model_zoos/Mixup_sup.md index 197c4fb3..102bfb38 100644 --- a/docs/en/model_zoos/Mixup_sup.md +++ b/docs/en/model_zoos/Mixup_sup.md @@ -277,26 +277,26 @@ These benchmarks follow CutMix settings, training 200/400/800/1200 epochs from s **Setup** -* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/). -* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. - -| Backbones | $Beta$ | DEiT-S(/16) | Swin-T | -|---------------|:--------:|:-----------:|:----------:| -| Epoch | $\alpha$ | 200 epochs | 200 epochs | -| Vanilla | - | 65.81 | 78.41 | -| MixUp | 0.8 | 69.98 | 76.78 | -| CutMix | 2 | 74.12 | 80.64 | -| DeiT | 0.8,1 | 75.92 | 81.25 | -| SmoothMix | 0.2 | 67.54 | 66.69 | -| SaliencyMix | 0.2 | 69.78 | 80.40 | -| AttentiveMix+ | 2 | 75.98 | 81.13 | -| FMix* | 1 | 70.41 | 80.72 | -| GridMix | 1 | 68.86 | 78.54 | -| PuzzleMix | 2 | 73.60 | 80.33 | -| ResizeMix* | 1 | 68.45 | 80.16 | -| TransMix | 0.8,1 | 76.17 | - | -| AutoMix | 2 | 76.24 | 82.67 | -| SAMix* | 2 | 77.94 | | +* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures, but use $32\times 32$ for ConvNeXt. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT/ConvNeXt and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/). +* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. We released the trained models and logs in [vits-mix-cifar100-weights](https://github.com/Westlake-AI/openmixup/releases/tag/vits-mix-cifar100-weights). + +| Backbones | $Beta$ | DEiT-S(/16) | Swin-T | ConvNeXt-T | +|---------------|:--------:|:-----------:|:----------:|:----------:| +| Epoch | $\alpha$ | 200 epochs | 200 epochs | 200 epochs | +| Vanilla | - | 65.81 | 78.41 | | +| MixUp | 0.8 | 69.98 | 76.78 | | +| CutMix | 2 | 74.12 | 80.64 | | +| DeiT | 0.8,1 | 75.92 | 81.25 | | +| SmoothMix | 0.2 | 67.54 | 66.69 | | +| SaliencyMix | 0.2 | 69.78 | 80.40 | | +| AttentiveMix+ | 2 | 75.98 | 81.13 | | +| FMix* | 1 | 70.41 | 80.72 | | +| GridMix | 1 | 68.86 | 78.54 | | +| PuzzleMix | 2 | 73.60 | 80.33 | | +| ResizeMix* | 1 | 68.45 | 80.16 | | +| TransMix | 0.8,1 | 76.17 | - | | +| AutoMix | 2 | 76.24 | 82.67 | | +| SAMix* | 2 | 77.94 | 82.62 | |

(back to top)

diff --git a/openmixup/models/backbones/__init__.py b/openmixup/models/backbones/__init__.py index 95f440b2..d05cc744 100644 --- a/openmixup/models/backbones/__init__.py +++ b/openmixup/models/backbones/__init__.py @@ -2,7 +2,7 @@ from .beit import BEiTVisionTransformer from .context_cluster import ContextCluster from .convmixer import ConvMixer -from .convnext import ConvNeXt, ConvNeXt_Mix, MIMConvNeXt, ConvNeXt_CIFAR +from .convnext import ConvNeXt, ConvNeXt_Mix, MIMConvNeXt, ConvNeXt_CIFAR, ConvNeXt_Mix_CIFAR from .cspnet import CSPDarkNet, CSPNet, CSPResNet, CSPResNeXt from .davit import DaViT from .deit import DistilledVisionTransformer @@ -61,7 +61,7 @@ __all__ = [ 'AlexNet', 'BEiTViT', 'BEiTVisionTransformer', 'ContextCluster', - 'ConvNeXt', 'ConvNeXt_Mix', 'MIMConvNeXt', 'ConvNeXt_CIFAR', 'ConvMixer', + 'ConvNeXt', 'ConvNeXt_Mix', 'MIMConvNeXt', 'ConvNeXt_CIFAR', 'ConvNeXt_Mix_CIFAR', 'ConvMixer', 'CSPDarkNet', 'CSPNet', 'CSPResNet', 'CSPResNeXt', 'DaViT', 'DistilledVisionTransformer', 'DeiT3', 'DenseNet', 'DenseNet_CIFAR', 'EdgeNeXt', 'EfficientFormer', 'EfficientNet', 'EfficientNetV2', 'HorNet', 'HorNet_CIFAR', 'HRNet', diff --git a/openmixup/models/backbones/convnext.py b/openmixup/models/backbones/convnext.py index 79eab6cb..a74c8405 100644 --- a/openmixup/models/backbones/convnext.py +++ b/openmixup/models/backbones/convnext.py @@ -612,7 +612,124 @@ def __init__(self, in_channels=3, norm_cfg=dict(type='LN2d', eps=1e-6), **kwargs in_channels, self.channels[0], kernel_size=3, - stride=1), + stride=1, + padding=1), build_norm_layer(norm_cfg, self.channels[0])[1], ) self.downsample_layers[0] = stem + + +@BACKBONES.register_module() +class ConvNeXt_Mix_CIFAR(ConvNeXt): + """ConvNeXt backbone for CIFAR, support ManifoldMix and its variants + + Provide a port to mixup the latent space. + """ + def __init__(self, in_channels=3, norm_cfg=dict(type='LN2d', eps=1e-6), **kwargs): + super(ConvNeXt_Mix_CIFAR, self).__init__( + in_channels=in_channels, norm_cfg=norm_cfg, **kwargs) + + # the first stem layer + stem = nn.Sequential( + nn.Conv2d( + in_channels, + self.channels[0], + kernel_size=3, + stride=1, + padding=1), + build_norm_layer(norm_cfg, self.channels[0])[1], + ) + self.downsample_layers[0] = stem + + def _feature_mixup(self, x, mask, dist_shuffle=False, idx_shuffle_mix=None, + cross_view=False, BN_shuffle=False, idx_shuffle_BN=None, + idx_unshuffle_BN=None, **kwargs): + """ mixup two feature maps with the pixel-wise mask + + Args: + x, mask (tensor): Input x [N,C,H,W] and mixup mask [N, \*, H, W]. + dist_shuffle (bool): Whether to shuffle cross gpus. + idx_shuffle_mix (tensor): Shuffle indice of [N,1] to generate x_. + cross_view (bool): Whether to view the input x as two views [2N, C, H, W], + which is usually adopted in self-supervised and semi-supervised settings. + BN_shuffle (bool): Whether to do shuffle cross gpus for shuffle_BN. + idx_shuffle_BN (tensor): Shuffle indice to utilize shuffle_BN cross gpus. + idx_unshuffle_BN (tensor): Unshuffle indice for the shuffle_BN (in pair). + """ + # adjust mixup mask + assert mask.dim() == 4 and mask.size(1) <= 2 + if mask.size(1) == 1: + mask = [mask, 1 - mask] + else: + mask = [ + mask[:, 0, :, :].unsqueeze(1), mask[:, 1, :, :].unsqueeze(1)] + # undo shuffle_BN for ssl mixup + if BN_shuffle: + assert idx_unshuffle_BN is not None and idx_shuffle_BN is not None + x = grad_batch_unshuffle_ddp(x, idx_unshuffle_BN) # 2N index if cross_view + + # shuffle input + if dist_shuffle==True: # cross gpus shuffle + assert idx_shuffle_mix is not None + if cross_view: + N = x.size(0) // 2 + detach_p = random.random() + x_ = x[N:, ...].clone().detach() if detach_p < 0.5 else x[N:, ...] + x = x[:N, ...] if detach_p < 0.5 else x[:N, ...].detach() + x_, _, _ = grad_batch_shuffle_ddp(x_, idx_shuffle_mix) + else: + x_, _, _ = grad_batch_shuffle_ddp(x, idx_shuffle_mix) + else: # within each gpu + if cross_view: + # default: the input image is shuffled + N = x.size(0) // 2 + detach_p = random.random() + x_ = x[N:, ...].clone().detach() if detach_p < 0.5 else x[N:, ...] + x = x[:N, ...] if detach_p < 0.5 else x[:N, ...].detach() + else: + x_ = x[idx_shuffle_mix, :] + assert x.size(3) == mask[0].size(3), \ + "mismatching mask x={}, mask={}.".format(x.size(), mask[0].size()) + mix = x * mask[0] + x_ * mask[1] + + # redo shuffle_BN for ssl mixup + if BN_shuffle: + mix, _, _ = grad_batch_shuffle_ddp(mix, idx_shuffle_BN) # N index + + return mix + + def forward(self, x, mix_args=None): + """ only support mask-based mixup policy """ + # latent space mixup + if mix_args is not None: + assert isinstance(mix_args, dict) + mix_layer = mix_args["layer"] # {0, 1, 2, 3} + if mix_args["BN_shuffle"]: + x, _, idx_unshuffle = grad_batch_shuffle_ddp(x) # 2N index if cross_view + else: + idx_unshuffle = None + else: + mix_layer = -1 + + # input mixup + if mix_layer == 0: + x = self._feature_mixup(x, idx_unshuffle_BN=idx_unshuffle, **mix_args) + + outs = [] + for i, stage in enumerate(self.stages): + x = self.downsample_layers[i](x) + x = stage(x) + if i in self.out_indices: + if i == 3: + norm_layer = getattr(self, f'norm{i}') + if self.gap_before_final_norm and i == 3: + gap = x.mean([-2, -1], keepdim=True) + x = norm_layer(gap).flatten(1) + else: + x = norm_layer(x) + outs.append(x) + if len(self.out_indices) == 1: + return outs + if i+1 == mix_layer: + x = self._feature_mixup(x, idx_unshuffle_BN=idx_unshuffle, **mix_args) + return outs diff --git a/requirements/optional.txt b/requirements/optional.txt index a429bd12..d6a6cbd5 100644 --- a/requirements/optional.txt +++ b/requirements/optional.txt @@ -1,5 +1,7 @@ albumentations>=0.3.2 # For Albumentations data transform -faiss-gpu>=1.6.1 # For DeepCluster and ODC -grad-cam >= 1.3.7 # For CAM visualization -requests # For torchserve +faiss-gpu>=1.6.1 # For DeepCluster and ODC +grad-cam >= 1.3.7 # For CAM visualization +gco==1.0.1 # For PuzzleMix (please install from source) +opencv-contrib-python # For SaliencyMix +requests # For torchserve scikit-image