diff --git a/configs/classification/cifar100/README.md b/configs/classification/cifar100/README.md
index 389c09c3..8d53a902 100644
--- a/configs/classification/cifar100/README.md
+++ b/configs/classification/cifar100/README.md
@@ -80,7 +80,7 @@ We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/open
 **Setup**
 
 * Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/).
-* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations.
+* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. We released the trained models and logs in [vits-mix-cifar100-weights](https://github.com/Westlake-AI/openmixup/releases/tag/vits-mix-cifar100-weights).
 
 | Backbones     |  $Beta$  | DEiT-S(/16) |   Swin-T   |
 |---------------|:--------:|:-----------:|:----------:|
@@ -98,7 +98,7 @@ We summarize mixup benchmarks in [Model Zoo](https://github.com/Westlake-AI/open
 | ResizeMix*    |     1    |    68.45    |    80.16   |
 | TransMix      |   0.8,1  |    76.17    |      -     |
 | AutoMix       |     2    |    76.24    |    82.67   |
-| SAMix*        |     2    |    77.94    |            |
+| SAMix*        |     2    |    77.94    |    82.62   |
 
 ## Citation
 
diff --git a/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py
index a6771dfc..70de8a86 100644
--- a/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py
+++ b/configs/classification/cifar100/automix/vits/deit_s_l6_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py
@@ -36,13 +36,13 @@
     head_one=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_mix=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_weights=dict(
         decent_weight=[], accent_weight=[],
         head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
diff --git a/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py
index 658319db..c8b2819e 100644
--- a/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py
+++ b/configs/classification/cifar100/automix/vits/swin_t_l3_a2_near_L1_01_sz224_mlr1e_4_bs100_ep200.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../../../_base_/datasets/cifar100/sz224_swin_bs100.py',
+    '../../../_base_/datasets/cifar100/sz224_randaug_bs100.py',
     '../../../_base_/default_runtime.py',
 ]
 
diff --git a/configs/classification/cifar100/mixups/vits/convnext_t_mixups_bs100.py b/configs/classification/cifar100/mixups/vits/convnext_t_mixups_bs100.py
new file mode 100644
index 00000000..7a231c74
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_t_mixups_bs100.py
@@ -0,0 +1,76 @@
+_base_ = [
+    '../../../_base_/datasets/cifar100/sz32_randaug_bs100.py',
+    '../../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='MixUpClassification',
+    pretrained=None,
+    alpha=[1, 0.8],
+    mix_mode=['cutmix', 'mixup'],
+    mix_args=dict(
+        alignmix=dict(eps=0.1, max_iter=100),
+        attentivemix=dict(grid_size=32, top_k=None, beta=8),  # AttentiveMix+ in this repo (use pre-trained)
+        automix=dict(mask_adjust=0, lam_margin=0),  # require pre-trained mixblock
+        fmix=dict(decay_power=3, size=(32,32), max_soft=0., reformulate=False),
+        gridmix=dict(n_holes=(2, 6), hole_aspect_ratio=1.,
+            cut_area_ratio=(0.5, 1), cut_aspect_ratio=(0.5, 2)),
+        manifoldmix=dict(layer=(0, 3)),
+        puzzlemix=dict(transport=True, t_batch_size=None, t_size=4,  # t_size for small-scale datasets
+            block_num=5, beta=1.2, gamma=0.5, eta=0.2, neigh_size=4, n_labels=3, t_eps=0.8),
+        resizemix=dict(scope=(0.1, 0.8), use_alpha=True),
+        samix=dict(mask_adjust=0, lam_margin=0.08),  # require pre-trained mixblock
+        transmix=dict(mix_mode="cutmix"),
+    ),
+    backbone=dict(
+        type='ConvNeXt_CIFAR',
+        arch='tiny',
+        out_indices=(3,),  # x-1: stage-x
+        act_cfg=dict(type='GELU'),
+        drop_path_rate=0.3,
+        gap_before_final_norm=True,
+    ),
+    head=dict(
+        type='ClsMixupHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=False,
+        in_channels=768, num_classes=100),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer=['LayerNorm', 'BatchNorm'], val=1., bias=0.)
+    ],
+)
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'gamma': dict(weight_decay=0.),
+    })
+
+# interval for accumulate gradient
+update_interval = 1  # total: 1 x bs100 x 1 accumulates = bs100
+
+# fp16
+use_fp16 = True
+fp16 = dict(type='mmcv', loss_scale='dynamic')
+optimizer_config = dict(grad_clip=None, update_interval=update_interval)
+
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-6,
+    warmup='linear',
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_attentivemix_a2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_attentivemix_a2_bs100_ep200.py
new file mode 100644
index 00000000..bafabf82
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_attentivemix_a2_bs100_ep200.py
@@ -0,0 +1,18 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    pretrained=None,
+    pretrained_k="torchvision://resnet50",
+    alpha=2,  # float or list
+    mix_mode="attentivemix",
+    backbone_k=dict(  # PyTorch pre-trained R-18 is required for attentivemix+
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        style='pytorch'),
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_automix_l2_a2_near_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_automix_l2_a2_near_bs100_ep200.py
new file mode 100644
index 00000000..c0228387
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_automix_l2_a2_near_bs100_ep200.py
@@ -0,0 +1,120 @@
+_base_ = [
+    '../../../../_base_/datasets/cifar100/sz32_randaug_bs100.py',
+    '../../../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,
+    mask_layer=2,  # dowmsampling to 1/16
+    mask_loss=0.1,  # using loss
+    mask_adjust=0,  # none for large datasets
+    lam_margin=0.08,
+    switch_off=1.0,  # switch off mixblock (fixed)
+    mask_up_override=None,
+    debug=True,
+    backbone=dict(
+        type='ConvNeXt_CIFAR',
+        arch='tiny',
+        out_indices=(2, 3),  # x-1: stage-x
+        act_cfg=dict(type='GELU'),
+        drop_path_rate=0.3,
+        gap_before_final_norm=True,
+    ),
+    mix_block = dict(  # AutoMix
+        type='PixelMixBlock',
+        in_channels=384, reduction=2, use_scale=True,
+        unsampling_mode=['nearest',],  # str or list, train & test MixBlock, 'nearest' for AutoMix
+        lam_concat=True, lam_concat_v=False,  # AutoMix.V1: lam cat q,k,v
+        lam_mul=False, lam_residual=False, lam_mul_k=-1,  # SAMix lam: none
+        x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
+        att_norm_cfg=None,  # AutoMix: attention norm for fp16
+        mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
+        frozen=False),
+    head_one=dict(
+        type='VisionTransformerClsHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=False,
+        in_channels=768, num_classes=100),
+    head_mix=dict(
+        type='VisionTransformerClsHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=False,
+        in_channels=768, num_classes=100),
+    head_weights=dict(
+        decent_weight=[], accent_weight=[],
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+)
+
+# interval for accumulate gradient
+update_interval = 1  # total: 8 x bs128 x 1 accumulates = bs1024
+
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=500 * 20,  # 20 ep
+        iter_per_epoch=500,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False,  # by iter
+        update_interval=update_interval,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99996,  # 0.999 to 0.99996
+        adjust_scope=[0.25, 1.0],
+        warming_up="constant",
+        update_interval=update_interval,
+        interval=1)
+]
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'gamma': dict(weight_decay=0.),
+        'mix_block': dict(lr=1e-3),
+    })
+# # Sets `find_unused_parameters`: randomly switch off mixblock
+# find_unused_parameters = True
+
+# fp16
+use_fp16 = False
+fp16 = dict(type='mmcv', loss_scale='dynamic')
+optimizer_config = dict(grad_clip=None, update_interval=update_interval)
+
+# lr scheduler: Swim for DeiT
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,
+    warmup='linear',
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,  # 0.1 x lr
+    paramwise_options=['mix_block'],
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# validation hook
+evaluation = dict(initial=False, save_best=None)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_cutmix_a2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_cutmix_a2_bs100_ep200.py
new file mode 100644
index 00000000..cdd70245
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_cutmix_a2_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=2.0,
+    mix_mode="cutmix",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_deit_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_deit_bs100_ep200.py
new file mode 100644
index 00000000..0af832c7
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_deit_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=[1, 0.8],
+    mix_mode=['cutmix', 'mixup'],
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_fmix_a1_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_fmix_a1_bs100_ep200.py
new file mode 100644
index 00000000..a530ac14
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_fmix_a1_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=1.0,
+    mix_mode="fmix",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_gridmix_a1_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_gridmix_a1_bs100_ep200.py
new file mode 100644
index 00000000..d91847ed
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_gridmix_a1_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=1.0,
+    mix_mode="gridmix",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_mixup_a0_8_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_mixup_a0_8_bs100_ep200.py
new file mode 100644
index 00000000..354ef35f
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_mixup_a0_8_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=0.8,
+    mix_mode="mixup",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_puzzlemix_a2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_puzzlemix_a2_bs100_ep200.py
new file mode 100644
index 00000000..3f5b243f
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_puzzlemix_a2_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=2.0,
+    mix_mode="puzzlemix",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_resizemix_lam01_08_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_resizemix_lam01_08_bs100_ep200.py
new file mode 100644
index 00000000..33c3b143
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_resizemix_lam01_08_bs100_ep200.py
@@ -0,0 +1,13 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=1.0,
+    mix_mode="resizemix",
+    mix_args=dict(
+        resizemix=dict(scope=(0.1, 0.8), use_alpha=True),
+    ),
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_saliencymix_a0_2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_saliencymix_a0_2_bs100_ep200.py
new file mode 100644
index 00000000..99f42e3c
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_saliencymix_a0_2_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=0.2,
+    mix_mode="saliencymix",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_samix_l2_a2_near_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_samix_l2_a2_near_bs100_ep200.py
new file mode 100644
index 00000000..e8c6126c
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_samix_l2_a2_near_bs100_ep200.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/datasets/cifar100/sz224_randaug_bs100.py',
+    '../../../../_base_/default_runtime.py',
+]
+
+# value_neck_cfg
+conv1x1=dict(
+    type="ConvNeck",
+    in_channels=384, hid_channels=192, out_channels=1,  # MixBlock v
+    num_layers=2, kernel_size=1,
+    with_last_norm=False, norm_cfg=dict(type='BN'),  # default
+    with_last_dropout=0.1, with_avg_pool=False, with_residual=False)  # no res + dropout
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,
+    mask_layer=2,  # dowmsampling to 1/16
+    mask_loss=0.1,  # using loss
+    mask_adjust=0,  # none for large datasets
+    lam_margin=0.08,
+    switch_off=1.0,  # switch off mixblock (fixed)
+    mask_up_override=None,
+    debug=True,
+    backbone=dict(
+        type='ConvNeXt_CIFAR',
+        arch='tiny',
+        out_indices=(2, 3),  # x-1: stage-x
+        act_cfg=dict(type='GELU'),
+        drop_path_rate=0.3,
+        gap_before_final_norm=True,
+    ),
+    mix_block = dict(  # AutoMix
+        type='PixelMixBlock',
+        in_channels=384, reduction=2, use_scale=True,
+        unsampling_mode=['nearest',],  # str or list, train & test MixBlock, 'nearest' for AutoMix
+        # unsampling_mode=['bilinear',],  # str or list, tricks in SAMix
+        lam_concat=False, lam_concat_v=False,  # AutoMix.V1: none
+        lam_mul=True, lam_residual=True, lam_mul_k=-1,  # SAMix lam: mult + k=-1 (-1 for large datasets)
+        value_neck_cfg=conv1x1,  # SAMix: non-linear value
+        x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
+        frozen=False),
+    head_one=dict(
+        type='VisionTransformerClsHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=False,
+        in_channels=768, num_classes=100),
+    head_mix=dict(
+        type='VisionTransformerClsHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=False,
+        in_channels=768, num_classes=100),
+    head_weights=dict(
+        decent_weight=[], accent_weight=[],
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+)
+
+# interval for accumulate gradient
+update_interval = 1  # total: 8 x bs128 x 1 accumulates = bs1024
+
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=500 * 20,  # 20 ep
+        iter_per_epoch=500,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False,  # by iter
+        update_interval=update_interval,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99996,  # 0.999 to 0.99996
+        adjust_scope=[0.25, 1.0],
+        warming_up="constant",
+        update_interval=update_interval,
+        interval=1)
+]
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'gamma': dict(weight_decay=0.),
+        'mix_block': dict(lr=7.5e-4),
+    })
+# # Sets `find_unused_parameters`: randomly switch off mixblock
+# find_unused_parameters = True
+
+# fp16
+use_fp16 = False
+fp16 = dict(type='mmcv', loss_scale='dynamic')
+optimizer_config = dict(
+    grad_clip=dict(max_norm=20.0), update_interval=update_interval)
+
+# lr scheduler: Swim for DeiT
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,
+    warmup='linear',
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,  # 0.1 x lr
+    paramwise_options=['mix_block'],
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# validation hook
+evaluation = dict(initial=False, save_best=None)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_smoothmix_a0_2_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_smoothmix_a0_2_bs100_ep200.py
new file mode 100644
index 00000000..21e6e8c9
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_smoothmix_a0_2_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=0.2,
+    mix_mode="smoothmix",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_vanilla_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_vanilla_bs100_ep200.py
new file mode 100644
index 00000000..f486c5d5
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/convnext_tiny/convnext_t_vanilla_bs100_ep200.py
@@ -0,0 +1,10 @@
+_base_ = "../convnext_t_mixups_bs100.py"
+
+# model settings
+model = dict(
+    alpha=1.0,
+    mix_mode="vanilla",
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py
index d309abec..7e3f3535 100644
--- a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py
+++ b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_automix_l6_a2_near_sz224_bs100_ep200.py
@@ -36,13 +36,13 @@
     head_one=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_mix=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_weights=dict(
         decent_weight=[], accent_weight=[],
         head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
diff --git a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py
index c05e7027..e05e28d1 100644
--- a/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py
+++ b/configs/classification/cifar100/mixups/vits/deit_small/deit_s_samix_l6_a2_near_sz224_bs100_ep200.py
@@ -46,13 +46,13 @@
     head_one=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_mix=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_weights=dict(
         decent_weight=[], accent_weight=[],
         head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
diff --git a/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_automix_l3_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_automix_l3_a2_near_sz224_bs100_ep200.py
new file mode 100644
index 00000000..21247159
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_automix_l3_a2_near_sz224_bs100_ep200.py
@@ -0,0 +1,121 @@
+_base_ = [
+    '../../../../_base_/datasets/cifar100/sz224_randaug_bs100.py',
+    '../../../../_base_/default_runtime.py',
+]
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,
+    mask_layer=3,  # dowmsampling to 1/16
+    mask_loss=0.1,  # using loss
+    mask_adjust=0,  # none for large datasets
+    lam_margin=0.08,
+    switch_off=1.0,  # switch off mixblock (fixed)
+    mask_up_override=None,
+    debug=True,
+    backbone=dict(
+        type='SwinTransformer',
+        arch='tiny',
+        img_size=224,
+        drop_path_rate=0.2,
+        out_indices=(2,3,),  # x-1: stage-x
+    ),
+    mix_block = dict(  # AutoMix
+        type='PixelMixBlock',
+        in_channels=768, reduction=2, use_scale=True,
+        unsampling_mode=['nearest',],  # str or list, train & test MixBlock, 'nearest' for AutoMix
+        lam_concat=True, lam_concat_v=False,  # AutoMix.V1: lam cat q,k,v
+        lam_mul=False, lam_residual=False, lam_mul_k=-1,  # SAMix lam: none
+        x_qk_concat=False, x_v_concat=False,  # SAMix x concat: none
+        att_norm_cfg=None,  # AutoMix: attention norm for fp16
+        mask_loss_mode="L1", mask_loss_margin=0.1,  # L1 loss, 0.1
+        frozen=False),
+    head_one=dict(
+        type='ClsMixupHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=True,
+        in_channels=768, num_classes=100),
+    head_mix=dict(
+        type='ClsMixupHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=True,
+        in_channels=768, num_classes=100),
+    head_weights=dict(
+        decent_weight=[], accent_weight=[],
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+)
+
+# interval for accumulate gradient
+update_interval = 1  # total: 8 x bs128 x 1 accumulates = bs1024
+
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=500 * 20,  # 20 ep
+        iter_per_epoch=500,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False,  # by iter
+        update_interval=update_interval,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99996,  # 0.999 to 0.99996
+        adjust_scope=[0.25, 1.0],
+        warming_up="constant",
+        update_interval=update_interval,
+        interval=1)
+]
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4,
+    weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'absolute_pos_embed': dict(weight_decay=0.),
+        'relative_position_bias_table': dict(weight_decay=0.),
+        'mix_block': dict(lr=1e-3),
+    })
+# # Sets `find_unused_parameters`: randomly switch off mixblock
+# find_unused_parameters = True
+
+# fp16
+use_fp16 = False
+fp16 = dict(type='mmcv', loss_scale='dynamic')
+optimizer_config = dict(
+    grad_clip=dict(max_norm=10.0), update_interval=update_interval)
+
+# lr scheduler: Swim for DeiT
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,
+    warmup='linear',
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,  # 0.1 x lr
+    paramwise_options=['mix_block'],
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# validation hook
+evaluation = dict(initial=False, save_best=None)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_samix_l3_a2_near_sz224_bs100_ep200.py b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_samix_l3_a2_near_sz224_bs100_ep200.py
new file mode 100644
index 00000000..3b753af0
--- /dev/null
+++ b/configs/classification/cifar100/mixups/vits/swin_tiny/swin_t_samix_l3_a2_near_sz224_bs100_ep200.py
@@ -0,0 +1,131 @@
+_base_ = [
+    '../../../../_base_/datasets/cifar100/sz224_randaug_bs100.py',
+    '../../../../_base_/default_runtime.py',
+]
+
+# value_neck_cfg
+conv1x1=dict(
+    type="ConvNeck",
+    in_channels=768, hid_channels=384, out_channels=1,  # MixBlock v
+    num_layers=2, kernel_size=1,
+    with_last_norm=False, norm_cfg=dict(type='BN'),  # default
+    with_last_dropout=0.1, with_avg_pool=False, with_residual=False)  # no res + dropout
+
+# model settings
+model = dict(
+    type='AutoMixup',
+    pretrained=None,
+    alpha=2.0,
+    momentum=0.999,
+    mask_layer=2,  # dowmsampling to 1/16
+    mask_loss=0.1,  # using loss
+    mask_adjust=0,  # none for large datasets
+    lam_margin=0.08,
+    switch_off=1.0,  # switch off mixblock (fixed)
+    mask_up_override=None,
+    debug=True,
+    backbone=dict(
+        type='SwinTransformer',
+        arch='tiny',
+        img_size=224,
+        drop_path_rate=0.2,
+        out_indices=(2,3,),  # x-1: stage-x
+    ),
+    mix_block = dict(  # AutoMix
+        type='PixelMixBlock',
+        in_channels=768, reduction=2, use_scale=True,
+        unsampling_mode=['nearest',],  # str or list, train & test MixBlock, 'nearest' for AutoMix
+        # unsampling_mode=['bilinear',],  # str or list, tricks in SAMix
+        lam_concat=False, lam_concat_v=False,  # AutoMix.V1: none
+        lam_mul=True, lam_residual=True, lam_mul_k=-1,  # SAMix lam: mult + k=-1 (-1 for large datasets)
+        value_neck_cfg=conv1x1,  # SAMix: non-linear value
+        x_qk_concat=True, x_v_concat=False,  # SAMix x concat: q,k
+        # att_norm_cfg=dict(type='BN'),  # norm after q,k (design for fp16, also conduct better performace in fp32)
+        mask_loss_mode="L1+Variance", mask_loss_margin=0.1,  # L1+Var loss, tricks in SAMix
+        frozen=False),
+    head_one=dict(
+        type='ClsMixupHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=True,
+        in_channels=768, num_classes=100),
+    head_mix=dict(
+        type='ClsMixupHead',  # mixup CE + label smooth
+        loss=dict(type='LabelSmoothLoss',
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        with_avg_pool=True,
+        in_channels=768, num_classes=100),
+    head_weights=dict(
+        decent_weight=[], accent_weight=[],
+        head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
+    init_cfg=[
+        dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+        dict(type='Constant', layer='LayerNorm', val=1., bias=0.)
+    ],
+)
+
+# interval for accumulate gradient
+update_interval = 1  # total: 8 x bs128 x 1 accumulates = bs1024
+
+custom_hooks = [
+    dict(type='SAVEHook',
+        save_interval=500 * 20,  # 20 ep
+        iter_per_epoch=500,
+    ),
+    dict(type='CustomCosineAnnealingHook',  # 0.1 to 0
+        attr_name="mask_loss", attr_base=0.1, min_attr=0., by_epoch=False,  # by iter
+        update_interval=update_interval,
+    ),
+    dict(type='CosineScheduleHook',
+        end_momentum=0.99996,  # 0.999 to 0.99996
+        adjust_scope=[0.25, 1.0],
+        warming_up="constant",
+        update_interval=update_interval,
+        interval=1)
+]
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=5e-4,
+    weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999),
+    paramwise_options={
+        '(bn|ln|gn)(\d+)?.(weight|bias)': dict(weight_decay=0.),
+        'norm': dict(weight_decay=0.),
+        'bias': dict(weight_decay=0.),
+        'absolute_pos_embed': dict(weight_decay=0.),
+        'relative_position_bias_table': dict(weight_decay=0.),
+        'mix_block': dict(lr=5e-4),
+    })
+# Sets `find_unused_parameters`: randomly switch off mixblock
+find_unused_parameters = True
+
+# fp16
+use_fp16 = False
+fp16 = dict(type='mmcv', loss_scale='dynamic')
+optimizer_config = dict(
+    grad_clip=dict(max_norm=20.0), update_interval=update_interval)
+
+# lr scheduler: Swim for DeiT
+lr_config = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,
+    warmup='linear',
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# additional scheduler
+addtional_scheduler = dict(
+    policy='CosineAnnealing',
+    by_epoch=False, min_lr=1e-4,  # 0.1 x lr
+    paramwise_options=['mix_block'],
+    warmup_iters=20, warmup_by_epoch=True,
+    warmup_ratio=1e-5,
+)
+
+# validation hook
+evaluation = dict(initial=False, save_best=None)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py
index ebce03e0..f7cb3e77 100644
--- a/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py
+++ b/configs/classification/cifar100/samix/vits/deit_s_l6_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py
@@ -46,13 +46,13 @@
     head_one=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_mix=dict(
         type='VisionTransformerClsHead',  # mixup CE + label smooth
         loss=dict(type='LabelSmoothLoss',
-            label_smooth_val=0.1, num_classes=1000, mode='original', loss_weight=1.0),
-        in_channels=384, num_classes=1000),
+            label_smooth_val=0.1, num_classes=100, mode='original', loss_weight=1.0),
+        in_channels=384, num_classes=100),
     head_weights=dict(
         decent_weight=[], accent_weight=[],
         head_mix_q=1, head_one_q=1, head_mix_k=1, head_one_k=1),
diff --git a/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py b/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py
index 8e310681..347836bb 100644
--- a/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py
+++ b/configs/classification/cifar100/samix/vits/swin_t_l3_a2_near_val_dp01_sz224_mlr1e_4_bs100_ep200.py
@@ -1,5 +1,5 @@
 _base_ = [
-    '../../../_base_/datasets/cifar100/sz224_swin_bs100.py',
+    '../../../_base_/datasets/cifar100/sz224_randaug_bs100.py',
     '../../../_base_/default_runtime.py',
 ]
 
diff --git a/docs/en/mixup_benchmarks/Mixup_cifar.md b/docs/en/mixup_benchmarks/Mixup_cifar.md
index 2c7a0eae..5e82226e 100644
--- a/docs/en/mixup_benchmarks/Mixup_cifar.md
+++ b/docs/en/mixup_benchmarks/Mixup_cifar.md
@@ -147,25 +147,25 @@ These benchmarks follow CutMix settings, training 200/400/800/1200 epochs from s
 
 **Setup**
 
-* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/).
-* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations.
-
-| Backbones     |  $Beta$  | DEiT-S(/16) |   Swin-T   |
-|---------------|:--------:|:-----------:|:----------:|
-| Epoch         | $\alpha$ |  200 epochs | 200 epochs |
-| Vanilla       |     -    |    65.81    |    78.41   |
-| MixUp         |    0.8   |    69.98    |    76.78   |
-| CutMix        |     2    |    74.12    |    80.64   |
-| DeiT          |   0.8,1  |    75.92    |    81.25   |
-| SmoothMix     |    0.2   |    67.54    |    66.69   |
-| SaliencyMix   |    0.2   |    69.78    |    80.40   |
-| AttentiveMix+ |     2    |    75.98    |    81.13   |
-| FMix*         |     1    |    70.41    |    80.72   |
-| GridMix       |     1    |    68.86    |    78.54   |
-| PuzzleMix     |     2    |    73.60    |    80.33   |
-| ResizeMix*    |     1    |    68.45    |    80.16   |
-| TransMix      |   0.8,1  |    76.17    |      -     |
-| AutoMix       |     2    |    76.24    |    82.67   |
-| SAMix*        |     2    |    77.94    |            |
+* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures, but use $32\times 32$ for ConvNeXt. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT/ConvNeXt and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/).
+* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. We released the trained models and logs in [vits-mix-cifar100-weights](https://github.com/Westlake-AI/openmixup/releases/tag/vits-mix-cifar100-weights).
+
+| Backbones     |  $Beta$  | DEiT-S(/16) |   Swin-T   | ConvNeXt-T |
+|---------------|:--------:|:-----------:|:----------:|:----------:|
+| Epoch         | $\alpha$ |  200 epochs | 200 epochs | 200 epochs |
+| Vanilla       |     -    |    65.81    |    78.41   |       |
+| MixUp         |    0.8   |    69.98    |    76.78   |       |
+| CutMix        |     2    |    74.12    |    80.64   |       |
+| DeiT          |   0.8,1  |    75.92    |    81.25   |       |
+| SmoothMix     |    0.2   |    67.54    |    66.69   |       |
+| SaliencyMix   |    0.2   |    69.78    |    80.40   |       |
+| AttentiveMix+ |     2    |    75.98    |    81.13   |       |
+| FMix*         |     1    |    70.41    |    80.72   |       |
+| GridMix       |     1    |    68.86    |    78.54   |       |
+| PuzzleMix     |     2    |    73.60    |    80.33   |       |
+| ResizeMix*    |     1    |    68.45    |    80.16   |       |
+| TransMix      |   0.8,1  |    76.17    |      -     |       |
+| AutoMix       |     2    |    76.24    |    82.67   |       |
+| SAMix*        |     2    |    77.94    |    82.62   |       |
 
 <p align="right">(<a href="#top">back to top</a>)</p>
\ No newline at end of file
diff --git a/docs/en/model_zoos/Mixup_sup.md b/docs/en/model_zoos/Mixup_sup.md
index 197c4fb3..102bfb38 100644
--- a/docs/en/model_zoos/Mixup_sup.md
+++ b/docs/en/model_zoos/Mixup_sup.md
@@ -277,26 +277,26 @@ These benchmarks follow CutMix settings, training 200/400/800/1200 epochs from s
 
 **Setup**
 
-* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/).
-* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations.
-
-| Backbones     |  $Beta$  | DEiT-S(/16) |   Swin-T   |
-|---------------|:--------:|:-----------:|:----------:|
-| Epoch         | $\alpha$ |  200 epochs | 200 epochs |
-| Vanilla       |     -    |    65.81    |    78.41   |
-| MixUp         |    0.8   |    69.98    |    76.78   |
-| CutMix        |     2    |    74.12    |    80.64   |
-| DeiT          |   0.8,1  |    75.92    |    81.25   |
-| SmoothMix     |    0.2   |    67.54    |    66.69   |
-| SaliencyMix   |    0.2   |    69.78    |    80.40   |
-| AttentiveMix+ |     2    |    75.98    |    81.13   |
-| FMix*         |     1    |    70.41    |    80.72   |
-| GridMix       |     1    |    68.86    |    78.54   |
-| PuzzleMix     |     2    |    73.60    |    80.33   |
-| ResizeMix*    |     1    |    68.45    |    80.16   |
-| TransMix      |   0.8,1  |    76.17    |      -     |
-| AutoMix       |     2    |    76.24    |    82.67   |
-| SAMix*        |     2    |    77.94    |            |
+* Since the original resolutions of CIFAR-100 are too small for ViTs, we resize the input images to $224\times 224$ (training and testing) while not modifying the ViT architectures, but use $32\times 32$ for ConvNeXt. This benchmark uses DeiT setup and trains the model for 200 epochs with a batch size of 100 on CIFAR-100. The basic learning rate of DeiT/ConvNeXt and Swin are $1e-3$ and $5e-4$, which is the optimal setup in our experiments. We search and report $\alpha$ in $Beta(\alpha, \alpha)$ for all compared methods. View config files in [mixups/vits](https://github.com/Westlake-AI/openmixup/tree/main/configs/classification/cifar100/mixups/vits/).
+* The **best** of top-1 accuracy in the last 10 training epochs is reported for ViT architectures. Notice that 📖 denotes original results reproduced by official implementations. We released the trained models and logs in [vits-mix-cifar100-weights](https://github.com/Westlake-AI/openmixup/releases/tag/vits-mix-cifar100-weights).
+
+| Backbones     |  $Beta$  | DEiT-S(/16) |   Swin-T   | ConvNeXt-T |
+|---------------|:--------:|:-----------:|:----------:|:----------:|
+| Epoch         | $\alpha$ |  200 epochs | 200 epochs | 200 epochs |
+| Vanilla       |     -    |    65.81    |    78.41   |       |
+| MixUp         |    0.8   |    69.98    |    76.78   |       |
+| CutMix        |     2    |    74.12    |    80.64   |       |
+| DeiT          |   0.8,1  |    75.92    |    81.25   |       |
+| SmoothMix     |    0.2   |    67.54    |    66.69   |       |
+| SaliencyMix   |    0.2   |    69.78    |    80.40   |       |
+| AttentiveMix+ |     2    |    75.98    |    81.13   |       |
+| FMix*         |     1    |    70.41    |    80.72   |       |
+| GridMix       |     1    |    68.86    |    78.54   |       |
+| PuzzleMix     |     2    |    73.60    |    80.33   |       |
+| ResizeMix*    |     1    |    68.45    |    80.16   |       |
+| TransMix      |   0.8,1  |    76.17    |      -     |       |
+| AutoMix       |     2    |    76.24    |    82.67   |       |
+| SAMix*        |     2    |    77.94    |    82.62   |       |
 
 <p align="right">(<a href="#top">back to top</a>)</p>
 
diff --git a/openmixup/models/backbones/__init__.py b/openmixup/models/backbones/__init__.py
index 95f440b2..d05cc744 100644
--- a/openmixup/models/backbones/__init__.py
+++ b/openmixup/models/backbones/__init__.py
@@ -2,7 +2,7 @@
 from .beit import BEiTVisionTransformer
 from .context_cluster import ContextCluster
 from .convmixer import ConvMixer
-from .convnext import ConvNeXt, ConvNeXt_Mix, MIMConvNeXt, ConvNeXt_CIFAR
+from .convnext import ConvNeXt, ConvNeXt_Mix, MIMConvNeXt, ConvNeXt_CIFAR, ConvNeXt_Mix_CIFAR
 from .cspnet import CSPDarkNet, CSPNet, CSPResNet, CSPResNeXt
 from .davit import DaViT
 from .deit import DistilledVisionTransformer
@@ -61,7 +61,7 @@
 
 __all__ = [
     'AlexNet', 'BEiTViT', 'BEiTVisionTransformer', 'ContextCluster',
-    'ConvNeXt', 'ConvNeXt_Mix', 'MIMConvNeXt', 'ConvNeXt_CIFAR', 'ConvMixer',
+    'ConvNeXt', 'ConvNeXt_Mix', 'MIMConvNeXt', 'ConvNeXt_CIFAR', 'ConvNeXt_Mix_CIFAR', 'ConvMixer',
     'CSPDarkNet', 'CSPNet', 'CSPResNet', 'CSPResNeXt',
     'DaViT', 'DistilledVisionTransformer', 'DeiT3', 'DenseNet', 'DenseNet_CIFAR',
     'EdgeNeXt', 'EfficientFormer', 'EfficientNet', 'EfficientNetV2', 'HorNet', 'HorNet_CIFAR', 'HRNet',
diff --git a/openmixup/models/backbones/convnext.py b/openmixup/models/backbones/convnext.py
index 79eab6cb..a74c8405 100644
--- a/openmixup/models/backbones/convnext.py
+++ b/openmixup/models/backbones/convnext.py
@@ -612,7 +612,124 @@ def __init__(self, in_channels=3, norm_cfg=dict(type='LN2d', eps=1e-6), **kwargs
                 in_channels,
                 self.channels[0],
                 kernel_size=3,
-                stride=1),
+                stride=1,
+                padding=1),
             build_norm_layer(norm_cfg, self.channels[0])[1],
         )
         self.downsample_layers[0] = stem
+
+
+@BACKBONES.register_module()
+class ConvNeXt_Mix_CIFAR(ConvNeXt):
+    """ConvNeXt backbone for CIFAR, support ManifoldMix and its variants
+
+    Provide a port to mixup the latent space.
+    """
+    def __init__(self, in_channels=3, norm_cfg=dict(type='LN2d', eps=1e-6), **kwargs):
+        super(ConvNeXt_Mix_CIFAR, self).__init__(
+            in_channels=in_channels, norm_cfg=norm_cfg, **kwargs)
+
+        # the first stem layer
+        stem = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                self.channels[0],
+                kernel_size=3,
+                stride=1,
+                padding=1),
+            build_norm_layer(norm_cfg, self.channels[0])[1],
+        )
+        self.downsample_layers[0] = stem
+
+    def _feature_mixup(self, x, mask, dist_shuffle=False, idx_shuffle_mix=None,
+                       cross_view=False, BN_shuffle=False, idx_shuffle_BN=None,
+                       idx_unshuffle_BN=None, **kwargs):
+        """ mixup two feature maps with the pixel-wise mask
+
+        Args:
+            x, mask (tensor): Input x [N,C,H,W] and mixup mask [N, \*, H, W].
+            dist_shuffle (bool): Whether to shuffle cross gpus.
+            idx_shuffle_mix (tensor): Shuffle indice of [N,1] to generate x_.
+            cross_view (bool): Whether to view the input x as two views [2N, C, H, W],
+                which is usually adopted in self-supervised and semi-supervised settings.
+            BN_shuffle (bool): Whether to do shuffle cross gpus for shuffle_BN.
+            idx_shuffle_BN (tensor): Shuffle indice to utilize shuffle_BN cross gpus.
+            idx_unshuffle_BN (tensor): Unshuffle indice for the shuffle_BN (in pair).
+        """
+        # adjust mixup mask
+        assert mask.dim() == 4 and mask.size(1) <= 2
+        if mask.size(1) == 1:
+            mask = [mask, 1 - mask]
+        else:
+            mask = [
+                mask[:, 0, :, :].unsqueeze(1), mask[:, 1, :, :].unsqueeze(1)]
+        # undo shuffle_BN for ssl mixup
+        if BN_shuffle:
+            assert idx_unshuffle_BN is not None and idx_shuffle_BN is not None
+            x = grad_batch_unshuffle_ddp(x, idx_unshuffle_BN)  # 2N index if cross_view
+        
+        # shuffle input
+        if dist_shuffle==True:  # cross gpus shuffle
+            assert idx_shuffle_mix is not None
+            if cross_view:
+                N = x.size(0) // 2
+                detach_p = random.random()
+                x_ = x[N:, ...].clone().detach() if detach_p < 0.5 else x[N:, ...]
+                x = x[:N, ...] if detach_p < 0.5 else x[:N, ...].detach()
+                x_, _, _ = grad_batch_shuffle_ddp(x_, idx_shuffle_mix)
+            else:
+                x_, _, _ = grad_batch_shuffle_ddp(x, idx_shuffle_mix)
+        else:  # within each gpu
+            if cross_view:
+                # default: the input image is shuffled
+                N = x.size(0) // 2
+                detach_p = random.random()
+                x_ = x[N:, ...].clone().detach() if detach_p < 0.5 else x[N:, ...]
+                x = x[:N, ...] if detach_p < 0.5 else x[:N, ...].detach()
+            else:
+                x_ = x[idx_shuffle_mix, :]
+        assert x.size(3) == mask[0].size(3), \
+            "mismatching mask x={}, mask={}.".format(x.size(), mask[0].size())
+        mix = x * mask[0] + x_ * mask[1]
+
+        # redo shuffle_BN for ssl mixup
+        if BN_shuffle:
+            mix, _, _ = grad_batch_shuffle_ddp(mix, idx_shuffle_BN)  # N index
+        
+        return mix
+
+    def forward(self, x, mix_args=None):
+        """ only support mask-based mixup policy """
+        # latent space mixup
+        if mix_args is not None:
+            assert isinstance(mix_args, dict)
+            mix_layer = mix_args["layer"]  # {0, 1, 2, 3}
+            if mix_args["BN_shuffle"]:
+                x, _, idx_unshuffle = grad_batch_shuffle_ddp(x)  # 2N index if cross_view
+            else:
+                idx_unshuffle = None
+        else:
+            mix_layer = -1
+        
+        # input mixup
+        if mix_layer == 0:
+            x = self._feature_mixup(x, idx_unshuffle_BN=idx_unshuffle, **mix_args)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x = self.downsample_layers[i](x)
+            x = stage(x)
+            if i in self.out_indices:
+                if i == 3:
+                    norm_layer = getattr(self, f'norm{i}')
+                    if self.gap_before_final_norm and i == 3:
+                        gap = x.mean([-2, -1], keepdim=True)
+                        x = norm_layer(gap).flatten(1)
+                    else:
+                        x = norm_layer(x)
+                outs.append(x)
+                if len(self.out_indices) == 1:
+                    return outs
+            if i+1 == mix_layer:
+                x = self._feature_mixup(x, idx_unshuffle_BN=idx_unshuffle, **mix_args)
+        return outs
diff --git a/requirements/optional.txt b/requirements/optional.txt
index a429bd12..d6a6cbd5 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -1,5 +1,7 @@
 albumentations>=0.3.2    # For Albumentations data transform
-faiss-gpu>=1.6.1    # For DeepCluster and ODC
-grad-cam >= 1.3.7   # For CAM visualization
-requests            # For torchserve
+faiss-gpu>=1.6.1      # For DeepCluster and ODC
+grad-cam >= 1.3.7     # For CAM visualization
+gco==1.0.1            # For PuzzleMix (please install from source)
+opencv-contrib-python # For SaliencyMix
+requests              # For torchserve
 scikit-image