diff --git a/configs/beit/beit_base_p16_224_ft_1k.yaml b/configs/beit/beit_base_p16_224_ft_1k.yaml
new file mode 100644
index 00000000..4628cbc3
--- /dev/null
+++ b/configs/beit/beit_base_p16_224_ft_1k.yaml
@@ -0,0 +1,123 @@
+epochs: 100
+output_dir: output_dir
+seed: 0
+device: gpu
+
+model:
+  name: BEiTFTWrapper
+  architecture:
+      name: VisionTransformerForFinetune
+      img_size: 224
+      embed_dim: 768
+      patch_size: 16
+      depth: 12
+      num_heads: 12
+      mlp_ratio: 4
+      qkv_bias: True
+      drop_path_rate: 0.1
+      init_values: 0.1
+      use_abs_pos_emb: False
+      use_rel_pos_bias: True
+  head:
+    name: BEiTFTHead
+    num_classes: 1000
+    in_channels: 768
+
+dataloader:
+  train:
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+    sampler:
+      batch_size: 128
+      shuffle: True
+      drop_last: True
+    dataset:
+      name: ImageNet
+      dataroot: data/ILSVRC2012/train/
+      return_label: True
+      transforms:
+        - name: RandomResizedCrop
+          size: 224
+          scale: [0.08, 1.]
+          interpolation: 'bicubic'
+        - name: RandomHorizontalFlip
+        - name: AutoAugment
+          config_str: 'rand-m9-mstd0.5-inc1'
+          interpolation: 'bicubic'
+          img_size: 224
+          mean: [0.5, 0.5, 0.5]
+          std: [0.5, 0.5, 0.5]
+        - name: Transpose
+        - name: NormalizeImage
+          scale: 1.0/255.0
+          mean: [0.5, 0.5, 0.5]
+          std: [0.5, 0.5, 0.5]
+        - name: RandomErasing
+          prob: 0.25
+          mode: 'pixel'
+          max_count: 1
+      batch_transforms:
+        - name: Mixup
+          mixup_alpha: 0.8
+          prob: 1.
+          switch_prob: 0.5
+          mode: 'batch'
+          cutmix_alpha: 1.0
+  val:
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+    sampler:
+      batch_size: 64
+      shuffle: false
+      drop_last: false
+    dataset:
+      name: ImageNet
+      dataroot: data/ILSVRC2012/val
+      return_label: True
+      transforms:
+        - name: Resize
+          size: 256
+          interpolation: 'bicubic'
+        - name: CenterCrop
+          size: 224
+        - name: Transpose
+        - name: Normalize
+          mean: [123.675, 116.28, 103.53]
+          std: [58.395, 57.12, 57.375]
+
+lr_scheduler:
+  name: LinearWarmup
+  learning_rate:
+    name: CosineAnnealingDecay
+    learning_rate: 4e-3
+    T_max: 100
+    eta_min: 1e-6
+  warmup_steps: 20
+  start_lr: 0
+  end_lr: 4e-3
+
+optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.05
+  epsilon: 1e-8
+  exclude_from_weight_decay: ["pos_embed","cls_token",".bias","norm","gamma"]
+  layer_decay: 0.65
+
+log_config:
+    name: LogHook
+    interval: 10
+
+checkpoint:
+  name: CheckpointHook
+  by_epoch: true
+  interval: 1
+
+custom_config:
+  - name: EvaluateHook
+
+vdl_config:
+    name: VisualHook
diff --git a/configs/beit/beit_base_p16_224_pt_1k.yaml b/configs/beit/beit_base_p16_224_pt_1k.yaml
new file mode 100644
index 00000000..3ff214d3
--- /dev/null
+++ b/configs/beit/beit_base_p16_224_pt_1k.yaml
@@ -0,0 +1,104 @@
+epochs: 800
+output_dir: output_dir
+seed: 0
+device: gpu
+
+model:
+    name: BEiTPTWrapper
+    architecture:
+        name: VisionTransformerForMaskedImageModeling
+        img_size: 224
+        embed_dim: 768
+        patch_size: 16
+        depth: 12
+        num_heads: 12
+        mlp_ratio: 4
+        use_abs_pos_emb: False
+        use_rel_pos_bias: False
+        use_shared_rel_pos_bias: True
+        init_values: 0.1
+        drop_path_rate: 0.1
+    head:
+        name: BEiTPTHead
+        num_classes: 1000
+        in_channels: 768
+    d_vae:
+        name: dall-e
+        weight_path: 'dvae/'
+        image_size: 112
+
+dataloader:
+  train:
+    loader:
+      num_workers: 0
+      use_shared_memory: False
+    sampler:
+      batch_size: 128
+      shuffle: True
+      drop_last: True
+    dataset:
+      name: BEiT_ImageNet
+      dataroot: data/ILSVRC2012/train/
+      common_transforms:
+        - name: ToRGB
+        - name: ColorJitter
+          brightness: 0.4
+          contrast: 0.4
+          saturation: 0.4
+          hue: 0.4
+        - name: RandomHorizontalFlip
+        - name: RandomResizedCropAndInterpolationWithTwoPic
+          size: 224
+          second_size: 112
+          interpolation: 'bicubic'
+          second_interpolation: 'lanczos'
+      patch_transforms:
+        - name: Transpose
+        - name: NormalizeImage
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      visual_token_transforms:
+        - name: Transpose
+        - name: VisualTokenMap
+          mode: 'map_pixels'
+          scale: 255
+      masking_generator:
+          input_size: 14
+          num_masking_patches: 75
+          max_num_patches: None
+          min_num_patches: 16
+
+lr_scheduler:
+  name: LinearWarmup
+  learning_rate:
+    name: CosineAnnealingDecay
+    learning_rate: 3e-3
+    T_max: 800
+    eta_min: 1e-5
+  warmup_steps: 10
+  start_lr: 0
+  end_lr: 3e-3
+
+optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  weight_decay: 0.05
+  epsilon: 1e-8
+  exclude_from_weight_decay: ["pos_embed","cls_token",".bias","norm","gamma"]
+  grad_clip:
+    name: global_norm
+    value: 3.0
+
+log_config:
+    name: LogHook
+    interval: 1
+
+checkpoint:
+  name: CheckpointHook
+  by_epoch: True
+  interval: 1
+
+vdl_config:
+    name: VisualHook
diff --git a/configs/byol/byol_clas_r50.yaml b/configs/byol/byol_clas_r50.yaml
index 3fa7cd11..70965f6d 100644
--- a/configs/byol/byol_clas_r50.yaml
+++ b/configs/byol/byol_clas_r50.yaml
@@ -1,5 +1,7 @@
 epochs: 100
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: ByolClassification
@@ -16,7 +18,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
@@ -30,11 +34,13 @@ dataloader:
         - name: RandomHorizontalFlip
         - name: Resize
           size: [224,224]
-          interpolation: bicubic 
+          interpolation: bicubic
         - name: ByolNormalize
         - name: Clip
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
@@ -49,7 +55,7 @@ dataloader:
         - name: CenterCrop
           size: 224
         - name: ByolNormalize
-        - name: Clip 
+        - name: Clip
 
 lr_scheduler:
   name: ByolLRScheduler
diff --git a/configs/byol/byol_r50_IM.yaml b/configs/byol/byol_r50_IM.yaml
index 379bae08..56ae2970 100644
--- a/configs/byol/byol_r50_IM.yaml
+++ b/configs/byol/byol_r50_IM.yaml
@@ -1,8 +1,11 @@
 epochs: 300
 use_byol_iters: True
 total_images: 1281167
-global_batch_size: 4096  # 128 * 4 * 8
+global_batch_size: 4096
 output_dir: output_dir
+seed: 0
+device: gpu
+
 model:
   name: BYOL
   backbone:
@@ -33,7 +36,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
diff --git a/configs/cait/cait_m36_384.yaml b/configs/cait/cait_m36_384.yaml
index e8097250..4b56d5f1 100644
--- a/configs/cait/cait_m36_384.yaml
+++ b/configs/cait/cait_m36_384.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CaitWrapper
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cait/cait_m48_448.yaml b/configs/cait/cait_m48_448.yaml
index d4d5b6b3..163d3d86 100644
--- a/configs/cait/cait_m48_448.yaml
+++ b/configs/cait/cait_m48_448.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CaitWrapper
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cait/cait_s24_224.yaml b/configs/cait/cait_s24_224.yaml
index 1e8c0299..9734f6d6 100644
--- a/configs/cait/cait_s24_224.yaml
+++ b/configs/cait/cait_s24_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CaitWrapper
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cait/cait_s24_384.yaml b/configs/cait/cait_s24_384.yaml
index 830bd0c5..2f3c3500 100644
--- a/configs/cait/cait_s24_384.yaml
+++ b/configs/cait/cait_s24_384.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CaitWrapper
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cait/cait_s36_384.yaml b/configs/cait/cait_s36_384.yaml
index 2b1f1959..c138ea6e 100644
--- a/configs/cait/cait_s36_384.yaml
+++ b/configs/cait/cait_s36_384.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CaitWrapper
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cait/cait_xs24_384.yaml b/configs/cait/cait_xs24_384.yaml
index d855eef7..197653e8 100644
--- a/configs/cait/cait_xs24_384.yaml
+++ b/configs/cait/cait_xs24_384.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CaitWrapper
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/clip/vit-b-32.yaml b/configs/clip/vit-b-32.yaml
index 549ff58b..0e40fd6d 100644
--- a/configs/clip/vit-b-32.yaml
+++ b/configs/clip/vit-b-32.yaml
@@ -1,5 +1,7 @@
 epochs: 10
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CLIPWrapper
@@ -24,7 +26,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
@@ -38,7 +42,7 @@ dataloader:
           size: 224
           scale: [0.75, 1.]
           ratio: [1., 1.]
-        - name: NormalizeImage                                                                                                                                                                                                                                                 
+        - name: NormalizeImage
           scale: 1.0/255.0
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
diff --git a/configs/convnext/convnext_small_224.yaml b/configs/convnext/convnext_small_224.yaml
index 78c28c31..fb772704 100644
--- a/configs/convnext/convnext_small_224.yaml
+++ b/configs/convnext/convnext_small_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: Classification
@@ -14,7 +16,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/convnext/convnext_tiny_224.yaml b/configs/convnext/convnext_tiny_224.yaml
index 03001502..3d4b9829 100644
--- a/configs/convnext/convnext_tiny_224.yaml
+++ b/configs/convnext/convnext_tiny_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: Classification
@@ -14,7 +16,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cvt/cvt_13_224.yaml b/configs/cvt/cvt_13_224.yaml
index 3e23777d..69617d29 100644
--- a/configs/cvt/cvt_13_224.yaml
+++ b/configs/cvt/cvt_13_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CvTWrapper
@@ -15,7 +17,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cvt/cvt_21_224.yaml b/configs/cvt/cvt_21_224.yaml
index 46f31b20..44156ad3 100644
--- a/configs/cvt/cvt_21_224.yaml
+++ b/configs/cvt/cvt_21_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CvTWrapper
@@ -15,7 +17,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cvt/cvt_21_384.yaml b/configs/cvt/cvt_21_384.yaml
index 466500e9..cf8412ae 100644
--- a/configs/cvt/cvt_21_384.yaml
+++ b/configs/cvt/cvt_21_384.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CvTWrapper
@@ -15,7 +17,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/cvt/cvt_w24_384.yaml b/configs/cvt/cvt_w24_384.yaml
index 03b88756..79f2a9ce 100644
--- a/configs/cvt/cvt_w24_384.yaml
+++ b/configs/cvt/cvt_w24_384.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: CvTWrapper
@@ -15,7 +17,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml b/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml
index 1f0a67b9..1135ab80 100644
--- a/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml
+++ b/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml
@@ -1,5 +1,7 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 use_amp: True
 AMP:
@@ -16,29 +18,31 @@ AMP:
 model:
   name: DeiTWrapper
   architecture:
-      name: DeiTVisionTransformer 
-      img_size: 224 
-      patch_size: 16 
-      embed_dim: 768 
-      depth: 12 
+      name: DeiTVisionTransformer
+      img_size: 224
+      patch_size: 16
+      embed_dim: 768
+      depth: 12
       num_heads: 12
-      mlp_ratio: 4 
-      qkv_bias: True 
+      mlp_ratio: 4
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.0
-      drop_path_rate : 0.1 
+      drop_path_rate : 0.1
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       name: DistributedRepeatedAugSampler
-      batch_size: 64 
+      batch_size: 64
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -53,7 +57,7 @@ dataloader:
         - name: AutoAugment
           config_str: 'rand-m9-mstd0.5-inc1'
           interpolation: 'bicubic'
-          img_size: 224   
+          img_size: 224
         - name: Normalize
           data_format: 'HWC'
           mean: [123.675, 116.28, 103.53]
@@ -74,7 +78,9 @@ dataloader:
           num_classes: 1000
 
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -103,7 +109,7 @@ lr_config:
   unit: 'epoch'
 
 lr_scheduler:
-  name: TimmCosine 
+  name: TimmCosine
   learning_rate: 1e-3
   eta_min: 1e-5
   warmup_epoch: 5
diff --git a/configs/mae/mae_vit_b_finetune.yaml b/configs/mae/mae_vit_b_finetune.yaml
index a8a2a001..7d0e1af7 100644
--- a/configs/mae/mae_vit_b_finetune.yaml
+++ b/configs/mae/mae_vit_b_finetune.yaml
@@ -1,5 +1,7 @@
 epochs: 100
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: MAE_FINETUNE
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/mae/mae_vit_b_pretrain.yaml b/configs/mae/mae_vit_b_pretrain.yaml
index c6aab60d..b9381abf 100644
--- a/configs/mae/mae_vit_b_pretrain.yaml
+++ b/configs/mae/mae_vit_b_pretrain.yaml
@@ -1,5 +1,7 @@
 epochs: 800
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: MAE_PRETRAIN
@@ -14,10 +16,11 @@ model:
       decoder_num_heads: 16
       mlp_ratio: 4
 
-
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/mlp_mixer/mlp-mixer_b16_224.yaml b/configs/mlp_mixer/mlp-mixer_b16_224.yaml
index 025d591e..f58b5851 100644
--- a/configs/mlp_mixer/mlp-mixer_b16_224.yaml
+++ b/configs/mlp_mixer/mlp-mixer_b16_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
     name: MlpMixerWrapper
@@ -16,7 +18,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/mlp_mixer/mlp-mixer_l16_224.yaml b/configs/mlp_mixer/mlp-mixer_l16_224.yaml
index 54582f0a..bf709450 100644
--- a/configs/mlp_mixer/mlp-mixer_l16_224.yaml
+++ b/configs/mlp_mixer/mlp-mixer_l16_224.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
     name: MlpMixerWrapper
@@ -16,7 +18,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/moco/moco_clas_r50.yaml b/configs/moco/moco_clas_r50.yaml
index 33d86461..fb1369d2 100644
--- a/configs/moco/moco_clas_r50.yaml
+++ b/configs/moco/moco_clas_r50.yaml
@@ -1,5 +1,7 @@
 epochs: 100
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: Classification
@@ -14,7 +16,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 6
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: true
@@ -33,7 +37,9 @@ dataloader:
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
   val:
-    num_workers: 4
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: false
diff --git a/configs/moco/moco_v1_r50.yaml b/configs/moco/moco_v1_r50.yaml
index 70ce6611..38dfb443 100644
--- a/configs/moco/moco_v1_r50.yaml
+++ b/configs/moco/moco_v1_r50.yaml
@@ -1,5 +1,7 @@
 epochs: 200
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: MoCo
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 32
       shuffle: true
diff --git a/configs/moco/moco_v2_r50.yaml b/configs/moco/moco_v2_r50.yaml
index 8d675aa3..6478fc7c 100644
--- a/configs/moco/moco_v2_r50.yaml
+++ b/configs/moco/moco_v2_r50.yaml
@@ -1,5 +1,7 @@
 epochs: 200
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: MoCo
@@ -18,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8 
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 32
       shuffle: true
diff --git a/configs/moco_byol/moco_byol_r50_IM.yaml b/configs/moco_byol/moco_byol_r50_IM.yaml
index 67dfc2a7..25da1042 100644
--- a/configs/moco_byol/moco_byol_r50_IM.yaml
+++ b/configs/moco_byol/moco_byol_r50_IM.yaml
@@ -3,6 +3,8 @@ use_byol_iters: True
 total_images: 1281167
 global_batch_size: 4096  # 128 * 4 * 8
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: MoCoBYOL
@@ -32,7 +34,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/moco_byol/moco_byol_r50_IM_clas.yaml b/configs/moco_byol/moco_byol_r50_IM_clas.yaml
index 381db80f..72a052f6 100644
--- a/configs/moco_byol/moco_byol_r50_IM_clas.yaml
+++ b/configs/moco_byol/moco_byol_r50_IM_clas.yaml
@@ -1,5 +1,7 @@
 epochs: 100
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: Classification
@@ -16,7 +18,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
@@ -28,7 +32,7 @@ dataloader:
       transforms:
         - name: RandomResizedCrop
           size: 224
-          interpolation: bicubic 
+          interpolation: bicubic
         - name: RandomHorizontalFlip
         - name: Transpose
         - name: NormalizeImage
@@ -36,7 +40,9 @@ dataloader:
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: false
@@ -48,7 +54,7 @@ dataloader:
       transforms:
         - name: Resize
           size: 256
-          interpolation: bicubic 
+          interpolation: bicubic
         - name: CenterCrop
           size: 224
         - name: Transpose
@@ -73,4 +79,3 @@ log_config:
 
 custom_config:
   - name: EvaluateHook
-
diff --git a/configs/pixpro/pixpro_base_r50_100ep.yaml b/configs/pixpro/pixpro_base_r50_100ep.yaml
index 401d89ad..45138a52 100644
--- a/configs/pixpro/pixpro_base_r50_100ep.yaml
+++ b/configs/pixpro/pixpro_base_r50_100ep.yaml
@@ -1,6 +1,8 @@
 epochs: 100
 output_dir: output_dir
 total_images: 1281167
+seed: 0
+device: gpu
 
 model:
   name: PixPro
@@ -27,7 +29,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 4
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: True
diff --git a/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml b/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml
index 253756fb..987e2dcc 100644
--- a/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml
+++ b/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml
@@ -1,5 +1,7 @@
 epochs: 100
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: Classification
@@ -14,7 +16,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 6
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: true
@@ -32,7 +36,9 @@ dataloader:
           mean: [123.675, 116.28, 103.53]
           std: [58.395, 57.12, 57.375]
   val:
-    num_workers: 4
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: false
diff --git a/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml
index 63d79201..9672bbc8 100644
--- a/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml
+++ b/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml
@@ -1,6 +1,7 @@
 epochs: 300
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -19,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
@@ -54,7 +57,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml
index 2b20d15f..c91af03a 100644
--- a/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml
+++ b/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml
@@ -2,6 +2,7 @@ epochs: 300
 output_dir: output_dir
 seed: 16
 use_amp: True
+device: gpu
 
 AMP:
    level: 'O1'
@@ -19,6 +20,7 @@ hybrid:
    mp_degree: 1
    pp_degree: 1
 
+# Make sure your paddle version is develop to use sharding.
 sharding:
    sharding_stage: 2
    offload: False
@@ -42,7 +44,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 8
       shuffle: true
@@ -78,7 +82,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 32
       shuffle: false
diff --git a/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml
index da9cd628..f7e70893 100644
--- a/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml
+++ b/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml
@@ -1,6 +1,7 @@
 epochs: 300
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -19,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
@@ -39,7 +42,7 @@ dataloader:
           interpolation: 'bicubic'
           img_size: 224
         - name: Transpose
-        - name: NormalizeImage                                                                                                                                                                                                                                                 
+        - name: NormalizeImage
           scale: 1.0/255.0
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
@@ -71,7 +76,7 @@ dataloader:
         - name: CenterCrop
           size: 224
         - name: Transpose
-        - name: NormalizeImage                                                                                                                                                                                                                                                 
+        - name: NormalizeImage
           scale: 1.0/255.0
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
diff --git a/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml
index 47f902db..8ec068cc 100644
--- a/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml
+++ b/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -1,6 +1,7 @@
 epochs: 300
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -19,7 +20,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
@@ -39,7 +42,7 @@ dataloader:
           interpolation: 'bicubic'
           img_size: 224
         - name: Transpose
-        - name: NormalizeImage                                                                                                                                                                                                                                                 
+        - name: NormalizeImage
           scale: 1.0/255.0
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
@@ -71,7 +76,7 @@ dataloader:
         - name: CenterCrop
           size: 224
         - name: Transpose
-        - name: NormalizeImage                                                                                                                                                                                                                                                 
+        - name: NormalizeImage
           scale: 1.0/255.0
           mean: [0.485, 0.456, 0.406]
           std: [0.229, 0.224, 0.225]
diff --git a/configs/t2t_vit/t2t_vit_14.yaml b/configs/t2t_vit/t2t_vit_14.yaml
index 4fe7c7bc..607fa892 100644
--- a/configs/t2t_vit/t2t_vit_14.yaml
+++ b/configs/t2t_vit/t2t_vit_14.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: T2TViTWrapper
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/t2t_vit/t2t_vit_19.yaml b/configs/t2t_vit/t2t_vit_19.yaml
index d3ff014e..07c1cc9e 100644
--- a/configs/t2t_vit/t2t_vit_19.yaml
+++ b/configs/t2t_vit/t2t_vit_19.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: T2TViTWrapper
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/t2t_vit/t2t_vit_24.yaml b/configs/t2t_vit/t2t_vit_24.yaml
index b107632b..4fbe27a9 100644
--- a/configs/t2t_vit/t2t_vit_24.yaml
+++ b/configs/t2t_vit/t2t_vit_24.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: T2TViTWrapper
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/t2t_vit/t2t_vit_t_14.yaml b/configs/t2t_vit/t2t_vit_t_14.yaml
index 8769024b..1f1112ab 100644
--- a/configs/t2t_vit/t2t_vit_t_14.yaml
+++ b/configs/t2t_vit/t2t_vit_t_14.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: T2TViTWrapper
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/t2t_vit/t2t_vit_t_19.yaml b/configs/t2t_vit/t2t_vit_t_19.yaml
index f546cc4a..eebe0364 100644
--- a/configs/t2t_vit/t2t_vit_t_19.yaml
+++ b/configs/t2t_vit/t2t_vit_t_19.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: T2TViTWrapper
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/t2t_vit/t2t_vit_t_24.yaml b/configs/t2t_vit/t2t_vit_t_24.yaml
index c634fd44..18a3b9da 100644
--- a/configs/t2t_vit/t2t_vit_t_24.yaml
+++ b/configs/t2t_vit/t2t_vit_t_24.yaml
@@ -1,5 +1,7 @@
 epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: T2TViTWrapper
@@ -17,7 +19,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: true
diff --git a/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml b/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml
index eabe6e68..3cfcd2bc 100644
--- a/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml
+++ b/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml
@@ -1,37 +1,41 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
+seed: 0
+device: gpu
 
 model:
   name: ViTWrapper
   architecture:
-      name: VisionTransformer 
-      img_size: 384 
-      patch_size: 16 
-      width: 768 
-      depth: 8 
-      num_heads: 8 
-      mlp_ratio: 3 
-      qkv_bias: True 
+      name: VisionTransformer
+      img_size: 384
+      patch_size: 16
+      width: 768
+      depth: 8
+      num_heads: 8
+      mlp_ratio: 3
+      qkv_bias: True
   head:
     name: VisionTransformerClsHead
     num_classes: 1000
-    in_channels: 768 
+    in_channels: 768
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: true
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
         - name: ToRGB
         - name: RandomResizedCrop
-          size: 384 
+          size: 384
           scale: [0.75, 1.]
           ratio: [1., 1.]
           interpolation: 'bicubic'
@@ -41,12 +45,12 @@ dataloader:
           std: [127.5, 127.5, 127.5]
 
 lr_scheduler:
-  name: CosineWarmup 
+  name: CosineWarmup
   learning_rate: 12.28
   T_max: 93835
-  warmup_steps: 10000 
+  warmup_steps: 10000
   start_lr: 0.01228
-  end_lr: 12.28 
+  end_lr: 12.28
 
 optimizer:
   name: AdamW
diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml
index bf5468cd..2583f46f 100644
--- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml
+++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml
@@ -1,6 +1,7 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
 seed: 2021
+device: gpu
 
 use_amp: True
 AMP:
@@ -17,14 +18,14 @@ AMP:
 model:
   name: ViTWrapper
   architecture:
-      name: GoogleVisionTransformer 
-      img_size: 224 
-      patch_size: 16 
-      embed_dim: 768 
-      depth: 12 
+      name: GoogleVisionTransformer
+      img_size: 224
+      patch_size: 16
+      embed_dim: 768
+      depth: 12
       num_heads: 12
-      mlp_ratio: 4 
-      qkv_bias: True 
+      mlp_ratio: 4
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.1
@@ -33,13 +34,15 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -54,10 +57,12 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -78,11 +83,11 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
 
 lr_scheduler:
-  name: ViTLRScheduler 
+  name: ViTLRScheduler
   learning_rate: 3e-3
   decay_type: cosine
   warmup_steps: 10000
diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml
index a3ff7554..72ff6433 100644
--- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml
+++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml
@@ -1,6 +1,7 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
 seed: 2021
+device: gpu
 
 use_amp: True
 AMP:
@@ -14,6 +15,7 @@ AMP:
                          "sigmoid_cross_entropy_with_logits", "elementwise_div"]
      level: 'O1'
 
+# Make sure your paddle version is develop to use sharding.
 sharding:
    sharding_stage: 2
    offload: False
@@ -22,14 +24,14 @@ sharding:
 model:
   name: ViTWrapper
   architecture:
-      name: GoogleVisionTransformer 
-      img_size: 224 
-      patch_size: 16 
-      embed_dim: 768 
-      depth: 12 
+      name: GoogleVisionTransformer
+      img_size: 224
+      patch_size: 16
+      embed_dim: 768
+      depth: 12
       num_heads: 12
-      mlp_ratio: 4 
-      qkv_bias: True 
+      mlp_ratio: 4
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.1
@@ -38,13 +40,15 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -59,10 +63,12 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -83,11 +89,11 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
 
 lr_scheduler:
-  name: ViTLRScheduler 
+  name: ViTLRScheduler
   learning_rate: 3e-3
   decay_type: cosine
   warmup_steps: 10000
diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml
index 94ae3f3d..d8066c55 100644
--- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml
+++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml
@@ -1,6 +1,7 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
 seed: 2021
+device: gpu
 
 use_amp: True
 AMP:
@@ -17,14 +18,14 @@ AMP:
 model:
   name: ViTWrapper
   architecture:
-      name: GoogleVisionTransformer 
-      img_size: 224 
-      patch_size: 16 
-      embed_dim: 768 
-      depth: 12 
+      name: GoogleVisionTransformer
+      img_size: 224
+      patch_size: 16
+      embed_dim: 768
+      depth: 12
       num_heads: 12
-      mlp_ratio: 4 
-      qkv_bias: True 
+      mlp_ratio: 4
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.1
@@ -33,13 +34,15 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -54,10 +57,12 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -78,11 +83,11 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
 
 lr_scheduler:
-  name: ViTLRScheduler 
+  name: ViTLRScheduler
   learning_rate: 3e-3
   decay_type: cosine
   warmup_steps: 10000
diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml
index 2073c57c..c905c250 100644
--- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml
+++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml
@@ -1,6 +1,7 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
 seed: 2021
+device: gpu
 
 use_amp: True
 AMP:
@@ -14,6 +15,7 @@ AMP:
                          "sigmoid_cross_entropy_with_logits", "elementwise_div"]
      level: 'O2'
 
+# Make sure your paddle version is develop to use sharding.
 sharding:
    sharding_stage: 2
    offload: False
@@ -22,14 +24,14 @@ sharding:
 model:
   name: ViTWrapper
   architecture:
-      name: GoogleVisionTransformer 
-      img_size: 224 
-      patch_size: 16 
-      embed_dim: 768 
-      depth: 12 
+      name: GoogleVisionTransformer
+      img_size: 224
+      patch_size: 16
+      embed_dim: 768
+      depth: 12
       num_heads: 12
-      mlp_ratio: 4 
-      qkv_bias: True 
+      mlp_ratio: 4
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.1
@@ -38,13 +40,15 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -59,10 +63,12 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -83,11 +89,11 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
 
 lr_scheduler:
-  name: ViTLRScheduler 
+  name: ViTLRScheduler
   learning_rate: 3e-3
   decay_type: cosine
   warmup_steps: 10000
diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml
index fae2842f..eb19f00d 100644
--- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml
+++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml
@@ -1,18 +1,19 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
 seed: 2021
+device: gpu
 
 model:
   name: ViTWrapper
   architecture:
-      name: GoogleVisionTransformer 
-      img_size: 224 
-      patch_size: 16 
-      embed_dim: 768 
-      depth: 12 
+      name: GoogleVisionTransformer
+      img_size: 224
+      patch_size: 16
+      embed_dim: 768
+      depth: 12
       num_heads: 12
-      mlp_ratio: 4 
-      qkv_bias: True 
+      mlp_ratio: 4
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.1
@@ -21,13 +22,15 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -42,10 +45,12 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -66,11 +71,11 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
 
 lr_scheduler:
-  name: ViTLRScheduler 
+  name: ViTLRScheduler
   learning_rate: 3e-3
   decay_type: cosine
   warmup_steps: 10000
diff --git a/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml b/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml
index 901060fc..23d4c346 100644
--- a/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml
+++ b/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml
@@ -1,37 +1,41 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
+seed: 16
+device: gpu
 
 model:
   name: ViTWrapper
   architecture:
-      name: VisionTransformer 
-      img_size: 384 
-      patch_size: 32 
-      width: 768 
-      depth: 12 
-      num_heads: 12 
-      mlp_ratio: 4 
-      qkv_bias: True 
+      name: VisionTransformer
+      img_size: 384
+      patch_size: 32
+      width: 768
+      depth: 12
+      num_heads: 12
+      mlp_ratio: 4
+      qkv_bias: True
   head:
     name: VisionTransformerClsHead
     num_classes: 1000
-    in_channels: 768 
+    in_channels: 768
 
 dataloader:
   train:
-    num_workers: 0
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 128 
+      batch_size: 128
       shuffle: true
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
         - name: ToRGB
         - name: RandomResizedCrop
-          size: 384 
+          size: 384
           scale: [0.75, 1.]
           ratio: [1., 1.]
           interpolation: 'bicubic'
@@ -41,12 +45,12 @@ dataloader:
           std: [127.5, 127.5, 127.5]
 
 lr_scheduler:
-  name: CosineWarmup 
-  learning_rate: 0.003 
+  name: CosineWarmup
+  learning_rate: 0.003
   T_max: 93835
-  warmup_steps: 10000 
+  warmup_steps: 10000
   start_lr: 0.00003
-  end_lr: 0.003 
+  end_lr: 0.003
 
 optimizer:
   name: AdamW
diff --git a/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml b/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml
index 8554a48f..f41d6bfc 100644
--- a/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml
+++ b/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml
@@ -1,30 +1,32 @@
-epochs: 300 
+epochs: 300
 output_dir: output_dir
 seed: 2021
+device: gpu
 
 model:
   name: ViTWrapper
   architecture:
-      name: GoogleVisionTransformer 
-      img_size: 224 
-      patch_size: 14 
-      embed_dim: 1664 
+      name: GoogleVisionTransformer
+      img_size: 224
+      patch_size: 14
+      embed_dim: 1664
       depth: 48
       num_heads: 16
       mlp_ratio: 4.9231
-      qkv_bias: True 
+      qkv_bias: True
       epsilon: 1e-6
       class_num: 1000
       drop_rate: 0.1
       representation_size: 768
   label_smoothing: 0.0001
-  
+
+# Make sure your paddle version is develop to use sharding.
 sharding:
    sharding_stage: 2
-   offload: False 
-   accumulate_grad: False 
+   offload: False
+   accumulate_grad: False
 
-use_amp: True 
+use_amp: True
 AMP:
    level: 'O2'
    save_dtype: 'float32'
@@ -38,13 +40,15 @@ AMP:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
-      batch_size: 16 
+      batch_size: 16
       shuffle: True
       drop_last: True
     dataset:
-      name: ImageNet 
+      name: ImageNet
       dataroot: data/ILSVRC2012/train/
       return_label: True
       transforms:
@@ -59,10 +63,12 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 256
       shuffle: False
@@ -83,11 +89,11 @@ dataloader:
         - name: Normalize
           data_format: 'HWC'
           mean: [127.5, 127.5, 127.5]
-          std: [127.5, 127.5, 127.5] 
+          std: [127.5, 127.5, 127.5]
         - name: Transpose
 
 lr_scheduler:
-  name: ViTLRScheduler 
+  name: ViTLRScheduler
   learning_rate: 3e-3
   decay_type: cosine
   warmup_steps: 10000
diff --git a/configs/xcit/xcit_large_24_p16_224.yaml b/configs/xcit/xcit_large_24_p16_224.yaml
index 54372a26..a479c215 100755
--- a/configs/xcit/xcit_large_24_p16_224.yaml
+++ b/configs/xcit/xcit_large_24_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 16
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_large_24_p8_224.yaml b/configs/xcit/xcit_large_24_p8_224.yaml
index 4e5df1c2..0c95f65e 100755
--- a/configs/xcit/xcit_large_24_p8_224.yaml
+++ b/configs/xcit/xcit_large_24_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 16
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_large_24_p8_224_dist.yaml b/configs/xcit/xcit_large_24_p8_224_dist.yaml
index 995a14bf..3afc2b3f 100755
--- a/configs/xcit/xcit_large_24_p8_224_dist.yaml
+++ b/configs/xcit/xcit_large_24_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -38,7 +39,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -48,7 +49,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -83,7 +86,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/xcit/xcit_medium_24_p16_224.yaml b/configs/xcit/xcit_medium_24_p16_224.yaml
index b0469af1..86e4e68f 100755
--- a/configs/xcit/xcit_medium_24_p16_224.yaml
+++ b/configs/xcit/xcit_medium_24_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 16
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_medium_24_p8_224.yaml b/configs/xcit/xcit_medium_24_p8_224.yaml
index 1b886155..a908a65f 100755
--- a/configs/xcit/xcit_medium_24_p8_224.yaml
+++ b/configs/xcit/xcit_medium_24_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 16
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_medium_24_p8_224_dist.yaml b/configs/xcit/xcit_medium_24_p8_224_dist.yaml
index 8cd14a50..ee815158 100755
--- a/configs/xcit/xcit_medium_24_p8_224_dist.yaml
+++ b/configs/xcit/xcit_medium_24_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -38,7 +39,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -48,7 +49,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -83,7 +86,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/xcit/xcit_nano_12_p16_224.yaml b/configs/xcit/xcit_nano_12_p16_224.yaml
index 77728d19..b33f9909 100755
--- a/configs/xcit/xcit_nano_12_p16_224.yaml
+++ b/configs/xcit/xcit_nano_12_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_nano_12_p8_224.yaml b/configs/xcit/xcit_nano_12_p8_224.yaml
index 32436ab6..561fe541 100755
--- a/configs/xcit/xcit_nano_12_p8_224.yaml
+++ b/configs/xcit/xcit_nano_12_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_nano_12_p8_224_dist.yaml b/configs/xcit/xcit_nano_12_p8_224_dist.yaml
index 3393e0d2..e1702f9a 100755
--- a/configs/xcit/xcit_nano_12_p8_224_dist.yaml
+++ b/configs/xcit/xcit_nano_12_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -39,7 +40,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -49,7 +50,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -84,7 +87,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/xcit/xcit_small_12_p16_224.yaml b/configs/xcit/xcit_small_12_p16_224.yaml
index 41cfde05..f2593217 100755
--- a/configs/xcit/xcit_small_12_p16_224.yaml
+++ b/configs/xcit/xcit_small_12_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_small_12_p8_224.yaml b/configs/xcit/xcit_small_12_p8_224.yaml
index fb9761c7..6c95d5dd 100755
--- a/configs/xcit/xcit_small_12_p8_224.yaml
+++ b/configs/xcit/xcit_small_12_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_small_12_p8_224_dist.yaml b/configs/xcit/xcit_small_12_p8_224_dist.yaml
index f792496f..538b0e3f 100755
--- a/configs/xcit/xcit_small_12_p8_224_dist.yaml
+++ b/configs/xcit/xcit_small_12_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -38,7 +39,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -48,7 +49,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -83,7 +86,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/xcit/xcit_small_24_p16_224.yaml b/configs/xcit/xcit_small_24_p16_224.yaml
index cdc9c495..8e379e26 100755
--- a/configs/xcit/xcit_small_24_p16_224.yaml
+++ b/configs/xcit/xcit_small_24_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 32
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_small_24_p8_224.yaml b/configs/xcit/xcit_small_24_p8_224.yaml
index eca56782..9ba9b579 100755
--- a/configs/xcit/xcit_small_24_p8_224.yaml
+++ b/configs/xcit/xcit_small_24_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 32
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_small_24_p8_224_dist.yaml b/configs/xcit/xcit_small_24_p8_224_dist.yaml
index f792496f..538b0e3f 100755
--- a/configs/xcit/xcit_small_24_p8_224_dist.yaml
+++ b/configs/xcit/xcit_small_24_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -38,7 +39,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -48,7 +49,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -83,7 +86,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/xcit/xcit_tiny_12_p16_224.yaml b/configs/xcit/xcit_tiny_12_p16_224.yaml
index f2f96680..c834b468 100755
--- a/configs/xcit/xcit_tiny_12_p16_224.yaml
+++ b/configs/xcit/xcit_tiny_12_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_tiny_12_p8_224.yaml b/configs/xcit/xcit_tiny_12_p8_224.yaml
index 056b96a5..3de640a2 100755
--- a/configs/xcit/xcit_tiny_12_p8_224.yaml
+++ b/configs/xcit/xcit_tiny_12_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_tiny_12_p8_224_dist.yaml b/configs/xcit/xcit_tiny_12_p8_224_dist.yaml
index 28706276..ec62efba 100755
--- a/configs/xcit/xcit_tiny_12_p8_224_dist.yaml
+++ b/configs/xcit/xcit_tiny_12_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -39,7 +40,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -49,7 +50,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -84,7 +87,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/configs/xcit/xcit_tiny_24_p16_224.yaml b/configs/xcit/xcit_tiny_24_p16_224.yaml
index 54da3860..de6efbb9 100755
--- a/configs/xcit/xcit_tiny_24_p16_224.yaml
+++ b/configs/xcit/xcit_tiny_24_p16_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_tiny_24_p8_224.yaml b/configs/xcit/xcit_tiny_24_p8_224.yaml
index bb9c3fcc..ee3572fd 100755
--- a/configs/xcit/xcit_tiny_24_p8_224.yaml
+++ b/configs/xcit/xcit_tiny_24_p8_224.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: SwinWrapper
@@ -20,7 +21,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 64
       shuffle: True
@@ -55,7 +58,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: False
diff --git a/configs/xcit/xcit_tiny_24_p8_224_dist.yaml b/configs/xcit/xcit_tiny_24_p8_224_dist.yaml
index 1fe78db0..9f961164 100755
--- a/configs/xcit/xcit_tiny_24_p8_224_dist.yaml
+++ b/configs/xcit/xcit_tiny_24_p8_224_dist.yaml
@@ -1,6 +1,7 @@
 epochs: 400
 output_dir: output_dir
 seed: 0
+device: gpu
 
 model:
   name: DistillationWrapper
@@ -38,7 +39,7 @@ model:
   pretrained_list:
     - regnety_160.pdparams
     - null
-  freeze_params_list: 
+  freeze_params_list:
     - True
     - False
   infer_model_key: Student
@@ -48,7 +49,9 @@ model:
 
 dataloader:
   train:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: True
@@ -83,7 +86,9 @@ dataloader:
           mode: 'batch'
           cutmix_alpha: 1.0
   val:
-    num_workers: 8
+    loader:
+      num_workers: 8
+      use_shared_memory: True
     sampler:
       batch_size: 128
       shuffle: false
diff --git a/passl/datasets/__init__.py b/passl/datasets/__init__.py
index 41ecaaed..1a9a9dbc 100644
--- a/passl/datasets/__init__.py
+++ b/passl/datasets/__init__.py
@@ -14,6 +14,7 @@
 
 from .imagenet import ImageNet
 from .imagenet import ImageNetCoord
+from .beitdataset import BEiT_ImageNet
 from .cifar import CIFAR10, CIFAR100
 
 from .textimagedataset import TextImageDataset
diff --git a/passl/datasets/beitdataset.py b/passl/datasets/beitdataset.py
new file mode 100644
index 00000000..269596e8
--- /dev/null
+++ b/passl/datasets/beitdataset.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from .folder import DatasetFolder
+
+from .preprocess import build_transforms, MaskingGenerator
+from .builder import DATASETS
+from ..utils.misc import accuracy
+
+
+@DATASETS.register()
+class BEiT_ImageNet(DatasetFolder):
+    cls_filter = None
+
+    def __init__(self,
+                 dataroot,
+                 common_transforms=None,
+                 patch_transforms=None,
+                 visual_token_transforms=None,
+                 masking_generator=None):
+        super(BEiT_ImageNet, self).__init__(dataroot,
+                                            cls_filter=self.cls_filter)
+
+        self.common_transform = build_transforms(common_transforms)
+        self.patch_transform = build_transforms(patch_transforms)
+        self.visual_token_transform = build_transforms(visual_token_transforms)
+        self.masked_position_generator = MaskingGenerator(**masking_generator)
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        # Only Used For Debug The DataAug Module.
+        #path = 'data/ILSVRC2012/train/n13040303/n13040303_1206.jpeg'
+        #target = 14
+        sample = self.loader(path)
+        for_patches, for_visual_tokens = self.common_transform(sample)
+        return \
+            self.patch_transform(for_patches), \
+            self.visual_token_transform(for_visual_tokens), \
+            self.masked_position_generator()
+
+    def evaluate(self, preds, labels, topk=(1, 5)):
+
+        eval_res = {}
+        eval_res['acc1'], eval_res['acc5'] = accuracy(preds, labels, topk)
+
+        return eval_res
diff --git a/passl/datasets/builder.py b/passl/datasets/builder.py
index 9c4ec62a..6d6e0004 100644
--- a/passl/datasets/builder.py
+++ b/passl/datasets/builder.py
@@ -24,11 +24,11 @@
 
 DATASETS = Registry("DATASET")
 
+
 class DistributedRepeatedAugSampler(DistributedBatchSampler):
     """
     based on https://github.com/facebookresearch/deit/blob/main/samplers.py
     """
-
     def __init__(self,
                  dataset,
                  batch_size,
@@ -36,10 +36,10 @@ def __init__(self,
                  rank=None,
                  shuffle=False,
                  drop_last=False):
-        super(DistributedRepeatedAugSampler, self).__init__(
-            dataset, batch_size, num_replicas, rank, shuffle, drop_last)
-        self.num_samples = int(
-            math.ceil(len(self.dataset) * 3.0 / self.nranks))
+        super(DistributedRepeatedAugSampler,
+              self).__init__(dataset, batch_size, num_replicas, rank, shuffle,
+                             drop_last)
+        self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.nranks))
         self.total_size = self.num_samples * self.nranks
         self.num_selected_samples = int(
             math.floor(len(self.dataset) // 256 * 256 / self.nranks))
@@ -79,8 +79,9 @@ def build_dataset(cfg):
     return build_from_config(cfg, DATASETS)
 
 
-def build_dataloader(cfg):
+def build_dataloader(cfg, device):
     cfg_ = copy.deepcopy(cfg)
+    loader_cfg = cfg_.pop('loader')
     dataset_cfg = cfg_.pop('dataset')
     sampler_cfg = cfg_.pop('sampler')
 
@@ -90,10 +91,13 @@ def build_dataloader(cfg):
     dataset = build_dataset(dataset_cfg)
 
     sampler_name = sampler_cfg.pop('name', 'DistributedBatchSampler')
-    
+
     sampler = eval("{}".format(sampler_name))(dataset, **sampler_cfg)
 
-    dataloader = paddle.io.DataLoader(dataset, batch_sampler=sampler, **cfg_)
+    dataloader = paddle.io.DataLoader(dataset,
+                                      batch_sampler=sampler,
+                                      places=device,
+                                      **loader_cfg)
 
     #setup mixup / cutmix
     mixup_fn = None
diff --git a/passl/datasets/preprocess/__init__.py b/passl/datasets/preprocess/__init__.py
index 693a1289..b1280179 100644
--- a/passl/datasets/preprocess/__init__.py
+++ b/passl/datasets/preprocess/__init__.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 
 from .builder import build_transform, build_transforms
+from .masking_generator import MaskingGenerator
 from .transforms import RandomApply, RandomGrayscale, GaussianBlur, Solarization
diff --git a/passl/datasets/preprocess/masking_generator.py b/passl/datasets/preprocess/masking_generator.py
new file mode 100644
index 00000000..8e3fbc90
--- /dev/null
+++ b/passl/datasets/preprocess/masking_generator.py
@@ -0,0 +1,107 @@
+# --------------------------------------------------------'
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# --------------------------------------------------------
+# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+
+import random
+import math
+import numpy as np
+
+
+class MaskingGenerator(object):
+    def __init__(self,
+                 input_size,
+                 num_masking_patches,
+                 min_num_patches=4,
+                 max_num_patches=None,
+                 min_aspect=0.3,
+                 max_aspect=None):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size, ) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_masking_patches = num_masking_patches
+
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
+
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for attempt in range(10):
+            target_area = random.uniform(self.min_num_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+
+                num_masked = mask[top:top + h, left:left + w].sum()
+                # Overlap
+                if 0 < h * w - num_masked <= max_mask_patches:
+                    for i in range(top, top + h):
+                        for j in range(left, left + w):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self):
+        mask = np.zeros(shape=self.get_shape(), dtype=np.int)
+        mask_count = 0
+        while mask_count < self.num_masking_patches:
+            max_mask_patches = self.num_masking_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
+
+
+class RandomMaskingGenerator(object):
+    def __init__(self, input_size, mask_ratio):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size, ) * 2
+
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_mask = int(mask_ratio * self.num_patches)
+
+    def __call__(self):
+        mask = np.hstack([
+            np.zeros(self.num_patches - self.num_mask),
+            np.ones(self.num_mask),
+        ])
+        np.random.shuffle(mask)
+        return mask  # [196]
diff --git a/passl/datasets/preprocess/transforms.py b/passl/datasets/preprocess/transforms.py
index 6ba3ed0a..a717e32a 100644
--- a/passl/datasets/preprocess/transforms.py
+++ b/passl/datasets/preprocess/transforms.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import cv2
 import math
 import random
-from PIL import ImageFilter, Image, ImageOps
-import cv2
+import warnings
 import numpy as np
 from functools import partial
+from PIL import ImageFilter, Image, ImageOps
 
 import paddle
 import paddle.vision.transforms as PT
@@ -26,6 +27,7 @@
 from .mixup import Mixup
 from .builder import TRANSFORMS, build_transform
 from .random_erasing import RandomErasing
+from .masking_generator import MaskingGenerator, RandomMaskingGenerator
 from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT
 from .auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform
 from .cv2_trans import ByolRandomHorizontalFlip, ByolColorJitter, ByolRandomGrayscale, ByolNormalize, \
@@ -41,6 +43,7 @@
 TRANSFORMS.register(PT.CenterCrop)
 TRANSFORMS.register(PT.ToTensor)
 
+# BYOL Augmentation
 TRANSFORMS.register(ByolRandomHorizontalFlip)
 TRANSFORMS.register(ByolColorJitter)
 TRANSFORMS.register(ByolRandomGrayscale)
@@ -53,9 +56,16 @@
 TRANSFORMS.register(RandomErasing)
 TRANSFORMS.register(Mixup)
 
+# PixPro
 TRANSFORMS.register(RandomResizedCropCoord)
 TRANSFORMS.register(RandomHorizontalFlipCoord)
 
+# BEiT
+TRANSFORMS.register(MaskingGenerator)
+
+_RANDOM_INTERPOLATION = ('bilinear', 'bicubic')
+
+
 @TRANSFORMS.register()
 class Clip():
     def __init__(self, min_val=0.0, max_val=1.0):
@@ -252,7 +262,7 @@ def _apply_image(self, img):
             if not is_pil:
                 img = np.asarray(img)
         return img
-    
+
 
 class UnifiedResize(object):
     """
@@ -292,20 +302,17 @@ def _pil_resize(src, size, resample):
                 interpolation = _pil_interp_from_str[interpolation.lower()]
             self.resize_func = partial(_pil_resize, resample=interpolation)
         else:
-            logger.warning(
-                f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
-            )
             self.resize_func = cv2.resize
 
     def __call__(self, src, size):
         return self.resize_func(src, size)
-    
+
+
 @TRANSFORMS.register()
 class RandCropImage(object):
     """ random crop image
         https://github.com/PaddlePaddle/PaddleClas/blob/release/2.3/ppcls/data/preprocess/ops/operators.py
     """
-
     def __init__(self,
                  size,
                  scale=None,
@@ -320,8 +327,8 @@ def __init__(self,
         self.scale = [0.08, 1.0] if scale is None else scale
         self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
 
-        self._resize_func = UnifiedResize(
-            interpolation=interpolation, backend=backend)
+        self._resize_func = UnifiedResize(interpolation=interpolation,
+                                          backend=backend)
 
     def __call__(self, img):
         size = self.size
@@ -350,13 +357,13 @@ def __call__(self, img):
         img = img[j:j + h, i:i + w, :]
 
         return self._resize_func(img, size)
-   
+
+
 @TRANSFORMS.register()
 class ResizeImage(object):
     """ resize image
         https://github.com/PaddlePaddle/PaddleClas/blob/release/2.3/ppcls/data/preprocess/ops/operators.py
     """
-
     def __init__(self,
                  size=None,
                  resize_short=None,
@@ -371,11 +378,11 @@ def __init__(self,
             self.w = size if type(size) is int else size[0]
             self.h = size if type(size) is int else size[1]
         else:
-            raise OperatorParamError("invalid params for ReisizeImage for '\
+            raise ValueError("invalid params for ReisizeImage for '\
                 'both 'size' and 'resize_short' are None")
 
-        self._resize_func = UnifiedResize(
-            interpolation=interpolation, backend=backend)
+        self._resize_func = UnifiedResize(interpolation=interpolation,
+                                          backend=backend)
 
     def __call__(self, img):
         img_h, img_w = img.shape[:2]
@@ -398,7 +405,7 @@ class NormalizeImage(PT.Normalize):
         scale (float): Normalize input value to [0, 1].
         mean (int|float|list|tuple): Sequence of means for each channel.
         std (int|float|list|tuple): Sequence of standard deviations for each channel.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
         to_rgb (bool, optional): Whether to convert to rgb. Default: False.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
@@ -411,7 +418,7 @@ class NormalizeImage(PT.Normalize):
         A callable object of Normalize.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -419,7 +426,7 @@ class NormalizeImage(PT.Normalize):
             from paddle.vision.transforms import Normalize
 
             normalize = NormalizeImage(scale=1./255.,
-                                  mean=[127.5, 127.5, 127.5], 
+                                  mean=[127.5, 127.5, 127.5],
                                   std=[127.5, 127.5, 127.5],
                                   data_format='HWC')
 
@@ -428,9 +435,8 @@ class NormalizeImage(PT.Normalize):
             fake_img = normalize(fake_img)
             print(fake_img.shape)
             print(fake_img.max, fake_img.max)
-    
-    """
 
+    """
     def __init__(self,
                  scale=None,
                  mean=0.0,
@@ -441,11 +447,154 @@ def __init__(self,
                  keys=None):
         super(NormalizeImage, self).__init__(mean=mean, std=std, keys=keys)
         self.scale = eval(scale)
-        self.dtype = dtype 
+        self.dtype = dtype
 
     def _apply_image(self, img):
         if self.scale is not None:
             img = img * self.scale
         img = F.normalize(img, self.mean, self.std, self.data_format,
-                           self.to_rgb)
+                          self.to_rgb)
         return img.astype(self.dtype)
+
+
+@TRANSFORMS.register()
+class RandomResizedCropAndInterpolationWithTwoPic(PT.RandomResizedCrop):
+    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made.
+    This crop is finally resized to given size.
+    This is popularly used to train the Inception networks.
+
+    Args:
+        size: expected output size of each edge
+        second size: second expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: PIL.Image.BILINEAR
+        second_interpolation: Default: PIL.Image.LANCZOS
+    """
+    def __init__(self,
+                 size,
+                 second_size=None,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear',
+                 second_interpolation='lanczos',
+                 keys=None):
+        super(RandomResizedCropAndInterpolationWithTwoPic, self).__init__(keys)
+        if isinstance(size, list):
+            self.size = size
+        else:
+            self.size = [size, size]
+        if second_size is not None:
+            if isinstance(second_size, list):
+                self.second_size = second_size
+            else:
+                self.second_size = [second_size, second_size]
+        else:
+            self.second_size = None
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("range should be of kind (min, max)")
+
+        if interpolation == 'random':
+            self.interpolation = _RANDOM_INTERPOLATION
+        else:
+            self.interpolation = interpolation
+        self.second_interpolation = second_interpolation
+        self.scale = scale
+        self.ratio = ratio
+
+    def get_params(self, img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+
+        Args:
+            img (PIL Image): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        area = img.size[0] * img.size[1]
+
+        for attempt in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w <= img.size[0] and h <= img.size[1]:
+                i = random.randint(0, img.size[1] - h)
+                j = random.randint(0, img.size[0] - w)
+                return i, j, h, w
+
+        # Fallback to central crop
+        in_ratio = img.size[0] / img.size[1]
+        if in_ratio < min(ratio):
+            w = img.size[0]
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = img.size[1]
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = img.size[0]
+            h = img.size[1]
+        i = (img.size[1] - h) // 2
+        j = (img.size[0] - w) // 2
+        return i, j, h, w
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+
+        Returns:
+            PIL Image: Randomly cropped and resized image.
+        """
+        i, j, h, w = self.get_params(img, self.scale, self.ratio)
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolation = random.choice(self.interpolation)
+        else:
+            interpolation = self.interpolation
+        cropped_img = F.crop(img, i, j, h, w)
+        if self.second_size is None:
+            return F.resize(cropped_img, self.size, interpolation)
+        else:
+            return F.resize(img, self.size, interpolation), \
+                   F.resize(img, self.second_size, self.second_interpolation)
+
+
+@TRANSFORMS.register()
+class VisualTokenMap(object):
+    def __init__(self, mode='map_pixel', scale=None):
+        self.mode = mode
+        self.scale = scale
+        self.logit_laplace_eps = 0.1
+
+    def map_pixels(self, x):
+        if self.scale is not None:
+            try:
+                x = paddle.to_tensor(x).astype('float32') / self.scale
+            except:
+                import pdb
+
+        return (1 - 2 * self.logit_laplace_eps) * x + self.logit_laplace_eps
+
+    def unmap_pixels(self, x):
+        if len(x.shape) != 4:
+            raise ValueError('expected input to be 4d')
+        if x.dtype != paddle.float32:
+            raise ValueError('expected input to have type float')
+
+        return paddle.clamp(
+            (x - self.logit_laplace_eps) / (1 - 2 * self.logit_laplace_eps), 0,
+            1)
+
+    def __call__(self, x):
+        if self.mode == "map_pixels":
+            return self.map_pixels(x)
+        elif self.mode == "unmap_pixels":
+            return self.unmap_pixels(x)
diff --git a/passl/engine/trainer.py b/passl/engine/trainer.py
index d0373035..bf3ab04a 100644
--- a/passl/engine/trainer.py
+++ b/passl/engine/trainer.py
@@ -106,6 +106,12 @@ def __init__(self, cfg):
             np.random.seed(seed)
             random.seed(seed)
 
+        # set device
+        assert cfg['device'] in ['cpu', 'gpu', 'xpu', 'npu']
+        self.device = paddle.set_device(cfg['device'])
+        self.logger.info('train with paddle {} on {} device'.format(
+            paddle.__version__, self.device))
+
         self.start_epoch = 0
         self.current_epoch = 0
         self.current_iter = 0
@@ -133,7 +139,7 @@ def __init__(self, cfg):
 
         # build train dataloader
         self.train_dataloader, self.mixup_fn = build_dataloader(
-            cfg.dataloader.train)
+            cfg.dataloader.train, self.device)
         self.iters_per_epoch = len(self.train_dataloader)
 
         # use byol iters
@@ -168,7 +174,11 @@ def __init__(self, cfg):
             mp_rank = hcg.get_model_parallel_rank()
             pp_rank = hcg.get_stage_id()
             dp_rank = hcg.get_data_parallel_rank()
-            set_hyrbid_parallel_seed(seed, 0, mp_rank, pp_rank)
+            set_hyrbid_parallel_seed(seed,
+                                     0,
+                                     mp_rank,
+                                     pp_rank,
+                                     device=self.device)
 
         # amp training
         self.use_amp = cfg.get('use_amp',
@@ -324,7 +334,7 @@ def train(self):
     def val(self, **kargs):
         if not hasattr(self, 'val_dataloader'):
             self.val_dataloader, mixup_fn = build_dataloader(
-                self.cfg.dataloader.val)
+                self.cfg.dataloader.val, self.device)
 
         self.logger.info(
             'start evaluate on epoch {} ..'.format(self.current_epoch + 1))
diff --git a/passl/modeling/architectures/BEiTWrapper.py b/passl/modeling/architectures/BEiTWrapper.py
new file mode 100644
index 00000000..0a886c58
--- /dev/null
+++ b/passl/modeling/architectures/BEiTWrapper.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+
+from .builder import MODELS
+from .builder import create_d_vae
+from ..heads import build_head
+from ..backbones import build_backbone
+
+
+@MODELS.register()
+class BEiTWrapper(nn.Layer):
+    def __init__(self, architecture=None, head=None):
+        """A wrapper for a BEiT supervised model.
+
+        Args:
+            architecture (dict): A dictionary containing the BEiT instantiation parameters.
+        """
+        super().__init__()
+
+        self.backbone = build_backbone(architecture)
+        self.automatic_optimization = False
+        self.head = build_head(head)
+
+    def backbone_forward(self, x):
+        x = self.backbone(x)
+        return x
+
+    def train_iter(self, *inputs, **kwargs):
+        img, label = inputs
+        cls_token = self.backbone_forward(img)
+        outs = self.head(cls_token)
+        loss_inputs = (outs, label)
+        outputs = self.head.loss(*loss_inputs)
+        return outputs
+
+    def forward(self, *inputs, mode='train', **kwargs):
+        if mode == 'train':
+            return self.train_iter(*inputs, **kwargs)
+        elif mode == 'test':
+            return self.test_iter(*inputs, **kwargs)
+        elif mode == 'extract':
+            return self.backbone(*inputs)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+    def validation_step(self, val_batch, idx):
+        image, text = val_batch
+        image_logits, text_logits = self.forward(image, text)
+        ground_truth = paddle.arange(len(image_logits))
+        loss = (self.image_loss(image_logits, ground_truth) +
+                self.text_loss(text_logits, ground_truth)).div(2)
+        self.log('val_loss', loss)
+
+
+@MODELS.register()
+class BEiTPTWrapper(nn.Layer):
+    def __init__(self, architecture=None, head=None, d_vae=None):
+        """A wrapper for a BEiT Pretrain.
+
+        Args:
+            architecture (dict): A dictionary containing the BEiT instantiation parameters.
+        """
+        super().__init__()
+
+        self.backbone = build_backbone(architecture)
+        self.automatic_optimization = False
+        self.head = build_head(head)
+        with paddle.no_grad():
+            self.d_vae = create_d_vae(d_vae)
+
+    def get_codebook_indices(self, images):
+        with paddle.no_grad():
+            logits = self.d_vae.encoder(images)
+            codebook_indices = logits.argmax(axis=1)
+            return codebook_indices
+
+    def backbone_forward(self,
+                         x,
+                         bool_masked_pos=None,
+                         return_all_tokens=False):
+        x = self.backbone(x,
+                          bool_masked_pos=bool_masked_pos,
+                          return_all_tokens=return_all_tokens)
+        return x
+
+    def train_iter(self, *inputs, **kwargs):
+        samples, images, bool_masked_pos = inputs
+
+        with paddle.no_grad():
+            input_ids = self.get_codebook_indices(images).flatten(1)
+            bool_masked_pos = bool_masked_pos.flatten(1).astype(
+                'bool')  # to bool.
+            labels = input_ids[bool_masked_pos]
+
+        outputs = self.backbone_forward(samples,
+                                        bool_masked_pos=bool_masked_pos,
+                                        return_all_tokens=False)
+        loss = self.head(outputs, labels)
+        return loss
+
+    def test_iter(self, *inputs, **kwargs):
+        with paddle.no_grad():
+            img, label = inputs
+            x = self.backbone_forward(img)
+            outs = self.head(x)
+
+        return outs
+
+    def forward(self, *inputs, mode='train', **kwargs):
+        if mode == 'train':
+            return self.train_iter(*inputs, **kwargs)
+        elif mode == 'test':
+            return self.test_iter(*inputs, **kwargs)
+        elif mode == 'extract':
+            return self.backbone(*inputs)
+        else:
+            raise Exception("No such mode: {}".format(mode))
+
+    def validation_step(self, val_batch, idx):
+        image, text = val_batch
+        image_logits, text_logits = self.forward(image, text)
+        ground_truth = paddle.arange(len(image_logits))
+        loss = (self.image_loss(image_logits, ground_truth) +
+                self.text_loss(text_logits, ground_truth)).div(2)
+        self.log('val_loss', loss)
+
+
+@MODELS.register()
+class BEiTFTWrapper(nn.Layer):
+    def __init__(self, architecture=None, head=None):
+        """A wrapper for a BEiT Finetune.
+
+        Args:
+            architecture (dict): A dictionary containing the BEiT instantiation parameters.
+        """
+        super().__init__()
+        self.backbone = build_backbone(architecture)
+        self.head = build_head(head)
+
+    def backbone_forward(self, x):
+        x = self.backbone(x)
+        return x
+
+    def train_iter(self, *inputs, **kwargs):
+        img, label = inputs
+        mixup_fn = kwargs['mixup_fn']
+        if mixup_fn is not None:
+            img, label = mixup_fn(img, label)
+
+        x = self.backbone_forward(img)
+        outputs = self.head(x)
+        outputs = self.head.loss(outputs, label)
+        return outputs
+
+    def test_iter(self, *inputs, **kwargs):
+        with paddle.no_grad():
+            img, _ = inputs
+            x = self.backbone_forward(img)
+            outs = self.head(x)
+            return outs  # self.head.loss(outs, label, soft=False)
+
+    def forward(self, *inputs, mode='train', **kwargs):
+        if mode == 'train':
+            return self.train_iter(*inputs, **kwargs)
+        elif mode == 'test':
+            return self.test_iter(*inputs, **kwargs)
+        elif mode == 'extract':
+            return self.backbone(*inputs)
+        else:
+            raise Exception("No such mode: {}".format(mode))
diff --git a/passl/modeling/architectures/BeitWrapper.py b/passl/modeling/architectures/BeitWrapper.py
deleted file mode 100644
index 7c968ff9..00000000
--- a/passl/modeling/architectures/BeitWrapper.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import paddle.distributed as dist
-
-from ..backbones import build_backbone
-from ..heads import build_head
-from .builder import MODELS
-
-
-@MODELS.register()
-class BeitWrapper(nn.Layer):
-    def __init__(self, architecture=None, head=None):
-        """A wrapper for a ViT model as specified in the paper.
-
-        Args:
-            architecture (dict): A dictionary containing the ViT instantiation parameters.
-        """
-        super().__init__()
-
-        self.backbone = build_backbone(architecture)
-        self.automatic_optimization = False
-        self.head = build_head(head)
-
-    def backbone_forward(self, x):
-        x = self.backbone(x)
-        return x
-
-    def train_iter(self, *inputs, **kwargs):
-        img, label = inputs
-        cls_token = self.backbone_forward(img)
-        outs = self.head(cls_token)
-        loss_inputs = (outs, label)
-        outputs = self.head.loss(*loss_inputs)
-        return outputs
-
-    def forward(self, *inputs, mode='train', **kwargs):
-        if mode == 'train':
-            return self.train_iter(*inputs, **kwargs)
-        elif mode == 'test':
-            return self.test_iter(*inputs, **kwargs)
-        elif mode == 'extract':
-            return self.backbone(*inputs)
-        else:
-            raise Exception("No such mode: {}".format(mode))
-
-    def validation_step(self, val_batch, idx):
-        image, text = val_batch
-        image_logits, text_logits = self.forward(image, text)
-        ground_truth = paddle.arange(len(image_logits))
-        loss = (self.image_loss(image_logits, ground_truth) +
-                self.text_loss(text_logits, ground_truth)).div(2)
-        self.log('val_loss', loss)
diff --git a/passl/modeling/architectures/__init__.py b/passl/modeling/architectures/__init__.py
index 4eb0338d..9d2356ff 100644
--- a/passl/modeling/architectures/__init__.py
+++ b/passl/modeling/architectures/__init__.py
@@ -12,26 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .moco import MoCo
+from .builder import build_model
+from .byol_clas import ByolClassification
 from .clas import Classification
-from .BYOL import BYOL
-from .MoCoBYOL import MoCoBYOL
-from .CLIPWrapper import CLIPWrapper
+from .moco import MoCo
 from .simclr import SimCLR
-from .byol_clas import ByolClassification
-from .ViTWrapper import ViTWrapper
-from .SwinWrapper import SwinWrapper
-from .builder import build_model
-
-from .BeitWrapper import BeitWrapper
+from .pixpro import PixPro
 
-from .T2TViTWrapper import T2TViTWrapper
+from .BEiTWrapper import BEiTWrapper, BEiTPTWrapper, BEiTFTWrapper
+from .BYOL import BYOL
 from .CaiTWrapper import CaiTWrapper
-from .MlpMixerWrapper import MlpMixerWrapper
+from .CLIPWrapper import CLIPWrapper
 from .CvTWrapper import CvTWrapper
-
 from .DeiTWrapper import DeiTWrapper
-from .pixpro import PixPro
-
-from .MAE import MAE_PRETRAIN, MAE_FINETUNE
 from .DistillationWrapper import DistillationWrapper
+from .MAE import MAE_PRETRAIN, MAE_FINETUNE
+from .MoCoBYOL import MoCoBYOL
+from .MlpMixerWrapper import MlpMixerWrapper
+from .SwinWrapper import SwinWrapper
+from .T2TViTWrapper import T2TViTWrapper
+from .ViTWrapper import ViTWrapper
diff --git a/passl/modeling/architectures/builder.py b/passl/modeling/architectures/builder.py
index 2dfc1ae4..12e477a3 100644
--- a/passl/modeling/architectures/builder.py
+++ b/passl/modeling/architectures/builder.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import paddle
+from copy import deepcopy
+from ..backbones.discrete_vae import Dalle_VAE, DiscreteVAE, load_model, Encoder, Decoder
 from ...utils.registry import Registry, build_from_config
 
 MODELS = Registry("MODEL")
@@ -19,3 +23,49 @@
 
 def build_model(cfg):
     return build_from_config(cfg, MODELS)
+
+
+def create_d_vae(cfg):
+    cfg = deepcopy(cfg)
+    name = cfg.pop('name')
+    if name == "dall-e":
+        return get_dalle_vae(cfg)
+    elif name == "customized":
+        return get_d_vae(cfg)
+    else:
+        raise NotImplementedError()
+
+
+def get_dalle_vae(cfg):
+    cfg = deepcopy(cfg)
+    image_size = cfg.pop('image_size')
+    weight_path = cfg.pop('weight_path')
+    with paddle.no_grad():
+        vae = Dalle_VAE(image_size)
+        vae.encoder = load_model('encoder', model_dir=weight_path)
+        vae.decoder = load_model('decoder', model_dir=weight_path)
+        return vae
+
+
+def get_d_vae(cfg):
+    cfg = deepcopy(cfg)
+    image_size = cfg.pop('image_size')
+    weight_path = cfg.pop('weight_path')
+    NUM_TOKENS = 8192
+    NUM_LAYERS = 3
+    EMB_DIM = 512
+    HID_DIM = 256
+
+    state_dict = paddle.load(os.path.join(weight_path, "pytorch_model.bin"),
+                             map_location="cpu")["weights"]
+
+    model = DiscreteVAE(
+        image_size=image_size,
+        num_layers=NUM_LAYERS,
+        num_tokens=NUM_TOKENS,
+        codebook_dim=EMB_DIM,
+        hidden_dim=HID_DIM,
+    )
+
+    model.load_state_dict(state_dict)
+    return model
diff --git a/passl/modeling/backbones/__init__.py b/passl/modeling/backbones/__init__.py
index 47227db2..c7bf315a 100644
--- a/passl/modeling/backbones/__init__.py
+++ b/passl/modeling/backbones/__init__.py
@@ -1,18 +1,21 @@
-from .resnet import ResNet
-from .clip import CLIP
-from .builder import build_backbone
-from .resnetcifar import ResNet as ResNetCifar
-from .resnetsimclr import ResNetsimclr
-from .vision_transformer import VisionTransformer
-from .vit import GoogleVisionTransformer
-from .swin_transformer import SwinTransformer
 from .beit import Beit
-from .t2t_vit import T2TViT
+from .beit_ft import VisionTransformerForFinetune
+from .beit_pt import VisionTransformerForMaskedImageModeling
+from .builder import build_backbone
 from .cait import Cait
-from .mlp_mixer import MlpMixer
+from .clip import CLIP
+from .convnext import ConvNeXt
 from .cvt import CvT
 from .deit import DeiTVisionTransformer, DistilledVisionTransformer
-from .convnext import ConvNeXt
+from .discrete_vae import Dalle_VAE, DiscreteVAE
 from .mae import MAE, MAE_ViT
-from .xcit import XCiT
+from .mlp_mixer import MlpMixer
 from .regnet import RegNet
+from .resnet import ResNet
+from .resnetcifar import ResNet as ResNetCifar
+from .resnetsimclr import ResNetsimclr
+from .swin_transformer import SwinTransformer
+from .t2t_vit import T2TViT
+from .vision_transformer import VisionTransformer
+from .vit import GoogleVisionTransformer
+from .xcit import XCiT
diff --git a/passl/modeling/backbones/beit_ft.py b/passl/modeling/backbones/beit_ft.py
new file mode 100644
index 00000000..e07cb4c2
--- /dev/null
+++ b/passl/modeling/backbones/beit_ft.py
@@ -0,0 +1,533 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/unilm/tree/master/beit
+
+import copy
+import math
+from functools import partial
+
+import paddle
+import torch
+from timm.models.layers import trunc_normal_ as trunc_normal__
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import BACKBONES
+
+__all__ = ["VisionTransformerForMaskedImageModeling"]
+
+trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
+zeros_ = nn.initializer.Constant(value=0.0)
+ones_ = nn.initializer.Constant(value=1.0)
+
+
+def drop_path(x, drop_prob=0.0, training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    """MLP module
+    MLP using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc1 -> act -> dropout -> fc2 -> dropout
+    """
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """2D Image to Patch Embedding
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        img_size = (img_size, img_size)
+        patch_size = (patch_size, patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2D(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})"
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose((0, 2, 1))  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class Identity(nn.Layer):
+    """Identity layer
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class Attention(nn.Layer):
+    """Attention Layer"""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = head_dim**-0.5 if qk_scale is None else qk_scale
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False)
+        if qkv_bias:
+            self.q_bias = paddle.create_parameter(shape=[all_head_dim],
+                                                  dtype="float32",
+                                                  default_initializer=zeros_)
+
+            self.v_bias = paddle.create_parameter(shape=[all_head_dim],
+                                                  dtype="float32",
+                                                  default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] -
+                                          1) * (2 * window_size[1] - 1) + 3
+
+            self.relative_position_bias_table = paddle.create_parameter(
+                shape=[self.num_relative_distance, num_heads],
+                dtype="float32",
+                default_initializer=zeros_,
+            )  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid([coords_h,
+                                                   coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten.unsqueeze(
+                axis=2) - coords_flatten.unsqueeze(
+                    axis=1)  # 2, Wh*Ww, Wh*Ww #??
+            relative_coords = relative_coords.transpose([1, 2,
+                                                         0])  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :,
+                            0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = paddle.zeros(
+                [
+                    window_size[0] * window_size[1] + 1,
+                    window_size[0] * window_size[1] + 1,
+                ],
+                dtype=relative_coords.dtype,
+            )
+            # Wh*Ww, Wh*Ww
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+
+        qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
+
+        qkv = qkv.reshape([B, N, 3, self.num_heads,
+                           -1]).transpose([2, 0, 3, 1, 4])
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+
+        attn = q @ k.transpose([0, 1, 3, 2])
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1,
+                    -1,
+                ])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+            attn = attn + relative_position_bias.unsqueeze(axis=0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, -1])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        init_values=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        window_size=None,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+        if init_values:
+            self.gamma_1 = paddle.create_parameter(
+                shape=[dim],
+                dtype="float32",
+                default_initializer=nn.initializer.Constant(value=init_values),
+            )
+            self.gamma_2 = paddle.create_parameter(
+                shape=[dim],
+                dtype="float32",
+                default_initializer=nn.initializer.Constant(value=init_values),
+            )
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] -
+                                      1) * (2 * window_size[1] - 1) + 3
+
+        self.relative_position_bias_table = paddle.create_parameter(
+            shape=[self.num_relative_distance, num_heads],
+            dtype="float32",
+            default_initializer=zeros_,
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h,
+                                               coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten.unsqueeze(
+            axis=2) - coords_flatten.unsqueeze(axis=1)  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2,
+                                                     0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = paddle.zeros([
+            window_size[0] * window_size[1] + 1,
+            window_size[0] * window_size[1] + 1
+        ],
+                                               dtype=relative_coords.dtype)
+        relative_position_index[1:,
+                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1,
+                -1,
+            ])  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+
+@BACKBONES.register()
+class VisionTransformerForFinetune(nn.Layer):
+    """ BEiT Finetune
+    This model is mainly used for pretraining ImageNet-22K
+    code base on https://github.com/microsoft/unilm/blob/master/beit/modeling_finetune.py
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.0,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.0,
+                 attn_drop_rate=0.0,
+                 drop_path_rate=0.0,
+                 norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+                 init_values=None,
+                 use_abs_pos_emb=True,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True,
+                 init_scale=0.001):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        wa = torch.ones(size=(1, 1, embed_dim))
+        trunc_normal__(wa, std=0.02)
+        wa = wa.cpu().numpy()
+
+        self.cls_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype="float32",
+            #default_initializer=wa.cpu().numpy(),
+        )
+        self.cls_token.set_value(wa)
+        #self.mask_token = paddle.create_parameter(
+        #    shape=[1, 1, embed_dim],
+        #    dtype="float32",
+        #    default_initializer=trunc_normal_,
+        #)
+        if use_abs_pos_emb:
+            self.pos_embed = paddle.create_parameter(
+                shape=[1, num_patches + 1, embed_dim],
+                dtype="float32",
+                default_initializer=trunc_normal_,
+            )
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.grid_size, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)]
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.LayerList([
+            Block(dim=embed_dim,
+                  num_heads=num_heads,
+                  mlp_ratio=mlp_ratio,
+                  qkv_bias=qkv_bias,
+                  qk_scale=qk_scale,
+                  drop=drop_rate,
+                  attn_drop=attn_drop_rate,
+                  drop_path=dpr[i],
+                  norm_layer=norm_layer,
+                  init_values=init_values,
+                  window_size=self.patch_embed.grid_size
+                  if use_rel_pos_bias else None) for i in range(depth)
+        ])
+        self.norm = Identity() if use_mean_pooling else norm_layer(embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        #self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity()
+
+        #trunc_normal_(self.head.weight)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.scale(1. / math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight, layer_id + 1)
+            rescale(layer.mlp.fc2.weight, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.shape
+
+        cls_tokens = self.cls_token.expand([batch_size, -1, -1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        x = self.norm(x)
+        if self.fc_norm is not None:
+            t = x[:, 1:, :]
+            return self.fc_norm(t.mean(1))
+        else:
+            return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        #x = self.head(x)
+        return x
diff --git a/passl/modeling/backbones/beit_pt.py b/passl/modeling/backbones/beit_pt.py
new file mode 100644
index 00000000..af08e9d2
--- /dev/null
+++ b/passl/modeling/backbones/beit_pt.py
@@ -0,0 +1,534 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/unilm/tree/master/beit
+
+import copy
+import math
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import BACKBONES
+
+__all__ = ["VisionTransformerForMaskedImageModeling"]
+
+trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
+zeros_ = nn.initializer.Constant(value=0.0)
+ones_ = nn.initializer.Constant(value=1.0)
+
+
+def drop_path(x, drop_prob=0.0, training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    """MLP module
+    MLP using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc1 -> act -> dropout -> fc2 -> dropout
+    """
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """2D Image to Patch Embedding
+    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        img_size = (img_size, img_size)
+        patch_size = (patch_size, patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2D(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})"
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose((0, 2, 1))  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class Identity(nn.Layer):
+    """Identity layer
+    The output of this layer is the input without any change.
+    Use this layer to avoid if condition in some forward methods
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class Attention(nn.Layer):
+    """Attention Layer"""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = head_dim**-0.5 if qk_scale is None else qk_scale
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False)
+        if qkv_bias:
+            self.q_bias = paddle.create_parameter(shape=[all_head_dim],
+                                                  dtype="float32",
+                                                  default_initializer=zeros_)
+
+            self.v_bias = paddle.create_parameter(shape=[all_head_dim],
+                                                  dtype="float32",
+                                                  default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] -
+                                          1) * (2 * window_size[1] - 1) + 3
+
+            self.relative_position_bias_table = paddle.create_parameter(
+                shape=[self.num_relative_distance, num_heads],
+                dtype="float32",
+                default_initializer=zeros_,
+            )  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid([coords_h,
+                                                   coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten.unsqueeze(
+                axis=2) - coords_flatten.unsqueeze(
+                    axis=1)  # 2, Wh*Ww, Wh*Ww #??
+            relative_coords = relative_coords.transpose([1, 2,
+                                                         0])  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :,
+                            0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = paddle.zeros(
+                [
+                    window_size[0] * window_size[1] + 1,
+                    window_size[0] * window_size[1] + 1,
+                ],
+                dtype=relative_coords.dtype,
+            )
+            # Wh*Ww, Wh*Ww
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+
+        qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
+
+        qkv = qkv.reshape([B, N, 3, self.num_heads,
+                           -1]).transpose([2, 0, 3, 1, 4])
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+
+        attn = q @ k.transpose([0, 1, 3, 2])
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1,
+                    -1,
+                ])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+            attn = attn + relative_position_bias.unsqueeze(axis=0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, -1])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        init_values=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        window_size=None,
+        attn_head_dim=None,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+        if init_values:
+            self.gamma_1 = paddle.create_parameter(
+                shape=[dim],
+                dtype="float32",
+                default_initializer=nn.initializer.Constant(value=init_values),
+            )
+            self.gamma_2 = paddle.create_parameter(
+                shape=[dim],
+                dtype="float32",
+                default_initializer=nn.initializer.Constant(value=init_values),
+            )
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] -
+                                      1) * (2 * window_size[1] - 1) + 3
+
+        self.relative_position_bias_table = paddle.create_parameter(
+            shape=[self.num_relative_distance, num_heads],
+            dtype="float32",
+            default_initializer=zeros_,
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h,
+                                               coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten.unsqueeze(
+            axis=2) - coords_flatten.unsqueeze(axis=1)  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose([1, 2,
+                                                     0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = paddle.zeros([
+            window_size[0] * window_size[1] + 1,
+            window_size[0] * window_size[1] + 1
+        ],
+                                               dtype=relative_coords.dtype)
+        relative_position_index[1:,
+                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1,
+                -1,
+            ])  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+
+@BACKBONES.register()
+class VisionTransformerForMaskedImageModeling(nn.Layer):
+    """ BEiT Pretrain
+    This model is mainly used for pretraining ImageNet-22K
+    code base on https://github.com/microsoft/unilm/blob/master/beit/modeling_pretrain.py
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 vocab_size=8192,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.0,
+                 attn_drop_rate=0.0,
+                 drop_path_rate=0.0,
+                 norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
+                 init_values=None,
+                 attn_head_dim=None,
+                 use_abs_pos_emb=True,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 init_std=0.02):
+        super(VisionTransformerForMaskedImageModeling, self).__init__()
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype="float32",
+            default_initializer=trunc_normal_,
+        )
+        self.mask_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype="float32",
+            default_initializer=trunc_normal_,
+        )
+
+        if use_abs_pos_emb:
+            self.pos_embed = paddle.create_parameter(
+                shape=[1, num_patches + 1, embed_dim],
+                dtype="float32",
+                default_initializer=trunc_normal_,
+            )
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.grid_size, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)]
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.grid_size
+                if use_rel_pos_bias else None,
+                attn_head_dim=attn_head_dim,
+            ) for i in range(depth)
+        ])
+        self.norm = norm_layer(embed_dim)
+
+        self.init_std = init_std
+        self.lm_head = nn.Linear(embed_dim, vocab_size)
+        trunc_normal_(self.lm_head.weight)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.scale(1. / math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight, layer_id + 1)
+            rescale(layer.mlp.fc2.weight, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward_features(self, x, bool_masked_pos):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.shape
+
+        cls_tokens = self.cls_token.expand([batch_size, -1, -1])
+        mask_token = self.mask_token.expand([batch_size, seq_len, -1])
+
+        w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype)
+        x = x * (1 - w) + cls_tokens * w
+
+        x = paddle.concat((cls_tokens, x), axis=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        x = self.norm(x)
+
+        return x
+
+    def forward(self, x, bool_masked_pos, return_all_tokens=False):
+        x = self.forward_features(x, bool_masked_pos=bool_masked_pos)
+        x = x[:, 1:]
+        if return_all_tokens:
+            return self.lm_head(x)
+        else:
+            # return the masked tokens
+            return self.lm_head(x[bool_masked_pos])
diff --git a/passl/modeling/backbones/discrete_vae.py b/passl/modeling/backbones/discrete_vae.py
new file mode 100644
index 00000000..257cd5d4
--- /dev/null
+++ b/passl/modeling/backbones/discrete_vae.py
@@ -0,0 +1,718 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254)
+# Github source: https://github.com/microsoft/unilm/tree/master/beit
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# By Hangbo Bao
+# Based on OpenAI DALL-E and lucidrains' DALLE-pytorch code bases
+# https://github.com/openai/DALL-E
+# https://github.com/lucidrains/DALLE-pytorch
+import os
+#import wget
+import paddle
+import paddle.nn as nn
+
+
+#
+#logit_laplace_eps = 0.1
+#
+#
+#def map_pixels(x):
+#    return (1 - 2 * logit_laplace_eps) * x + logit_laplace_eps
+#
+#
+#def unmap_pixels(x):
+#    return paddle.clip((x - logit_laplace_eps) / (1 - 2 * logit_laplace_eps), 0, 1)
+#
+#
+#
+#class Identity(nn.Layer):
+#    def __init__(self):
+#        super(Identity, self).__init__()
+#
+#    def forward(self, inputs):
+#        return inputs
+#
+#
+class EncoderBlock(nn.Layer):
+    def __init__(self, n_in, n_out, n_layers):
+        super(EncoderBlock, self).__init__()
+        n_hid = n_out // 4
+        self.post_gain = 1 / (n_layers**2)
+
+        self.id_path = nn.Conv2D(n_in, n_out,
+                                 1) if n_in != n_out else Identity()
+        self.res_path = nn.Sequential(
+            ('relu_1', nn.ReLU()),
+            ('conv_1', nn.Conv2D(n_in, n_hid, 3, padding=1)),
+            ('relu_2', nn.ReLU()),
+            ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_3', nn.ReLU()),
+            ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_4', nn.ReLU()), ('conv_4', nn.Conv2D(n_hid, n_out, 1)))
+
+    def forward(self, x):
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+class Encoder(nn.Layer):
+    def __init__(self,
+                 group_count=4,
+                 n_hid=256,
+                 n_blk_per_group=2,
+                 input_channels=3,
+                 vocab_size=8192):
+        super(Encoder, self).__init__()
+        self.vocab_size = vocab_size
+
+        blk_range = range(n_blk_per_group)
+        n_layers = group_count * n_blk_per_group
+
+        self.blocks = nn.Sequential(
+            ('input', nn.Conv2D(input_channels, 1 * n_hid, 7, padding=3)),
+            ('group_1',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(1 * n_hid, 1 * n_hid, n_layers=n_layers))
+                   for i in blk_range],
+                 ('pool', nn.MaxPool2D(kernel_size=2)),
+             )),
+            ('group_2',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(1 * n_hid if i == 0 else 2 * n_hid,
+                                 2 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('pool', nn.MaxPool2D(kernel_size=2)),
+             )),
+            ('group_3',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(2 * n_hid if i == 0 else 4 * n_hid,
+                                 4 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('pool', nn.MaxPool2D(kernel_size=2)),
+             )),
+            ('group_4',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(4 * n_hid if i == 0 else 8 * n_hid,
+                                 8 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range], )),
+            ('output',
+             nn.Sequential(
+                 ('relu', nn.ReLU()),
+                 ('conv', nn.Conv2D(8 * n_hid, vocab_size, 1)),
+             )),
+        )
+
+    def forward(self, x):
+        return self.blocks(x)
+
+
+class DecoderBlock(nn.Layer):
+    def __init__(self, n_in, n_out, n_layers):
+        super(DecoderBlock, self).__init__()
+        n_hid = n_out // 4
+        self.post_gain = 1 / (n_layers**2)
+
+        self.id_path = nn.Conv2D(n_in, n_out,
+                                 1) if n_in != n_out else Identity()
+        self.res_path = nn.Sequential(
+            ('relu_1', nn.ReLU()), ('conv_1', nn.Conv2D(n_in, n_hid, 1)),
+            ('relu_2', nn.ReLU()),
+            ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_3', nn.ReLU()),
+            ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_4', nn.ReLU()),
+            ('conv_4', nn.Conv2D(n_hid, n_out, 3, padding=1)))
+
+    def forward(self, x):
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+class Decoder(nn.Layer):
+    def __init__(self,
+                 group_count=4,
+                 n_init=128,
+                 n_hid=256,
+                 n_blk_per_group=2,
+                 output_channels=3,
+                 vocab_size=8192):
+        super(Decoder, self).__init__()
+        self.vocab_size = vocab_size
+
+        blk_range = range(n_blk_per_group)
+        n_layers = group_count * n_blk_per_group
+
+        self.blocks = nn.Sequential(
+            ('input', nn.Conv2D(vocab_size, n_init, 1)),
+            ('group_1',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(n_init if i == 0 else 8 * n_hid,
+                                 8 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
+             )),
+            ('group_2',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(8 * n_hid if i == 0 else 4 * n_hid,
+                                 4 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
+             )),
+            ('group_3',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(4 * n_hid if i == 0 else 2 * n_hid,
+                                 2 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
+             )),
+            ('group_4',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(2 * n_hid if i == 0 else 1 * n_hid,
+                                 1 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range], )),
+            ('output',
+             nn.Sequential(
+                 ('relu', nn.ReLU()),
+                 ('conv', nn.Conv2D(1 * n_hid, 2 * output_channels, 1)),
+             )),
+        )
+
+    def forward(self, x):
+        return self.blocks(x)
+
+
+model_dict = {
+    'encoder': [
+        'Encoder',
+        r'https://passl.bj.bcebos.com/vision_transformers/beit/encoder.pdparams',
+        'encoder.pdparams'
+    ],
+    'decoder': [
+        'Decoder',
+        r'https://passl.bj.bcebos.com/vision_transformers/beit/decoder.pdparams',
+        'decoder.pdparams'
+    ]
+}
+
+
+def load_model(model_name, model_dir):
+    model_fn, url, file_name = model_dict[model_name]
+    model = eval(model_fn)()
+
+    model_path = os.path.join(model_dir, file_name)
+    if not os.path.exists(model_path):
+        if not os.path.exists(model_dir):
+            os.makedirs(model_dir)
+        #wget.download(url, out=model_path)
+    params = paddle.load(model_path)
+    model.set_state_dict(params)
+    model.eval()
+    return model
+
+
+#
+#
+#class DalleVAE(nn.Layer):
+#    def __init__(self, group_count=4, n_init=128, n_hid=256, n_blk_per_group=2, input_channels=3, output_channels=3, vocab_size=8192):
+#        super(DiscreteVAE, self).__init__()
+#        self.vocab_size = vocab_size
+#        self.encoder = Encoder()
+#        self.decoder = Decoder()
+#        self.l1_loss = paddle.nn.loss.L1Loss(reduction='none')
+#
+#    def encode(self, x):
+#        return self.encoder(x)
+#
+#    def decode(self, z):
+#        return self.decoder(z)
+#
+#
+#    def logit_laplace_loss(self, x, x_stats):
+#        ## x [ B, 3, 256, 256 ]
+#        ## x_stats [ B, 6, 256, 256 ]
+#        # mu
+#        mu = x_stats[:,:3]
+#        #
+#        lnb = x_stats[:,3:]
+#        log_norm = -paddle.log(x * (1 - x)) - lnb - paddle.log(paddle.to_tensor(2.0))
+#        #print("log_norm", log_norm)
+#        log_compare = -self.l1_loss(paddle.log(x/(1-x)), mu) / paddle.exp(lnb)
+#        #print("log_compare", log_compare)
+#        return -(log_norm+log_compare)
+#
+#    def gumbel_softmax(self, z_logits, temperature):
+#
+#        def sample_gumbel(shape, eps=1e-20):
+#            U = paddle.fluid.layers.uniform_random(shape,min=0,max=1)
+#            return -paddle.log(-paddle.log(U + eps) + eps)
+#
+#        def gumbel_softmax_sample(logits, temperature):
+#            y = logits + sample_gumbel(logits.shape)
+#            return nn.functional.softmax( y / temperature, axis=1)
+#
+#        return gumbel_softmax_sample(z_logits, temperature)
+#
+#
+#    def forward(self, x, temperature):
+#        # [B, vocab_size, 32, 32]
+#        z_logits = self.encoder(x)
+#        q_y = nn.functional.softmax(z_logits, axis=1)
+#        log_q_y = paddle.log(q_y+1e-20)
+#        kl_loss = q_y*(log_q_y-paddle.log(paddle.to_tensor(1.0/self.vocab_size)))
+#        # to [B, 32, 32]
+#        kl_loss = paddle.sum(kl_loss, axis=[1])
+#        # to [B]
+#        kl_loss = paddle.mean(kl_loss, axis=[1,2])
+#        #print(kl_loss)
+#
+#        z = self.gumbel_softmax(z_logits, temperature)
+#        x_stats = self.decoder(z)
+#        recon_loss = self.logit_laplace_loss(x, x_stats)
+#        recon_loss = paddle.mean(recon_loss, axis=[1, 2, 3])
+#        #print(recon_loss)
+#
+#        return recon_loss, kl_loss
+#
+
+#
+#
+#def load_model(model_name, pretrained=False):
+#    model_fn, url, file_name = model_dict[model_name]
+#    model = model_fn()
+#
+#    if pretrained:
+#        model_path = os.path.join('pretrained_models', file_name)
+#        if not os.path.isfile(model_path):
+#            if not os.path.exists('pretrained_models'):
+#                os.mkdir('pretrained_models')
+#            wget.download(url, out=model_path)
+#        params = paddle.load(model_path)
+#        model.set_dict(params)
+#
+#    model.eval()
+#    return model
+
+from math import sqrt
+import os
+import paddle
+from paddle import nn, einsum
+import paddle.nn.functional as F
+from einops import rearrange
+
+from .builder import BACKBONES
+
+
+def top_k(logits, thres=0.5):
+    num_logits = logits.shape[-1]
+    k = max(int((1 - thres) * num_logits), 1)
+    val, ind = paddle.topk(logits, k)
+    probs = paddle.full_like(logits, float('-inf'))
+    probs.scatter_(1, ind, val)
+    return probs
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+def eval_decorator(fn):
+    def inner(model, *args, **kwargs):
+        was_training = model.training
+        model.eval()
+        out = fn(model, *args, **kwargs)
+        model.train(was_training)
+        return out
+
+    return inner
+
+
+class BasicVAE(nn.Layer):
+    def get_codebook_indices(self, images):
+        raise NotImplementedError()
+
+    def decode(self, img_seq):
+        raise NotImplementedError()
+
+    def get_codebook_probs(self, img_seq):
+        raise NotImplementedError()
+
+    def get_image_tokens_size(self):
+        pass
+
+    def get_image_size(self):
+        pass
+
+
+class ResBlock(nn.Layer):
+    def __init__(self, chan_in, hidden_size, chan_out):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2D(chan_in, hidden_size, 3, padding=1), nn.ReLU(),
+            nn.Conv2D(hidden_size, hidden_size, 3, padding=1), nn.ReLU(),
+            nn.Conv2D(hidden_size, chan_out, 1))
+
+    def forward(self, x):
+        return self.net(x) + x
+
+
+@BACKBONES.register()
+class DiscreteVAE(BasicVAE):
+    def __init__(self,
+                 image_size=256,
+                 num_tokens=512,
+                 codebook_dim=512,
+                 num_layers=3,
+                 hidden_dim=64,
+                 channels=3,
+                 smooth_l1_loss=False,
+                 temperature=0.9,
+                 straight_through=False,
+                 kl_div_loss_weight=0.):
+        super().__init__()
+        # assert log2(image_size).is_integer(), 'image size must be a power of 2'
+        assert num_layers >= 1, 'number of layers must be greater than or equal to 1'
+
+        self.image_size = image_size
+        self.num_tokens = num_tokens
+        self.num_layers = num_layers
+        self.temperature = temperature
+        self.straight_through = straight_through
+        self.codebook = nn.Embedding(num_tokens, codebook_dim)
+
+        enc_layers = []
+        dec_layers = []
+
+        enc_in = channels
+        dec_in = codebook_dim
+
+        for layer_id in range(num_layers):
+            enc_layers.append(
+                nn.Sequential(
+                    nn.Conv2D(enc_in, hidden_dim, 4, stride=2, padding=1),
+                    nn.ReLU()))
+            enc_layers.append(
+                ResBlock(chan_in=hidden_dim,
+                         hidden_size=hidden_dim,
+                         chan_out=hidden_dim))
+            enc_in = hidden_dim
+            dec_layers.append(
+                nn.Sequential(
+                    nn.ConvTranspose2D(dec_in,
+                                       hidden_dim,
+                                       4,
+                                       stride=2,
+                                       padding=1), nn.ReLU()))
+            dec_layers.append(
+                ResBlock(chan_in=hidden_dim,
+                         hidden_size=hidden_dim,
+                         chan_out=hidden_dim))
+            dec_in = hidden_dim
+
+        enc_layers.append(nn.Conv2D(hidden_dim, num_tokens, 1))
+        dec_layers.append(nn.Conv2D(hidden_dim, channels, 1))
+
+        self.encoder = nn.Sequential(*enc_layers)
+        self.decoder = nn.Sequential(*dec_layers)
+
+        self.loss_fn = F.smooth_l1_loss if smooth_l1_loss else F.mse_loss
+        self.kl_div_loss_weight = kl_div_loss_weight
+
+    def get_image_size(self):
+        return self.image_size
+
+    def get_image_tokens_size(self):
+        return self.image_size // 8
+
+    @paddle.no_grad()
+    @eval_decorator
+    def get_codebook_indices(self, images):
+        logits = self.forward(images, return_logits=True)
+        codebook_indices = logits.argmax(dim=1)
+        return codebook_indices
+
+    @paddle.no_grad()
+    @eval_decorator
+    def get_codebook_probs(self, images):
+        logits = self.forward(images, return_logits=True)
+        return nn.Softmax(dim=1)(logits)
+
+    def decode(self, img_seq):
+        image_embeds = self.codebook(img_seq)
+        b, n, d = image_embeds.shape
+        h = w = int(sqrt(n))
+
+        image_embeds = rearrange(image_embeds, 'b (h w) d -> b d h w', h=h, w=w)
+        images = self.decoder(image_embeds)
+        return images
+
+    def forward(self,
+                img,
+                return_loss=False,
+                return_recons=False,
+                return_logits=False,
+                temp=None):
+        device, num_tokens, image_size, kl_div_loss_weight = img.device, self.num_tokens, self.image_size, self.kl_div_loss_weight
+        assert img.shape[-1] == image_size and img.shape[
+            -2] == image_size, f'input must have the correct image size {image_size}'
+
+        logits = self.encoder(img)
+
+        if return_logits:
+            return logits  # return logits for getting hard image indices for DALL-E training
+
+        temp = default(temp, self.temperature)
+        soft_one_hot = F.gumbel_softmax(logits,
+                                        tau=temp,
+                                        dim=1,
+                                        hard=self.straight_through)
+        sampled = einsum('b n h w, n d -> b d h w', soft_one_hot,
+                         self.codebook.weight)
+        out = self.decoder(sampled)
+
+        if not return_loss:
+            return out
+
+        # reconstruction loss
+
+        recon_loss = self.loss_fn(img, out)
+
+        # kl divergence
+
+        logits = rearrange(logits, 'b n h w -> b (h w) n')
+        qy = F.softmax(logits, dim=-1)
+
+        log_qy = paddle.log(qy + 1e-10)
+        log_uniform = paddle.log(
+            paddle.to_tensor([1. / num_tokens], device=device))
+        kl_div = F.kl_div(log_uniform,
+                          log_qy,
+                          None,
+                          None,
+                          'batchmean',
+                          log_target=True)
+
+        loss = recon_loss + (kl_div * kl_div_loss_weight)
+
+        if not return_recons:
+            return loss
+
+        return loss, out
+
+
+@BACKBONES.register()
+class Dalle_VAE(BasicVAE):
+    def __init__(self, image_size):
+        super().__init__()
+        self.encoder = Encoder()
+        self.decoder = Decoder()
+        self.image_size = image_size
+
+    def decode(self, img_seq):
+        bsz = img_seq.size()[0]
+        img_seq = img_seq.view(bsz, self.image_size // 8, self.image_size // 8)
+        z = F.one_hot(img_seq,
+                      num_classes=self.encoder.vocab_size).permute(0, 3, 1,
+                                                                   2).float()
+        return self.decoder(z).float()
+
+    def get_codebook_indices(self, images):
+        z_logits = self.encoder(images)
+        return paddle.argmax(z_logits, axis=1)
+
+    def get_codebook_probs(self, images):
+        z_logits = self.encoder(images)
+        return nn.Softmax(dim=1)(z_logits)
+
+    def forward(self, img_seq_prob, no_process=False):
+        if no_process:
+            return self.decoder(img_seq_prob.float()).float()
+        else:
+            bsz, seq_len, num_class = img_seq_prob.size()
+            z = img_seq_prob.view(bsz, self.image_size // 8,
+                                  self.image_size // 8, self.encoder.vocab_size)
+            return self.decoder(z.permute(0, 3, 1, 2).float()).float()
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class EncoderBlock(nn.Layer):
+    def __init__(self, n_in, n_out, n_layers):
+        super(EncoderBlock, self).__init__()
+        n_hid = n_out // 4
+        self.post_gain = 1 / (n_layers**2)
+
+        self.id_path = nn.Conv2D(n_in, n_out,
+                                 1) if n_in != n_out else Identity()
+        self.res_path = nn.Sequential(
+            ('relu_1', nn.ReLU()),
+            ('conv_1', nn.Conv2D(n_in, n_hid, 3, padding=1)),
+            ('relu_2', nn.ReLU()),
+            ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_3', nn.ReLU()),
+            ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_4', nn.ReLU()), ('conv_4', nn.Conv2D(n_hid, n_out, 1)))
+
+    def forward(self, x):
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+class Encoder(nn.Layer):
+    def __init__(self,
+                 group_count=4,
+                 n_hid=256,
+                 n_blk_per_group=2,
+                 input_channels=3,
+                 vocab_size=8192):
+        super(Encoder, self).__init__()
+        self.vocab_size = vocab_size
+
+        blk_range = range(n_blk_per_group)
+        n_layers = group_count * n_blk_per_group
+
+        self.blocks = nn.Sequential(
+            ('input', nn.Conv2D(input_channels, 1 * n_hid, 7, padding=3)),
+            ('group_1',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(1 * n_hid, 1 * n_hid, n_layers=n_layers))
+                   for i in blk_range],
+                 ('pool', nn.MaxPool2D(kernel_size=2)),
+             )),
+            ('group_2',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(1 * n_hid if i == 0 else 2 * n_hid,
+                                 2 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('pool', nn.MaxPool2D(kernel_size=2)),
+             )),
+            ('group_3',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(2 * n_hid if i == 0 else 4 * n_hid,
+                                 4 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('pool', nn.MaxPool2D(kernel_size=2)),
+             )),
+            ('group_4',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    EncoderBlock(4 * n_hid if i == 0 else 8 * n_hid,
+                                 8 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range], )),
+            ('output',
+             nn.Sequential(
+                 ('relu', nn.ReLU()),
+                 ('conv', nn.Conv2D(8 * n_hid, vocab_size, 1)),
+             )),
+        )
+
+    def forward(self, x):
+        return self.blocks(x)
+
+
+class DecoderBlock(nn.Layer):
+    def __init__(self, n_in, n_out, n_layers):
+        super(DecoderBlock, self).__init__()
+        n_hid = n_out // 4
+        self.post_gain = 1 / (n_layers**2)
+
+        self.id_path = nn.Conv2D(n_in, n_out,
+                                 1) if n_in != n_out else Identity()
+        self.res_path = nn.Sequential(
+            ('relu_1', nn.ReLU()), ('conv_1', nn.Conv2D(n_in, n_hid, 1)),
+            ('relu_2', nn.ReLU()),
+            ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_3', nn.ReLU()),
+            ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)),
+            ('relu_4', nn.ReLU()),
+            ('conv_4', nn.Conv2D(n_hid, n_out, 3, padding=1)))
+
+    def forward(self, x):
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+class Decoder(nn.Layer):
+    def __init__(self,
+                 group_count=4,
+                 n_init=128,
+                 n_hid=256,
+                 n_blk_per_group=2,
+                 output_channels=3,
+                 vocab_size=8192):
+        super(Decoder, self).__init__()
+        self.vocab_size = vocab_size
+
+        blk_range = range(n_blk_per_group)
+        n_layers = group_count * n_blk_per_group
+
+        self.blocks = nn.Sequential(
+            ('input', nn.Conv2D(vocab_size, n_init, 1)),
+            ('group_1',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(n_init if i == 0 else 8 * n_hid,
+                                 8 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
+             )),
+            ('group_2',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(8 * n_hid if i == 0 else 4 * n_hid,
+                                 4 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
+             )),
+            ('group_3',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(4 * n_hid if i == 0 else 2 * n_hid,
+                                 2 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range],
+                 ('upsample', nn.Upsample(scale_factor=2, mode='nearest')),
+             )),
+            ('group_4',
+             nn.Sequential(
+                 *[(f'block_{i + 1}',
+                    DecoderBlock(2 * n_hid if i == 0 else 1 * n_hid,
+                                 1 * n_hid,
+                                 n_layers=n_layers)) for i in blk_range], )),
+            ('output',
+             nn.Sequential(
+                 ('relu', nn.ReLU()),
+                 ('conv', nn.Conv2D(1 * n_hid, 2 * output_channels, 1)),
+             )),
+        )
+
+    def forward(self, x):
+        return self.blocks(x)
diff --git a/passl/modeling/heads/__init__.py b/passl/modeling/heads/__init__.py
index 5f86aaf3..9b443691 100644
--- a/passl/modeling/heads/__init__.py
+++ b/passl/modeling/heads/__init__.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .beit_head import BEiTClsHead, BEiTPTHead, BEiTFTHead
+from .builder import build_head
+from .cait_head import CaitClsHead
 from .contrastive_head import ContrastiveHead
 from .clas_head import ClasHead
+from .clip_head import CLIPHead
+from .cvt_head import CvTClsHead
 from .l2_head import L2Head
 from .mb_head import MBHead
-from .clip_head import CLIPHead
-from .builder import build_head
+from .mlp_mixer_head import MlpMixerClsHead
+from .pixpro_head import PixProHead
 from .simclr_contrastive_head import SimCLRContrastiveHead
-from .vision_transformer_head import VisionTransformerClsHead
 from .swin_transformer_head import SwinTransformerClsHead
-from .beit_head import BeitClsHead
 from .t2t_vit_head import T2TViTClsHead
-from .cait_head import CaitClsHead
-from .mlp_mixer_head import MlpMixerClsHead
-from .cvt_head import CvTClsHead
-from .pixpro_head import PixProHead
+from .vision_transformer_head import VisionTransformerClsHead
diff --git a/passl/modeling/heads/beit_head.py b/passl/modeling/heads/beit_head.py
index c9e6d629..c852e6b4 100644
--- a/passl/modeling/heads/beit_head.py
+++ b/passl/modeling/heads/beit_head.py
@@ -12,18 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+import math
 import paddle
 import paddle.nn as nn
+from paddle import multiply
+from paddle.nn import Identity
+import paddle.nn.functional as F
 
 from .builder import HEADS
 
 trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
 zeros_ = nn.initializer.Constant(value=0.0)
+ones_ = nn.initializer.Constant(value=1.0)
 
 
 @HEADS.register()
-class BeitClsHead(nn.Layer):
-    """Swin Transformer classifier head.
+class BEiTClsHead(nn.Layer):
+    """BEiT classifier head.
 
     Args:
         in_channels (int): Number of channels in the input feature map.
@@ -73,3 +79,111 @@ def accuracy(output, target, topk=(1, )):
             correct_k = correct[:k].reshape([-1]).sum(0, keepdim=True)
             res.append(correct_k * 100.0 / batch_size)
         return res
+
+
+@HEADS.register()
+class BEiTPTHead(nn.Layer):
+    """BEiT Pretrain Head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background category.
+    """
+    def __init__(self, in_channels=None, num_classes=None, init_scale=0.001):
+        super().__init__()
+        self.criterion = nn.CrossEntropyLoss()
+
+    def forward(self, cls_score, labels):
+        losses = dict()
+        losses["loss"] = self.criterion(cls_score, labels)
+        loss_value = losses["loss"].item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+        #lossmlm_acc = (cls_score.max(-1) == labels).astype('float32').mean().item()
+
+        losses["mlm_acc"] = accuracy(cls_score, labels)[0]
+        return losses
+
+
+def accuracy(output, target, topk=(1, )):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with paddle.no_grad():
+        maxk = max(topk)
+        batch_size = target.shape[0]
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = paddle.cast(pred == target.reshape([1, -1]).expand_as(pred),
+                              "float32")
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape([-1]).sum(0, keepdim=True)
+            res.append(correct_k * 100.0 / batch_size)
+        return res
+
+
+@HEADS.register()
+class BEiTFTHead(nn.Layer):
+    """BEiT Finetune Head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background category.
+    """
+    def __init__(self, in_channels=None, num_classes=None, init_scale=0.001):
+        super(BEiTFTHead, self).__init__()
+        self.head = nn.Linear(in_channels,
+                              num_classes) if num_classes > 0 else Identity()
+        self.criterion = nn.CrossEntropyLoss()
+        trunc_normal_(self.head.weight)
+        self.apply(self._init_weights)
+
+        self.head.weight.set_value(
+            multiply(self.head.weight, paddle.to_tensor(init_scale)))
+        self.head.bias.set_value(
+            multiply(self.head.bias, paddle.to_tensor(init_scale)))
+
+    def forward(self, x):
+        x = self.head(x)
+        return x
+
+    def loss(self, x, labels, soft=True):
+        losses = dict()
+        if soft:
+            losses['loss'] = paddle.sum(-labels * F.log_softmax(x, axis=-1),
+                                        axis=-1).mean()
+        else:
+            losses["loss"] = self.criterion(x, labels)
+        losses['acc1'], losses['acc5'] = accuracy(x, labels, topk=(1, 5))
+        return losses
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+
+def accuracy(output, target, topk=(1, )):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with paddle.no_grad():
+        maxk = max(topk)
+        if target.dim() > 1:
+            target = target.argmax(axis=-1)
+        batch_size = target.shape[0]
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = paddle.cast(pred == target.reshape([1, -1]).expand_as(pred),
+                              'float32')
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape([-1]).sum(0, keepdim=True)
+            res.append(correct_k * 100.0 / batch_size)
+        return res
diff --git a/passl/solver/builder.py b/passl/solver/builder.py
index b29275d1..3f4573ee 100644
--- a/passl/solver/builder.py
+++ b/passl/solver/builder.py
@@ -48,14 +48,12 @@ def build_lr_scheduler_simclr(cfg, iters_per_epoch, batch_size, epochs,
     # FIXME: if have a better way
 
     if cfg.name == 'CosineAnnealingDecay':
-        cfg.T_max = T_max
         cfg.T_max *= iters_per_epoch
     elif cfg.name == 'MultiStepDecay':
         cfg.milestones = [x * iters_per_epoch for x in cfg.milestones]
     elif cfg.name == 'Cosinesimclr':
         cfg.iters_per_epoch = iters_per_epoch
         cfg.epochs = epochs
-        cfg.T_max = T_max
     elif cfg.name == 'simclrCosineWarmup':
         cfg.step_each_epoch = iters_per_epoch
         cfg.epochs = epochs
@@ -93,9 +91,85 @@ def build_clip_optimizer(cfg, lr_scheduler, parameters=None):
     return OPTIMIZERS.get(name)(lr_scheduler, **cfg)
 
 
+def get_num_layer_for_vit(var_name, num_max_layer):
+    if var_name in ("backbone.cls_token", "backbone.mask_token",
+                    "backbone.pos_embed"):
+        return 0
+    elif var_name.startswith("backbone.patch_embed"):
+        return 0
+    elif var_name.startswith("backbone.rel_pos_bias"):
+        return num_max_layer - 1
+    elif var_name.startswith("backbone.blocks"):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return num_max_layer - 1
+
+
+class LayerDecayValueAssigner(object):
+    def __init__(self, values):
+        self.values = values
+
+    def get_scale(self, layer_id):
+        return self.values[layer_id]
+
+    def get_layer_id(self, var_name):
+        return get_num_layer_for_vit(var_name, len(self.values))
+
+
+def get_parameter_groups(cfg,
+                         model,
+                         skip_list=(),
+                         get_num_layer=None,
+                         get_layer_scale=None):
+    weight_decay = cfg['weight_decay']
+    parameter_group_names = {}
+    parameter_group_vars = {}
+
+    for name, param in model.named_parameters():
+        if param.stop_gradient:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            group_name = "no_decay"
+            this_weight_decay = 0.
+        else:
+            group_name = "decay"
+            this_weight_decay = weight_decay
+        if get_num_layer is not None:
+            layer_id = get_num_layer(name)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+        else:
+            layer_id = None
+
+        if group_name not in parameter_group_names:
+            if get_layer_scale is not None:
+                scale = get_layer_scale(layer_id)
+            else:
+                scale = 1.
+
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "learning_rate": scale
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "learning_rate": scale
+            }
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    return list(parameter_group_vars.values())
+
+
 def build_optimizer(cfg, lr_scheduler, model_list=None):
     cfg = copy.deepcopy(cfg)
     name = cfg.pop('name')
+    if 'layer_decay' in cfg:
+        layer_decay = cfg.pop('layer_decay')
+        assert isinstance(layer_decay, float)
+    if layer_decay is None:
+        layer_decay = 1.0
 
     # step 1 clip grad
     if 'grad_clip' in cfg:
@@ -107,8 +181,21 @@ def build_optimizer(cfg, lr_scheduler, model_list=None):
             clip_norm = grad_clip_cfg['value']
             cfg['grad_clip'] = ClipGradByNorm(clip_norm=clip_norm)
 
-    parameters = sum([m.parameters()
-                      for m in model_list], []) if model_list else None
+    if layer_decay < 1.0:
+        num_layers = model_list[0].backbone.get_num_layers()
+        assigner = LayerDecayValueAssigner(
+            list(layer_decay**(num_layers + 1 - i)
+                 for i in range(num_layers + 2)))
+    else:
+        assigner = None
+    if assigner is not None:
+        parameters = get_parameter_groups(cfg,
+                                          model_list[0],
+                                          get_num_layer=assigner.get_layer_id,
+                                          get_layer_scale=assigner.get_scale)
+    else:
+        parameters = sum([m.parameters()
+                          for m in model_list], []) if model_list else None
 
     # step 2 Adapt Lars and Lamb optimizer parameter argument.
     if 'Lars' in name or 'Lamb' in name: