diff --git a/configs/beit/beit_base_p16_224_ft_1k.yaml b/configs/beit/beit_base_p16_224_ft_1k.yaml new file mode 100644 index 00000000..4628cbc3 --- /dev/null +++ b/configs/beit/beit_base_p16_224_ft_1k.yaml @@ -0,0 +1,123 @@ +epochs: 100 +output_dir: output_dir +seed: 0 +device: gpu + +model: + name: BEiTFTWrapper + architecture: + name: VisionTransformerForFinetune + img_size: 224 + embed_dim: 768 + patch_size: 16 + depth: 12 + num_heads: 12 + mlp_ratio: 4 + qkv_bias: True + drop_path_rate: 0.1 + init_values: 0.1 + use_abs_pos_emb: False + use_rel_pos_bias: True + head: + name: BEiTFTHead + num_classes: 1000 + in_channels: 768 + +dataloader: + train: + loader: + num_workers: 8 + use_shared_memory: True + sampler: + batch_size: 128 + shuffle: True + drop_last: True + dataset: + name: ImageNet + dataroot: data/ILSVRC2012/train/ + return_label: True + transforms: + - name: RandomResizedCrop + size: 224 + scale: [0.08, 1.] + interpolation: 'bicubic' + - name: RandomHorizontalFlip + - name: AutoAugment + config_str: 'rand-m9-mstd0.5-inc1' + interpolation: 'bicubic' + img_size: 224 + mean: [0.5, 0.5, 0.5] + std: [0.5, 0.5, 0.5] + - name: Transpose + - name: NormalizeImage + scale: 1.0/255.0 + mean: [0.5, 0.5, 0.5] + std: [0.5, 0.5, 0.5] + - name: RandomErasing + prob: 0.25 + mode: 'pixel' + max_count: 1 + batch_transforms: + - name: Mixup + mixup_alpha: 0.8 + prob: 1. + switch_prob: 0.5 + mode: 'batch' + cutmix_alpha: 1.0 + val: + loader: + num_workers: 8 + use_shared_memory: True + sampler: + batch_size: 64 + shuffle: false + drop_last: false + dataset: + name: ImageNet + dataroot: data/ILSVRC2012/val + return_label: True + transforms: + - name: Resize + size: 256 + interpolation: 'bicubic' + - name: CenterCrop + size: 224 + - name: Transpose + - name: Normalize + mean: [123.675, 116.28, 103.53] + std: [58.395, 57.12, 57.375] + +lr_scheduler: + name: LinearWarmup + learning_rate: + name: CosineAnnealingDecay + learning_rate: 4e-3 + T_max: 100 + eta_min: 1e-6 + warmup_steps: 20 + start_lr: 0 + end_lr: 4e-3 + +optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + weight_decay: 0.05 + epsilon: 1e-8 + exclude_from_weight_decay: ["pos_embed","cls_token",".bias","norm","gamma"] + layer_decay: 0.65 + +log_config: + name: LogHook + interval: 10 + +checkpoint: + name: CheckpointHook + by_epoch: true + interval: 1 + +custom_config: + - name: EvaluateHook + +vdl_config: + name: VisualHook diff --git a/configs/beit/beit_base_p16_224_pt_1k.yaml b/configs/beit/beit_base_p16_224_pt_1k.yaml new file mode 100644 index 00000000..3ff214d3 --- /dev/null +++ b/configs/beit/beit_base_p16_224_pt_1k.yaml @@ -0,0 +1,104 @@ +epochs: 800 +output_dir: output_dir +seed: 0 +device: gpu + +model: + name: BEiTPTWrapper + architecture: + name: VisionTransformerForMaskedImageModeling + img_size: 224 + embed_dim: 768 + patch_size: 16 + depth: 12 + num_heads: 12 + mlp_ratio: 4 + use_abs_pos_emb: False + use_rel_pos_bias: False + use_shared_rel_pos_bias: True + init_values: 0.1 + drop_path_rate: 0.1 + head: + name: BEiTPTHead + num_classes: 1000 + in_channels: 768 + d_vae: + name: dall-e + weight_path: 'dvae/' + image_size: 112 + +dataloader: + train: + loader: + num_workers: 0 + use_shared_memory: False + sampler: + batch_size: 128 + shuffle: True + drop_last: True + dataset: + name: BEiT_ImageNet + dataroot: data/ILSVRC2012/train/ + common_transforms: + - name: ToRGB + - name: ColorJitter + brightness: 0.4 + contrast: 0.4 + saturation: 0.4 + hue: 0.4 + - name: RandomHorizontalFlip + - name: RandomResizedCropAndInterpolationWithTwoPic + size: 224 + second_size: 112 + interpolation: 'bicubic' + second_interpolation: 'lanczos' + patch_transforms: + - name: Transpose + - name: NormalizeImage + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + visual_token_transforms: + - name: Transpose + - name: VisualTokenMap + mode: 'map_pixels' + scale: 255 + masking_generator: + input_size: 14 + num_masking_patches: 75 + max_num_patches: None + min_num_patches: 16 + +lr_scheduler: + name: LinearWarmup + learning_rate: + name: CosineAnnealingDecay + learning_rate: 3e-3 + T_max: 800 + eta_min: 1e-5 + warmup_steps: 10 + start_lr: 0 + end_lr: 3e-3 + +optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + weight_decay: 0.05 + epsilon: 1e-8 + exclude_from_weight_decay: ["pos_embed","cls_token",".bias","norm","gamma"] + grad_clip: + name: global_norm + value: 3.0 + +log_config: + name: LogHook + interval: 1 + +checkpoint: + name: CheckpointHook + by_epoch: True + interval: 1 + +vdl_config: + name: VisualHook diff --git a/configs/byol/byol_clas_r50.yaml b/configs/byol/byol_clas_r50.yaml index 3fa7cd11..70965f6d 100644 --- a/configs/byol/byol_clas_r50.yaml +++ b/configs/byol/byol_clas_r50.yaml @@ -1,5 +1,7 @@ epochs: 100 output_dir: output_dir +seed: 0 +device: gpu model: name: ByolClassification @@ -16,7 +18,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true @@ -30,11 +34,13 @@ dataloader: - name: RandomHorizontalFlip - name: Resize size: [224,224] - interpolation: bicubic + interpolation: bicubic - name: ByolNormalize - name: Clip val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false @@ -49,7 +55,7 @@ dataloader: - name: CenterCrop size: 224 - name: ByolNormalize - - name: Clip + - name: Clip lr_scheduler: name: ByolLRScheduler diff --git a/configs/byol/byol_r50_IM.yaml b/configs/byol/byol_r50_IM.yaml index 379bae08..56ae2970 100644 --- a/configs/byol/byol_r50_IM.yaml +++ b/configs/byol/byol_r50_IM.yaml @@ -1,8 +1,11 @@ epochs: 300 use_byol_iters: True total_images: 1281167 -global_batch_size: 4096 # 128 * 4 * 8 +global_batch_size: 4096 output_dir: output_dir +seed: 0 +device: gpu + model: name: BYOL backbone: @@ -33,7 +36,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True diff --git a/configs/cait/cait_m36_384.yaml b/configs/cait/cait_m36_384.yaml index e8097250..4b56d5f1 100644 --- a/configs/cait/cait_m36_384.yaml +++ b/configs/cait/cait_m36_384.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CaitWrapper @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cait/cait_m48_448.yaml b/configs/cait/cait_m48_448.yaml index d4d5b6b3..163d3d86 100644 --- a/configs/cait/cait_m48_448.yaml +++ b/configs/cait/cait_m48_448.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CaitWrapper @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cait/cait_s24_224.yaml b/configs/cait/cait_s24_224.yaml index 1e8c0299..9734f6d6 100644 --- a/configs/cait/cait_s24_224.yaml +++ b/configs/cait/cait_s24_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CaitWrapper @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cait/cait_s24_384.yaml b/configs/cait/cait_s24_384.yaml index 830bd0c5..2f3c3500 100644 --- a/configs/cait/cait_s24_384.yaml +++ b/configs/cait/cait_s24_384.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CaitWrapper @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cait/cait_s36_384.yaml b/configs/cait/cait_s36_384.yaml index 2b1f1959..c138ea6e 100644 --- a/configs/cait/cait_s36_384.yaml +++ b/configs/cait/cait_s36_384.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CaitWrapper @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cait/cait_xs24_384.yaml b/configs/cait/cait_xs24_384.yaml index d855eef7..197653e8 100644 --- a/configs/cait/cait_xs24_384.yaml +++ b/configs/cait/cait_xs24_384.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CaitWrapper @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/clip/vit-b-32.yaml b/configs/clip/vit-b-32.yaml index 549ff58b..0e40fd6d 100644 --- a/configs/clip/vit-b-32.yaml +++ b/configs/clip/vit-b-32.yaml @@ -1,5 +1,7 @@ epochs: 10 output_dir: output_dir +seed: 0 +device: gpu model: name: CLIPWrapper @@ -24,7 +26,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true @@ -38,7 +42,7 @@ dataloader: size: 224 scale: [0.75, 1.] ratio: [1., 1.] - - name: NormalizeImage + - name: NormalizeImage scale: 1.0/255.0 mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] diff --git a/configs/convnext/convnext_small_224.yaml b/configs/convnext/convnext_small_224.yaml index 78c28c31..fb772704 100644 --- a/configs/convnext/convnext_small_224.yaml +++ b/configs/convnext/convnext_small_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: Classification @@ -14,7 +16,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/convnext/convnext_tiny_224.yaml b/configs/convnext/convnext_tiny_224.yaml index 03001502..3d4b9829 100644 --- a/configs/convnext/convnext_tiny_224.yaml +++ b/configs/convnext/convnext_tiny_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: Classification @@ -14,7 +16,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cvt/cvt_13_224.yaml b/configs/cvt/cvt_13_224.yaml index 3e23777d..69617d29 100644 --- a/configs/cvt/cvt_13_224.yaml +++ b/configs/cvt/cvt_13_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CvTWrapper @@ -15,7 +17,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cvt/cvt_21_224.yaml b/configs/cvt/cvt_21_224.yaml index 46f31b20..44156ad3 100644 --- a/configs/cvt/cvt_21_224.yaml +++ b/configs/cvt/cvt_21_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CvTWrapper @@ -15,7 +17,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cvt/cvt_21_384.yaml b/configs/cvt/cvt_21_384.yaml index 466500e9..cf8412ae 100644 --- a/configs/cvt/cvt_21_384.yaml +++ b/configs/cvt/cvt_21_384.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CvTWrapper @@ -15,7 +17,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/cvt/cvt_w24_384.yaml b/configs/cvt/cvt_w24_384.yaml index 03b88756..79f2a9ce 100644 --- a/configs/cvt/cvt_w24_384.yaml +++ b/configs/cvt/cvt_w24_384.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: CvTWrapper @@ -15,7 +17,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml b/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml index 1f0a67b9..1135ab80 100644 --- a/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml +++ b/configs/deit/deit-base-p16-pt_in1k-224_2n16c_fp16_o1_dp.yaml @@ -1,5 +1,7 @@ -epochs: 300 +epochs: 300 output_dir: output_dir +seed: 0 +device: gpu use_amp: True AMP: @@ -16,29 +18,31 @@ AMP: model: name: DeiTWrapper architecture: - name: DeiTVisionTransformer - img_size: 224 - patch_size: 16 - embed_dim: 768 - depth: 12 + name: DeiTVisionTransformer + img_size: 224 + patch_size: 16 + embed_dim: 768 + depth: 12 num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + mlp_ratio: 4 + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.0 - drop_path_rate : 0.1 + drop_path_rate : 0.1 dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: name: DistributedRepeatedAugSampler - batch_size: 64 + batch_size: 64 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -53,7 +57,7 @@ dataloader: - name: AutoAugment config_str: 'rand-m9-mstd0.5-inc1' interpolation: 'bicubic' - img_size: 224 + img_size: 224 - name: Normalize data_format: 'HWC' mean: [123.675, 116.28, 103.53] @@ -74,7 +78,9 @@ dataloader: num_classes: 1000 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -103,7 +109,7 @@ lr_config: unit: 'epoch' lr_scheduler: - name: TimmCosine + name: TimmCosine learning_rate: 1e-3 eta_min: 1e-5 warmup_epoch: 5 diff --git a/configs/mae/mae_vit_b_finetune.yaml b/configs/mae/mae_vit_b_finetune.yaml index a8a2a001..7d0e1af7 100644 --- a/configs/mae/mae_vit_b_finetune.yaml +++ b/configs/mae/mae_vit_b_finetune.yaml @@ -1,5 +1,7 @@ epochs: 100 output_dir: output_dir +seed: 0 +device: gpu model: name: MAE_FINETUNE @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/mae/mae_vit_b_pretrain.yaml b/configs/mae/mae_vit_b_pretrain.yaml index c6aab60d..b9381abf 100644 --- a/configs/mae/mae_vit_b_pretrain.yaml +++ b/configs/mae/mae_vit_b_pretrain.yaml @@ -1,5 +1,7 @@ epochs: 800 output_dir: output_dir +seed: 0 +device: gpu model: name: MAE_PRETRAIN @@ -14,10 +16,11 @@ model: decoder_num_heads: 16 mlp_ratio: 4 - dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/mlp_mixer/mlp-mixer_b16_224.yaml b/configs/mlp_mixer/mlp-mixer_b16_224.yaml index 025d591e..f58b5851 100644 --- a/configs/mlp_mixer/mlp-mixer_b16_224.yaml +++ b/configs/mlp_mixer/mlp-mixer_b16_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: MlpMixerWrapper @@ -16,7 +18,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/mlp_mixer/mlp-mixer_l16_224.yaml b/configs/mlp_mixer/mlp-mixer_l16_224.yaml index 54582f0a..bf709450 100644 --- a/configs/mlp_mixer/mlp-mixer_l16_224.yaml +++ b/configs/mlp_mixer/mlp-mixer_l16_224.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: MlpMixerWrapper @@ -16,7 +18,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/moco/moco_clas_r50.yaml b/configs/moco/moco_clas_r50.yaml index 33d86461..fb1369d2 100644 --- a/configs/moco/moco_clas_r50.yaml +++ b/configs/moco/moco_clas_r50.yaml @@ -1,5 +1,7 @@ epochs: 100 output_dir: output_dir +seed: 0 +device: gpu model: name: Classification @@ -14,7 +16,9 @@ model: dataloader: train: - num_workers: 6 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: true @@ -33,7 +37,9 @@ dataloader: mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] val: - num_workers: 4 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: false diff --git a/configs/moco/moco_v1_r50.yaml b/configs/moco/moco_v1_r50.yaml index 70ce6611..38dfb443 100644 --- a/configs/moco/moco_v1_r50.yaml +++ b/configs/moco/moco_v1_r50.yaml @@ -1,5 +1,7 @@ epochs: 200 output_dir: output_dir +seed: 0 +device: gpu model: name: MoCo @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 32 shuffle: true diff --git a/configs/moco/moco_v2_r50.yaml b/configs/moco/moco_v2_r50.yaml index 8d675aa3..6478fc7c 100644 --- a/configs/moco/moco_v2_r50.yaml +++ b/configs/moco/moco_v2_r50.yaml @@ -1,5 +1,7 @@ epochs: 200 output_dir: output_dir +seed: 0 +device: gpu model: name: MoCo @@ -18,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 32 shuffle: true diff --git a/configs/moco_byol/moco_byol_r50_IM.yaml b/configs/moco_byol/moco_byol_r50_IM.yaml index 67dfc2a7..25da1042 100644 --- a/configs/moco_byol/moco_byol_r50_IM.yaml +++ b/configs/moco_byol/moco_byol_r50_IM.yaml @@ -3,6 +3,8 @@ use_byol_iters: True total_images: 1281167 global_batch_size: 4096 # 128 * 4 * 8 output_dir: output_dir +seed: 0 +device: gpu model: name: MoCoBYOL @@ -32,7 +34,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/moco_byol/moco_byol_r50_IM_clas.yaml b/configs/moco_byol/moco_byol_r50_IM_clas.yaml index 381db80f..72a052f6 100644 --- a/configs/moco_byol/moco_byol_r50_IM_clas.yaml +++ b/configs/moco_byol/moco_byol_r50_IM_clas.yaml @@ -1,5 +1,7 @@ epochs: 100 output_dir: output_dir +seed: 0 +device: gpu model: name: Classification @@ -16,7 +18,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true @@ -28,7 +32,7 @@ dataloader: transforms: - name: RandomResizedCrop size: 224 - interpolation: bicubic + interpolation: bicubic - name: RandomHorizontalFlip - name: Transpose - name: NormalizeImage @@ -36,7 +40,9 @@ dataloader: mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: false @@ -48,7 +54,7 @@ dataloader: transforms: - name: Resize size: 256 - interpolation: bicubic + interpolation: bicubic - name: CenterCrop size: 224 - name: Transpose @@ -73,4 +79,3 @@ log_config: custom_config: - name: EvaluateHook - diff --git a/configs/pixpro/pixpro_base_r50_100ep.yaml b/configs/pixpro/pixpro_base_r50_100ep.yaml index 401d89ad..45138a52 100644 --- a/configs/pixpro/pixpro_base_r50_100ep.yaml +++ b/configs/pixpro/pixpro_base_r50_100ep.yaml @@ -1,6 +1,8 @@ epochs: 100 output_dir: output_dir total_images: 1281167 +seed: 0 +device: gpu model: name: PixPro @@ -27,7 +29,9 @@ model: dataloader: train: - num_workers: 4 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: True diff --git a/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml b/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml index 253756fb..987e2dcc 100644 --- a/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml +++ b/configs/pixpro/pixpro_base_r50_100ep_IM_clas.yaml @@ -1,5 +1,7 @@ epochs: 100 output_dir: output_dir +seed: 0 +device: gpu model: name: Classification @@ -14,7 +16,9 @@ model: dataloader: train: - num_workers: 6 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: true @@ -32,7 +36,9 @@ dataloader: mean: [123.675, 116.28, 103.53] std: [58.395, 57.12, 57.375] val: - num_workers: 4 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: false diff --git a/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml index 63d79201..9672bbc8 100644 --- a/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml +++ b/configs/swin_transformer/SwinTransformer_base_patch4_window7_224.yaml @@ -1,6 +1,7 @@ epochs: 300 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -19,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true @@ -54,7 +57,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml index 2b20d15f..c91af03a 100644 --- a/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml +++ b/configs/swin_transformer/SwinTransformer_giant_patch4_window7_224.yaml @@ -2,6 +2,7 @@ epochs: 300 output_dir: output_dir seed: 16 use_amp: True +device: gpu AMP: level: 'O1' @@ -19,6 +20,7 @@ hybrid: mp_degree: 1 pp_degree: 1 +# Make sure your paddle version is develop to use sharding. sharding: sharding_stage: 2 offload: False @@ -42,7 +44,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 8 shuffle: true @@ -78,7 +82,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 32 shuffle: false diff --git a/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml index da9cd628..f7e70893 100644 --- a/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml +++ b/configs/swin_transformer/SwinTransformer_small_patch4_window7_224.yaml @@ -1,6 +1,7 @@ epochs: 300 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -19,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true @@ -39,7 +42,7 @@ dataloader: interpolation: 'bicubic' img_size: 224 - name: Transpose - - name: NormalizeImage + - name: NormalizeImage scale: 1.0/255.0 mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false @@ -71,7 +76,7 @@ dataloader: - name: CenterCrop size: 224 - name: Transpose - - name: NormalizeImage + - name: NormalizeImage scale: 1.0/255.0 mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] diff --git a/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml b/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml index 47f902db..8ec068cc 100644 --- a/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml +++ b/configs/swin_transformer/SwinTransformer_tiny_patch4_window7_224.yaml @@ -1,6 +1,7 @@ epochs: 300 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -19,7 +20,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true @@ -39,7 +42,7 @@ dataloader: interpolation: 'bicubic' img_size: 224 - name: Transpose - - name: NormalizeImage + - name: NormalizeImage scale: 1.0/255.0 mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false @@ -71,7 +76,7 @@ dataloader: - name: CenterCrop size: 224 - name: Transpose - - name: NormalizeImage + - name: NormalizeImage scale: 1.0/255.0 mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] diff --git a/configs/t2t_vit/t2t_vit_14.yaml b/configs/t2t_vit/t2t_vit_14.yaml index 4fe7c7bc..607fa892 100644 --- a/configs/t2t_vit/t2t_vit_14.yaml +++ b/configs/t2t_vit/t2t_vit_14.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: T2TViTWrapper @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/t2t_vit/t2t_vit_19.yaml b/configs/t2t_vit/t2t_vit_19.yaml index d3ff014e..07c1cc9e 100644 --- a/configs/t2t_vit/t2t_vit_19.yaml +++ b/configs/t2t_vit/t2t_vit_19.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: T2TViTWrapper @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/t2t_vit/t2t_vit_24.yaml b/configs/t2t_vit/t2t_vit_24.yaml index b107632b..4fbe27a9 100644 --- a/configs/t2t_vit/t2t_vit_24.yaml +++ b/configs/t2t_vit/t2t_vit_24.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: T2TViTWrapper @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/t2t_vit/t2t_vit_t_14.yaml b/configs/t2t_vit/t2t_vit_t_14.yaml index 8769024b..1f1112ab 100644 --- a/configs/t2t_vit/t2t_vit_t_14.yaml +++ b/configs/t2t_vit/t2t_vit_t_14.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: T2TViTWrapper @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/t2t_vit/t2t_vit_t_19.yaml b/configs/t2t_vit/t2t_vit_t_19.yaml index f546cc4a..eebe0364 100644 --- a/configs/t2t_vit/t2t_vit_t_19.yaml +++ b/configs/t2t_vit/t2t_vit_t_19.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: T2TViTWrapper @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/t2t_vit/t2t_vit_t_24.yaml b/configs/t2t_vit/t2t_vit_t_24.yaml index c634fd44..18a3b9da 100644 --- a/configs/t2t_vit/t2t_vit_t_24.yaml +++ b/configs/t2t_vit/t2t_vit_t_24.yaml @@ -1,5 +1,7 @@ epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: T2TViTWrapper @@ -17,7 +19,9 @@ model: dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: true diff --git a/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml b/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml index eabe6e68..3cfcd2bc 100644 --- a/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml +++ b/configs/vision_transformer/vit-base-p16-ft_in1k-384.yaml @@ -1,37 +1,41 @@ -epochs: 300 +epochs: 300 output_dir: output_dir +seed: 0 +device: gpu model: name: ViTWrapper architecture: - name: VisionTransformer - img_size: 384 - patch_size: 16 - width: 768 - depth: 8 - num_heads: 8 - mlp_ratio: 3 - qkv_bias: True + name: VisionTransformer + img_size: 384 + patch_size: 16 + width: 768 + depth: 8 + num_heads: 8 + mlp_ratio: 3 + qkv_bias: True head: name: VisionTransformerClsHead num_classes: 1000 - in_channels: 768 + in_channels: 768 dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: true drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: - name: ToRGB - name: RandomResizedCrop - size: 384 + size: 384 scale: [0.75, 1.] ratio: [1., 1.] interpolation: 'bicubic' @@ -41,12 +45,12 @@ dataloader: std: [127.5, 127.5, 127.5] lr_scheduler: - name: CosineWarmup + name: CosineWarmup learning_rate: 12.28 T_max: 93835 - warmup_steps: 10000 + warmup_steps: 10000 start_lr: 0.01228 - end_lr: 12.28 + end_lr: 12.28 optimizer: name: AdamW diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml index bf5468cd..2583f46f 100644 --- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml +++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_dp.yaml @@ -1,6 +1,7 @@ -epochs: 300 +epochs: 300 output_dir: output_dir seed: 2021 +device: gpu use_amp: True AMP: @@ -17,14 +18,14 @@ AMP: model: name: ViTWrapper architecture: - name: GoogleVisionTransformer - img_size: 224 - patch_size: 16 - embed_dim: 768 - depth: 12 + name: GoogleVisionTransformer + img_size: 224 + patch_size: 16 + embed_dim: 768 + depth: 12 num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + mlp_ratio: 4 + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.1 @@ -33,13 +34,15 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -54,10 +57,12 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -78,11 +83,11 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose lr_scheduler: - name: ViTLRScheduler + name: ViTLRScheduler learning_rate: 3e-3 decay_type: cosine warmup_steps: 10000 diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml index a3ff7554..72ff6433 100644 --- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml +++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o1_sharding.yaml @@ -1,6 +1,7 @@ -epochs: 300 +epochs: 300 output_dir: output_dir seed: 2021 +device: gpu use_amp: True AMP: @@ -14,6 +15,7 @@ AMP: "sigmoid_cross_entropy_with_logits", "elementwise_div"] level: 'O1' +# Make sure your paddle version is develop to use sharding. sharding: sharding_stage: 2 offload: False @@ -22,14 +24,14 @@ sharding: model: name: ViTWrapper architecture: - name: GoogleVisionTransformer - img_size: 224 - patch_size: 16 - embed_dim: 768 - depth: 12 + name: GoogleVisionTransformer + img_size: 224 + patch_size: 16 + embed_dim: 768 + depth: 12 num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + mlp_ratio: 4 + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.1 @@ -38,13 +40,15 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -59,10 +63,12 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -83,11 +89,11 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose lr_scheduler: - name: ViTLRScheduler + name: ViTLRScheduler learning_rate: 3e-3 decay_type: cosine warmup_steps: 10000 diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml index 94ae3f3d..d8066c55 100644 --- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml +++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_dp.yaml @@ -1,6 +1,7 @@ -epochs: 300 +epochs: 300 output_dir: output_dir seed: 2021 +device: gpu use_amp: True AMP: @@ -17,14 +18,14 @@ AMP: model: name: ViTWrapper architecture: - name: GoogleVisionTransformer - img_size: 224 - patch_size: 16 - embed_dim: 768 - depth: 12 + name: GoogleVisionTransformer + img_size: 224 + patch_size: 16 + embed_dim: 768 + depth: 12 num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + mlp_ratio: 4 + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.1 @@ -33,13 +34,15 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -54,10 +57,12 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -78,11 +83,11 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose lr_scheduler: - name: ViTLRScheduler + name: ViTLRScheduler learning_rate: 3e-3 decay_type: cosine warmup_steps: 10000 diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml index 2073c57c..c905c250 100644 --- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml +++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp16_o2_sharding.yaml @@ -1,6 +1,7 @@ -epochs: 300 +epochs: 300 output_dir: output_dir seed: 2021 +device: gpu use_amp: True AMP: @@ -14,6 +15,7 @@ AMP: "sigmoid_cross_entropy_with_logits", "elementwise_div"] level: 'O2' +# Make sure your paddle version is develop to use sharding. sharding: sharding_stage: 2 offload: False @@ -22,14 +24,14 @@ sharding: model: name: ViTWrapper architecture: - name: GoogleVisionTransformer - img_size: 224 - patch_size: 16 - embed_dim: 768 - depth: 12 + name: GoogleVisionTransformer + img_size: 224 + patch_size: 16 + embed_dim: 768 + depth: 12 num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + mlp_ratio: 4 + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.1 @@ -38,13 +40,15 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -59,10 +63,12 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -83,11 +89,11 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose lr_scheduler: - name: ViTLRScheduler + name: ViTLRScheduler learning_rate: 3e-3 decay_type: cosine warmup_steps: 10000 diff --git a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml index fae2842f..eb19f00d 100644 --- a/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml +++ b/configs/vision_transformer/vit-base-p16-pt_in1k-224_4n32c_fp32_dp.yaml @@ -1,18 +1,19 @@ -epochs: 300 +epochs: 300 output_dir: output_dir seed: 2021 +device: gpu model: name: ViTWrapper architecture: - name: GoogleVisionTransformer - img_size: 224 - patch_size: 16 - embed_dim: 768 - depth: 12 + name: GoogleVisionTransformer + img_size: 224 + patch_size: 16 + embed_dim: 768 + depth: 12 num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + mlp_ratio: 4 + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.1 @@ -21,13 +22,15 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -42,10 +45,12 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -66,11 +71,11 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose lr_scheduler: - name: ViTLRScheduler + name: ViTLRScheduler learning_rate: 3e-3 decay_type: cosine warmup_steps: 10000 diff --git a/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml b/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml index 901060fc..23d4c346 100644 --- a/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml +++ b/configs/vision_transformer/vit-base-p32_ft_in1k-384.yaml @@ -1,37 +1,41 @@ -epochs: 300 +epochs: 300 output_dir: output_dir +seed: 16 +device: gpu model: name: ViTWrapper architecture: - name: VisionTransformer - img_size: 384 - patch_size: 32 - width: 768 - depth: 12 - num_heads: 12 - mlp_ratio: 4 - qkv_bias: True + name: VisionTransformer + img_size: 384 + patch_size: 32 + width: 768 + depth: 12 + num_heads: 12 + mlp_ratio: 4 + qkv_bias: True head: name: VisionTransformerClsHead num_classes: 1000 - in_channels: 768 + in_channels: 768 dataloader: train: - num_workers: 0 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 128 + batch_size: 128 shuffle: true drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: - name: ToRGB - name: RandomResizedCrop - size: 384 + size: 384 scale: [0.75, 1.] ratio: [1., 1.] interpolation: 'bicubic' @@ -41,12 +45,12 @@ dataloader: std: [127.5, 127.5, 127.5] lr_scheduler: - name: CosineWarmup - learning_rate: 0.003 + name: CosineWarmup + learning_rate: 0.003 T_max: 93835 - warmup_steps: 10000 + warmup_steps: 10000 start_lr: 0.00003 - end_lr: 0.003 + end_lr: 0.003 optimizer: name: AdamW diff --git a/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml b/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml index 8554a48f..f41d6bfc 100644 --- a/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml +++ b/configs/vision_transformer/vit-g-p14-pt_in1k-224_1n8c.yaml @@ -1,30 +1,32 @@ -epochs: 300 +epochs: 300 output_dir: output_dir seed: 2021 +device: gpu model: name: ViTWrapper architecture: - name: GoogleVisionTransformer - img_size: 224 - patch_size: 14 - embed_dim: 1664 + name: GoogleVisionTransformer + img_size: 224 + patch_size: 14 + embed_dim: 1664 depth: 48 num_heads: 16 mlp_ratio: 4.9231 - qkv_bias: True + qkv_bias: True epsilon: 1e-6 class_num: 1000 drop_rate: 0.1 representation_size: 768 label_smoothing: 0.0001 - + +# Make sure your paddle version is develop to use sharding. sharding: sharding_stage: 2 - offload: False - accumulate_grad: False + offload: False + accumulate_grad: False -use_amp: True +use_amp: True AMP: level: 'O2' save_dtype: 'float32' @@ -38,13 +40,15 @@ AMP: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: - batch_size: 16 + batch_size: 16 shuffle: True drop_last: True dataset: - name: ImageNet + name: ImageNet dataroot: data/ILSVRC2012/train/ return_label: True transforms: @@ -59,10 +63,12 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 256 shuffle: False @@ -83,11 +89,11 @@ dataloader: - name: Normalize data_format: 'HWC' mean: [127.5, 127.5, 127.5] - std: [127.5, 127.5, 127.5] + std: [127.5, 127.5, 127.5] - name: Transpose lr_scheduler: - name: ViTLRScheduler + name: ViTLRScheduler learning_rate: 3e-3 decay_type: cosine warmup_steps: 10000 diff --git a/configs/xcit/xcit_large_24_p16_224.yaml b/configs/xcit/xcit_large_24_p16_224.yaml index 54372a26..a479c215 100755 --- a/configs/xcit/xcit_large_24_p16_224.yaml +++ b/configs/xcit/xcit_large_24_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 16 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_large_24_p8_224.yaml b/configs/xcit/xcit_large_24_p8_224.yaml index 4e5df1c2..0c95f65e 100755 --- a/configs/xcit/xcit_large_24_p8_224.yaml +++ b/configs/xcit/xcit_large_24_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 16 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_large_24_p8_224_dist.yaml b/configs/xcit/xcit_large_24_p8_224_dist.yaml index 995a14bf..3afc2b3f 100755 --- a/configs/xcit/xcit_large_24_p8_224_dist.yaml +++ b/configs/xcit/xcit_large_24_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -38,7 +39,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -48,7 +49,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -83,7 +86,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/xcit/xcit_medium_24_p16_224.yaml b/configs/xcit/xcit_medium_24_p16_224.yaml index b0469af1..86e4e68f 100755 --- a/configs/xcit/xcit_medium_24_p16_224.yaml +++ b/configs/xcit/xcit_medium_24_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 16 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_medium_24_p8_224.yaml b/configs/xcit/xcit_medium_24_p8_224.yaml index 1b886155..a908a65f 100755 --- a/configs/xcit/xcit_medium_24_p8_224.yaml +++ b/configs/xcit/xcit_medium_24_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 16 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_medium_24_p8_224_dist.yaml b/configs/xcit/xcit_medium_24_p8_224_dist.yaml index 8cd14a50..ee815158 100755 --- a/configs/xcit/xcit_medium_24_p8_224_dist.yaml +++ b/configs/xcit/xcit_medium_24_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -38,7 +39,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -48,7 +49,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -83,7 +86,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/xcit/xcit_nano_12_p16_224.yaml b/configs/xcit/xcit_nano_12_p16_224.yaml index 77728d19..b33f9909 100755 --- a/configs/xcit/xcit_nano_12_p16_224.yaml +++ b/configs/xcit/xcit_nano_12_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_nano_12_p8_224.yaml b/configs/xcit/xcit_nano_12_p8_224.yaml index 32436ab6..561fe541 100755 --- a/configs/xcit/xcit_nano_12_p8_224.yaml +++ b/configs/xcit/xcit_nano_12_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_nano_12_p8_224_dist.yaml b/configs/xcit/xcit_nano_12_p8_224_dist.yaml index 3393e0d2..e1702f9a 100755 --- a/configs/xcit/xcit_nano_12_p8_224_dist.yaml +++ b/configs/xcit/xcit_nano_12_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -39,7 +40,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -49,7 +50,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -84,7 +87,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/xcit/xcit_small_12_p16_224.yaml b/configs/xcit/xcit_small_12_p16_224.yaml index 41cfde05..f2593217 100755 --- a/configs/xcit/xcit_small_12_p16_224.yaml +++ b/configs/xcit/xcit_small_12_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_small_12_p8_224.yaml b/configs/xcit/xcit_small_12_p8_224.yaml index fb9761c7..6c95d5dd 100755 --- a/configs/xcit/xcit_small_12_p8_224.yaml +++ b/configs/xcit/xcit_small_12_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_small_12_p8_224_dist.yaml b/configs/xcit/xcit_small_12_p8_224_dist.yaml index f792496f..538b0e3f 100755 --- a/configs/xcit/xcit_small_12_p8_224_dist.yaml +++ b/configs/xcit/xcit_small_12_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -38,7 +39,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -48,7 +49,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -83,7 +86,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/xcit/xcit_small_24_p16_224.yaml b/configs/xcit/xcit_small_24_p16_224.yaml index cdc9c495..8e379e26 100755 --- a/configs/xcit/xcit_small_24_p16_224.yaml +++ b/configs/xcit/xcit_small_24_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 32 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_small_24_p8_224.yaml b/configs/xcit/xcit_small_24_p8_224.yaml index eca56782..9ba9b579 100755 --- a/configs/xcit/xcit_small_24_p8_224.yaml +++ b/configs/xcit/xcit_small_24_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 32 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_small_24_p8_224_dist.yaml b/configs/xcit/xcit_small_24_p8_224_dist.yaml index f792496f..538b0e3f 100755 --- a/configs/xcit/xcit_small_24_p8_224_dist.yaml +++ b/configs/xcit/xcit_small_24_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -38,7 +39,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -48,7 +49,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -83,7 +86,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/xcit/xcit_tiny_12_p16_224.yaml b/configs/xcit/xcit_tiny_12_p16_224.yaml index f2f96680..c834b468 100755 --- a/configs/xcit/xcit_tiny_12_p16_224.yaml +++ b/configs/xcit/xcit_tiny_12_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_tiny_12_p8_224.yaml b/configs/xcit/xcit_tiny_12_p8_224.yaml index 056b96a5..3de640a2 100755 --- a/configs/xcit/xcit_tiny_12_p8_224.yaml +++ b/configs/xcit/xcit_tiny_12_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_tiny_12_p8_224_dist.yaml b/configs/xcit/xcit_tiny_12_p8_224_dist.yaml index 28706276..ec62efba 100755 --- a/configs/xcit/xcit_tiny_12_p8_224_dist.yaml +++ b/configs/xcit/xcit_tiny_12_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -39,7 +40,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -49,7 +50,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -84,7 +87,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/configs/xcit/xcit_tiny_24_p16_224.yaml b/configs/xcit/xcit_tiny_24_p16_224.yaml index 54da3860..de6efbb9 100755 --- a/configs/xcit/xcit_tiny_24_p16_224.yaml +++ b/configs/xcit/xcit_tiny_24_p16_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_tiny_24_p8_224.yaml b/configs/xcit/xcit_tiny_24_p8_224.yaml index bb9c3fcc..ee3572fd 100755 --- a/configs/xcit/xcit_tiny_24_p8_224.yaml +++ b/configs/xcit/xcit_tiny_24_p8_224.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: SwinWrapper @@ -20,7 +21,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 64 shuffle: True @@ -55,7 +58,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: False diff --git a/configs/xcit/xcit_tiny_24_p8_224_dist.yaml b/configs/xcit/xcit_tiny_24_p8_224_dist.yaml index 1fe78db0..9f961164 100755 --- a/configs/xcit/xcit_tiny_24_p8_224_dist.yaml +++ b/configs/xcit/xcit_tiny_24_p8_224_dist.yaml @@ -1,6 +1,7 @@ epochs: 400 output_dir: output_dir seed: 0 +device: gpu model: name: DistillationWrapper @@ -38,7 +39,7 @@ model: pretrained_list: - regnety_160.pdparams - null - freeze_params_list: + freeze_params_list: - True - False infer_model_key: Student @@ -48,7 +49,9 @@ model: dataloader: train: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: True @@ -83,7 +86,9 @@ dataloader: mode: 'batch' cutmix_alpha: 1.0 val: - num_workers: 8 + loader: + num_workers: 8 + use_shared_memory: True sampler: batch_size: 128 shuffle: false diff --git a/passl/datasets/__init__.py b/passl/datasets/__init__.py index 41ecaaed..1a9a9dbc 100644 --- a/passl/datasets/__init__.py +++ b/passl/datasets/__init__.py @@ -14,6 +14,7 @@ from .imagenet import ImageNet from .imagenet import ImageNetCoord +from .beitdataset import BEiT_ImageNet from .cifar import CIFAR10, CIFAR100 from .textimagedataset import TextImageDataset diff --git a/passl/datasets/beitdataset.py b/passl/datasets/beitdataset.py new file mode 100644 index 00000000..269596e8 --- /dev/null +++ b/passl/datasets/beitdataset.py @@ -0,0 +1,65 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from .folder import DatasetFolder + +from .preprocess import build_transforms, MaskingGenerator +from .builder import DATASETS +from ..utils.misc import accuracy + + +@DATASETS.register() +class BEiT_ImageNet(DatasetFolder): + cls_filter = None + + def __init__(self, + dataroot, + common_transforms=None, + patch_transforms=None, + visual_token_transforms=None, + masking_generator=None): + super(BEiT_ImageNet, self).__init__(dataroot, + cls_filter=self.cls_filter) + + self.common_transform = build_transforms(common_transforms) + self.patch_transform = build_transforms(patch_transforms) + self.visual_token_transform = build_transforms(visual_token_transforms) + self.masked_position_generator = MaskingGenerator(**masking_generator) + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + path, target = self.samples[index] + # Only Used For Debug The DataAug Module. + #path = 'data/ILSVRC2012/train/n13040303/n13040303_1206.jpeg' + #target = 14 + sample = self.loader(path) + for_patches, for_visual_tokens = self.common_transform(sample) + return \ + self.patch_transform(for_patches), \ + self.visual_token_transform(for_visual_tokens), \ + self.masked_position_generator() + + def evaluate(self, preds, labels, topk=(1, 5)): + + eval_res = {} + eval_res['acc1'], eval_res['acc5'] = accuracy(preds, labels, topk) + + return eval_res diff --git a/passl/datasets/builder.py b/passl/datasets/builder.py index 9c4ec62a..6d6e0004 100644 --- a/passl/datasets/builder.py +++ b/passl/datasets/builder.py @@ -24,11 +24,11 @@ DATASETS = Registry("DATASET") + class DistributedRepeatedAugSampler(DistributedBatchSampler): """ based on https://github.com/facebookresearch/deit/blob/main/samplers.py """ - def __init__(self, dataset, batch_size, @@ -36,10 +36,10 @@ def __init__(self, rank=None, shuffle=False, drop_last=False): - super(DistributedRepeatedAugSampler, self).__init__( - dataset, batch_size, num_replicas, rank, shuffle, drop_last) - self.num_samples = int( - math.ceil(len(self.dataset) * 3.0 / self.nranks)) + super(DistributedRepeatedAugSampler, + self).__init__(dataset, batch_size, num_replicas, rank, shuffle, + drop_last) + self.num_samples = int(math.ceil(len(self.dataset) * 3.0 / self.nranks)) self.total_size = self.num_samples * self.nranks self.num_selected_samples = int( math.floor(len(self.dataset) // 256 * 256 / self.nranks)) @@ -79,8 +79,9 @@ def build_dataset(cfg): return build_from_config(cfg, DATASETS) -def build_dataloader(cfg): +def build_dataloader(cfg, device): cfg_ = copy.deepcopy(cfg) + loader_cfg = cfg_.pop('loader') dataset_cfg = cfg_.pop('dataset') sampler_cfg = cfg_.pop('sampler') @@ -90,10 +91,13 @@ def build_dataloader(cfg): dataset = build_dataset(dataset_cfg) sampler_name = sampler_cfg.pop('name', 'DistributedBatchSampler') - + sampler = eval("{}".format(sampler_name))(dataset, **sampler_cfg) - dataloader = paddle.io.DataLoader(dataset, batch_sampler=sampler, **cfg_) + dataloader = paddle.io.DataLoader(dataset, + batch_sampler=sampler, + places=device, + **loader_cfg) #setup mixup / cutmix mixup_fn = None diff --git a/passl/datasets/preprocess/__init__.py b/passl/datasets/preprocess/__init__.py index 693a1289..b1280179 100644 --- a/passl/datasets/preprocess/__init__.py +++ b/passl/datasets/preprocess/__init__.py @@ -13,4 +13,5 @@ # limitations under the License. from .builder import build_transform, build_transforms +from .masking_generator import MaskingGenerator from .transforms import RandomApply, RandomGrayscale, GaussianBlur, Solarization diff --git a/passl/datasets/preprocess/masking_generator.py b/passl/datasets/preprocess/masking_generator.py new file mode 100644 index 00000000..8e3fbc90 --- /dev/null +++ b/passl/datasets/preprocess/masking_generator.py @@ -0,0 +1,107 @@ +# --------------------------------------------------------' +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -------------------------------------------------------- +# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) +# Github source: https://github.com/microsoft/unilm/tree/master/beit +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] + +import random +import math +import numpy as np + + +class MaskingGenerator(object): + def __init__(self, + input_size, + num_masking_patches, + min_num_patches=4, + max_num_patches=None, + min_aspect=0.3, + max_aspect=None): + if not isinstance(input_size, tuple): + input_size = (input_size, ) * 2 + self.height, self.width = input_size + + self.num_patches = self.height * self.width + self.num_masking_patches = num_masking_patches + + self.min_num_patches = min_num_patches + self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches + + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + + def get_shape(self): + return self.height, self.width + + def _mask(self, mask, max_mask_patches): + delta = 0 + for attempt in range(10): + target_area = random.uniform(self.min_num_patches, max_mask_patches) + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < self.width and h < self.height: + top = random.randint(0, self.height - h) + left = random.randint(0, self.width - w) + + num_masked = mask[top:top + h, left:left + w].sum() + # Overlap + if 0 < h * w - num_masked <= max_mask_patches: + for i in range(top, top + h): + for j in range(left, left + w): + if mask[i, j] == 0: + mask[i, j] = 1 + delta += 1 + + if delta > 0: + break + return delta + + def __call__(self): + mask = np.zeros(shape=self.get_shape(), dtype=np.int) + mask_count = 0 + while mask_count < self.num_masking_patches: + max_mask_patches = self.num_masking_patches - mask_count + max_mask_patches = min(max_mask_patches, self.max_num_patches) + + delta = self._mask(mask, max_mask_patches) + if delta == 0: + break + else: + mask_count += delta + + return mask + + +class RandomMaskingGenerator(object): + def __init__(self, input_size, mask_ratio): + if not isinstance(input_size, tuple): + input_size = (input_size, ) * 2 + + self.height, self.width = input_size + + self.num_patches = self.height * self.width + self.num_mask = int(mask_ratio * self.num_patches) + + def __call__(self): + mask = np.hstack([ + np.zeros(self.num_patches - self.num_mask), + np.ones(self.num_mask), + ]) + np.random.shuffle(mask) + return mask # [196] diff --git a/passl/datasets/preprocess/transforms.py b/passl/datasets/preprocess/transforms.py index 6ba3ed0a..a717e32a 100644 --- a/passl/datasets/preprocess/transforms.py +++ b/passl/datasets/preprocess/transforms.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import cv2 import math import random -from PIL import ImageFilter, Image, ImageOps -import cv2 +import warnings import numpy as np from functools import partial +from PIL import ImageFilter, Image, ImageOps import paddle import paddle.vision.transforms as PT @@ -26,6 +27,7 @@ from .mixup import Mixup from .builder import TRANSFORMS, build_transform from .random_erasing import RandomErasing +from .masking_generator import MaskingGenerator, RandomMaskingGenerator from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT from .auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform from .cv2_trans import ByolRandomHorizontalFlip, ByolColorJitter, ByolRandomGrayscale, ByolNormalize, \ @@ -41,6 +43,7 @@ TRANSFORMS.register(PT.CenterCrop) TRANSFORMS.register(PT.ToTensor) +# BYOL Augmentation TRANSFORMS.register(ByolRandomHorizontalFlip) TRANSFORMS.register(ByolColorJitter) TRANSFORMS.register(ByolRandomGrayscale) @@ -53,9 +56,16 @@ TRANSFORMS.register(RandomErasing) TRANSFORMS.register(Mixup) +# PixPro TRANSFORMS.register(RandomResizedCropCoord) TRANSFORMS.register(RandomHorizontalFlipCoord) +# BEiT +TRANSFORMS.register(MaskingGenerator) + +_RANDOM_INTERPOLATION = ('bilinear', 'bicubic') + + @TRANSFORMS.register() class Clip(): def __init__(self, min_val=0.0, max_val=1.0): @@ -252,7 +262,7 @@ def _apply_image(self, img): if not is_pil: img = np.asarray(img) return img - + class UnifiedResize(object): """ @@ -292,20 +302,17 @@ def _pil_resize(src, size, resample): interpolation = _pil_interp_from_str[interpolation.lower()] self.resize_func = partial(_pil_resize, resample=interpolation) else: - logger.warning( - f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead." - ) self.resize_func = cv2.resize def __call__(self, src, size): return self.resize_func(src, size) - + + @TRANSFORMS.register() class RandCropImage(object): """ random crop image https://github.com/PaddlePaddle/PaddleClas/blob/release/2.3/ppcls/data/preprocess/ops/operators.py """ - def __init__(self, size, scale=None, @@ -320,8 +327,8 @@ def __init__(self, self.scale = [0.08, 1.0] if scale is None else scale self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio - self._resize_func = UnifiedResize( - interpolation=interpolation, backend=backend) + self._resize_func = UnifiedResize(interpolation=interpolation, + backend=backend) def __call__(self, img): size = self.size @@ -350,13 +357,13 @@ def __call__(self, img): img = img[j:j + h, i:i + w, :] return self._resize_func(img, size) - + + @TRANSFORMS.register() class ResizeImage(object): """ resize image https://github.com/PaddlePaddle/PaddleClas/blob/release/2.3/ppcls/data/preprocess/ops/operators.py """ - def __init__(self, size=None, resize_short=None, @@ -371,11 +378,11 @@ def __init__(self, self.w = size if type(size) is int else size[0] self.h = size if type(size) is int else size[1] else: - raise OperatorParamError("invalid params for ReisizeImage for '\ + raise ValueError("invalid params for ReisizeImage for '\ 'both 'size' and 'resize_short' are None") - self._resize_func = UnifiedResize( - interpolation=interpolation, backend=backend) + self._resize_func = UnifiedResize(interpolation=interpolation, + backend=backend) def __call__(self, img): img_h, img_w = img.shape[:2] @@ -398,7 +405,7 @@ class NormalizeImage(PT.Normalize): scale (float): Normalize input value to [0, 1]. mean (int|float|list|tuple): Sequence of means for each channel. std (int|float|list|tuple): Sequence of standard deviations for each channel. - data_format (str, optional): Data format of img, should be 'HWC' or + data_format (str, optional): Data format of img, should be 'HWC' or 'CHW'. Default: 'CHW'. to_rgb (bool, optional): Whether to convert to rgb. Default: False. keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None. @@ -411,7 +418,7 @@ class NormalizeImage(PT.Normalize): A callable object of Normalize. Examples: - + .. code-block:: python import numpy as np @@ -419,7 +426,7 @@ class NormalizeImage(PT.Normalize): from paddle.vision.transforms import Normalize normalize = NormalizeImage(scale=1./255., - mean=[127.5, 127.5, 127.5], + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], data_format='HWC') @@ -428,9 +435,8 @@ class NormalizeImage(PT.Normalize): fake_img = normalize(fake_img) print(fake_img.shape) print(fake_img.max, fake_img.max) - - """ + """ def __init__(self, scale=None, mean=0.0, @@ -441,11 +447,154 @@ def __init__(self, keys=None): super(NormalizeImage, self).__init__(mean=mean, std=std, keys=keys) self.scale = eval(scale) - self.dtype = dtype + self.dtype = dtype def _apply_image(self, img): if self.scale is not None: img = img * self.scale img = F.normalize(img, self.mean, self.std, self.data_format, - self.to_rgb) + self.to_rgb) return img.astype(self.dtype) + + +@TRANSFORMS.register() +class RandomResizedCropAndInterpolationWithTwoPic(PT.RandomResizedCrop): + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. + This crop is finally resized to given size. + This is popularly used to train the Inception networks. + + Args: + size: expected output size of each edge + second size: second expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + second_interpolation: Default: PIL.Image.LANCZOS + """ + def __init__(self, + size, + second_size=None, + scale=(0.08, 1.0), + ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', + second_interpolation='lanczos', + keys=None): + super(RandomResizedCropAndInterpolationWithTwoPic, self).__init__(keys) + if isinstance(size, list): + self.size = size + else: + self.size = [size, size] + if second_size is not None: + if isinstance(second_size, list): + self.second_size = second_size + else: + self.second_size = [second_size, second_size] + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + if interpolation == 'random': + self.interpolation = _RANDOM_INTERPOLATION + else: + self.interpolation = interpolation + self.second_interpolation = second_interpolation + self.scale = scale + self.ratio = ratio + + def get_params(self, img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def _apply_image(self, img): + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Randomly cropped and resized image. + """ + i, j, h, w = self.get_params(img, self.scale, self.ratio) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + cropped_img = F.crop(img, i, j, h, w) + if self.second_size is None: + return F.resize(cropped_img, self.size, interpolation) + else: + return F.resize(img, self.size, interpolation), \ + F.resize(img, self.second_size, self.second_interpolation) + + +@TRANSFORMS.register() +class VisualTokenMap(object): + def __init__(self, mode='map_pixel', scale=None): + self.mode = mode + self.scale = scale + self.logit_laplace_eps = 0.1 + + def map_pixels(self, x): + if self.scale is not None: + try: + x = paddle.to_tensor(x).astype('float32') / self.scale + except: + import pdb + + return (1 - 2 * self.logit_laplace_eps) * x + self.logit_laplace_eps + + def unmap_pixels(self, x): + if len(x.shape) != 4: + raise ValueError('expected input to be 4d') + if x.dtype != paddle.float32: + raise ValueError('expected input to have type float') + + return paddle.clamp( + (x - self.logit_laplace_eps) / (1 - 2 * self.logit_laplace_eps), 0, + 1) + + def __call__(self, x): + if self.mode == "map_pixels": + return self.map_pixels(x) + elif self.mode == "unmap_pixels": + return self.unmap_pixels(x) diff --git a/passl/engine/trainer.py b/passl/engine/trainer.py index d0373035..bf3ab04a 100644 --- a/passl/engine/trainer.py +++ b/passl/engine/trainer.py @@ -106,6 +106,12 @@ def __init__(self, cfg): np.random.seed(seed) random.seed(seed) + # set device + assert cfg['device'] in ['cpu', 'gpu', 'xpu', 'npu'] + self.device = paddle.set_device(cfg['device']) + self.logger.info('train with paddle {} on {} device'.format( + paddle.__version__, self.device)) + self.start_epoch = 0 self.current_epoch = 0 self.current_iter = 0 @@ -133,7 +139,7 @@ def __init__(self, cfg): # build train dataloader self.train_dataloader, self.mixup_fn = build_dataloader( - cfg.dataloader.train) + cfg.dataloader.train, self.device) self.iters_per_epoch = len(self.train_dataloader) # use byol iters @@ -168,7 +174,11 @@ def __init__(self, cfg): mp_rank = hcg.get_model_parallel_rank() pp_rank = hcg.get_stage_id() dp_rank = hcg.get_data_parallel_rank() - set_hyrbid_parallel_seed(seed, 0, mp_rank, pp_rank) + set_hyrbid_parallel_seed(seed, + 0, + mp_rank, + pp_rank, + device=self.device) # amp training self.use_amp = cfg.get('use_amp', @@ -324,7 +334,7 @@ def train(self): def val(self, **kargs): if not hasattr(self, 'val_dataloader'): self.val_dataloader, mixup_fn = build_dataloader( - self.cfg.dataloader.val) + self.cfg.dataloader.val, self.device) self.logger.info( 'start evaluate on epoch {} ..'.format(self.current_epoch + 1)) diff --git a/passl/modeling/architectures/BEiTWrapper.py b/passl/modeling/architectures/BEiTWrapper.py new file mode 100644 index 00000000..0a886c58 --- /dev/null +++ b/passl/modeling/architectures/BEiTWrapper.py @@ -0,0 +1,187 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.distributed as dist + +from .builder import MODELS +from .builder import create_d_vae +from ..heads import build_head +from ..backbones import build_backbone + + +@MODELS.register() +class BEiTWrapper(nn.Layer): + def __init__(self, architecture=None, head=None): + """A wrapper for a BEiT supervised model. + + Args: + architecture (dict): A dictionary containing the BEiT instantiation parameters. + """ + super().__init__() + + self.backbone = build_backbone(architecture) + self.automatic_optimization = False + self.head = build_head(head) + + def backbone_forward(self, x): + x = self.backbone(x) + return x + + def train_iter(self, *inputs, **kwargs): + img, label = inputs + cls_token = self.backbone_forward(img) + outs = self.head(cls_token) + loss_inputs = (outs, label) + outputs = self.head.loss(*loss_inputs) + return outputs + + def forward(self, *inputs, mode='train', **kwargs): + if mode == 'train': + return self.train_iter(*inputs, **kwargs) + elif mode == 'test': + return self.test_iter(*inputs, **kwargs) + elif mode == 'extract': + return self.backbone(*inputs) + else: + raise Exception("No such mode: {}".format(mode)) + + def validation_step(self, val_batch, idx): + image, text = val_batch + image_logits, text_logits = self.forward(image, text) + ground_truth = paddle.arange(len(image_logits)) + loss = (self.image_loss(image_logits, ground_truth) + + self.text_loss(text_logits, ground_truth)).div(2) + self.log('val_loss', loss) + + +@MODELS.register() +class BEiTPTWrapper(nn.Layer): + def __init__(self, architecture=None, head=None, d_vae=None): + """A wrapper for a BEiT Pretrain. + + Args: + architecture (dict): A dictionary containing the BEiT instantiation parameters. + """ + super().__init__() + + self.backbone = build_backbone(architecture) + self.automatic_optimization = False + self.head = build_head(head) + with paddle.no_grad(): + self.d_vae = create_d_vae(d_vae) + + def get_codebook_indices(self, images): + with paddle.no_grad(): + logits = self.d_vae.encoder(images) + codebook_indices = logits.argmax(axis=1) + return codebook_indices + + def backbone_forward(self, + x, + bool_masked_pos=None, + return_all_tokens=False): + x = self.backbone(x, + bool_masked_pos=bool_masked_pos, + return_all_tokens=return_all_tokens) + return x + + def train_iter(self, *inputs, **kwargs): + samples, images, bool_masked_pos = inputs + + with paddle.no_grad(): + input_ids = self.get_codebook_indices(images).flatten(1) + bool_masked_pos = bool_masked_pos.flatten(1).astype( + 'bool') # to bool. + labels = input_ids[bool_masked_pos] + + outputs = self.backbone_forward(samples, + bool_masked_pos=bool_masked_pos, + return_all_tokens=False) + loss = self.head(outputs, labels) + return loss + + def test_iter(self, *inputs, **kwargs): + with paddle.no_grad(): + img, label = inputs + x = self.backbone_forward(img) + outs = self.head(x) + + return outs + + def forward(self, *inputs, mode='train', **kwargs): + if mode == 'train': + return self.train_iter(*inputs, **kwargs) + elif mode == 'test': + return self.test_iter(*inputs, **kwargs) + elif mode == 'extract': + return self.backbone(*inputs) + else: + raise Exception("No such mode: {}".format(mode)) + + def validation_step(self, val_batch, idx): + image, text = val_batch + image_logits, text_logits = self.forward(image, text) + ground_truth = paddle.arange(len(image_logits)) + loss = (self.image_loss(image_logits, ground_truth) + + self.text_loss(text_logits, ground_truth)).div(2) + self.log('val_loss', loss) + + +@MODELS.register() +class BEiTFTWrapper(nn.Layer): + def __init__(self, architecture=None, head=None): + """A wrapper for a BEiT Finetune. + + Args: + architecture (dict): A dictionary containing the BEiT instantiation parameters. + """ + super().__init__() + self.backbone = build_backbone(architecture) + self.head = build_head(head) + + def backbone_forward(self, x): + x = self.backbone(x) + return x + + def train_iter(self, *inputs, **kwargs): + img, label = inputs + mixup_fn = kwargs['mixup_fn'] + if mixup_fn is not None: + img, label = mixup_fn(img, label) + + x = self.backbone_forward(img) + outputs = self.head(x) + outputs = self.head.loss(outputs, label) + return outputs + + def test_iter(self, *inputs, **kwargs): + with paddle.no_grad(): + img, _ = inputs + x = self.backbone_forward(img) + outs = self.head(x) + return outs # self.head.loss(outs, label, soft=False) + + def forward(self, *inputs, mode='train', **kwargs): + if mode == 'train': + return self.train_iter(*inputs, **kwargs) + elif mode == 'test': + return self.test_iter(*inputs, **kwargs) + elif mode == 'extract': + return self.backbone(*inputs) + else: + raise Exception("No such mode: {}".format(mode)) diff --git a/passl/modeling/architectures/BeitWrapper.py b/passl/modeling/architectures/BeitWrapper.py deleted file mode 100644 index 7c968ff9..00000000 --- a/passl/modeling/architectures/BeitWrapper.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import paddle.distributed as dist - -from ..backbones import build_backbone -from ..heads import build_head -from .builder import MODELS - - -@MODELS.register() -class BeitWrapper(nn.Layer): - def __init__(self, architecture=None, head=None): - """A wrapper for a ViT model as specified in the paper. - - Args: - architecture (dict): A dictionary containing the ViT instantiation parameters. - """ - super().__init__() - - self.backbone = build_backbone(architecture) - self.automatic_optimization = False - self.head = build_head(head) - - def backbone_forward(self, x): - x = self.backbone(x) - return x - - def train_iter(self, *inputs, **kwargs): - img, label = inputs - cls_token = self.backbone_forward(img) - outs = self.head(cls_token) - loss_inputs = (outs, label) - outputs = self.head.loss(*loss_inputs) - return outputs - - def forward(self, *inputs, mode='train', **kwargs): - if mode == 'train': - return self.train_iter(*inputs, **kwargs) - elif mode == 'test': - return self.test_iter(*inputs, **kwargs) - elif mode == 'extract': - return self.backbone(*inputs) - else: - raise Exception("No such mode: {}".format(mode)) - - def validation_step(self, val_batch, idx): - image, text = val_batch - image_logits, text_logits = self.forward(image, text) - ground_truth = paddle.arange(len(image_logits)) - loss = (self.image_loss(image_logits, ground_truth) + - self.text_loss(text_logits, ground_truth)).div(2) - self.log('val_loss', loss) diff --git a/passl/modeling/architectures/__init__.py b/passl/modeling/architectures/__init__.py index 4eb0338d..9d2356ff 100644 --- a/passl/modeling/architectures/__init__.py +++ b/passl/modeling/architectures/__init__.py @@ -12,26 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .moco import MoCo +from .builder import build_model +from .byol_clas import ByolClassification from .clas import Classification -from .BYOL import BYOL -from .MoCoBYOL import MoCoBYOL -from .CLIPWrapper import CLIPWrapper +from .moco import MoCo from .simclr import SimCLR -from .byol_clas import ByolClassification -from .ViTWrapper import ViTWrapper -from .SwinWrapper import SwinWrapper -from .builder import build_model - -from .BeitWrapper import BeitWrapper +from .pixpro import PixPro -from .T2TViTWrapper import T2TViTWrapper +from .BEiTWrapper import BEiTWrapper, BEiTPTWrapper, BEiTFTWrapper +from .BYOL import BYOL from .CaiTWrapper import CaiTWrapper -from .MlpMixerWrapper import MlpMixerWrapper +from .CLIPWrapper import CLIPWrapper from .CvTWrapper import CvTWrapper - from .DeiTWrapper import DeiTWrapper -from .pixpro import PixPro - -from .MAE import MAE_PRETRAIN, MAE_FINETUNE from .DistillationWrapper import DistillationWrapper +from .MAE import MAE_PRETRAIN, MAE_FINETUNE +from .MoCoBYOL import MoCoBYOL +from .MlpMixerWrapper import MlpMixerWrapper +from .SwinWrapper import SwinWrapper +from .T2TViTWrapper import T2TViTWrapper +from .ViTWrapper import ViTWrapper diff --git a/passl/modeling/architectures/builder.py b/passl/modeling/architectures/builder.py index 2dfc1ae4..12e477a3 100644 --- a/passl/modeling/architectures/builder.py +++ b/passl/modeling/architectures/builder.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import paddle +from copy import deepcopy +from ..backbones.discrete_vae import Dalle_VAE, DiscreteVAE, load_model, Encoder, Decoder from ...utils.registry import Registry, build_from_config MODELS = Registry("MODEL") @@ -19,3 +23,49 @@ def build_model(cfg): return build_from_config(cfg, MODELS) + + +def create_d_vae(cfg): + cfg = deepcopy(cfg) + name = cfg.pop('name') + if name == "dall-e": + return get_dalle_vae(cfg) + elif name == "customized": + return get_d_vae(cfg) + else: + raise NotImplementedError() + + +def get_dalle_vae(cfg): + cfg = deepcopy(cfg) + image_size = cfg.pop('image_size') + weight_path = cfg.pop('weight_path') + with paddle.no_grad(): + vae = Dalle_VAE(image_size) + vae.encoder = load_model('encoder', model_dir=weight_path) + vae.decoder = load_model('decoder', model_dir=weight_path) + return vae + + +def get_d_vae(cfg): + cfg = deepcopy(cfg) + image_size = cfg.pop('image_size') + weight_path = cfg.pop('weight_path') + NUM_TOKENS = 8192 + NUM_LAYERS = 3 + EMB_DIM = 512 + HID_DIM = 256 + + state_dict = paddle.load(os.path.join(weight_path, "pytorch_model.bin"), + map_location="cpu")["weights"] + + model = DiscreteVAE( + image_size=image_size, + num_layers=NUM_LAYERS, + num_tokens=NUM_TOKENS, + codebook_dim=EMB_DIM, + hidden_dim=HID_DIM, + ) + + model.load_state_dict(state_dict) + return model diff --git a/passl/modeling/backbones/__init__.py b/passl/modeling/backbones/__init__.py index 47227db2..c7bf315a 100644 --- a/passl/modeling/backbones/__init__.py +++ b/passl/modeling/backbones/__init__.py @@ -1,18 +1,21 @@ -from .resnet import ResNet -from .clip import CLIP -from .builder import build_backbone -from .resnetcifar import ResNet as ResNetCifar -from .resnetsimclr import ResNetsimclr -from .vision_transformer import VisionTransformer -from .vit import GoogleVisionTransformer -from .swin_transformer import SwinTransformer from .beit import Beit -from .t2t_vit import T2TViT +from .beit_ft import VisionTransformerForFinetune +from .beit_pt import VisionTransformerForMaskedImageModeling +from .builder import build_backbone from .cait import Cait -from .mlp_mixer import MlpMixer +from .clip import CLIP +from .convnext import ConvNeXt from .cvt import CvT from .deit import DeiTVisionTransformer, DistilledVisionTransformer -from .convnext import ConvNeXt +from .discrete_vae import Dalle_VAE, DiscreteVAE from .mae import MAE, MAE_ViT -from .xcit import XCiT +from .mlp_mixer import MlpMixer from .regnet import RegNet +from .resnet import ResNet +from .resnetcifar import ResNet as ResNetCifar +from .resnetsimclr import ResNetsimclr +from .swin_transformer import SwinTransformer +from .t2t_vit import T2TViT +from .vision_transformer import VisionTransformer +from .vit import GoogleVisionTransformer +from .xcit import XCiT diff --git a/passl/modeling/backbones/beit_ft.py b/passl/modeling/backbones/beit_ft.py new file mode 100644 index 00000000..e07cb4c2 --- /dev/null +++ b/passl/modeling/backbones/beit_ft.py @@ -0,0 +1,533 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Code was based on https://github.com/microsoft/unilm/tree/master/beit + +import copy +import math +from functools import partial + +import paddle +import torch +from timm.models.layers import trunc_normal_ as trunc_normal__ +import paddle.nn as nn +import paddle.nn.functional as F + +from .builder import BACKBONES + +__all__ = ["VisionTransformerForMaskedImageModeling"] + +trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02) +zeros_ = nn.initializer.Constant(value=0.0) +ones_ = nn.initializer.Constant(value=1.0) + + +def drop_path(x, drop_prob=0.0, training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Mlp(nn.Layer): + """MLP module + MLP using nn.Linear and activation is GELU, dropout is applied. + Ops: fc1 -> act -> dropout -> fc2 -> dropout + """ + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class PatchEmbed(nn.Layer): + """2D Image to Patch Embedding + Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op. + """ + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + ): + super().__init__() + img_size = (img_size, img_size) + patch_size = (patch_size, patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], + img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2D(in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else Identity() + + def forward(self, x): + B, C, H, W = x.shape + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})" + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose((0, 2, 1)) # BCHW -> BNC + x = self.norm(x) + return x + + +class Identity(nn.Layer): + """Identity layer + The output of this layer is the input without any change. + Use this layer to avoid if condition in some forward methods + """ + def __init__(self): + super().__init__() + + def forward(self, inputs): + return inputs + + +class Attention(nn.Layer): + """Attention Layer""" + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = head_dim**-0.5 if qk_scale is None else qk_scale + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False) + if qkv_bias: + self.q_bias = paddle.create_parameter(shape=[all_head_dim], + dtype="float32", + default_initializer=zeros_) + + self.v_bias = paddle.create_parameter(shape=[all_head_dim], + dtype="float32", + default_initializer=zeros_) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - + 1) * (2 * window_size[1] - 1) + 3 + + self.relative_position_bias_table = paddle.create_parameter( + shape=[self.num_relative_distance, num_heads], + dtype="float32", + default_initializer=zeros_, + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, + coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten.unsqueeze( + axis=2) - coords_flatten.unsqueeze( + axis=1) # 2, Wh*Ww, Wh*Ww #?? + relative_coords = relative_coords.transpose([1, 2, + 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, + 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = paddle.zeros( + [ + window_size[0] * window_size[1] + 1, + window_size[0] * window_size[1] + 1, + ], + dtype=relative_coords.dtype, + ) + # Wh*Ww, Wh*Ww + relative_position_index[1:, 1:] = relative_coords.sum(-1) + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", + relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = paddle.concat( + (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) + + qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias) + + qkv = qkv.reshape([B, N, 3, self.num_heads, + -1]).transpose([2, 0, 3, 1, 4]) + # make torchscript happy (cannot use tensor as tuple) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + + attn = q @ k.transpose([0, 1, 3, 2]) + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + + attn = attn + relative_position_bias.unsqueeze(axis=0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, -1]) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size, + attn_head_dim=attn_head_dim, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if init_values: + self.gamma_1 = paddle.create_parameter( + shape=[dim], + dtype="float32", + default_initializer=nn.initializer.Constant(value=init_values), + ) + self.gamma_2 = paddle.create_parameter( + shape=[dim], + dtype="float32", + default_initializer=nn.initializer.Constant(value=init_values), + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias): + if self.gamma_1 is None: + x = x + self.drop_path( + self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class RelativePositionBias(nn.Layer): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - + 1) * (2 * window_size[1] - 1) + 3 + + self.relative_position_bias_table = paddle.create_parameter( + shape=[self.num_relative_distance, num_heads], + dtype="float32", + default_initializer=zeros_, + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, + coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten.unsqueeze( + axis=2) - coords_flatten.unsqueeze(axis=1) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose([1, 2, + 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = paddle.zeros([ + window_size[0] * window_size[1] + 1, + window_size[0] * window_size[1] + 1 + ], + dtype=relative_coords.dtype) + relative_position_index[1:, + 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ]) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww + + +@BACKBONES.register() +class VisionTransformerForFinetune(nn.Layer): + """ BEiT Finetune + This model is mainly used for pretraining ImageNet-22K + code base on https://github.com/microsoft/unilm/blob/master/beit/modeling_finetune.py + """ + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + norm_layer=partial(nn.LayerNorm, epsilon=1e-6), + init_values=None, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + use_mean_pooling=True, + init_scale=0.001): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + wa = torch.ones(size=(1, 1, embed_dim)) + trunc_normal__(wa, std=0.02) + wa = wa.cpu().numpy() + + self.cls_token = paddle.create_parameter( + shape=[1, 1, embed_dim], + dtype="float32", + #default_initializer=wa.cpu().numpy(), + ) + self.cls_token.set_value(wa) + #self.mask_token = paddle.create_parameter( + # shape=[1, 1, embed_dim], + # dtype="float32", + # default_initializer=trunc_normal_, + #) + if use_abs_pos_emb: + self.pos_embed = paddle.create_parameter( + shape=[1, num_patches + 1, embed_dim], + dtype="float32", + default_initializer=trunc_normal_, + ) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=self.patch_embed.grid_size, num_heads=num_heads) + else: + self.rel_pos_bias = None + + # stochastic depth decay rule + dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)] + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.LayerList([ + Block(dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.grid_size + if use_rel_pos_bias else None) for i in range(depth) + ]) + self.norm = Identity() if use_mean_pooling else norm_layer(embed_dim) + self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None + #self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity() + + #trunc_normal_(self.head.weight) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + param.scale(1. / math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, layer_id + 1) + rescale(layer.mlp.fc2.weight, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def get_num_layers(self): + return len(self.blocks) + + def forward_features(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand([batch_size, -1, -1]) + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias( + ) if self.rel_pos_bias is not None else None + for blk in self.blocks: + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + if self.fc_norm is not None: + t = x[:, 1:, :] + return self.fc_norm(t.mean(1)) + else: + return x[:, 0] + + def forward(self, x): + x = self.forward_features(x) + #x = self.head(x) + return x diff --git a/passl/modeling/backbones/beit_pt.py b/passl/modeling/backbones/beit_pt.py new file mode 100644 index 00000000..af08e9d2 --- /dev/null +++ b/passl/modeling/backbones/beit_pt.py @@ -0,0 +1,534 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Code was based on https://github.com/microsoft/unilm/tree/master/beit + +import copy +import math +from functools import partial + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from .builder import BACKBONES + +__all__ = ["VisionTransformerForMaskedImageModeling"] + +trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02) +zeros_ = nn.initializer.Constant(value=0.0) +ones_ = nn.initializer.Constant(value=1.0) + + +def drop_path(x, drop_prob=0.0, training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Mlp(nn.Layer): + """MLP module + MLP using nn.Linear and activation is GELU, dropout is applied. + Ops: fc1 -> act -> dropout -> fc2 -> dropout + """ + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class PatchEmbed(nn.Layer): + """2D Image to Patch Embedding + Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op. + """ + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + ): + super().__init__() + img_size = (img_size, img_size) + patch_size = (patch_size, patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], + img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2D(in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else Identity() + + def forward(self, x): + B, C, H, W = x.shape + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})" + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose((0, 2, 1)) # BCHW -> BNC + x = self.norm(x) + return x + + +class Identity(nn.Layer): + """Identity layer + The output of this layer is the input without any change. + Use this layer to avoid if condition in some forward methods + """ + def __init__(self): + super().__init__() + + def forward(self, inputs): + return inputs + + +class Attention(nn.Layer): + """Attention Layer""" + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = head_dim**-0.5 if qk_scale is None else qk_scale + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False) + if qkv_bias: + self.q_bias = paddle.create_parameter(shape=[all_head_dim], + dtype="float32", + default_initializer=zeros_) + + self.v_bias = paddle.create_parameter(shape=[all_head_dim], + dtype="float32", + default_initializer=zeros_) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - + 1) * (2 * window_size[1] - 1) + 3 + + self.relative_position_bias_table = paddle.create_parameter( + shape=[self.num_relative_distance, num_heads], + dtype="float32", + default_initializer=zeros_, + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, + coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten.unsqueeze( + axis=2) - coords_flatten.unsqueeze( + axis=1) # 2, Wh*Ww, Wh*Ww #?? + relative_coords = relative_coords.transpose([1, 2, + 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, + 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = paddle.zeros( + [ + window_size[0] * window_size[1] + 1, + window_size[0] * window_size[1] + 1, + ], + dtype=relative_coords.dtype, + ) + # Wh*Ww, Wh*Ww + relative_position_index[1:, 1:] = relative_coords.sum(-1) + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", + relative_position_index) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = paddle.concat( + (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) + + qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias) + + qkv = qkv.reshape([B, N, 3, self.num_heads, + -1]).transpose([2, 0, 3, 1, 4]) + # make torchscript happy (cannot use tensor as tuple) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + + attn = q @ k.transpose([0, 1, 3, 2]) + + if self.relative_position_bias_table is not None: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + + attn = attn + relative_position_bias.unsqueeze(axis=0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose([0, 2, 1, 3]).reshape([B, N, -1]) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + window_size=None, + attn_head_dim=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + window_size=window_size, + attn_head_dim=attn_head_dim, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if init_values: + self.gamma_1 = paddle.create_parameter( + shape=[dim], + dtype="float32", + default_initializer=nn.initializer.Constant(value=init_values), + ) + self.gamma_2 = paddle.create_parameter( + shape=[dim], + dtype="float32", + default_initializer=nn.initializer.Constant(value=init_values), + ) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias): + if self.gamma_1 is None: + x = x + self.drop_path( + self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn( + self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class RelativePositionBias(nn.Layer): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - + 1) * (2 * window_size[1] - 1) + 3 + + self.relative_position_bias_table = paddle.create_parameter( + shape=[self.num_relative_distance, num_heads], + dtype="float32", + default_initializer=zeros_, + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, + coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten.unsqueeze( + axis=2) - coords_flatten.unsqueeze(axis=1) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose([1, 2, + 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = paddle.zeros([ + window_size[0] * window_size[1] + 1, + window_size[0] * window_size[1] + 1 + ], + dtype=relative_coords.dtype) + relative_position_index[1:, + 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1])].reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, + -1, + ]) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww + + +@BACKBONES.register() +class VisionTransformerForMaskedImageModeling(nn.Layer): + """ BEiT Pretrain + This model is mainly used for pretraining ImageNet-22K + code base on https://github.com/microsoft/unilm/blob/master/beit/modeling_pretrain.py + """ + def __init__(self, + img_size=224, + patch_size=16, + in_chans=3, + vocab_size=8192, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + norm_layer=partial(nn.LayerNorm, epsilon=1e-6), + init_values=None, + attn_head_dim=None, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + init_std=0.02): + super(VisionTransformerForMaskedImageModeling, self).__init__() + # num_features for consistency with other models + self.num_features = self.embed_dim = embed_dim + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = paddle.create_parameter( + shape=[1, 1, embed_dim], + dtype="float32", + default_initializer=trunc_normal_, + ) + self.mask_token = paddle.create_parameter( + shape=[1, 1, embed_dim], + dtype="float32", + default_initializer=trunc_normal_, + ) + + if use_abs_pos_emb: + self.pos_embed = paddle.create_parameter( + shape=[1, num_patches + 1, embed_dim], + dtype="float32", + default_initializer=trunc_normal_, + ) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias( + window_size=self.patch_embed.grid_size, num_heads=num_heads) + else: + self.rel_pos_bias = None + + # stochastic depth decay rule + dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)] + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.grid_size + if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim, + ) for i in range(depth) + ]) + self.norm = norm_layer(embed_dim) + + self.init_std = init_std + self.lm_head = nn.Linear(embed_dim, vocab_size) + trunc_normal_(self.lm_head.weight) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + param.scale(1. / math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, layer_id + 1) + rescale(layer.mlp.fc2.weight, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand([batch_size, -1, -1]) + mask_token = self.mask_token.expand([batch_size, seq_len, -1]) + + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + cls_tokens * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias( + ) if self.rel_pos_bias is not None else None + for blk in self.blocks: + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + + return x + + def forward(self, x, bool_masked_pos, return_all_tokens=False): + x = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + if return_all_tokens: + return self.lm_head(x) + else: + # return the masked tokens + return self.lm_head(x[bool_masked_pos]) diff --git a/passl/modeling/backbones/discrete_vae.py b/passl/modeling/backbones/discrete_vae.py new file mode 100644 index 00000000..257cd5d4 --- /dev/null +++ b/passl/modeling/backbones/discrete_vae.py @@ -0,0 +1,718 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) +# Github source: https://github.com/microsoft/unilm/tree/master/beit +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# By Hangbo Bao +# Based on OpenAI DALL-E and lucidrains' DALLE-pytorch code bases +# https://github.com/openai/DALL-E +# https://github.com/lucidrains/DALLE-pytorch +import os +#import wget +import paddle +import paddle.nn as nn + + +# +#logit_laplace_eps = 0.1 +# +# +#def map_pixels(x): +# return (1 - 2 * logit_laplace_eps) * x + logit_laplace_eps +# +# +#def unmap_pixels(x): +# return paddle.clip((x - logit_laplace_eps) / (1 - 2 * logit_laplace_eps), 0, 1) +# +# +# +#class Identity(nn.Layer): +# def __init__(self): +# super(Identity, self).__init__() +# +# def forward(self, inputs): +# return inputs +# +# +class EncoderBlock(nn.Layer): + def __init__(self, n_in, n_out, n_layers): + super(EncoderBlock, self).__init__() + n_hid = n_out // 4 + self.post_gain = 1 / (n_layers**2) + + self.id_path = nn.Conv2D(n_in, n_out, + 1) if n_in != n_out else Identity() + self.res_path = nn.Sequential( + ('relu_1', nn.ReLU()), + ('conv_1', nn.Conv2D(n_in, n_hid, 3, padding=1)), + ('relu_2', nn.ReLU()), + ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_3', nn.ReLU()), + ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_4', nn.ReLU()), ('conv_4', nn.Conv2D(n_hid, n_out, 1))) + + def forward(self, x): + return self.id_path(x) + self.post_gain * self.res_path(x) + + +class Encoder(nn.Layer): + def __init__(self, + group_count=4, + n_hid=256, + n_blk_per_group=2, + input_channels=3, + vocab_size=8192): + super(Encoder, self).__init__() + self.vocab_size = vocab_size + + blk_range = range(n_blk_per_group) + n_layers = group_count * n_blk_per_group + + self.blocks = nn.Sequential( + ('input', nn.Conv2D(input_channels, 1 * n_hid, 7, padding=3)), + ('group_1', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(1 * n_hid, 1 * n_hid, n_layers=n_layers)) + for i in blk_range], + ('pool', nn.MaxPool2D(kernel_size=2)), + )), + ('group_2', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(1 * n_hid if i == 0 else 2 * n_hid, + 2 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('pool', nn.MaxPool2D(kernel_size=2)), + )), + ('group_3', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(2 * n_hid if i == 0 else 4 * n_hid, + 4 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('pool', nn.MaxPool2D(kernel_size=2)), + )), + ('group_4', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(4 * n_hid if i == 0 else 8 * n_hid, + 8 * n_hid, + n_layers=n_layers)) for i in blk_range], )), + ('output', + nn.Sequential( + ('relu', nn.ReLU()), + ('conv', nn.Conv2D(8 * n_hid, vocab_size, 1)), + )), + ) + + def forward(self, x): + return self.blocks(x) + + +class DecoderBlock(nn.Layer): + def __init__(self, n_in, n_out, n_layers): + super(DecoderBlock, self).__init__() + n_hid = n_out // 4 + self.post_gain = 1 / (n_layers**2) + + self.id_path = nn.Conv2D(n_in, n_out, + 1) if n_in != n_out else Identity() + self.res_path = nn.Sequential( + ('relu_1', nn.ReLU()), ('conv_1', nn.Conv2D(n_in, n_hid, 1)), + ('relu_2', nn.ReLU()), + ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_3', nn.ReLU()), + ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_4', nn.ReLU()), + ('conv_4', nn.Conv2D(n_hid, n_out, 3, padding=1))) + + def forward(self, x): + return self.id_path(x) + self.post_gain * self.res_path(x) + + +class Decoder(nn.Layer): + def __init__(self, + group_count=4, + n_init=128, + n_hid=256, + n_blk_per_group=2, + output_channels=3, + vocab_size=8192): + super(Decoder, self).__init__() + self.vocab_size = vocab_size + + blk_range = range(n_blk_per_group) + n_layers = group_count * n_blk_per_group + + self.blocks = nn.Sequential( + ('input', nn.Conv2D(vocab_size, n_init, 1)), + ('group_1', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(n_init if i == 0 else 8 * n_hid, + 8 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('upsample', nn.Upsample(scale_factor=2, mode='nearest')), + )), + ('group_2', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(8 * n_hid if i == 0 else 4 * n_hid, + 4 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('upsample', nn.Upsample(scale_factor=2, mode='nearest')), + )), + ('group_3', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(4 * n_hid if i == 0 else 2 * n_hid, + 2 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('upsample', nn.Upsample(scale_factor=2, mode='nearest')), + )), + ('group_4', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(2 * n_hid if i == 0 else 1 * n_hid, + 1 * n_hid, + n_layers=n_layers)) for i in blk_range], )), + ('output', + nn.Sequential( + ('relu', nn.ReLU()), + ('conv', nn.Conv2D(1 * n_hid, 2 * output_channels, 1)), + )), + ) + + def forward(self, x): + return self.blocks(x) + + +model_dict = { + 'encoder': [ + 'Encoder', + r'https://passl.bj.bcebos.com/vision_transformers/beit/encoder.pdparams', + 'encoder.pdparams' + ], + 'decoder': [ + 'Decoder', + r'https://passl.bj.bcebos.com/vision_transformers/beit/decoder.pdparams', + 'decoder.pdparams' + ] +} + + +def load_model(model_name, model_dir): + model_fn, url, file_name = model_dict[model_name] + model = eval(model_fn)() + + model_path = os.path.join(model_dir, file_name) + if not os.path.exists(model_path): + if not os.path.exists(model_dir): + os.makedirs(model_dir) + #wget.download(url, out=model_path) + params = paddle.load(model_path) + model.set_state_dict(params) + model.eval() + return model + + +# +# +#class DalleVAE(nn.Layer): +# def __init__(self, group_count=4, n_init=128, n_hid=256, n_blk_per_group=2, input_channels=3, output_channels=3, vocab_size=8192): +# super(DiscreteVAE, self).__init__() +# self.vocab_size = vocab_size +# self.encoder = Encoder() +# self.decoder = Decoder() +# self.l1_loss = paddle.nn.loss.L1Loss(reduction='none') +# +# def encode(self, x): +# return self.encoder(x) +# +# def decode(self, z): +# return self.decoder(z) +# +# +# def logit_laplace_loss(self, x, x_stats): +# ## x [ B, 3, 256, 256 ] +# ## x_stats [ B, 6, 256, 256 ] +# # mu +# mu = x_stats[:,:3] +# # +# lnb = x_stats[:,3:] +# log_norm = -paddle.log(x * (1 - x)) - lnb - paddle.log(paddle.to_tensor(2.0)) +# #print("log_norm", log_norm) +# log_compare = -self.l1_loss(paddle.log(x/(1-x)), mu) / paddle.exp(lnb) +# #print("log_compare", log_compare) +# return -(log_norm+log_compare) +# +# def gumbel_softmax(self, z_logits, temperature): +# +# def sample_gumbel(shape, eps=1e-20): +# U = paddle.fluid.layers.uniform_random(shape,min=0,max=1) +# return -paddle.log(-paddle.log(U + eps) + eps) +# +# def gumbel_softmax_sample(logits, temperature): +# y = logits + sample_gumbel(logits.shape) +# return nn.functional.softmax( y / temperature, axis=1) +# +# return gumbel_softmax_sample(z_logits, temperature) +# +# +# def forward(self, x, temperature): +# # [B, vocab_size, 32, 32] +# z_logits = self.encoder(x) +# q_y = nn.functional.softmax(z_logits, axis=1) +# log_q_y = paddle.log(q_y+1e-20) +# kl_loss = q_y*(log_q_y-paddle.log(paddle.to_tensor(1.0/self.vocab_size))) +# # to [B, 32, 32] +# kl_loss = paddle.sum(kl_loss, axis=[1]) +# # to [B] +# kl_loss = paddle.mean(kl_loss, axis=[1,2]) +# #print(kl_loss) +# +# z = self.gumbel_softmax(z_logits, temperature) +# x_stats = self.decoder(z) +# recon_loss = self.logit_laplace_loss(x, x_stats) +# recon_loss = paddle.mean(recon_loss, axis=[1, 2, 3]) +# #print(recon_loss) +# +# return recon_loss, kl_loss +# + +# +# +#def load_model(model_name, pretrained=False): +# model_fn, url, file_name = model_dict[model_name] +# model = model_fn() +# +# if pretrained: +# model_path = os.path.join('pretrained_models', file_name) +# if not os.path.isfile(model_path): +# if not os.path.exists('pretrained_models'): +# os.mkdir('pretrained_models') +# wget.download(url, out=model_path) +# params = paddle.load(model_path) +# model.set_dict(params) +# +# model.eval() +# return model + +from math import sqrt +import os +import paddle +from paddle import nn, einsum +import paddle.nn.functional as F +from einops import rearrange + +from .builder import BACKBONES + + +def top_k(logits, thres=0.5): + num_logits = logits.shape[-1] + k = max(int((1 - thres) * num_logits), 1) + val, ind = paddle.topk(logits, k) + probs = paddle.full_like(logits, float('-inf')) + probs.scatter_(1, ind, val) + return probs + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def eval_decorator(fn): + def inner(model, *args, **kwargs): + was_training = model.training + model.eval() + out = fn(model, *args, **kwargs) + model.train(was_training) + return out + + return inner + + +class BasicVAE(nn.Layer): + def get_codebook_indices(self, images): + raise NotImplementedError() + + def decode(self, img_seq): + raise NotImplementedError() + + def get_codebook_probs(self, img_seq): + raise NotImplementedError() + + def get_image_tokens_size(self): + pass + + def get_image_size(self): + pass + + +class ResBlock(nn.Layer): + def __init__(self, chan_in, hidden_size, chan_out): + super().__init__() + self.net = nn.Sequential( + nn.Conv2D(chan_in, hidden_size, 3, padding=1), nn.ReLU(), + nn.Conv2D(hidden_size, hidden_size, 3, padding=1), nn.ReLU(), + nn.Conv2D(hidden_size, chan_out, 1)) + + def forward(self, x): + return self.net(x) + x + + +@BACKBONES.register() +class DiscreteVAE(BasicVAE): + def __init__(self, + image_size=256, + num_tokens=512, + codebook_dim=512, + num_layers=3, + hidden_dim=64, + channels=3, + smooth_l1_loss=False, + temperature=0.9, + straight_through=False, + kl_div_loss_weight=0.): + super().__init__() + # assert log2(image_size).is_integer(), 'image size must be a power of 2' + assert num_layers >= 1, 'number of layers must be greater than or equal to 1' + + self.image_size = image_size + self.num_tokens = num_tokens + self.num_layers = num_layers + self.temperature = temperature + self.straight_through = straight_through + self.codebook = nn.Embedding(num_tokens, codebook_dim) + + enc_layers = [] + dec_layers = [] + + enc_in = channels + dec_in = codebook_dim + + for layer_id in range(num_layers): + enc_layers.append( + nn.Sequential( + nn.Conv2D(enc_in, hidden_dim, 4, stride=2, padding=1), + nn.ReLU())) + enc_layers.append( + ResBlock(chan_in=hidden_dim, + hidden_size=hidden_dim, + chan_out=hidden_dim)) + enc_in = hidden_dim + dec_layers.append( + nn.Sequential( + nn.ConvTranspose2D(dec_in, + hidden_dim, + 4, + stride=2, + padding=1), nn.ReLU())) + dec_layers.append( + ResBlock(chan_in=hidden_dim, + hidden_size=hidden_dim, + chan_out=hidden_dim)) + dec_in = hidden_dim + + enc_layers.append(nn.Conv2D(hidden_dim, num_tokens, 1)) + dec_layers.append(nn.Conv2D(hidden_dim, channels, 1)) + + self.encoder = nn.Sequential(*enc_layers) + self.decoder = nn.Sequential(*dec_layers) + + self.loss_fn = F.smooth_l1_loss if smooth_l1_loss else F.mse_loss + self.kl_div_loss_weight = kl_div_loss_weight + + def get_image_size(self): + return self.image_size + + def get_image_tokens_size(self): + return self.image_size // 8 + + @paddle.no_grad() + @eval_decorator + def get_codebook_indices(self, images): + logits = self.forward(images, return_logits=True) + codebook_indices = logits.argmax(dim=1) + return codebook_indices + + @paddle.no_grad() + @eval_decorator + def get_codebook_probs(self, images): + logits = self.forward(images, return_logits=True) + return nn.Softmax(dim=1)(logits) + + def decode(self, img_seq): + image_embeds = self.codebook(img_seq) + b, n, d = image_embeds.shape + h = w = int(sqrt(n)) + + image_embeds = rearrange(image_embeds, 'b (h w) d -> b d h w', h=h, w=w) + images = self.decoder(image_embeds) + return images + + def forward(self, + img, + return_loss=False, + return_recons=False, + return_logits=False, + temp=None): + device, num_tokens, image_size, kl_div_loss_weight = img.device, self.num_tokens, self.image_size, self.kl_div_loss_weight + assert img.shape[-1] == image_size and img.shape[ + -2] == image_size, f'input must have the correct image size {image_size}' + + logits = self.encoder(img) + + if return_logits: + return logits # return logits for getting hard image indices for DALL-E training + + temp = default(temp, self.temperature) + soft_one_hot = F.gumbel_softmax(logits, + tau=temp, + dim=1, + hard=self.straight_through) + sampled = einsum('b n h w, n d -> b d h w', soft_one_hot, + self.codebook.weight) + out = self.decoder(sampled) + + if not return_loss: + return out + + # reconstruction loss + + recon_loss = self.loss_fn(img, out) + + # kl divergence + + logits = rearrange(logits, 'b n h w -> b (h w) n') + qy = F.softmax(logits, dim=-1) + + log_qy = paddle.log(qy + 1e-10) + log_uniform = paddle.log( + paddle.to_tensor([1. / num_tokens], device=device)) + kl_div = F.kl_div(log_uniform, + log_qy, + None, + None, + 'batchmean', + log_target=True) + + loss = recon_loss + (kl_div * kl_div_loss_weight) + + if not return_recons: + return loss + + return loss, out + + +@BACKBONES.register() +class Dalle_VAE(BasicVAE): + def __init__(self, image_size): + super().__init__() + self.encoder = Encoder() + self.decoder = Decoder() + self.image_size = image_size + + def decode(self, img_seq): + bsz = img_seq.size()[0] + img_seq = img_seq.view(bsz, self.image_size // 8, self.image_size // 8) + z = F.one_hot(img_seq, + num_classes=self.encoder.vocab_size).permute(0, 3, 1, + 2).float() + return self.decoder(z).float() + + def get_codebook_indices(self, images): + z_logits = self.encoder(images) + return paddle.argmax(z_logits, axis=1) + + def get_codebook_probs(self, images): + z_logits = self.encoder(images) + return nn.Softmax(dim=1)(z_logits) + + def forward(self, img_seq_prob, no_process=False): + if no_process: + return self.decoder(img_seq_prob.float()).float() + else: + bsz, seq_len, num_class = img_seq_prob.size() + z = img_seq_prob.view(bsz, self.image_size // 8, + self.image_size // 8, self.encoder.vocab_size) + return self.decoder(z.permute(0, 3, 1, 2).float()).float() + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, inputs): + return inputs + + +class EncoderBlock(nn.Layer): + def __init__(self, n_in, n_out, n_layers): + super(EncoderBlock, self).__init__() + n_hid = n_out // 4 + self.post_gain = 1 / (n_layers**2) + + self.id_path = nn.Conv2D(n_in, n_out, + 1) if n_in != n_out else Identity() + self.res_path = nn.Sequential( + ('relu_1', nn.ReLU()), + ('conv_1', nn.Conv2D(n_in, n_hid, 3, padding=1)), + ('relu_2', nn.ReLU()), + ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_3', nn.ReLU()), + ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_4', nn.ReLU()), ('conv_4', nn.Conv2D(n_hid, n_out, 1))) + + def forward(self, x): + return self.id_path(x) + self.post_gain * self.res_path(x) + + +class Encoder(nn.Layer): + def __init__(self, + group_count=4, + n_hid=256, + n_blk_per_group=2, + input_channels=3, + vocab_size=8192): + super(Encoder, self).__init__() + self.vocab_size = vocab_size + + blk_range = range(n_blk_per_group) + n_layers = group_count * n_blk_per_group + + self.blocks = nn.Sequential( + ('input', nn.Conv2D(input_channels, 1 * n_hid, 7, padding=3)), + ('group_1', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(1 * n_hid, 1 * n_hid, n_layers=n_layers)) + for i in blk_range], + ('pool', nn.MaxPool2D(kernel_size=2)), + )), + ('group_2', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(1 * n_hid if i == 0 else 2 * n_hid, + 2 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('pool', nn.MaxPool2D(kernel_size=2)), + )), + ('group_3', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(2 * n_hid if i == 0 else 4 * n_hid, + 4 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('pool', nn.MaxPool2D(kernel_size=2)), + )), + ('group_4', + nn.Sequential( + *[(f'block_{i + 1}', + EncoderBlock(4 * n_hid if i == 0 else 8 * n_hid, + 8 * n_hid, + n_layers=n_layers)) for i in blk_range], )), + ('output', + nn.Sequential( + ('relu', nn.ReLU()), + ('conv', nn.Conv2D(8 * n_hid, vocab_size, 1)), + )), + ) + + def forward(self, x): + return self.blocks(x) + + +class DecoderBlock(nn.Layer): + def __init__(self, n_in, n_out, n_layers): + super(DecoderBlock, self).__init__() + n_hid = n_out // 4 + self.post_gain = 1 / (n_layers**2) + + self.id_path = nn.Conv2D(n_in, n_out, + 1) if n_in != n_out else Identity() + self.res_path = nn.Sequential( + ('relu_1', nn.ReLU()), ('conv_1', nn.Conv2D(n_in, n_hid, 1)), + ('relu_2', nn.ReLU()), + ('conv_2', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_3', nn.ReLU()), + ('conv_3', nn.Conv2D(n_hid, n_hid, 3, padding=1)), + ('relu_4', nn.ReLU()), + ('conv_4', nn.Conv2D(n_hid, n_out, 3, padding=1))) + + def forward(self, x): + return self.id_path(x) + self.post_gain * self.res_path(x) + + +class Decoder(nn.Layer): + def __init__(self, + group_count=4, + n_init=128, + n_hid=256, + n_blk_per_group=2, + output_channels=3, + vocab_size=8192): + super(Decoder, self).__init__() + self.vocab_size = vocab_size + + blk_range = range(n_blk_per_group) + n_layers = group_count * n_blk_per_group + + self.blocks = nn.Sequential( + ('input', nn.Conv2D(vocab_size, n_init, 1)), + ('group_1', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(n_init if i == 0 else 8 * n_hid, + 8 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('upsample', nn.Upsample(scale_factor=2, mode='nearest')), + )), + ('group_2', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(8 * n_hid if i == 0 else 4 * n_hid, + 4 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('upsample', nn.Upsample(scale_factor=2, mode='nearest')), + )), + ('group_3', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(4 * n_hid if i == 0 else 2 * n_hid, + 2 * n_hid, + n_layers=n_layers)) for i in blk_range], + ('upsample', nn.Upsample(scale_factor=2, mode='nearest')), + )), + ('group_4', + nn.Sequential( + *[(f'block_{i + 1}', + DecoderBlock(2 * n_hid if i == 0 else 1 * n_hid, + 1 * n_hid, + n_layers=n_layers)) for i in blk_range], )), + ('output', + nn.Sequential( + ('relu', nn.ReLU()), + ('conv', nn.Conv2D(1 * n_hid, 2 * output_channels, 1)), + )), + ) + + def forward(self, x): + return self.blocks(x) diff --git a/passl/modeling/heads/__init__.py b/passl/modeling/heads/__init__.py index 5f86aaf3..9b443691 100644 --- a/passl/modeling/heads/__init__.py +++ b/passl/modeling/heads/__init__.py @@ -12,18 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .beit_head import BEiTClsHead, BEiTPTHead, BEiTFTHead +from .builder import build_head +from .cait_head import CaitClsHead from .contrastive_head import ContrastiveHead from .clas_head import ClasHead +from .clip_head import CLIPHead +from .cvt_head import CvTClsHead from .l2_head import L2Head from .mb_head import MBHead -from .clip_head import CLIPHead -from .builder import build_head +from .mlp_mixer_head import MlpMixerClsHead +from .pixpro_head import PixProHead from .simclr_contrastive_head import SimCLRContrastiveHead -from .vision_transformer_head import VisionTransformerClsHead from .swin_transformer_head import SwinTransformerClsHead -from .beit_head import BeitClsHead from .t2t_vit_head import T2TViTClsHead -from .cait_head import CaitClsHead -from .mlp_mixer_head import MlpMixerClsHead -from .cvt_head import CvTClsHead -from .pixpro_head import PixProHead +from .vision_transformer_head import VisionTransformerClsHead diff --git a/passl/modeling/heads/beit_head.py b/passl/modeling/heads/beit_head.py index c9e6d629..c852e6b4 100644 --- a/passl/modeling/heads/beit_head.py +++ b/passl/modeling/heads/beit_head.py @@ -12,18 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys +import math import paddle import paddle.nn as nn +from paddle import multiply +from paddle.nn import Identity +import paddle.nn.functional as F from .builder import HEADS trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02) zeros_ = nn.initializer.Constant(value=0.0) +ones_ = nn.initializer.Constant(value=1.0) @HEADS.register() -class BeitClsHead(nn.Layer): - """Swin Transformer classifier head. +class BEiTClsHead(nn.Layer): + """BEiT classifier head. Args: in_channels (int): Number of channels in the input feature map. @@ -73,3 +79,111 @@ def accuracy(output, target, topk=(1, )): correct_k = correct[:k].reshape([-1]).sum(0, keepdim=True) res.append(correct_k * 100.0 / batch_size) return res + + +@HEADS.register() +class BEiTPTHead(nn.Layer): + """BEiT Pretrain Head. + + Args: + in_channels (int): Number of channels in the input feature map. + num_classes (int): Number of categories excluding the background category. + """ + def __init__(self, in_channels=None, num_classes=None, init_scale=0.001): + super().__init__() + self.criterion = nn.CrossEntropyLoss() + + def forward(self, cls_score, labels): + losses = dict() + losses["loss"] = self.criterion(cls_score, labels) + loss_value = losses["loss"].item() + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + #lossmlm_acc = (cls_score.max(-1) == labels).astype('float32').mean().item() + + losses["mlm_acc"] = accuracy(cls_score, labels)[0] + return losses + + +def accuracy(output, target, topk=(1, )): + """Computes the accuracy over the k top predictions for the specified values of k""" + with paddle.no_grad(): + maxk = max(topk) + batch_size = target.shape[0] + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = paddle.cast(pred == target.reshape([1, -1]).expand_as(pred), + "float32") + + res = [] + for k in topk: + correct_k = correct[:k].reshape([-1]).sum(0, keepdim=True) + res.append(correct_k * 100.0 / batch_size) + return res + + +@HEADS.register() +class BEiTFTHead(nn.Layer): + """BEiT Finetune Head. + + Args: + in_channels (int): Number of channels in the input feature map. + num_classes (int): Number of categories excluding the background category. + """ + def __init__(self, in_channels=None, num_classes=None, init_scale=0.001): + super(BEiTFTHead, self).__init__() + self.head = nn.Linear(in_channels, + num_classes) if num_classes > 0 else Identity() + self.criterion = nn.CrossEntropyLoss() + trunc_normal_(self.head.weight) + self.apply(self._init_weights) + + self.head.weight.set_value( + multiply(self.head.weight, paddle.to_tensor(init_scale))) + self.head.bias.set_value( + multiply(self.head.bias, paddle.to_tensor(init_scale))) + + def forward(self, x): + x = self.head(x) + return x + + def loss(self, x, labels, soft=True): + losses = dict() + if soft: + losses['loss'] = paddle.sum(-labels * F.log_softmax(x, axis=-1), + axis=-1).mean() + else: + losses["loss"] = self.criterion(x, labels) + losses['acc1'], losses['acc5'] = accuracy(x, labels, topk=(1, 5)) + return losses + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + +def accuracy(output, target, topk=(1, )): + """Computes the accuracy over the k top predictions for the specified values of k""" + with paddle.no_grad(): + maxk = max(topk) + if target.dim() > 1: + target = target.argmax(axis=-1) + batch_size = target.shape[0] + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = paddle.cast(pred == target.reshape([1, -1]).expand_as(pred), + 'float32') + + res = [] + for k in topk: + correct_k = correct[:k].reshape([-1]).sum(0, keepdim=True) + res.append(correct_k * 100.0 / batch_size) + return res diff --git a/passl/solver/builder.py b/passl/solver/builder.py index b29275d1..3f4573ee 100644 --- a/passl/solver/builder.py +++ b/passl/solver/builder.py @@ -48,14 +48,12 @@ def build_lr_scheduler_simclr(cfg, iters_per_epoch, batch_size, epochs, # FIXME: if have a better way if cfg.name == 'CosineAnnealingDecay': - cfg.T_max = T_max cfg.T_max *= iters_per_epoch elif cfg.name == 'MultiStepDecay': cfg.milestones = [x * iters_per_epoch for x in cfg.milestones] elif cfg.name == 'Cosinesimclr': cfg.iters_per_epoch = iters_per_epoch cfg.epochs = epochs - cfg.T_max = T_max elif cfg.name == 'simclrCosineWarmup': cfg.step_each_epoch = iters_per_epoch cfg.epochs = epochs @@ -93,9 +91,85 @@ def build_clip_optimizer(cfg, lr_scheduler, parameters=None): return OPTIMIZERS.get(name)(lr_scheduler, **cfg) +def get_num_layer_for_vit(var_name, num_max_layer): + if var_name in ("backbone.cls_token", "backbone.mask_token", + "backbone.pos_embed"): + return 0 + elif var_name.startswith("backbone.patch_embed"): + return 0 + elif var_name.startswith("backbone.rel_pos_bias"): + return num_max_layer - 1 + elif var_name.startswith("backbone.blocks"): + layer_id = int(var_name.split('.')[2]) + return layer_id + 1 + else: + return num_max_layer - 1 + + +class LayerDecayValueAssigner(object): + def __init__(self, values): + self.values = values + + def get_scale(self, layer_id): + return self.values[layer_id] + + def get_layer_id(self, var_name): + return get_num_layer_for_vit(var_name, len(self.values)) + + +def get_parameter_groups(cfg, + model, + skip_list=(), + get_num_layer=None, + get_layer_scale=None): + weight_decay = cfg['weight_decay'] + parameter_group_names = {} + parameter_group_vars = {} + + for name, param in model.named_parameters(): + if param.stop_gradient: + continue # frozen weights + if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: + group_name = "no_decay" + this_weight_decay = 0. + else: + group_name = "decay" + this_weight_decay = weight_decay + if get_num_layer is not None: + layer_id = get_num_layer(name) + group_name = "layer_%d_%s" % (layer_id, group_name) + else: + layer_id = None + + if group_name not in parameter_group_names: + if get_layer_scale is not None: + scale = get_layer_scale(layer_id) + else: + scale = 1. + + parameter_group_names[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "learning_rate": scale + } + parameter_group_vars[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "learning_rate": scale + } + parameter_group_vars[group_name]["params"].append(param) + parameter_group_names[group_name]["params"].append(name) + return list(parameter_group_vars.values()) + + def build_optimizer(cfg, lr_scheduler, model_list=None): cfg = copy.deepcopy(cfg) name = cfg.pop('name') + if 'layer_decay' in cfg: + layer_decay = cfg.pop('layer_decay') + assert isinstance(layer_decay, float) + if layer_decay is None: + layer_decay = 1.0 # step 1 clip grad if 'grad_clip' in cfg: @@ -107,8 +181,21 @@ def build_optimizer(cfg, lr_scheduler, model_list=None): clip_norm = grad_clip_cfg['value'] cfg['grad_clip'] = ClipGradByNorm(clip_norm=clip_norm) - parameters = sum([m.parameters() - for m in model_list], []) if model_list else None + if layer_decay < 1.0: + num_layers = model_list[0].backbone.get_num_layers() + assigner = LayerDecayValueAssigner( + list(layer_decay**(num_layers + 1 - i) + for i in range(num_layers + 2))) + else: + assigner = None + if assigner is not None: + parameters = get_parameter_groups(cfg, + model_list[0], + get_num_layer=assigner.get_layer_id, + get_layer_scale=assigner.get_scale) + else: + parameters = sum([m.parameters() + for m in model_list], []) if model_list else None # step 2 Adapt Lars and Lamb optimizer parameter argument. if 'Lars' in name or 'Lamb' in name: