add vit-b/16 finetune (#729) (#786)

GuoxiaWang · web-flow · commit 16f9ca5c32fa · 2022-09-22T23:43:05.000+08:00
diff --git a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
@@ -0,0 +1,122 @@
+_base_: ../base.yaml
+
+Global:
+  device: gpu
+  seed: 2021
+
+Engine:
+  run_mode: 'epoch'
+  num_train_epochs: 8
+  eval_freq: 1
+  accumulate_steps: 1
+  logging_freq: 10
+  mix_precision:
+    use_pure_fp16: True
+    scale_loss: 32768.0
+    custom_black_list: ["reduce_sum", "elementwise_div"]
+    custom_white_list: []
+  save_load:
+    save_epoch: 1
+    output_dir: ./output
+    ckpt_dir:
+
+Distributed:
+  dp_degree:
+
+Model:
+  module: "GeneralClsModule"
+  model:
+    name: "ViT_base_patch16_384"
+    class_num: 1000
+    drop_rate: 0.1
+    pretrained:
+      prefix_path: ./pretrained/vit/imagenet2012-ViT-B_16-224
+      finetune: True
+  loss:
+    train:
+      name: 'CELoss'
+    eval:
+      name: 'CELoss'
+  metric:
+    train:
+      name: 'TopkAcc'
+      topk: [1, 5]
+    eval:
+      name: 'TopkAcc'
+      topk: [1, 5]
+
+Optimizer:
+  name: Momentum
+  weight_decay: 0.0001
+  momentum: 0.9
+  lr:
+    name: ViTLRScheduler
+    learning_rate: 0.03
+    decay_type: cosine
+    warmup_steps: 500
+  grad_clip:
+    name: "ClipGradByGlobalNorm"
+    clip_norm: 1.0
+
+Data:
+  Train:
+    dataset:
+      name: GeneralClsDataset
+      image_root: ./dataset/ILSVRC2012/
+      class_num: 1000
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            scale: [0.05, 1.0]
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - ToCHWImage:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32 # total batchsize 512
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: GeneralClsDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 384
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - ToCHWImage:
+        
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
diff --git a/projects/vit/README.md b/projects/vit/README.md
@@ -3,7 +3,7 @@
 This project implements the (Vision Transformer) proposed by google [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929).
 
 
-## How to pretrain from scratch on imagenet 1k
+## How to pretrain from scratch on imagenet2012
 
 ### Go to the main repo directory
 All commands are executed in the home directory.
@@ -36,6 +36,20 @@ Note: ViT-B/16 needs run on 2 nodes with 16 A100 GPUs. If you only have a low-me
 
 The following commands need to be run on each node.
 ```shell
-
 python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml
 ```
+
+## How to finetune on imagenet2012
+Finetune is similar to pre-training on ImageNet2012 dataset, we have provided the configured yaml file.
+
+```shell
+python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml
+```
+
+
+## Model
+
+| Model    | Phase    | Size   | Dataset      | Resolution | GPUs        | Img/sec | Top1 Acc | Pre-trained checkpoint                                                                             | Fine-tuned checkpoint | Log                                                                                      |
+|----------|----------|--------|--------------|------------|-------------|---------|----------|----------------------------------------------------------------------------------------------------|-----------------------|------------------------------------------------------------------------------------------|
+| ViT-B_16 | pretrain | 174MiB | ImageNet2012 | 224        | A100*N2C16  | 7350    | 74.55%   | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) | -                     | [log](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.log) |
+| ViT-B_16 | finetune |        | ImageNet2012 | 384        | A100*N2C16  | 1363    |          | [download](https://paddlefleetx.bj.bcebos.com/model/vision/vit/imagenet2012-ViT-B_16-224.pdparams) |                       |                                                                                          |
diff --git a/projects/vit/run_finetune.sh b/projects/vit/run_finetune.sh
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml