PaddlePaddle
diff --git a/‎plsc/configs/VisionTransformer/ViT_large_patch16_224_in22k_4n32c_dp_fp16o2.yaml‎
Lines changed: 6 additions & 11 deletions b/‎plsc/configs/VisionTransformer/ViT_large_patch16_224_in22k_4n32c_dp_fp16o2.yaml‎
Lines changed: 6 additions & 11 deletions
diff --git a/‎plsc/configs/VisionTransformer/ViT_large_patch16_384_in1k_ft_1n8c_dp_fp16o2.yaml‎
Lines changed: 130 additions & 0 deletions b/‎plsc/configs/VisionTransformer/ViT_large_patch16_384_in1k_ft_1n8c_dp_fp16o2.yaml‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎plsc/configs/VisionTransformer/ViT_large_patch16_384_in1k_ft_4n32c_dp_fp16o2.yaml‎
Lines changed: 129 additions & 0 deletions b/‎plsc/configs/VisionTransformer/ViT_large_patch16_384_in1k_ft_4n32c_dp_fp16o2.yaml‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎plsc/core/grad_clip.py‎
Lines changed: 54 additions & 0 deletions b/‎plsc/core/grad_clip.py‎
Lines changed: 54 additions & 0 deletions
@@ -27,9 +27,8 @@ DistributedStrategy:
 # model architecture
 Model:
   name: ViT_large_patch16_224
-  class_num: 10450
+  class_num: 21841
   drop_rate: 0.1
-  representation_size: 768
 
 # loss function config for traing/eval process
 Loss:
@@ -53,20 +52,16 @@ Optimizer:
   epsilon: 1e-8
   weight_decay: 0.03
   exp_avg_force_fp32: True
-  grad_clip:
-    name: ClipGradByGlobalNorm
-    clip_norm: 1.0
 
 
 # data loader for train and eval
 DataLoader:
   Train:
     dataset:
       name: ImageNetDataset
-      image_root: ./dataset/ImageNet21K/
-      multi_label: True
-      class_num: 10450
-      cls_label_path: ./dataset/ImageNet21K/multi_label_train_list.txt
+      image_root: ./dataset/ImageNet22K/
+      class_num: 21841
+      cls_label_path: ./dataset/ImageNet22K/train_list.txt
       transform_ops:
         - DecodeImage:
             to_rgb: True
@@ -97,8 +92,8 @@ DataLoader:
   Eval:
     dataset: 
       name: ImageNetDataset
-      image_root: ./dataset/ImageNet21K/
-      cls_label_path: ./dataset/ImageNet21K/val_list.txt
+      image_root: ./dataset/ImageNet22K/
+      cls_label_path: ./dataset/ImageNet22K/val_list.txt
       transform_ops:
         - DecodeImage:
             to_rgb: True
 
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoint: null
+  finetune: True
+  pretrained_model: ./pretrained/vit_jax/imagenet21k-ViT-L_16
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 8
+  epochs: 8
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2021
+  
+# FP16 setting
+FP16:
+  level: O2
+  GradScaler:
+    init_loss_scaling: 65536.0
+  
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: ViT_large_patch16_384
+  class_num: 1000
+  drop_rate: 0.1
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - ViTCELoss:
+        type: softmax
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+  name: ViTLRScheduler
+  learning_rate: 0.03
+  decay_type: cosine
+  warmup_steps: 500
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 0.0001
+  grad_clip:
+    name: ClipGradByGlobalNorm
+    clip_norm: 1.0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      class_num: 1000
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            scale: [0.05, 1.0]
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - ToCHWImage:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64 # total batchsize 512
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 384
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - ToCHWImage:
+        
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoint: null
+  finetune: True
+  pretrained_model: ./pretrained/vit_jax/imagenet21k-ViT-L_16
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 8
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2021
+  
+# FP16 setting
+FP16:
+  level: O2
+  GradScaler:
+    init_loss_scaling: 65536.0
+  
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: ViT_large_patch16_384
+  class_num: 1000
+  drop_rate: 0.1
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - ViTCELoss:
+        type: softmax
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+  name: ViTLRScheduler
+  learning_rate: 0.03
+  decay_type: cosine
+  warmup_steps: 500
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 0.0001
+  grad_clip:
+    name: ClipGradByGlobalNorm
+    clip_norm: 1.0
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      class_num: 1000
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            scale: [0.05, 1.0]
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - ToCHWImage:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16 # total batchsize 512
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 384
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - ToCHWImage:
+        
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
@@ -15,6 +15,7 @@
 import warnings
 import paddle
 from paddle import _legacy_C_ops as _C_ops
+from plsc.utils import logger
 
 
 def _squared_l2_norm(x):
@@ -94,3 +95,56 @@ def __call__(self, params):
                         'Y': clip_coef},
                 outputs={'Out': param.grad},
                 attrs={'axis': -1})
+
+
+@paddle.no_grad()
+def clip_grad_norm_(parameters,
+                    max_norm: float,
+                    norm_type: float=2.0,
+                    error_if_nonfinite: bool=False):
+    r"""Clips gradient norm of an iterable of parameters.
+
+    The norm is computed over all gradients together, as if they were
+    concatenated into a single vector. Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of the gradients from :attr:``parameters`` is ``nan``,
+            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
+
+    Returns:
+        Total norm of the parameters (viewed as a single vector).
+    """
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return paddle.to_tensor([0.])
+
+    total_norm = paddle.norm(
+        paddle.stack([paddle.norm(p.grad, norm_type) for p in parameters]),
+        norm_type)
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of order {norm_type} for gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. To disable '
+            'this error and scale the gradients by the non-finite norm anyway, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    for p in parameters:
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type="elementwise_mul",
+            inputs={'X': p.grad,
+                    'Y': clip_coef_clamped},
+            outputs={'Out': p.grad},
+            attrs={'axis': -1})
+    return total_norm