[AMP OP&Test] Support fp16/bf16 for cumsum (#51694)

DesmonDay · web-flow · commit 01eeba5e4532 · 2023-03-21T10:31:14.000+08:00
* add fp16 unittest

* support bf16 and add unittest

* fix according to review
diff --git a/paddle/phi/kernels/gpu/cum_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_grad_kernel.cu
@@ -29,6 +29,7 @@ namespace cub = hipcub;
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -82,5 +83,6 @@ PD_REGISTER_KERNEL(cumsum_grad,
                    int16_t,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -28,6 +28,7 @@ namespace cub = hipcub;
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -217,7 +218,8 @@ __global__ void BlockScanKernel(T* d_out,
 }
 
 template <typename Context, typename T>
-typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value>::type
+typename std::enable_if<!std::is_same<T, phi::dtype::float16>::value &&
+                        !std::is_same<T, phi::dtype::bfloat16>::value>::type
 ThrustCumsumKernel(const Context& dev_ctx,
                    const T* in_data,
                    T* out_data,
@@ -261,6 +263,15 @@ ThrustCumsumKernel(const Context& dev_ctx,
                    bool reverse,
                    bool exclusive) {}
 
+template <typename Context, typename T>
+typename std::enable_if<std::is_same<T, phi::dtype::bfloat16>::value>::type
+ThrustCumsumKernel(const Context& dev_ctx,
+                   const phi::dtype::bfloat16* in_data,
+                   phi::dtype::bfloat16* out_data,
+                   int64_t size,
+                   bool reverse,
+                   bool exclusive) {}
+
 template <typename T, typename Context, typename Op>
 void ScanKernel(const Context& dev_ctx,
                 const DenseTensor& x,
@@ -301,6 +312,7 @@ void ScanKernel(const Context& dev_ctx,
   // Use thrust for parallel acceleration when the input size is equal to the
   // length of the ‘axis’ dimension.
   if (!std::is_same<T, phi::dtype::float16>::value &&
+      !std::is_same<T, phi::dtype::bfloat16>::value &&
       std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
     ThrustCumsumKernel<Context, T>(
         dev_ctx, in_data, out_data, size, reverse, exclusive);
@@ -440,7 +452,8 @@ PD_REGISTER_KERNEL(cumsum,
                    int16_t,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(logcumsumexp,
                    GPU,
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py