PaddlePaddle
diff --git a/‎cmake/hip.cmake
+1 b/‎cmake/hip.cmake
+1
diff --git a/‎paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+3-3 b/‎paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+3-3
diff --git a/‎paddle/fluid/operators/array_to_lod_tensor_op.cc
+1-1 b/‎paddle/fluid/operators/array_to_lod_tensor_op.cc
+1-1
diff --git a/‎paddle/fluid/operators/assign_op.cc
+1-1 b/‎paddle/fluid/operators/assign_op.cc
+1-1
diff --git a/‎paddle/fluid/operators/math/bert_encoder_functor.cu
+29-8 b/‎paddle/fluid/operators/math/bert_encoder_functor.cu
+29-8
diff --git a/‎paddle/fluid/operators/math/bert_encoder_functor.h
+12-3 b/‎paddle/fluid/operators/math/bert_encoder_functor.h
+12-3
diff --git a/‎paddle/fluid/operators/math/depthwise_conv.cu
+14-1 b/‎paddle/fluid/operators/math/depthwise_conv.cu
+14-1
@@ -45,6 +45,7 @@ set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
 # define HIP_CXX_FLAGS
 list(APPEND HIP_CXX_FLAGS -fPIC)
 list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1)
+# Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
 list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
 list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined)
 list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override)
 
@@ -737,7 +737,7 @@ x.second );
   }
 
   int assign_async(const concurrent_unordered_map& other,
-                   gpuStream_t stream = 0) {
+                   cudaStream_t stream = 0) {
     m_collisions = other.m_collisions;
     if (other.m_hashtbl_size <= m_hashtbl_capacity) {
       m_hashtbl_size = other.m_hashtbl_size;
@@ -754,7 +754,7 @@ x.second );
     return 0;
   }
 
-  void clear_async(gpuStream_t stream = 0) {
+  void clear_async(cudaStream_t stream = 0) {
     constexpr int block_size = 128;
     init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0,
                    stream>>>(m_hashtbl_values, m_hashtbl_size, unused_key,
@@ -771,7 +771,7 @@ x.second );
     }
   }
 
-  int prefetch(const int dev_id, gpuStream_t stream = 0) {
+  int prefetch(const int dev_id, cudaStream_t stream = 0) {
     cudaPointerAttributes hashtbl_values_ptr_attributes;
     cudaError_t status = cudaPointerGetAttributes(
         &hashtbl_values_ptr_attributes, m_hashtbl_values);
 
@@ -51,7 +51,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor<void> {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
 
@@ -164,7 +164,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
                                ops::AssignKernel, plat::float16,
                                ops::AssignKernel);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
                                 ops::AssignKernel, int, ops::AssignKernel,
                                 int64_t, ops::AssignKernel, bool,
 
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cuda_runtime.h>
 #include <algorithm>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -145,6 +144,8 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
   LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps);
 }
 
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#ifndef __HIPCC__  // @{ Half kernel: EmbEltwiseLayernormKernel
 template <>
 __global__ void EmbEltwiseLayernormKernel<half, 256>(
     int hidden, const int64_t *ids, const float *scale, const float *bias,
@@ -188,12 +189,13 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(
                        eps);
 #endif
 }
+#endif  // @} End Half kernel: EmbEltwiseLayernormKernel
 
 template <typename T>
 void EmbEltwiseLayerNormFunctor<T>::operator()(
     int batch, int seq_len, int hidden, const int64_t *ids, const float *scale,
     const float *bias, const int64_t *embs, T *output, float eps, int input_num,
-    cudaStream_t stream) {
+    gpuStream_t stream) {
   const unsigned tpb = 256;
   const dim3 grid(seq_len, batch, 1);
   const dim3 block(tpb, 1, 1);
@@ -205,7 +207,8 @@ void EmbEltwiseLayerNormFunctor<T>::operator()(
 template class EmbEltwiseLayerNormFunctor<float>;
 
 // device function 'operator()' is not supportted until cuda 10.0
-#if CUDA_VERSION >= 10000
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
 template class EmbEltwiseLayerNormFunctor<half>;
 #endif
 
@@ -230,6 +233,8 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_,
     qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val);
 }
 
+// HIP defined __HIP_NO_HALF_CONVERSIONS__
+#ifndef __HIPCC__  // @{ Half kernel: SoftmaxKernelWithEltadd
 template <>
 __global__ void SoftmaxKernelWithEltadd<half>(
     half *qk_buf_, const half *bias_qk_, const int batch_size,
@@ -251,6 +256,7 @@ __global__ void SoftmaxKernelWithEltadd<half>(
     qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val);
 #endif
 }
+#endif  // @} End Half kernel: SoftmaxKernelWithEltadd
 
 template <typename T>
 __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_,
@@ -282,7 +288,9 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
     half2 *qk_buf_, const half2 *bias_qk_, const int batch_size,
     const int head_num, const int seq_len, const unsigned mask) {
 // operator "+" of half only suppotted after cuda version 10.0
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) || \
+    (CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000)
   int qk_offset = blockIdx.x * seq_len;
   int idx = threadIdx.x;
   assert(blockDim.x % 32 == 0);
@@ -398,7 +406,8 @@ void MultiHeadGPUComputeFunctor<T>::operator()(
 template class MultiHeadGPUComputeFunctor<float>;
 
 // device function 'operator()' is not supportted until cuda 10.0
-#if CUDA_VERSION >= 10000
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) || CUDA_VERSION >= 10000
 template class MultiHeadGPUComputeFunctor<half>;
 #endif
 
@@ -422,6 +431,8 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1,
                          eps);
 }
 
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormSmallKernel
 template <>
 __global__ void SkipLayerNormSmallKernel<half, 32>(
     int num, int hidden, const half *input1, const half *input2, half *output,
@@ -484,6 +495,7 @@ __global__ void SkipLayerNormSmallKernel<half, 384>(
                             eps);
 #endif
 }
+#endif  // @} End Half kernel: SkipLayerNormSmallKernel
 
 template <typename T, unsigned TPB>
 __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
@@ -505,6 +517,8 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
   LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
 }
 
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormKernel
 template <>
 __global__ void SkipLayerNormKernel<half, 256>(int num, int hidden,
                                                const half *input1,
@@ -527,6 +541,7 @@ __global__ void SkipLayerNormKernel<half, 256>(int num, int hidden,
   LayerNorm<half, 256>(thread_data, hidden, offset, bias, scale, output, eps);
 #endif
 }
+#endif  // @} End Half kernel: SkipLayerNormKernel
 
 template <typename T, typename T2, unsigned TPB>
 __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
@@ -549,6 +564,8 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
   LayerNorm2<T, T2, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
 }
 
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormKernel2
 template <>
 __global__ void SkipLayerNormKernel2<half, half2, 256>(
     int num, int hidden, const half2 *input1, const half2 *input2,
@@ -572,13 +589,13 @@ __global__ void SkipLayerNormKernel2<half, half2, 256>(
                                eps);
 #endif
 }
+#endif  // @} End Half kernel: SkipLayerNormKernel2
 
 template <typename T>
 void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
                                          const T *input1, const T *input2,
                                          const float *scale, const float *bias,
-                                         T *output, T eps,
-                                         cudaStream_t stream) {
+                                         T *output, T eps, gpuStream_t stream) {
   int block = num / hidden;
   if (hidden <= 32) {
     const int threads = 32;
@@ -603,6 +620,8 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
             reinterpret_cast<float2 *>(output),
             reinterpret_cast<const float2 *>(scale),
             reinterpret_cast<const float2 *>(bias), eps);
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#ifndef __HIPCC__
       } else if (std::is_same<T, __half>::value) {
         SkipLayerNormKernel2<__half, __half2,
                              threads><<<block, threads, 0, stream>>>(
@@ -611,6 +630,7 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
             reinterpret_cast<__half2 *>(output),
             reinterpret_cast<const float2 *>(scale),
             reinterpret_cast<const float2 *>(bias), eps);
+#endif
       } else {
         assert(false);
         // should not be here
@@ -625,7 +645,8 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
 template class SkipLayerNormFunctor<float>;
 
 // device function 'operator()' is not supportted until cuda 10.0
-#if CUDA_VERSION >= 10000
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) || CUDA_VERSION >= 10000
 template class SkipLayerNormFunctor<half>;
 #endif
 
 
@@ -13,9 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>  // NOLINT
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -36,7 +45,7 @@ struct CUDATypeTraits<float> {
   typedef float TYPE;
 };
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // This functor involves a fusion calculation in Ernie or Bert.
 //  The fusion mode is as follows:
 //
@@ -55,7 +64,7 @@ class EmbEltwiseLayerNormFunctor {
  public:
   void operator()(int batch, int seq_len, int hidden, const int64_t *ids,
                   const float *scale, const float *bias, const int64_t *embs,
-                  T *output, float eps, int input_num, cudaStream_t stream);
+                  T *output, float eps, int input_num, gpuStream_t stream);
 };
 
 // This functor involves a fusion calculation in Ernie or Bert.
@@ -97,7 +106,7 @@ class SkipLayerNormFunctor {
  public:
   void operator()(const int num, const int hidden, const T *input1,
                   const T *input2, const float *scale, const float *bias,
-                  T *output, T eps, cudaStream_t stream);
+                  T *output, T eps, gpuStream_t stream);
 };
 #endif
 
 
@@ -14,7 +14,13 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
-#include "cub/cub.cuh"
+#ifdef __NVCC__
+#include <cub/cub.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -27,7 +33,14 @@ template <typename T>
 __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
   typedef cub::WarpReduce<T> WarpReduce;
   typename WarpReduce::TempStorage temp_storage;
+
+#ifdef __HIPCC__
+  int block_size = min(blockDim.x * blockDim.y * blockDim.z, warpSize);
+  value = WarpReduce(temp_storage).Sum(value, block_size);
+#else
   value = WarpReduce(temp_storage).Sum(value);
+#endif
+
   if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
 }
Original file line number	Diff line number	Diff line change
`@@ -737,7 +737,7 @@ x.second );`
`737`	`737`	`}`
`738`	`738`
`739`	`739`	`int assign_async(const concurrent_unordered_map& other,`
`740`		`- gpuStream_t stream = 0) {`
	`740`	`+ cudaStream_t stream = 0) {`
`741`	`741`	`m_collisions = other.m_collisions;`
`742`	`742`	`if (other.m_hashtbl_size <= m_hashtbl_capacity) {`
`743`	`743`	`m_hashtbl_size = other.m_hashtbl_size;`
`@@ -754,7 +754,7 @@ x.second );`
`754`	`754`	`return 0;`
`755`	`755`	`}`
`756`	`756`
`757`		`- void clear_async(gpuStream_t stream = 0) {`
	`757`	`+ void clear_async(cudaStream_t stream = 0) {`
`758`	`758`	`constexpr int block_size = 128;`
`759`	`759`	`init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0,`
`760`	`760`	`stream>>>(m_hashtbl_values, m_hashtbl_size, unused_key,`
`@@ -771,7 +771,7 @@ x.second );`
`771`	`771`	`}`
`772`	`772`	`}`
`773`	`773`
`774`		`- int prefetch(const int dev_id, gpuStream_t stream = 0) {`
	`774`	`+ int prefetch(const int dev_id, cudaStream_t stream = 0) {`
`775`	`775`	`cudaPointerAttributes hashtbl_values_ptr_attributes;`
`776`	`776`	`cudaError_t status = cudaPointerGetAttributes(`
`777`	`777`	`&hashtbl_values_ptr_attributes, m_hashtbl_values);`