[XPU] update xhpc to impove performance of strided_copy, update interface of prelu and rsqrt

cqulilujia · cqulilujia · commit c6d6f9f4ad7a · 2025-05-20T17:25:04.000+08:00
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -33,7 +33,7 @@ set(XPU_FFT_LIB_NAME "libcufft.so")
 add_compile_definitions(XPUAPI_NOT_INCLUDE_DEPRECATED)
 
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "dev/20250417")
+  set(XPU_XHPC_BASE_DATE "dev/20250520")
 endif()
 set(XPU_XCCL_BASE_VERSION "3.0.2.5") # For XRE5
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -606,8 +606,19 @@ struct XPURsqrtGradFunctor : public funcs::BaseActivationFunctor<T> {
                   const DenseTensor* out,
                   const DenseTensor* dout,
                   DenseTensor* dx) const {
-    int r = xpu_activation_backward<Context, T, XPUType>(
-        dev_ctx, x, out, dout, dx, xpu::rsqrt_grad<XPUType>);
+    dev_ctx.template Alloc<T>(dx);
+    const XPUType* out_data = nullptr;
+    const XPUType* dout_data = nullptr;
+    if (out != nullptr) {
+      out_data = reinterpret_cast<const XPUType*>(out->data<T>());
+    }
+    if (dout != nullptr) {
+      dout_data = reinterpret_cast<const XPUType*>(dout->data<T>());
+    }
+    XPUType* dx_data = reinterpret_cast<XPUType*>(dx->data<T>());
+
+    int r = xpu::rsqrt_grad(
+        dev_ctx.x_context(), out_data, dout_data, dx_data, dx->numel());
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "rsqrt_grad");
   }
 };
diff --git a/paddle/phi/kernels/xpu/flash_attn_utils.h b/paddle/phi/kernels/xpu/flash_attn_utils.h
@@ -84,4 +84,3 @@ static void GenerateRNGState(
   }
 }
 }  // namespace phi
-#
diff --git a/paddle/phi/kernels/xpu/p_recv_kernel.cc b/paddle/phi/kernels/xpu/p_recv_kernel.cc
@@ -53,7 +53,7 @@ void PRecvKernel(const Context& dev_ctx,
 #else
   PADDLE_THROW(common::errors::PreconditionNotMet(
       "PaddlePaddle is not compiled with DWITH_XPU_BKCL, please recompile with "
-      "DWITH_XPU_BKCL for using p_recv_kernel."));
+      "DWITH_XPU_BKCL for using p_recv kernel."));
 #endif
 }
 
@@ -80,7 +80,7 @@ void PRecvArrayKernel(const Context& dev_ctx,
 #else
   PADDLE_THROW(common::errors::PreconditionNotMet(
       "PaddlePaddle is not compiled with DWITH_XPU_BKCL, please recompile with "
-      "DWITH_XPU_BKCL for using p_recv_kernel."));
+      "DWITH_XPU_BKCL for using p_recv_array kernel."));
 #endif
 }
 
diff --git a/paddle/phi/kernels/xpu/p_send_kernel.cc b/paddle/phi/kernels/xpu/p_send_kernel.cc
@@ -45,7 +45,7 @@ void PSendKernel(const Context& dev_ctx,
 #else
   PADDLE_THROW(common::errors::PreconditionNotMet(
       "PaddlePaddle is not compiled with DWITH_XPU_BKCL, please recompile with "
-      "DWITH_XPU_BKCL for using p_send_kernel."));
+      "DWITH_XPU_BKCL for using p_send kernel."));
 #endif
 }
 
@@ -68,7 +68,7 @@ void PSendArrayKernel(const Context& dev_ctx,
 #else
   PADDLE_THROW(common::errors::PreconditionNotMet(
       "PaddlePaddle is not compiled with DWITH_XPU_BKCL, please recompile with "
-      "DWITH_XPU_BKCL for using p_send_kernel."));
+      "DWITH_XPU_BKCL for using p_send_array kernel."));
 #endif
 }
 
diff --git a/paddle/phi/kernels/xpu/prelu_grad_kernel.cc b/paddle/phi/kernels/xpu/prelu_grad_kernel.cc
@@ -38,48 +38,48 @@ void PReluGradKernel(const Context& dev_ctx,
 
   auto x_dim = x.dims();
   auto x_rank = x_dim.size();
-
   std::vector<int64_t> x_shape(x_rank);
   if (x_rank == 0) {
     x_shape = std::vector<int64_t>({1});
   } else {
-    for (int i = 0; i < x_rank; i++) {
-      x_shape[i] = x_dim[i];
-    }
+    x_shape = common::vectorize<int64_t>(x_dim);
   }
 
-  // mode = 0: channel_nchw, slope_shape = {c}, default. meanwhile, xshape = {n,
-  // c, h, w}
-  // mode = 1, channel_nhwc, slope_shape = {c}, meanwhile, xshape = {n, h, w, c}
-  // mode = 2, elementwise, slope_shape = {c*h*w}
-  // mode = 3, single slope, slope_shape = {1}
+  // mode = 0: channel_nchw, xshape = {n, c, h, w}, alpha_shape = {c}
+  // mode = 1, channel_nhwc, xshape = {n, h, w, c}, alpha_shape = {c}
+  // mode = 2, elementwise, deprecated in Paddle 2.x
+  // mode = 3, alpha_shape = {} or {1}
 
   int xpu_mode = 0;
 
   if (mode == "channel") {
     if (data_format == "NCHW") {
       xpu_mode = 0;
-    } else {
-      // NHWC
+      if (x_rank == 2) {  // special case for NC shape, use channel last mode
+        xpu_mode == 1;
+      }
+    } else {  // NHWC, channel last
       xpu_mode = 1;
     }
   } else if (mode == "element") {
     xpu_mode = 2;
-  } else {
+  } else if (mode == "all") {
     xpu_mode = 3;
+  } else {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "Expected mode of prelu kernel is 'channel' or 'all', But got "
+        "unsupported mode: %s.",
+        mode));
   }
 
-  int r = xpu::prelu_grad(
-      dev_ctx.x_context(),
-      reinterpret_cast<const XPUType*>(x_ptr),
-      reinterpret_cast<const XPUType*>(
-          out_grad_ptr), /* const T* y, not used in xpu kernel */
-      reinterpret_cast<const XPUType*>(alpha_ptr),
-      reinterpret_cast<const XPUType*>(out_grad_ptr),
-      reinterpret_cast<XPUType*>(x_grad_ptr),
-      reinterpret_cast<XPUType*>(alpha_grad_ptr),
-      x_shape,
-      xpu_mode);
+  int r = xpu::prelu_grad(dev_ctx.x_context(),
+                          reinterpret_cast<const XPUType*>(x_ptr),
+                          reinterpret_cast<const XPUType*>(alpha_ptr),
+                          reinterpret_cast<const XPUType*>(out_grad_ptr),
+                          reinterpret_cast<XPUType*>(x_grad_ptr),
+                          reinterpret_cast<XPUType*>(alpha_grad_ptr),
+                          x_shape,
+                          xpu_mode);
 
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "prelu_grad");
 }
diff --git a/paddle/phi/kernels/xpu/prelu_kernel.cc b/paddle/phi/kernels/xpu/prelu_kernel.cc
@@ -35,33 +35,45 @@ void PReluKernel(const Context& dev_ctx,
   auto x_dim = x.dims();
   auto x_rank = x_dim.size();
   std::vector<int64_t> x_shape(x_rank);
-
   if (x_rank == 0) {
     x_shape = std::vector<int64_t>({1});
   } else {
-    for (int i = 0; i < x_rank; i++) {
-      x_shape[i] = x_dim[i];
-    }
+    x_shape = common::vectorize<int64_t>(x_dim);
   }
 
-  auto alpha_dim = alpha.dims();
-  auto alpha_rank = alpha_dim.size();
-  std::vector<int64_t> alpha_shape(x_rank, 1);  // same size with x_shape
+  // mode = 0: channel_nchw, xshape = {n, c, h, w}, alpha_shape = {c}
+  // mode = 1, channel_nhwc, xshape = {n, h, w, c}, alpha_shape = {c}
+  // mode = 2, elementwise, deprecated in Paddle 2.x
+  // mode = 3, alpha_shape = {} or {1}
 
-  if (x_rank == 0) {
-    alpha_shape = std::vector<int64_t>({1});
-  } else {
-    for (int i = 0; i < alpha_rank; i++) {
-      alpha_shape[i] = alpha_dim[i];
+  int xpu_mode = 0;
+
+  if (mode == "channel") {
+    if (data_format == "NCHW") {
+      xpu_mode = 0;
+      if (x_rank == 2) {  // special case for NC shape, use channel last mode
+        xpu_mode == 1;
+      }
+    } else {  // NHWC, channel last
+      xpu_mode = 1;
     }
+  } else if (mode == "element") {
+    xpu_mode = 2;
+  } else if (mode == "all") {
+    xpu_mode = 3;
+  } else {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "Expected mode of prelu kernel is 'channel' or 'all', But got "
+        "unsupported mode: %s.",
+        mode));
   }
 
   int r = xpu::prelu(dev_ctx.x_context(),
                      reinterpret_cast<const XPUType*>(x_ptr),
                      reinterpret_cast<const XPUType*>(alpha_ptr),
                      reinterpret_cast<XPUType*>(y_ptr),
                      x_shape,
-                     alpha_shape);
+                     xpu_mode);
 
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "prelu");
 }
diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
@@ -54,11 +54,6 @@ void StridedCopyKernel(const Context& dev_ctx,
                               "StridedCopyKernel's out tensor must complete "
                               "mutable data before call kernel."));
 
-  // The following XPU operators have performance issues and are temporarily
-  // disabled. A temporary workaround has been implemented: "First copy data to
-  // CPU, perform computation using CPU operator logic, then copy results back
-  // to XPU".
-  /*
   // use XPUCopyTypeTrait to deal with double and int16_t copy instead of
   // XPUTypeTrait
   using XPUType = typename XPUCopyTypeTrait<T>::Type;
@@ -74,80 +69,17 @@ void StridedCopyKernel(const Context& dev_ctx,
     r = xpu::copy<XPUType>(dev_ctx.x_context(), input_data, output_data, 1);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
   } else {
+    int64_t data_size = input.Holder()->size() - input.meta().offset;
     r = xpu::strided_copy<XPUType>(dev_ctx.x_context(),
                                    input_data,
                                    output_data,
+                                   data_size,
                                    common::vectorize<int64_t>(input.dims()),
                                    common::vectorize<int64_t>(out->dims()),
                                    common::vectorize<int64_t>(input.strides()),
                                    common::vectorize<int64_t>(out->strides()));
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "strided_copy");
   }
-  */
-
-  // wait before copy
-  dev_ctx.Wait();
-
-  // CPU buffer for input
-  char* input_on_cpu = new char[input.Holder()->size()];
-  memory_utils::Copy(CPUPlace(),
-                     static_cast<void*>(input_on_cpu),
-                     dev_ctx.GetPlace(),
-                     static_cast<const void*>(input.Holder()->ptr()),
-                     input.Holder()->size());
-
-  // CPU buffer for out
-  char* output_on_cpu = new char[out->Holder()->size()];
-  memory_utils::Copy(CPUPlace(),
-                     static_cast<void*>(output_on_cpu),
-                     dev_ctx.GetPlace(),
-                     static_cast<const void*>(out->Holder()->ptr()),
-                     out->Holder()->size());
-
-  // wait after copy
-  dev_ctx.Wait();
-
-  // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc
-  const T* input_data =
-      reinterpret_cast<T*>(input_on_cpu + input.meta().offset);
-  int input_rank = input.dims().size();
-  const int64_t* input_dims = input.dims().Get();
-  const int64_t* input_stride = input.strides().Get();
-
-  T* output_data = reinterpret_cast<T*>(output_on_cpu + offset);
-  int output_rank = meta.dims.size();
-  const int64_t* output_dims = meta.dims.Get();
-  const int64_t* output_stride = meta.strides.Get();
-
-  auto numel = input.numel();
-
-  for (int64_t i = 0; i < numel; i++) {
-    int64_t input_offset = 0;
-    int64_t index_tmp = i;
-    for (int dim = input_rank - 1; dim >= 0; --dim) {
-      input_offset += (index_tmp % input_dims[dim]) * input_stride[dim];
-      index_tmp = index_tmp / input_dims[dim];
-    }
-    int64_t output_offset = 0;
-    index_tmp = i;
-    for (int dim = output_rank - 1; dim >= 0; --dim) {
-      output_offset += (index_tmp % output_dims[dim]) * output_stride[dim];
-      index_tmp = index_tmp / output_dims[dim];
-    }
-    output_data[output_offset] = input_data[input_offset];
-  }
-
-  // copy out tensor, from cpu to xpu
-  memory_utils::Copy(dev_ctx.GetPlace(),
-                     static_cast<void*>(out->Holder()->ptr()),
-                     CPUPlace(),
-                     static_cast<const void*>(output_on_cpu),
-                     out->Holder()->size());
-  // wait after copy
-  dev_ctx.Wait();
-
-  delete[] input_on_cpu;
-  delete[] output_on_cpu;
 }
 
 }  // namespace phi

Original file line number	Diff line number	Diff line change
`@@ -84,4 +84,3 @@ static void GenerateRNGState(`
`84`	`84`	`}`
`85`	`85`	`}`
`86`	`86`	`} // namespace phi`
`87`		`-#`