[XPU]Support complex for multiply (#72982)

fangfangssj · web-flow · commit ec5e4df903ce · 2025-05-29T17:07:47.000+08:00
* add multiply complex

* add 0size

* fix test
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -460,11 +460,17 @@ XPUOpMap& get_kl3_ops() {
       {"elementwise_mul_grad",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
+#ifdef PADDLE_WITH_XPU_FFT
+                     phi::DataType::COMPLEX64,
+#endif
                      phi::DataType::BFLOAT16})},
       {"elementwise_mul",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
                      phi::DataType::BFLOAT16,
+#ifdef PADDLE_WITH_XPU_FFT
+                     phi::DataType::COMPLEX64,
+#endif
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
       {"elementwise_pow",
diff --git a/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_multiply_grad_kernel.cc
@@ -19,6 +19,12 @@
 
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/expand_grad_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/xpu/elementwise.h"
 
@@ -33,6 +39,25 @@ void MultiplyGradKernel(const Context& dev_ctx,
                         DenseTensor* dx,
                         DenseTensor* dy) {
   using XPUType = typename XPUTypeTrait<T>::Type;
+  if (dout.numel() == 0) {
+    if (dx) {
+      if (dx->numel() == 0) {
+        dev_ctx.template Alloc<T>(dx);
+      } else {
+        phi::Full<T, Context>(
+            dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
+      }
+    }
+    if (dy) {
+      if (dy->numel() == 0) {
+        dev_ctx.template Alloc<T>(dy);
+      } else {
+        phi::Full<T, Context>(
+            dev_ctx, phi::IntArray(common::vectorize(dy->dims())), 0, dy);
+      }
+    }
+    return;
+  }
   funcs::ElementwiseGradPreProcess(dout, dx);
   auto f = [](xpu::Context* ctx,
               const XPUType* x,
@@ -50,6 +75,113 @@ void MultiplyGradKernel(const Context& dev_ctx,
   XPUElementwiseGrad<T, XPUType>(dev_ctx, x, y, dout, axis, dx, dy, f, true);
 }
 
+#ifdef PADDLE_WITH_XPU_FFT
+template <>
+void MultiplyGradKernel<phi::dtype::complex<float>, XPUContext>(
+    const XPUContext& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    const DenseTensor& dout,
+    int axis,
+    DenseTensor* dx,
+    DenseTensor* dy) {
+  using T = phi::dtype::complex<float>;
+  if (dout.numel() == 0) {
+    if (dx) {
+      if (dx->numel() == 0) {
+        dev_ctx.template Alloc<T>(dx);
+      } else {
+        phi::Full<T, XPUContext>(
+            dev_ctx, phi::IntArray(common::vectorize(dx->dims())), T(0), dx);
+      }
+    }
+    if (dy) {
+      if (dy->numel() == 0) {
+        dev_ctx.template Alloc<T>(dy);
+      } else {
+        phi::Full<T, XPUContext>(
+            dev_ctx, phi::IntArray(common::vectorize(dy->dims())), T(0), dy);
+      }
+    }
+    return;
+  }
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  // The current complex number implementation uses separate real/imaginary
+  // parts,resulting in redundant operations and performance
+  // penalties.Optimization should address this in future iterations.
+  DenseTensor dout_real = Real<T, XPUContext>(dev_ctx, dout);
+  DenseTensor dout_imag = Imag<T, XPUContext>(dev_ctx, dout);
+
+  if (dx) {
+    DenseTensor y_real = Real<T, XPUContext>(dev_ctx, y);
+    DenseTensor y_imag = Imag<T, XPUContext>(dev_ctx, y);
+    DenseTensor dx_real = Add<float, XPUContext>(
+        dev_ctx,
+        Multiply<float, XPUContext>(dev_ctx, dout_real, y_real),
+        Multiply<float, XPUContext>(dev_ctx, dout_imag, y_imag));
+    DenseTensor dx_imag = Subtract<float, XPUContext>(
+        dev_ctx,
+        Multiply<float, XPUContext>(dev_ctx, dout_imag, y_real),
+        Multiply<float, XPUContext>(dev_ctx, dout_real, y_imag));
+    dev_ctx.template Alloc<T>(dx);
+    if (x.dims() == dout.dims()) {
+      phi::ComplexKernel<float>(dev_ctx, dx_real, dx_imag, dx);
+    } else {
+      DenseTensor dx_real_expanded, dx_imag_expanded;
+      dx_real_expanded.Resize(dx->dims());
+      dx_imag_expanded.Resize(dx->dims());
+      ExpandGradKernel<float, XPUContext>(
+          dev_ctx,
+          x,
+          dx_real,
+          phi::IntArray(phi::vectorize(x.dims())),
+          &dx_real_expanded);
+      ExpandGradKernel<float, XPUContext>(
+          dev_ctx,
+          x,
+          dx_imag,
+          phi::IntArray(phi::vectorize(x.dims())),
+          &dx_imag_expanded);
+      phi::ComplexKernel<float>(
+          dev_ctx, dx_real_expanded, dx_imag_expanded, dx);
+    }
+  }
+  if (dy) {
+    DenseTensor x_real = Real<T, XPUContext>(dev_ctx, x);
+    DenseTensor x_imag = Imag<T, XPUContext>(dev_ctx, x);
+    DenseTensor dy_real = Add<float, XPUContext>(
+        dev_ctx,
+        Multiply<float, XPUContext>(dev_ctx, dout_real, x_real),
+        Multiply<float, XPUContext>(dev_ctx, dout_imag, x_imag));
+    DenseTensor dy_imag = Subtract<float, XPUContext>(
+        dev_ctx,
+        Multiply<float, XPUContext>(dev_ctx, dout_imag, x_real),
+        Multiply<float, XPUContext>(dev_ctx, dout_real, x_imag));
+    dev_ctx.template Alloc<T>(dy);
+    if (y.dims() == dout.dims()) {
+      phi::ComplexKernel<float>(dev_ctx, dy_real, dy_imag, dy);
+    } else {
+      DenseTensor dy_real_expanded, dy_imag_expanded;
+      dy_real_expanded.Resize(dy->dims());
+      dy_imag_expanded.Resize(dy->dims());
+      ExpandGradKernel<float, XPUContext>(
+          dev_ctx,
+          y,
+          dy_real,
+          phi::IntArray(phi::vectorize(y.dims())),
+          &dy_real_expanded);
+      ExpandGradKernel<float, XPUContext>(
+          dev_ctx,
+          y,
+          dy_imag,
+          phi::IntArray(phi::vectorize(y.dims())),
+          &dy_imag_expanded);
+      phi::ComplexKernel<float>(
+          dev_ctx, dy_real_expanded, dy_imag_expanded, dy);
+    }
+  }
+}
+#endif
 }  // namespace phi
 
 PD_REGISTER_KERNEL(multiply_grad,
@@ -58,4 +190,8 @@ PD_REGISTER_KERNEL(multiply_grad,
                    phi::MultiplyGradKernel,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
-                   float) {}
+#ifdef PADDLE_WITH_XPU_FFT
+                   phi::dtype::complex<float>,
+#endif
+                   float) {
+}
diff --git a/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/xpu/elementwise_multiply_kernel.cc
@@ -19,6 +19,9 @@
 
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/xpu/elementwise.h"
 
@@ -30,6 +33,10 @@ void MultiplyKernel(const Context& dev_ctx,
                     const DenseTensor& y,
                     DenseTensor* out) {
   using XPUType = typename XPUTypeTrait<T>::Type;
+  if (out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
   auto f = [](xpu::Context* ctx,
               const XPUType* x,
               const XPUType* y,
@@ -42,6 +49,37 @@ void MultiplyKernel(const Context& dev_ctx,
   XPUElementwise<T, XPUType>(dev_ctx, x, y, -1, out, f);
 }
 
+#ifdef PADDLE_WITH_XPU_FFT
+template <>
+void MultiplyKernel<phi::dtype::complex<float>, XPUContext>(
+    const XPUContext& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* out) {
+  using T = phi::dtype::complex<float>;
+  if (out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
+  // The current complex number implementation uses separate real/imaginary
+  // parts,resulting in redundant operations and performance
+  // penalties.Optimization should address this in future iterations.
+  const DenseTensor x_real = Real<T, XPUContext>(dev_ctx, x);
+  const DenseTensor x_imag = Imag<T, XPUContext>(dev_ctx, x);
+  const DenseTensor y_real = Real<T, XPUContext>(dev_ctx, y);
+  const DenseTensor y_imag = Imag<T, XPUContext>(dev_ctx, y);
+  DenseTensor real_out = Subtract<float, XPUContext>(
+      dev_ctx,
+      Multiply<float, XPUContext>(dev_ctx, x_real, y_real),
+      Multiply<float, XPUContext>(dev_ctx, x_imag, y_imag));
+  DenseTensor imag_out = Add<float, XPUContext>(
+      dev_ctx,
+      Multiply<float, XPUContext>(dev_ctx, x_real, y_imag),
+      Multiply<float, XPUContext>(dev_ctx, x_imag, y_real));
+  phi::ComplexKernel<float>(dev_ctx, real_out, imag_out, out);
+}
+#endif
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(multiply,
@@ -50,6 +88,10 @@ PD_REGISTER_KERNEL(multiply,
                    phi::MultiplyKernel,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
+#ifdef PADDLE_WITH_XPU_FFT
+                   phi::dtype::complex<float>,
+#endif
                    float,
                    int,
-                   int64_t) {}
+                   int64_t) {
+}
diff --git a/test/xpu/test_elementwise_mul_op_xpu.py b/test/xpu/test_elementwise_mul_op_xpu.py