PaddlePaddle · HydrogenSulfate · May 15, 2025 · May 9, 2025 · May 12, 2025 · May 12, 2025
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
@@ -110,6 +110,13 @@ function(kernel_declare TARGET_LIST)
           set(first_registry "")
         endif()
       endif()
+      # The kernel related to xpufft must have WITH_XPU_FFT enabled.
+      if(WITH_XPU AND NOT WITH_XPU_FFT)
+        string(FIND "${first_registry}" "xpufft" pos)
+        if(pos GREATER 1)
+          set(first_registry "")
+        endif()
+      endif()
 
       if(NOT first_registry STREQUAL "")
         string(
@@ -141,6 +148,7 @@ function(kernel_declare TARGET_LIST)
         string(REPLACE "," ";" kernel_msg "${kernel_msg}")
         string(REGEX REPLACE "[ \\\t\r\n]+" "" kernel_msg "${kernel_msg}")
         string(REGEX REPLACE "//cuda_only" "" kernel_msg "${kernel_msg}")
+        string(REGEX REPLACE "//xpufft" "" kernel_msg "${kernel_msg}")
 
         list(GET kernel_msg 0 kernel_name)
         if(NOT is_all_backend STREQUAL "")

diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -1822,6 +1822,15 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::INT16,
                      phi::DataType::INT64,
                      phi::DataType::INT32})},
+#ifdef PADDLE_WITH_XPU_FFT
+      {"conj", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"real", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"real_grad", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"imag", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"imag_grad", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"complex", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"complex_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+#endif
   };
 
   return s_xpu3_kernels;

diff --git a/paddle/phi/kernels/xpu/complex_grad_kernel.cc b/paddle/phi/kernels/xpu/complex_grad_kernel.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU_FFT
+#include "paddle/phi/kernels/complex_grad_kernel.h"
+
+#include "fft/cuComplex.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace xfft_internal::xpu {
+int combine_as_complex(int N, float* real, float* imag, float2* out);
+int complex_spilt_float(int N, float2* in, float* real, float* imag);
+}  // namespace xfft_internal::xpu
+
+namespace phi {
+
+template <class T, class Context>
+static DenseTensor Fill(const Context& ctx,
+                        std::vector<int> shape,
+                        T fill_value) {
+  DenseTensor ret;
+  ret.Resize(common::make_ddim(shape));
+  ctx.template Alloc<T>(&ret);
+  funcs::SetConstant<Context, T>()(ctx, &ret, fill_value);
+  return ret;
+}
+
+template <typename T, typename Context>
+void RealGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+  DenseTensor imag = Fill<phi::dtype::Real<T>, Context>(
+      dev_ctx, common::vectorize<int>(dout.dims()), phi::dtype::Real<T>(0.0));
+  int r = xfft_internal::xpu::combine_as_complex(
+      numel,
+      const_cast<phi::dtype::Real<T>*>(dout.data<phi::dtype::Real<T>>()),
+      imag.data<phi::dtype::Real<T>>(),
+      reinterpret_cast<cuFloatComplex*>(dx_data));
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+}
+
+template <typename T, typename Context>
+void ImagGradKernel(const Context& dev_ctx,
+                    const DenseTensor& dout,
+                    DenseTensor* dx) {
+  auto numel = dout.numel();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
+  DenseTensor real = Fill<phi::dtype::Real<T>, Context>(
+      dev_ctx, common::vectorize<int>(dout.dims()), phi::dtype::Real<T>(0.0));
+  int r = xfft_internal::xpu::combine_as_complex(
+      numel,
+      real.data<phi::dtype::Real<T>>(),
+      const_cast<phi::dtype::Real<T>*>(dout.data<phi::dtype::Real<T>>()),
+      reinterpret_cast<cuFloatComplex*>(dx_data));
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+}
+
+template <typename T, typename Context>
+void ComplexGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& dout,
+                       DenseTensor* dx,
+                       DenseTensor* dy) {
+  using C = phi::dtype::complex<T>;
+  auto numel = dout.numel();
+  DenseTensor real_dout, imag_dout;
+  real_dout.Resize(dout.dims());
+  imag_dout.Resize(dout.dims());
+  T* real_data = dev_ctx.template Alloc<T>(&real_dout);
+  T* imag_data = dev_ctx.template Alloc<T>(&imag_dout);
+
+  int r = xfft_internal::xpu::complex_spilt_float(
+      numel,
+      reinterpret_cast<cuFloatComplex*>(const_cast<C*>(dout.data<C>())),
+      real_data,
+      imag_data);
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+
+  if (dx) {
+    if (x.dims() == dout.dims()) {
+      dx->ShareDataWith(real_dout);
+    } else {
+      ExpandGradKernel<T, Context>(
+          dev_ctx, x, real_dout, phi::IntArray(phi::vectorize(x.dims())), dx);
+    }
+  }
+
+  if (dy) {
+    if (y.dims() == dout.dims()) {
+      dy->ShareDataWith(imag_dout);
+    } else {
+      ExpandGradKernel<T, Context>(
+          dev_ctx, y, imag_dout, phi::IntArray(phi::vectorize(y.dims())), dy);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(imag_grad,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ImagGradKernel,
+                   phi::dtype::complex<float>) {
+  kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
+
+PD_REGISTER_KERNEL(real_grad,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::RealGradKernel,
+                   phi::dtype::complex<float>) {
+  kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
+
+PD_REGISTER_KERNEL(complex_grad,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ComplexGradKernel,
+                   float) {
+  kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
+#endif  // PADDLE_WITH_XPU_FFT
diff --git a/paddle/phi/kernels/xpu/complex_kernel.cc b/paddle/phi/kernels/xpu/complex_kernel.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU_FFT
+#include "paddle/phi/kernels/complex_kernel.h"
+
+#include "fft/cuComplex.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/common_infer_shape_functions.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+
+namespace xfft_internal::xpu {
+int combine_as_complex(int N, float* real, float* imag, float2* out);
+int complex_spilt_float(int N, float2* in, float* real, float* imag);
+int Conj(int N, float2* input, float2* output);
+}  // namespace xfft_internal::xpu
+
+namespace phi {
+template <typename T, typename Context>
+void ConjKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  if (std::is_same<T, phi::dtype::complex<float>>::value) {
+    int r = xfft_internal::xpu::Conj(
+        x.numel(),
+        reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
+        reinterpret_cast<cuFloatComplex*>(out->data<T>()));
+    PADDLE_ENFORCE_XPU_SUCCESS(r);
+  } else {
+    using XPUType = typename XPUCopyTypeTrait<T>::Type;
+    const auto* input_data = x.data<T>();
+    int r = xpu::copy<XPUType>(dev_ctx.x_context(),
+                               reinterpret_cast<const XPUType*>(input_data),
+                               reinterpret_cast<XPUType*>(out->data<T>()),
+                               x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  }
+}
+
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
+  // The allocation of imag here is redundant and could be optimized.
+  phi::DenseTensor imag;
+  imag.Resize(x.dims());
+  dev_ctx.template Alloc<phi::dtype::Real<T>>(&imag);
+  int r = xfft_internal::xpu::complex_spilt_float(
+      out->numel(),
+      reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
+      out->data<phi::dtype::Real<T>>(),
+      imag.data<phi::dtype::Real<T>>());
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+}
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DenseTensor* out) {
+  dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
+  // The allocation of ‘real’ here is redundant and could be optimized.
+  phi::DenseTensor real;
+  real.Resize(x.dims());
+  dev_ctx.template Alloc<phi::dtype::Real<T>>(&real);
+  int r = xfft_internal::xpu::complex_spilt_float(
+      out->numel(),
+      reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
+      real.data<phi::dtype::Real<T>>(),
+      out->data<phi::dtype::Real<T>>());
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+}
+
+template <typename T, typename Context>
+void ComplexKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  using C = phi::dtype::complex<T>;
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto out_dims = phi::funcs::BroadcastTwoDims(x_dims, y_dims);
+  std::vector<int64_t> out_dims_vec = phi::vectorize(out_dims);
+
+  DenseTensor broadcasted_x, broadcasted_y;
+  T* x_data = nullptr;
+  T* y_data = nullptr;
+
+  if (x_dims == out_dims) {
+    x_data = const_cast<T*>(x.data<T>());
+  } else {
+    broadcasted_x.Resize(out_dims);
+    dev_ctx.template Alloc<T>(&broadcasted_x);
+    ExpandKernel<T, Context>(
+        dev_ctx, x, phi::IntArray(out_dims_vec), &broadcasted_x);
+    x_data = broadcasted_x.data<T>();
+  }
+
+  if (y_dims == out_dims) {
+    y_data = const_cast<T*>(y.data<T>());
+  } else {
+    broadcasted_y.Resize(out_dims);
+    dev_ctx.template Alloc<T>(&broadcasted_y);
+    ExpandKernel<T, Context>(
+        dev_ctx, y, phi::IntArray(out_dims_vec), &broadcasted_y);
+    y_data = broadcasted_y.data<T>();
+  }
+
+  dev_ctx.template Alloc<C>(out);
+  int r = xfft_internal::xpu::combine_as_complex(
+      out->numel(),
+      x_data,
+      y_data,
+      reinterpret_cast<cuFloatComplex*>(out->data<C>()));
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(conj,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ConjKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>) {}
+
+PD_REGISTER_KERNEL(real,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::RealKernel,
+                   phi::dtype::complex<float>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
+
+PD_REGISTER_KERNEL(imag,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ImagKernel,
+                   phi::dtype::complex<float>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
+
+PD_REGISTER_KERNEL(complex,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ComplexKernel,
+                   float) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
+#endif  // PADDLE_WITH_XPU_FFT