PaddlePaddle · HydrogenSulfate · May 15, 2025 · May 9, 2025 · May 12, 2025 · May 12, 2025
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
@@ -110,6 +110,13 @@ function(kernel_declare TARGET_LIST)
           set(first_registry "")
         endif()
       endif()
+      # The kernel related to xpufft must have WITH_XPU_FFT enabled.
+      if(WITH_XPU AND NOT WITH_XPU_FFT)
+        string(FIND "${first_registry}" "xpufft" pos)
+        if(pos GREATER 1)
+          set(first_registry "")
+        endif()
+      endif()
 
       if(NOT first_registry STREQUAL "")
         string(
@@ -141,6 +148,7 @@ function(kernel_declare TARGET_LIST)
         string(REPLACE "," ";" kernel_msg "${kernel_msg}")
         string(REGEX REPLACE "[ \\\t\r\n]+" "" kernel_msg "${kernel_msg}")
         string(REGEX REPLACE "//cuda_only" "" kernel_msg "${kernel_msg}")
+        string(REGEX REPLACE "//xpufft" "" kernel_msg "${kernel_msg}")
 
         list(GET kernel_msg 0 kernel_name)
         if(NOT is_all_backend STREQUAL "")

diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -298,8 +298,6 @@ XPUOpMap& get_kl3_ops() {
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
                      phi::DataType::BFLOAT16})},
-      {"complex", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"complex_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"concat_grad",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -803,8 +801,6 @@ XPUOpMap& get_kl3_ops() {
       {"huber_loss", XPUKernelSet({phi::DataType::FLOAT32})},
       {"kldiv_loss", XPUKernelSet({phi::DataType::FLOAT32})},
       {"kldiv_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"imag", XPUKernelSet({phi::DataType::COMPLEX64})},
-      {"imag_grad", XPUKernelSet({phi::DataType::COMPLEX64})},
       {"increment",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
@@ -1102,8 +1098,6 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::INT64,
                      phi::DataType::FLOAT32,
                      phi::DataType::FLOAT64})},
-      {"real", XPUKernelSet({phi::DataType::COMPLEX64})},
-      {"real_grad", XPUKernelSet({phi::DataType::COMPLEX64})},
       {"reciprocal", XPUKernelSet({phi::DataType::FLOAT32})},
       {"reciprocal_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
@@ -1828,6 +1822,15 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::INT16,
                      phi::DataType::INT64,
                      phi::DataType::INT32})},
+#ifdef PADDLE_WITH_XPU_FFT
+      {"conj", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"real", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"real_grad", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"imag", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"imag_grad", XPUKernelSet({phi::DataType::COMPLEX64})},
+      {"complex", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"complex_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+#endif
   };
 
   return s_xpu3_kernels;

diff --git a/paddle/phi/kernels/xpu/complex_grad_kernel.cc b/paddle/phi/kernels/xpu/complex_grad_kernel.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_XPU_FFT
 #include "paddle/phi/kernels/complex_grad_kernel.h"
 
 #include "fft/cuComplex.h"
@@ -54,7 +55,7 @@ void RealGradKernel(const Context& dev_ctx,
       const_cast<phi::dtype::Real<T>*>(dout.data<phi::dtype::Real<T>>()),
       imag.data<phi::dtype::Real<T>>(),
       reinterpret_cast<cuFloatComplex*>(dx_data));
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "real_grad");
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
 }
 
 template <typename T, typename Context>
@@ -71,7 +72,7 @@ void ImagGradKernel(const Context& dev_ctx,
       real.data<phi::dtype::Real<T>>(),
       const_cast<phi::dtype::Real<T>*>(dout.data<phi::dtype::Real<T>>()),
       reinterpret_cast<cuFloatComplex*>(dx_data));
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "imag_grad");
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
 }
 
 template <typename T, typename Context>
@@ -94,7 +95,7 @@ void ComplexGradKernel(const Context& dev_ctx,
       reinterpret_cast<cuFloatComplex*>(const_cast<C*>(dout.data<C>())),
       real_data,
       imag_data);
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "complex_grad");
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
 
   if (dx) {
     if (x.dims() == dout.dims()) {
@@ -116,23 +117,27 @@ void ComplexGradKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PD_REGISTER_KERNEL(imag_grad,
+PD_REGISTER_KERNEL(imag_grad,  // xpufft
                    XPU,
                    ALL_LAYOUT,
                    phi::ImagGradKernel,
                    phi::dtype::complex<float>) {
   kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(real_grad,
+PD_REGISTER_KERNEL(real_grad,  // xpufft
                    XPU,
                    ALL_LAYOUT,
                    phi::RealGradKernel,
                    phi::dtype::complex<float>) {
   kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(
-    complex_grad, XPU, ALL_LAYOUT, phi::ComplexGradKernel, float) {
+PD_REGISTER_KERNEL(complex_grad,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ComplexGradKernel,
+                   float) {
   kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
+#endif  // PADDLE_WITH_XPU_FFT
diff --git a/paddle/phi/kernels/xpu/complex_kernel.cc b/paddle/phi/kernels/xpu/complex_kernel.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_XPU_FFT
 #include "paddle/phi/kernels/complex_kernel.h"
 
 #include "fft/cuComplex.h"
@@ -21,6 +22,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/common_infer_shape_functions.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
 
 namespace xfft_internal::xpu {
 int combine_as_complex(int N, float* real, float* imag, float2* out);
@@ -34,18 +36,29 @@ void ConjKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
-  int r = xfft_internal::xpu::Conj(
-      x.numel(),
-      reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
-      reinterpret_cast<cuFloatComplex*>(out->data<T>()));
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "conj");
+  if (std::is_same<T, phi::dtype::complex<float>>::value) {
+    int r = xfft_internal::xpu::Conj(
+        x.numel(),
+        reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
+        reinterpret_cast<cuFloatComplex*>(out->data<T>()));
+    PADDLE_ENFORCE_XPU_SUCCESS(r);
+  } else {
+    using XPUType = typename XPUCopyTypeTrait<T>::Type;
+    const auto* input_data = x.data<T>();
+    int r = xpu::copy<XPUType>(dev_ctx.x_context(),
+                               reinterpret_cast<const XPUType*>(input_data),
+                               reinterpret_cast<XPUType*>(out->data<T>()),
+                               x.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
+  }
 }
 
 template <typename T, typename Context>
 void RealKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
   dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
+  // The allocation of imag here is redundant and could be optimized.
   phi::DenseTensor imag;
   imag.Resize(x.dims());
   dev_ctx.template Alloc<phi::dtype::Real<T>>(&imag);
@@ -54,14 +67,15 @@ void RealKernel(const Context& dev_ctx,
       reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
       out->data<phi::dtype::Real<T>>(),
       imag.data<phi::dtype::Real<T>>());
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "real");
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
 }
 
 template <typename T, typename Context>
 void ImagKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 DenseTensor* out) {
   dev_ctx.template Alloc<phi::dtype::Real<T>>(out);
+  // The allocation of ‘real’ here is redundant and could be optimized.
   phi::DenseTensor real;
   real.Resize(x.dims());
   dev_ctx.template Alloc<phi::dtype::Real<T>>(&real);
@@ -70,7 +84,7 @@ void ImagKernel(const Context& dev_ctx,
       reinterpret_cast<cuFloatComplex*>(const_cast<T*>(x.data<T>())),
       real.data<phi::dtype::Real<T>>(),
       out->data<phi::dtype::Real<T>>());
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "imag");
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
 }
 
 template <typename T, typename Context>
@@ -114,24 +128,44 @@ void ComplexKernel(const Context& dev_ctx,
       x_data,
       y_data,
       reinterpret_cast<cuFloatComplex*>(out->data<C>()));
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "complex");
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
 }
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    conj, XPU, ALL_LAYOUT, phi::ConjKernel, float, phi::dtype::complex<float>) {
-}
+PD_REGISTER_KERNEL(conj,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ConjKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>) {}
 
-PD_REGISTER_KERNEL(
-    real, XPU, ALL_LAYOUT, phi::RealKernel, phi::dtype::complex<float>) {
+PD_REGISTER_KERNEL(real,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::RealKernel,
+                   phi::dtype::complex<float>) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(
-    imag, XPU, ALL_LAYOUT, phi::ImagKernel, phi::dtype::complex<float>) {
+PD_REGISTER_KERNEL(imag,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ImagKernel,
+                   phi::dtype::complex<float>) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(complex, XPU, ALL_LAYOUT, phi::ComplexKernel, float) {
+PD_REGISTER_KERNEL(complex,  // xpufft
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ComplexKernel,
+                   float) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
+#endif  // PADDLE_WITH_XPU_FFT
diff --git a/test/xpu/test_conj_op_xpu.py b/test/xpu/test_conj_op_xpu.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+
+import paddle
+
+paddle.enable_static()
+
+
+class XPUTestConjOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'conj'
+        self.use_dynamic_create_class = False
+
+    class TestConjOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "conj"
+            self.python_api = paddle.tensor.conj
+            self.init_dtype_type()
+            self.init_input()
+            self.inputs = {'X': self.x}
+            out = np.conj(self.x)
+            self.outputs = {'Out': out}
+
+        def init_dtype_type(self):
+            self.dtype = np.complex64
+
+        def init_input(self):
+            self.x = (
+                np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+            ).astype(self.dtype)
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(
+                    place,
+                    ['X'],
+                    'Out',
+                )
+
+    class TestConjOp1(TestConjOp):
+        def init_input(self):
+            self.x = (
+                np.random.random([2, 20, 2, 3])
+                + 1j * np.random.random([2, 20, 2, 3])
+            ).astype(self.dtype)
+
+    class TestConjOp2(TestConjOp):
+        def init_input(self):
+            self.x = (
+                np.random.random([2, 2, 3]) + 1j * np.random.random([2, 2, 3])
+            ).astype(self.dtype)
+
+    class TestConjOp3(TestConjOp):
+        def init_input(self):
+            self.x = np.random.random([2, 2, 3]).astype(np.int32)
+
+    class TestConjOp4(TestConjOp):
+        def init_input(self):
+            self.x = np.random.random([2, 2, 3]).astype(np.int64)
+
+    class TestConjOp5(TestConjOp):
+        def init_input(self):
+            self.x = np.random.random([2, 2, 3]).astype(np.float16)
+
+    class TestConjOp6(TestConjOp):
+        def init_input(self):
+            self.x = np.random.random([2, 2, 3]).astype(np.uint16)
+
+    class TestConjOp7(TestConjOp):
+        def init_input(self):
+            self.x = np.random.random([2, 2, 3]).astype(np.float32)
+
+    class TestConjOp8(TestConjOp):
+        def init_input(self):
+            self.x = np.random.random([2, 2, 3]).astype(np.float64)
+
+
+support_types = get_xpu_op_support_types('conj')
+for stype in support_types:
+    create_test_class(globals(), XPUTestConjOp, stype)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/xpu/xpu_op_test b/test/xpu/xpu_op_test
@@ -0,0 +1,7 @@
+complex_float32
+complex_grad_float32
+conj_complex64
+conj_complex64
+conj_complex64
+conj_complex64
+conj_complex64