[XPU] update StridedCopyKernel (PaddlePaddle#72030)

houj04 · web-flow · commit b888f044ce17 · 2025-04-03T18:42:45.000+08:00
* [XPU] update StridedCopyKernel

* fix ut env

* follow comments
diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
@@ -1,20 +1,21 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/phi/kernels/strided_copy_kernel.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -44,6 +45,20 @@ void StridedCopyKernel(const Context& dev_ctx,
                         input.numel(),
                         out->numel()));
 
+  if (input.numel() <= 0) {
+    return;
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(out->data<T>(),
+                          common::errors::InvalidArgument(
+                              "StridedCopyKernel's out tensor must complete "
+                              "mutable data before call kernel."));
+
+  // The following XPU operators have performance issues and are temporarily
+  // disabled. A temporary workaround has been implemented: "First copy data to
+  // CPU, perform computation using CPU operator logic, then copy results back
+  // to XPU".
+  /*
   // use XPUCopyTypeTrait to deal with double and int16_t copy instead of
   // XPUTypeTrait
   using XPUType = typename XPUCopyTypeTrait<T>::Type;
@@ -68,6 +83,71 @@ void StridedCopyKernel(const Context& dev_ctx,
                                    common::vectorize<int64_t>(out->strides()));
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "strided_copy");
   }
+  */
+
+  // wait before copy
+  dev_ctx.Wait();
+
+  // CPU buffer for input
+  char* input_on_cpu = new char[input.Holder()->size()];
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(input_on_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<const void*>(input.Holder()->ptr()),
+                     input.Holder()->size());
+
+  // CPU buffer for out
+  char* output_on_cpu = new char[out->Holder()->size()];
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(output_on_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<const void*>(out->Holder()->ptr()),
+                     out->Holder()->size());
+
+  // wait after copy
+  dev_ctx.Wait();
+
+  // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc
+  const T* input_data =
+      reinterpret_cast<T*>(input_on_cpu + input.meta().offset);
+  int input_rank = input.dims().size();
+  const int64_t* input_dims = input.dims().Get();
+  const int64_t* input_stride = input.strides().Get();
+
+  T* output_data = reinterpret_cast<T*>(output_on_cpu + offset);
+  int output_rank = meta.dims.size();
+  const int64_t* output_dims = meta.dims.Get();
+  const int64_t* output_stride = meta.strides.Get();
+
+  auto numel = input.numel();
+
+  for (int64_t i = 0; i < numel; i++) {
+    int64_t input_offset = 0;
+    int64_t index_tmp = i;
+    for (int dim = input_rank - 1; dim >= 0; --dim) {
+      input_offset += (index_tmp % input_dims[dim]) * input_stride[dim];
+      index_tmp = index_tmp / input_dims[dim];
+    }
+    int64_t output_offset = 0;
+    index_tmp = i;
+    for (int dim = output_rank - 1; dim >= 0; --dim) {
+      output_offset += (index_tmp % output_dims[dim]) * output_stride[dim];
+      index_tmp = index_tmp / output_dims[dim];
+    }
+    output_data[output_offset] = input_data[input_offset];
+  }
+
+  // copy out tensor, from cpu to xpu
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     static_cast<void*>(out->Holder()->ptr()),
+                     CPUPlace(),
+                     static_cast<const void*>(output_on_cpu),
+                     out->Holder()->size());
+  // wait after copy
+  dev_ctx.Wait();
+
+  delete[] input_on_cpu;
+  delete[] output_on_cpu;
 }
 
 }  // namespace phi
diff --git a/test/indexing/CMakeLists.txt b/test/indexing/CMakeLists.txt
@@ -7,3 +7,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
+
+set_tests_properties(test_setitem_appendix
+                     PROPERTIES ENVIRONMENT "FLAGS_use_stride_kernel=1")
diff --git a/test/indexing/test_setitem_appendix.py b/test/indexing/test_setitem_appendix.py
@@ -194,10 +194,6 @@ def test_tensor(self):
         self.accuracy_check(x, y)
 
 
-@unittest.skipIf(
-    paddle.core.is_compiled_with_xpu(),
-    "There are some bugs on XPU.",
-)
 class TestSetitemDygraphCombinedIndex(unittest.TestCase):
     def accuracy_check(self, numpy_array, paddle_t):
         np.testing.assert_allclose(numpy_array, paddle_t.numpy())