From 9d518d3ad46bfc0ebfa3957f8eacd784ee27bcdb Mon Sep 17 00:00:00 2001
From: houj04 <houj04@foxmail.com>
Date: Wed, 2 Apr 2025 14:34:06 +0800
Subject: [PATCH 1/3] [XPU] update StridedCopyKernel

---
 paddle/phi/kernels/xpu/strided_copy_kernel.cc | 68 +++++++++++++++++++
 paddle/scripts/paddle_build.sh                |  1 +
 test/indexing/test_setitem_appendix.py        |  4 --
 3 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
index 34245ed4f6cfe..2918bc969c283 100644
--- a/paddle/phi/kernels/xpu/strided_copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
@@ -44,6 +44,17 @@ void StridedCopyKernel(const Context& dev_ctx,
                         input.numel(),
                         out->numel()));
 
+  if (input.numel() <= 0) {
+    return;
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(out->data<T>(),
+                          common::errors::InvalidArgument(
+                              "StridedCopyKernel's out tensor must complete "
+                              "mutable data before call kernel."));
+
+  // 下述XPU算子有性能问题，因此暂时禁用掉，改成“先拷贝到CPU，按照CPU算子逻辑计算，再拷贝回XPU”的临时方案
+  /*
   // use XPUCopyTypeTrait to deal with double and int16_t copy instead of
   // XPUTypeTrait
   using XPUType = typename XPUCopyTypeTrait<T>::Type;
@@ -68,6 +79,63 @@ void StridedCopyKernel(const Context& dev_ctx,
                                    common::vectorize<int64_t>(out->strides()));
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "strided_copy");
   }
+  */
+
+  // CPU buffer for input
+  char* input_on_cpu = new char[input.Holder()->size()];
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(input_on_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<const void*>(input.Holder()->ptr()),
+                     input.Holder()->size());
+
+  // CPU buffer for out
+  char* output_on_cpu = new char[out->Holder()->size()];
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(output_on_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<const void*>(out->Holder()->ptr()),
+                     out->Holder()->size());
+
+  // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc
+  const T* input_data =
+      reinterpret_cast<T*>(input_on_cpu + input.meta().offset);
+  int input_rank = input.dims().size();
+  const int64_t* input_dims = input.dims().Get();
+  const int64_t* input_stride = input.strides().Get();
+
+  T* output_data = reinterpret_cast<T*>(output_on_cpu + offset);
+  int output_rank = meta.dims.size();
+  const int64_t* output_dims = meta.dims.Get();
+  const int64_t* output_stride = meta.strides.Get();
+
+  auto numel = input.numel();
+
+  for (int64_t i = 0; i < numel; i++) {
+    int64_t input_offset = 0;
+    int64_t index_tmp = i;
+    for (int dim = input_rank - 1; dim >= 0; --dim) {
+      input_offset += (index_tmp % input_dims[dim]) * input_stride[dim];
+      index_tmp = index_tmp / input_dims[dim];
+    }
+    int64_t output_offset = 0;
+    index_tmp = i;
+    for (int dim = output_rank - 1; dim >= 0; --dim) {
+      output_offset += (index_tmp % output_dims[dim]) * output_stride[dim];
+      index_tmp = index_tmp / output_dims[dim];
+    }
+    output_data[output_offset] = input_data[input_offset];
+  }
+
+  // copy out tensor, from cpu to xpu
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     static_cast<void*>(out->Holder()->ptr()),
+                     CPUPlace(),
+                     static_cast<const void*>(output_on_cpu),
+                     out->Holder()->size());
+
+  delete[] input_on_cpu;
+  delete[] output_on_cpu;
 }
 
 }  // namespace phi
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 96f97e1c0613a..2c06edc0ed0fc 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2449,6 +2449,7 @@ EOF
 }
 
 function parallel_test_base_xpu() {
+    unset FLAGS_use_stride_kernel
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
diff --git a/test/indexing/test_setitem_appendix.py b/test/indexing/test_setitem_appendix.py
index b920db2d9515c..e6329ec1b7bba 100644
--- a/test/indexing/test_setitem_appendix.py
+++ b/test/indexing/test_setitem_appendix.py
@@ -194,10 +194,6 @@ def test_tensor(self):
         self.accuracy_check(x, y)
 
 
-@unittest.skipIf(
-    paddle.core.is_compiled_with_xpu(),
-    "There are some bugs on XPU.",
-)
 class TestSetitemDygraphCombinedIndex(unittest.TestCase):
     def accuracy_check(self, numpy_array, paddle_t):
         np.testing.assert_allclose(numpy_array, paddle_t.numpy())

From f4a81ebba8253bc46a7c3894e599810233fcdeca Mon Sep 17 00:00:00 2001
From: houj04 <houj04@foxmail.com>
Date: Wed, 2 Apr 2025 17:17:20 +0800
Subject: [PATCH 2/3] fix ut env

---
 paddle/scripts/paddle_build.sh | 1 -
 test/indexing/CMakeLists.txt   | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2c06edc0ed0fc..96f97e1c0613a 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2449,7 +2449,6 @@ EOF
 }
 
 function parallel_test_base_xpu() {
-    unset FLAGS_use_stride_kernel
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
diff --git a/test/indexing/CMakeLists.txt b/test/indexing/CMakeLists.txt
index 95739040ef4af..a37113cbfe02b 100644
--- a/test/indexing/CMakeLists.txt
+++ b/test/indexing/CMakeLists.txt
@@ -7,3 +7,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach()
+
+set_tests_properties(test_setitem_appendix
+                     PROPERTIES ENVIRONMENT "FLAGS_use_stride_kernel=1")

From ed3dc69f6ebf59f1681c4caf918b447ce51adddc Mon Sep 17 00:00:00 2001
From: houj04 <houj04@foxmail.com>
Date: Thu, 3 Apr 2025 12:26:58 +0800
Subject: [PATCH 3/3] follow comments

---
 paddle/phi/kernels/xpu/strided_copy_kernel.cc | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
index 2918bc969c283..6a0e25d5e4665 100644
--- a/paddle/phi/kernels/xpu/strided_copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/strided_copy_kernel.cc
@@ -1,20 +1,21 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "paddle/phi/kernels/strided_copy_kernel.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
-
 namespace phi {
 
 template <typename T, typename Context>
@@ -53,7 +54,10 @@ void StridedCopyKernel(const Context& dev_ctx,
                               "StridedCopyKernel's out tensor must complete "
                               "mutable data before call kernel."));
 
-  // 下述XPU算子有性能问题，因此暂时禁用掉，改成“先拷贝到CPU，按照CPU算子逻辑计算，再拷贝回XPU”的临时方案
+  // The following XPU operators have performance issues and are temporarily
+  // disabled. A temporary workaround has been implemented: "First copy data to
+  // CPU, perform computation using CPU operator logic, then copy results back
+  // to XPU".
   /*
   // use XPUCopyTypeTrait to deal with double and int16_t copy instead of
   // XPUTypeTrait
@@ -81,6 +85,9 @@ void StridedCopyKernel(const Context& dev_ctx,
   }
   */
 
+  // wait before copy
+  dev_ctx.Wait();
+
   // CPU buffer for input
   char* input_on_cpu = new char[input.Holder()->size()];
   memory_utils::Copy(CPUPlace(),
@@ -97,6 +104,9 @@ void StridedCopyKernel(const Context& dev_ctx,
                      static_cast<const void*>(out->Holder()->ptr()),
                      out->Holder()->size());
 
+  // wait after copy
+  dev_ctx.Wait();
+
   // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc
   const T* input_data =
       reinterpret_cast<T*>(input_on_cpu + input.meta().offset);
@@ -133,6 +143,8 @@ void StridedCopyKernel(const Context& dev_ctx,
                      CPUPlace(),
                      static_cast<const void*>(output_on_cpu),
                      out->Holder()->size());
+  // wait after copy
+  dev_ctx.Wait();
 
   delete[] input_on_cpu;
   delete[] output_on_cpu;