From 9d518d3ad46bfc0ebfa3957f8eacd784ee27bcdb Mon Sep 17 00:00:00 2001 From: houj04 Date: Wed, 2 Apr 2025 14:34:06 +0800 Subject: [PATCH 1/3] [XPU] update StridedCopyKernel --- paddle/phi/kernels/xpu/strided_copy_kernel.cc | 68 +++++++++++++++++++ paddle/scripts/paddle_build.sh | 1 + test/indexing/test_setitem_appendix.py | 4 -- 3 files changed, 69 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc index 34245ed4f6cfe..2918bc969c283 100644 --- a/paddle/phi/kernels/xpu/strided_copy_kernel.cc +++ b/paddle/phi/kernels/xpu/strided_copy_kernel.cc @@ -44,6 +44,17 @@ void StridedCopyKernel(const Context& dev_ctx, input.numel(), out->numel())); + if (input.numel() <= 0) { + return; + } + + PADDLE_ENFORCE_NOT_NULL(out->data(), + common::errors::InvalidArgument( + "StridedCopyKernel's out tensor must complete " + "mutable data before call kernel.")); + + // 下述XPU算子有性能问题,因此暂时禁用掉,改成“先拷贝到CPU,按照CPU算子逻辑计算,再拷贝回XPU”的临时方案 + /* // use XPUCopyTypeTrait to deal with double and int16_t copy instead of // XPUTypeTrait using XPUType = typename XPUCopyTypeTrait::Type; @@ -68,6 +79,63 @@ void StridedCopyKernel(const Context& dev_ctx, common::vectorize(out->strides())); PADDLE_ENFORCE_XDNN_SUCCESS(r, "strided_copy"); } + */ + + // CPU buffer for input + char* input_on_cpu = new char[input.Holder()->size()]; + memory_utils::Copy(CPUPlace(), + static_cast(input_on_cpu), + dev_ctx.GetPlace(), + static_cast(input.Holder()->ptr()), + input.Holder()->size()); + + // CPU buffer for out + char* output_on_cpu = new char[out->Holder()->size()]; + memory_utils::Copy(CPUPlace(), + static_cast(output_on_cpu), + dev_ctx.GetPlace(), + static_cast(out->Holder()->ptr()), + out->Holder()->size()); + + // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc + const T* input_data = + reinterpret_cast(input_on_cpu + input.meta().offset); + int input_rank = input.dims().size(); + const int64_t* input_dims = input.dims().Get(); + const int64_t* input_stride = input.strides().Get(); + + T* output_data = reinterpret_cast(output_on_cpu + offset); + int output_rank = meta.dims.size(); + const int64_t* output_dims = meta.dims.Get(); + const int64_t* output_stride = meta.strides.Get(); + + auto numel = input.numel(); + + for (int64_t i = 0; i < numel; i++) { + int64_t input_offset = 0; + int64_t index_tmp = i; + for (int dim = input_rank - 1; dim >= 0; --dim) { + input_offset += (index_tmp % input_dims[dim]) * input_stride[dim]; + index_tmp = index_tmp / input_dims[dim]; + } + int64_t output_offset = 0; + index_tmp = i; + for (int dim = output_rank - 1; dim >= 0; --dim) { + output_offset += (index_tmp % output_dims[dim]) * output_stride[dim]; + index_tmp = index_tmp / output_dims[dim]; + } + output_data[output_offset] = input_data[input_offset]; + } + + // copy out tensor, from cpu to xpu + memory_utils::Copy(dev_ctx.GetPlace(), + static_cast(out->Holder()->ptr()), + CPUPlace(), + static_cast(output_on_cpu), + out->Holder()->size()); + + delete[] input_on_cpu; + delete[] output_on_cpu; } } // namespace phi diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 96f97e1c0613a..2c06edc0ed0fc 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2449,6 +2449,7 @@ EOF } function parallel_test_base_xpu() { + unset FLAGS_use_stride_kernel mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build if [ ${WITH_TESTING:-ON} == "ON" ] ; then diff --git a/test/indexing/test_setitem_appendix.py b/test/indexing/test_setitem_appendix.py index b920db2d9515c..e6329ec1b7bba 100644 --- a/test/indexing/test_setitem_appendix.py +++ b/test/indexing/test_setitem_appendix.py @@ -194,10 +194,6 @@ def test_tensor(self): self.accuracy_check(x, y) -@unittest.skipIf( - paddle.core.is_compiled_with_xpu(), - "There are some bugs on XPU.", -) class TestSetitemDygraphCombinedIndex(unittest.TestCase): def accuracy_check(self, numpy_array, paddle_t): np.testing.assert_allclose(numpy_array, paddle_t.numpy()) From f4a81ebba8253bc46a7c3894e599810233fcdeca Mon Sep 17 00:00:00 2001 From: houj04 Date: Wed, 2 Apr 2025 17:17:20 +0800 Subject: [PATCH 2/3] fix ut env --- paddle/scripts/paddle_build.sh | 1 - test/indexing/CMakeLists.txt | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2c06edc0ed0fc..96f97e1c0613a 100644 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2449,7 +2449,6 @@ EOF } function parallel_test_base_xpu() { - unset FLAGS_use_stride_kernel mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build if [ ${WITH_TESTING:-ON} == "ON" ] ; then diff --git a/test/indexing/CMakeLists.txt b/test/indexing/CMakeLists.txt index 95739040ef4af..a37113cbfe02b 100644 --- a/test/indexing/CMakeLists.txt +++ b/test/indexing/CMakeLists.txt @@ -7,3 +7,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach() + +set_tests_properties(test_setitem_appendix + PROPERTIES ENVIRONMENT "FLAGS_use_stride_kernel=1") From ed3dc69f6ebf59f1681c4caf918b447ce51adddc Mon Sep 17 00:00:00 2001 From: houj04 Date: Thu, 3 Apr 2025 12:26:58 +0800 Subject: [PATCH 3/3] follow comments --- paddle/phi/kernels/xpu/strided_copy_kernel.cc | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/paddle/phi/kernels/xpu/strided_copy_kernel.cc b/paddle/phi/kernels/xpu/strided_copy_kernel.cc index 2918bc969c283..6a0e25d5e4665 100644 --- a/paddle/phi/kernels/xpu/strided_copy_kernel.cc +++ b/paddle/phi/kernels/xpu/strided_copy_kernel.cc @@ -1,20 +1,21 @@ -/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/phi/kernels/strided_copy_kernel.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" - namespace phi { template @@ -53,7 +54,10 @@ void StridedCopyKernel(const Context& dev_ctx, "StridedCopyKernel's out tensor must complete " "mutable data before call kernel.")); - // 下述XPU算子有性能问题,因此暂时禁用掉,改成“先拷贝到CPU,按照CPU算子逻辑计算,再拷贝回XPU”的临时方案 + // The following XPU operators have performance issues and are temporarily + // disabled. A temporary workaround has been implemented: "First copy data to + // CPU, perform computation using CPU operator logic, then copy results back + // to XPU". /* // use XPUCopyTypeTrait to deal with double and int16_t copy instead of // XPUTypeTrait @@ -81,6 +85,9 @@ void StridedCopyKernel(const Context& dev_ctx, } */ + // wait before copy + dev_ctx.Wait(); + // CPU buffer for input char* input_on_cpu = new char[input.Holder()->size()]; memory_utils::Copy(CPUPlace(), @@ -97,6 +104,9 @@ void StridedCopyKernel(const Context& dev_ctx, static_cast(out->Holder()->ptr()), out->Holder()->size()); + // wait after copy + dev_ctx.Wait(); + // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc const T* input_data = reinterpret_cast(input_on_cpu + input.meta().offset); @@ -133,6 +143,8 @@ void StridedCopyKernel(const Context& dev_ctx, CPUPlace(), static_cast(output_on_cpu), out->Holder()->size()); + // wait after copy + dev_ctx.Wait(); delete[] input_on_cpu; delete[] output_on_cpu;