Skip to content

Commit bc67fa1

Browse files
committed
[XPU] update xhpc to impove performance of strided_copy
1 parent 2615043 commit bc67fa1

File tree

3 files changed

+17
-74
lines changed

3 files changed

+17
-74
lines changed

cmake/external/xpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ set(XPU_XFA_LIB_NAME "libxpu_flash_attention.so")
3030
set(XPU_XPUDNN_LIB_NAME "libxpu_dnn.so")
3131

3232
if(NOT DEFINED XPU_XHPC_BASE_DATE)
33-
set(XPU_XHPC_BASE_DATE "dev/20250417")
33+
set(XPU_XHPC_BASE_DATE "dev/20250514")
3434
endif()
3535
set(XPU_XCCL_BASE_VERSION "3.0.2.5") # For XRE5
3636
if(NOT DEFINED XPU_XFT_BASE_VERSION)

paddle/phi/kernels/xpu/activation_grad_kernel.cc

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ int xpu_activation_backward(const Context& dev_ctx,
144144
const XPUType*,
145145
const XPUType*,
146146
XPUType*,
147-
int)> func) {
147+
int64_t)> func) {
148148
/* TODO: relu tanh sigmoid are inplace */
149149
const XPUType* x_data = nullptr;
150150
const XPUType* y_data = nullptr;
@@ -606,8 +606,19 @@ struct XPURsqrtGradFunctor : public funcs::BaseActivationFunctor<T> {
606606
const DenseTensor* out,
607607
const DenseTensor* dout,
608608
DenseTensor* dx) const {
609-
int r = xpu_activation_backward<Context, T, XPUType>(
610-
dev_ctx, x, out, dout, dx, xpu::rsqrt_grad<XPUType>);
609+
dev_ctx.template Alloc<T>(dx);
610+
const XPUType* out_data = nullptr;
611+
const XPUType* dout_data = nullptr;
612+
if (out != nullptr) {
613+
out_data = reinterpret_cast<const XPUType*>(out->data<T>());
614+
}
615+
if (dout != nullptr) {
616+
dout_data = reinterpret_cast<const XPUType*>(dout->data<T>());
617+
}
618+
XPUType* dx_data = reinterpret_cast<XPUType*>(dx->data<T>());
619+
620+
int r = xpu::rsqrt_grad(
621+
dev_ctx.x_context(), out_data, dout_data, dx_data, dx->numel());
611622
PADDLE_ENFORCE_XDNN_SUCCESS(r, "rsqrt_grad");
612623
}
613624
};

paddle/phi/kernels/xpu/strided_copy_kernel.cc

Lines changed: 2 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,6 @@ void StridedCopyKernel(const Context& dev_ctx,
5454
"StridedCopyKernel's out tensor must complete "
5555
"mutable data before call kernel."));
5656

57-
// The following XPU operators have performance issues and are temporarily
58-
// disabled. A temporary workaround has been implemented: "First copy data to
59-
// CPU, perform computation using CPU operator logic, then copy results back
60-
// to XPU".
61-
/*
6257
// use XPUCopyTypeTrait to deal with double and int16_t copy instead of
6358
// XPUTypeTrait
6459
using XPUType = typename XPUCopyTypeTrait<T>::Type;
@@ -74,80 +69,17 @@ void StridedCopyKernel(const Context& dev_ctx,
7469
r = xpu::copy<XPUType>(dev_ctx.x_context(), input_data, output_data, 1);
7570
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
7671
} else {
72+
int64_t data_size = input.Holder()->size() - input.meta().offset;
7773
r = xpu::strided_copy<XPUType>(dev_ctx.x_context(),
7874
input_data,
7975
output_data,
76+
data_size,
8077
common::vectorize<int64_t>(input.dims()),
8178
common::vectorize<int64_t>(out->dims()),
8279
common::vectorize<int64_t>(input.strides()),
8380
common::vectorize<int64_t>(out->strides()));
8481
PADDLE_ENFORCE_XDNN_SUCCESS(r, "strided_copy");
8582
}
86-
*/
87-
88-
// wait before copy
89-
dev_ctx.Wait();
90-
91-
// CPU buffer for input
92-
char* input_on_cpu = new char[input.Holder()->size()];
93-
memory_utils::Copy(CPUPlace(),
94-
static_cast<void*>(input_on_cpu),
95-
dev_ctx.GetPlace(),
96-
static_cast<const void*>(input.Holder()->ptr()),
97-
input.Holder()->size());
98-
99-
// CPU buffer for out
100-
char* output_on_cpu = new char[out->Holder()->size()];
101-
memory_utils::Copy(CPUPlace(),
102-
static_cast<void*>(output_on_cpu),
103-
dev_ctx.GetPlace(),
104-
static_cast<const void*>(out->Holder()->ptr()),
105-
out->Holder()->size());
106-
107-
// wait after copy
108-
dev_ctx.Wait();
109-
110-
// follow paddle/phi/kernels/cpu/strided_copy_kernel.cc
111-
const T* input_data =
112-
reinterpret_cast<T*>(input_on_cpu + input.meta().offset);
113-
int input_rank = input.dims().size();
114-
const int64_t* input_dims = input.dims().Get();
115-
const int64_t* input_stride = input.strides().Get();
116-
117-
T* output_data = reinterpret_cast<T*>(output_on_cpu + offset);
118-
int output_rank = meta.dims.size();
119-
const int64_t* output_dims = meta.dims.Get();
120-
const int64_t* output_stride = meta.strides.Get();
121-
122-
auto numel = input.numel();
123-
124-
for (int64_t i = 0; i < numel; i++) {
125-
int64_t input_offset = 0;
126-
int64_t index_tmp = i;
127-
for (int dim = input_rank - 1; dim >= 0; --dim) {
128-
input_offset += (index_tmp % input_dims[dim]) * input_stride[dim];
129-
index_tmp = index_tmp / input_dims[dim];
130-
}
131-
int64_t output_offset = 0;
132-
index_tmp = i;
133-
for (int dim = output_rank - 1; dim >= 0; --dim) {
134-
output_offset += (index_tmp % output_dims[dim]) * output_stride[dim];
135-
index_tmp = index_tmp / output_dims[dim];
136-
}
137-
output_data[output_offset] = input_data[input_offset];
138-
}
139-
140-
// copy out tensor, from cpu to xpu
141-
memory_utils::Copy(dev_ctx.GetPlace(),
142-
static_cast<void*>(out->Holder()->ptr()),
143-
CPUPlace(),
144-
static_cast<const void*>(output_on_cpu),
145-
out->Holder()->size());
146-
// wait after copy
147-
dev_ctx.Wait();
148-
149-
delete[] input_on_cpu;
150-
delete[] output_on_cpu;
15183
}
15284

15385
} // namespace phi

0 commit comments

Comments
 (0)