@@ -54,11 +54,6 @@ void StridedCopyKernel(const Context& dev_ctx,
54
54
" StridedCopyKernel's out tensor must complete "
55
55
" mutable data before call kernel." ));
56
56
57
- // The following XPU operators have performance issues and are temporarily
58
- // disabled. A temporary workaround has been implemented: "First copy data to
59
- // CPU, perform computation using CPU operator logic, then copy results back
60
- // to XPU".
61
- /*
62
57
// use XPUCopyTypeTrait to deal with double and int16_t copy instead of
63
58
// XPUTypeTrait
64
59
using XPUType = typename XPUCopyTypeTrait<T>::Type;
@@ -74,80 +69,17 @@ void StridedCopyKernel(const Context& dev_ctx,
74
69
r = xpu::copy<XPUType>(dev_ctx.x_context (), input_data, output_data, 1 );
75
70
PADDLE_ENFORCE_XDNN_SUCCESS (r, " copy" );
76
71
} else {
72
+ int64_t data_size = input.Holder ()->size () - input.meta ().offset ;
77
73
r = xpu::strided_copy<XPUType>(dev_ctx.x_context (),
78
74
input_data,
79
75
output_data,
76
+ data_size,
80
77
common::vectorize<int64_t >(input.dims ()),
81
78
common::vectorize<int64_t >(out->dims ()),
82
79
common::vectorize<int64_t >(input.strides ()),
83
80
common::vectorize<int64_t >(out->strides ()));
84
81
PADDLE_ENFORCE_XDNN_SUCCESS (r, " strided_copy" );
85
82
}
86
- */
87
-
88
- // wait before copy
89
- dev_ctx.Wait ();
90
-
91
- // CPU buffer for input
92
- char * input_on_cpu = new char [input.Holder ()->size ()];
93
- memory_utils::Copy (CPUPlace (),
94
- static_cast <void *>(input_on_cpu),
95
- dev_ctx.GetPlace (),
96
- static_cast <const void *>(input.Holder ()->ptr ()),
97
- input.Holder ()->size ());
98
-
99
- // CPU buffer for out
100
- char * output_on_cpu = new char [out->Holder ()->size ()];
101
- memory_utils::Copy (CPUPlace (),
102
- static_cast <void *>(output_on_cpu),
103
- dev_ctx.GetPlace (),
104
- static_cast <const void *>(out->Holder ()->ptr ()),
105
- out->Holder ()->size ());
106
-
107
- // wait after copy
108
- dev_ctx.Wait ();
109
-
110
- // follow paddle/phi/kernels/cpu/strided_copy_kernel.cc
111
- const T* input_data =
112
- reinterpret_cast <T*>(input_on_cpu + input.meta ().offset );
113
- int input_rank = input.dims ().size ();
114
- const int64_t * input_dims = input.dims ().Get ();
115
- const int64_t * input_stride = input.strides ().Get ();
116
-
117
- T* output_data = reinterpret_cast <T*>(output_on_cpu + offset);
118
- int output_rank = meta.dims .size ();
119
- const int64_t * output_dims = meta.dims .Get ();
120
- const int64_t * output_stride = meta.strides .Get ();
121
-
122
- auto numel = input.numel ();
123
-
124
- for (int64_t i = 0 ; i < numel; i++) {
125
- int64_t input_offset = 0 ;
126
- int64_t index_tmp = i;
127
- for (int dim = input_rank - 1 ; dim >= 0 ; --dim) {
128
- input_offset += (index_tmp % input_dims[dim]) * input_stride[dim];
129
- index_tmp = index_tmp / input_dims[dim];
130
- }
131
- int64_t output_offset = 0 ;
132
- index_tmp = i;
133
- for (int dim = output_rank - 1 ; dim >= 0 ; --dim) {
134
- output_offset += (index_tmp % output_dims[dim]) * output_stride[dim];
135
- index_tmp = index_tmp / output_dims[dim];
136
- }
137
- output_data[output_offset] = input_data[input_offset];
138
- }
139
-
140
- // copy out tensor, from cpu to xpu
141
- memory_utils::Copy (dev_ctx.GetPlace (),
142
- static_cast <void *>(out->Holder ()->ptr ()),
143
- CPUPlace (),
144
- static_cast <const void *>(output_on_cpu),
145
- out->Holder ()->size ());
146
- // wait after copy
147
- dev_ctx.Wait ();
148
-
149
- delete[] input_on_cpu;
150
- delete[] output_on_cpu;
151
83
}
152
84
153
85
} // namespace phi
0 commit comments