@@ -22,29 +22,13 @@ limitations under the License. */
22
22
#include " paddle/phi/backends/gpu/gpu_primitives.h"
23
23
#include " paddle/phi/common/place.h"
24
24
#include " paddle/phi/core/dense_tensor.h"
25
+ #include " paddle/phi/kernels/funcs/aligned_vector.h"
25
26
#include " paddle/phi/kernels/funcs/math_function.h"
27
+ #include " paddle/phi/kernels/primitive/kernel_primitives.h"
26
28
27
29
namespace phi {
28
30
namespace funcs {
29
31
30
- template <typename T, typename IndexT = int >
31
- __global__ void GatherCUDAKernel (const T* params,
32
- const IndexT* indices,
33
- T* output,
34
- size_t index_size,
35
- size_t slice_size,
36
- int64_t index_dim_size) {
37
- CUDA_KERNEL_LOOP_TYPE (i, index_size * slice_size, int64_t ) {
38
- int64_t indices_i = i / slice_size;
39
- int64_t slice_i = i - indices_i * slice_size; // offset inside the slice
40
- IndexT gather_i =
41
- (indices[indices_i] < 0 ? (indices[indices_i] + index_dim_size)
42
- : indices[indices_i]);
43
- int64_t params_i = gather_i * slice_size + slice_i;
44
- *(output + i) = *(params + params_i);
45
- }
46
- }
47
-
48
32
template <typename T, typename IndexT = int >
49
33
__global__ void GatherNdCUDAKernel (const T* input,
50
34
const Dim<DDim::kMaxRank > input_dims,
@@ -81,48 +65,6 @@ __global__ void GatherNdCUDAKernel(const T* input,
81
65
}
82
66
}
83
67
84
- /* *
85
- * A thin wrapper on gpu tensor
86
- * Return a new tensor from source tensor, gathered according to index
87
- * input[src]: type-T source Tensor
88
- * input[index]: type-IndexT index Tensor (1-D)
89
- * return: output tensor
90
- */
91
- template <typename T, typename IndexT = int >
92
- void GPUGather (const phi::GPUContext& ctx,
93
- const DenseTensor& src,
94
- const DenseTensor& index,
95
- DenseTensor* output) {
96
- if (index .dims ().size () == 2 ) {
97
- PADDLE_ENFORCE_EQ (
98
- index .dims ()[1 ],
99
- 1 ,
100
- common::errors::InvalidArgument (" If the index's rank of gather_op is 2,"
101
- " the second dimension should be 1." ));
102
- }
103
-
104
- // index size
105
- int64_t index_size = index .dims ().size () == 0 ? 1 : index .dims ()[0 ];
106
-
107
- auto src_dims = src.dims ();
108
-
109
- // slice size
110
- int64_t slice_size = 1 ;
111
- for (int i = 1 ; i < src_dims.size (); ++i) slice_size *= src_dims[i];
112
-
113
- const T* p_src = src.data <T>();
114
- const IndexT* p_index = index .data <IndexT>();
115
- T* p_output = output->data <T>();
116
-
117
- int block = 512 ;
118
- int64_t n = slice_size * index_size;
119
- dim3 grid = dim3 ((n + block - 1 ) / block);
120
- phi::backends::gpu::LimitGridDim (ctx, &grid);
121
-
122
- GatherCUDAKernel<T, IndexT><<<grid, block, 0 , ctx.stream ()>>>(
123
- p_src, p_index, p_output, index_size, slice_size, src_dims[0 ]);
124
- }
125
-
126
68
template <typename T, typename IndexT = int >
127
69
void GPUGatherNd (const phi::GPUContext& ctx,
128
70
const DenseTensor& input,
@@ -170,42 +112,36 @@ void GPUGatherNd(const phi::GPUContext& ctx,
170
112
end_size);
171
113
}
172
114
173
- template <typename T, typename U>
115
+ template <typename T, typename U, int VecSize >
174
116
__global__ void GatherGPUKernel (const T* input,
175
117
const U* index,
176
118
T* out,
177
119
int64_t outer_dim_size,
178
- int64_t inner_dim_size,
179
120
int64_t out_index_dim_size,
180
121
int64_t input_index_dim_size,
181
122
int64_t size) {
182
- int64_t idx = blockDim.x * blockIdx.x + threadIdx.x ;
123
+ int64_t block_size = blockDim.x ;
124
+ int64_t idx = (blockIdx.x * block_size + threadIdx.x ) * VecSize;
183
125
int64_t outer_size = outer_dim_size * out_index_dim_size;
184
- for (; idx < size; idx += blockDim .x * gridDim. x ) {
126
+ for (; idx < size; idx += gridDim .x * block_size * VecSize ) {
185
127
int64_t inner_dim_index = idx / outer_size;
186
- int64_t next_idx = idx - outer_size * inner_dim_index ;
128
+ int64_t next_idx = idx % outer_size;
187
129
int64_t index_dim_index = next_idx / outer_dim_size;
188
130
U index_val = index [index_dim_index];
189
131
190
- PADDLE_ENFORCE (
191
- index_val >= -input_index_dim_size && index_val < input_index_dim_size,
192
- " The index is out of bounds, "
193
- " please check whether the dimensions of index and "
194
- " input meet the requirements. It should "
195
- " be less than [%ld] and greater than or equal to [%ld], but "
196
- " received [%ld]" ,
197
- input_index_dim_size,
198
- -input_index_dim_size,
199
- index_val);
200
132
if (index_val < 0 ) {
201
133
index_val += input_index_dim_size;
202
134
}
203
135
204
- int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index ;
136
+ int64_t out_dim_index = next_idx % outer_dim_size;
205
137
int64_t input_index =
206
138
inner_dim_index * (outer_dim_size * input_index_dim_size) +
207
139
index_val * outer_dim_size + out_dim_index;
208
- out[idx] = input[input_index];
140
+
141
+ using VecType = kps::details::VectorType<T, VecSize>;
142
+ const VecType* src = reinterpret_cast <const VecType*>(&input[input_index]);
143
+ VecType* dst = reinterpret_cast <VecType*>(&out[idx]);
144
+ *dst = *src;
209
145
}
210
146
}
211
147
@@ -248,12 +184,10 @@ void GatherV2CUDAFunction(const DenseTensor* input,
248
184
int axis_index = axis;
249
185
int64_t index_dim_size = input_dim[axis_index];
250
186
251
- int64_t inner_dim_size = 1 ;
252
187
int64_t outer_dim_size = 1 ;
253
188
std::vector<int64_t > out_dim_vec;
254
189
255
190
for (int i = 0 ; i < axis_index; i++) {
256
- inner_dim_size *= input_dim[i];
257
191
out_dim_vec.push_back (input_dim[i]);
258
192
}
259
193
if (index ->dims ().size () != 0 ) {
@@ -270,18 +204,54 @@ void GatherV2CUDAFunction(const DenseTensor* input,
270
204
int64_t out_size = out->numel ();
271
205
if (out_size == 0 ) return ;
272
206
273
- auto config = phi::backends::gpu::GetGpuLaunchConfig1D (ctx, out_size);
207
+ int vec_size = 4 ;
208
+ vec_size = std::min (phi::GetVectorizedSize (input), vec_size);
209
+ vec_size = std::min (phi::GetVectorizedSize (out), vec_size);
210
+ while (vec_size > 1 && outer_dim_size % vec_size != 0 ) {
211
+ vec_size /= 2 ;
212
+ }
213
+
214
+ constexpr int loop_count = 4 ;
215
+ auto config = phi::backends::gpu::GetGpuLaunchConfig1D (
216
+ ctx, out_size, vec_size * loop_count);
274
217
auto stream = ctx.stream ();
275
- GatherGPUKernel<T, U>
276
- <<<config.block_per_grid , config.thread_per_block , 0 , stream>>>(
277
- input_data,
278
- index_data,
279
- out_data,
280
- outer_dim_size,
281
- inner_dim_size,
282
- index_size,
283
- index_dim_size,
284
- out_size);
218
+
219
+ switch (vec_size) {
220
+ #define CASE_VEC_SIZE (__Sz ) \
221
+ case __Sz: \
222
+ GatherGPUKernel<T, U, __Sz> \
223
+ <<<config.block_per_grid , config.thread_per_block , 0 , stream>>>( \
224
+ input_data, \
225
+ index_data, \
226
+ out_data, \
227
+ outer_dim_size, \
228
+ index_size, \
229
+ index_dim_size, \
230
+ out_size); \
231
+ break
232
+ CASE_VEC_SIZE (4 );
233
+ CASE_VEC_SIZE (2 );
234
+ CASE_VEC_SIZE (1 );
235
+ #undef CASE_VEC_SIZE
236
+ default :
237
+ PADDLE_THROW (common::errors::Unimplemented (
238
+ " Unsupported vectorized size: %d" , vec_size));
239
+ }
240
+ }
241
+
242
+ /* *
243
+ * A thin wrapper on gpu tensor
244
+ * Return a new tensor from source tensor, gathered according to index
245
+ * input[src]: type-T source Tensor
246
+ * input[index]: type-IndexT index Tensor (1-D)
247
+ * return: output tensor
248
+ */
249
+ template <typename T, typename IndexT = int >
250
+ void GPUGather (const phi::GPUContext& ctx,
251
+ const DenseTensor& src,
252
+ const DenseTensor& index,
253
+ DenseTensor* output) {
254
+ GatherV2CUDAFunction<T, IndexT>(&src, &index , /* axis= */ 0 , output, ctx);
285
255
}
286
256
287
257
template <typename T, typename U>
0 commit comments