Skip to content

Commit 0498fa7

Browse files
committed
[PHI] Optimize Gather kernel with vectorization
1 parent 0a85d41 commit 0498fa7

File tree

1 file changed

+60
-90
lines changed

1 file changed

+60
-90
lines changed

paddle/phi/kernels/funcs/gather.cu.h

+60-90
Original file line numberDiff line numberDiff line change
@@ -22,29 +22,13 @@ limitations under the License. */
2222
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2323
#include "paddle/phi/common/place.h"
2424
#include "paddle/phi/core/dense_tensor.h"
25+
#include "paddle/phi/kernels/funcs/aligned_vector.h"
2526
#include "paddle/phi/kernels/funcs/math_function.h"
27+
#include "paddle/phi/kernels/primitive/kernel_primitives.h"
2628

2729
namespace phi {
2830
namespace funcs {
2931

30-
template <typename T, typename IndexT = int>
31-
__global__ void GatherCUDAKernel(const T* params,
32-
const IndexT* indices,
33-
T* output,
34-
size_t index_size,
35-
size_t slice_size,
36-
int64_t index_dim_size) {
37-
CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
38-
int64_t indices_i = i / slice_size;
39-
int64_t slice_i = i - indices_i * slice_size; // offset inside the slice
40-
IndexT gather_i =
41-
(indices[indices_i] < 0 ? (indices[indices_i] + index_dim_size)
42-
: indices[indices_i]);
43-
int64_t params_i = gather_i * slice_size + slice_i;
44-
*(output + i) = *(params + params_i);
45-
}
46-
}
47-
4832
template <typename T, typename IndexT = int>
4933
__global__ void GatherNdCUDAKernel(const T* input,
5034
const Dim<DDim::kMaxRank> input_dims,
@@ -81,48 +65,6 @@ __global__ void GatherNdCUDAKernel(const T* input,
8165
}
8266
}
8367

84-
/**
85-
* A thin wrapper on gpu tensor
86-
* Return a new tensor from source tensor, gathered according to index
87-
* input[src]: type-T source Tensor
88-
* input[index]: type-IndexT index Tensor (1-D)
89-
* return: output tensor
90-
*/
91-
template <typename T, typename IndexT = int>
92-
void GPUGather(const phi::GPUContext& ctx,
93-
const DenseTensor& src,
94-
const DenseTensor& index,
95-
DenseTensor* output) {
96-
if (index.dims().size() == 2) {
97-
PADDLE_ENFORCE_EQ(
98-
index.dims()[1],
99-
1,
100-
common::errors::InvalidArgument("If the index's rank of gather_op is 2,"
101-
" the second dimension should be 1."));
102-
}
103-
104-
// index size
105-
int64_t index_size = index.dims().size() == 0 ? 1 : index.dims()[0];
106-
107-
auto src_dims = src.dims();
108-
109-
// slice size
110-
int64_t slice_size = 1;
111-
for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
112-
113-
const T* p_src = src.data<T>();
114-
const IndexT* p_index = index.data<IndexT>();
115-
T* p_output = output->data<T>();
116-
117-
int block = 512;
118-
int64_t n = slice_size * index_size;
119-
dim3 grid = dim3((n + block - 1) / block);
120-
phi::backends::gpu::LimitGridDim(ctx, &grid);
121-
122-
GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
123-
p_src, p_index, p_output, index_size, slice_size, src_dims[0]);
124-
}
125-
12668
template <typename T, typename IndexT = int>
12769
void GPUGatherNd(const phi::GPUContext& ctx,
12870
const DenseTensor& input,
@@ -170,42 +112,36 @@ void GPUGatherNd(const phi::GPUContext& ctx,
170112
end_size);
171113
}
172114

173-
template <typename T, typename U>
115+
template <typename T, typename U, int VecSize>
174116
__global__ void GatherGPUKernel(const T* input,
175117
const U* index,
176118
T* out,
177119
int64_t outer_dim_size,
178-
int64_t inner_dim_size,
179120
int64_t out_index_dim_size,
180121
int64_t input_index_dim_size,
181122
int64_t size) {
182-
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
123+
int64_t block_size = blockDim.x;
124+
int64_t idx = (blockIdx.x * block_size + threadIdx.x) * VecSize;
183125
int64_t outer_size = outer_dim_size * out_index_dim_size;
184-
for (; idx < size; idx += blockDim.x * gridDim.x) {
126+
for (; idx < size; idx += gridDim.x * block_size * VecSize) {
185127
int64_t inner_dim_index = idx / outer_size;
186-
int64_t next_idx = idx - outer_size * inner_dim_index;
128+
int64_t next_idx = idx % outer_size;
187129
int64_t index_dim_index = next_idx / outer_dim_size;
188130
U index_val = index[index_dim_index];
189131

190-
PADDLE_ENFORCE(
191-
index_val >= -input_index_dim_size && index_val < input_index_dim_size,
192-
"The index is out of bounds, "
193-
"please check whether the dimensions of index and "
194-
"input meet the requirements. It should "
195-
"be less than [%ld] and greater than or equal to [%ld], but "
196-
"received [%ld]",
197-
input_index_dim_size,
198-
-input_index_dim_size,
199-
index_val);
200132
if (index_val < 0) {
201133
index_val += input_index_dim_size;
202134
}
203135

204-
int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
136+
int64_t out_dim_index = next_idx % outer_dim_size;
205137
int64_t input_index =
206138
inner_dim_index * (outer_dim_size * input_index_dim_size) +
207139
index_val * outer_dim_size + out_dim_index;
208-
out[idx] = input[input_index];
140+
141+
using VecType = kps::details::VectorType<T, VecSize>;
142+
const VecType* src = reinterpret_cast<const VecType*>(&input[input_index]);
143+
VecType* dst = reinterpret_cast<VecType*>(&out[idx]);
144+
*dst = *src;
209145
}
210146
}
211147

@@ -248,12 +184,10 @@ void GatherV2CUDAFunction(const DenseTensor* input,
248184
int axis_index = axis;
249185
int64_t index_dim_size = input_dim[axis_index];
250186

251-
int64_t inner_dim_size = 1;
252187
int64_t outer_dim_size = 1;
253188
std::vector<int64_t> out_dim_vec;
254189

255190
for (int i = 0; i < axis_index; i++) {
256-
inner_dim_size *= input_dim[i];
257191
out_dim_vec.push_back(input_dim[i]);
258192
}
259193
if (index->dims().size() != 0) {
@@ -270,18 +204,54 @@ void GatherV2CUDAFunction(const DenseTensor* input,
270204
int64_t out_size = out->numel();
271205
if (out_size == 0) return;
272206

273-
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
207+
int vec_size = 4;
208+
vec_size = std::min(phi::GetVectorizedSize(input), vec_size);
209+
vec_size = std::min(phi::GetVectorizedSize(out), vec_size);
210+
while (vec_size > 1 && outer_dim_size % vec_size != 0) {
211+
vec_size /= 2;
212+
}
213+
214+
constexpr int loop_count = 4;
215+
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
216+
ctx, out_size, vec_size * loop_count);
274217
auto stream = ctx.stream();
275-
GatherGPUKernel<T, U>
276-
<<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
277-
input_data,
278-
index_data,
279-
out_data,
280-
outer_dim_size,
281-
inner_dim_size,
282-
index_size,
283-
index_dim_size,
284-
out_size);
218+
219+
switch (vec_size) {
220+
#define CASE_VEC_SIZE(__Sz) \
221+
case __Sz: \
222+
GatherGPUKernel<T, U, __Sz> \
223+
<<<config.block_per_grid, config.thread_per_block, 0, stream>>>( \
224+
input_data, \
225+
index_data, \
226+
out_data, \
227+
outer_dim_size, \
228+
index_size, \
229+
index_dim_size, \
230+
out_size); \
231+
break
232+
CASE_VEC_SIZE(4);
233+
CASE_VEC_SIZE(2);
234+
CASE_VEC_SIZE(1);
235+
#undef CASE_VEC_SIZE
236+
default:
237+
PADDLE_THROW(common::errors::Unimplemented(
238+
"Unsupported vectorized size: %d", vec_size));
239+
}
240+
}
241+
242+
/**
243+
* A thin wrapper on gpu tensor
244+
* Return a new tensor from source tensor, gathered according to index
245+
* input[src]: type-T source Tensor
246+
* input[index]: type-IndexT index Tensor (1-D)
247+
* return: output tensor
248+
*/
249+
template <typename T, typename IndexT = int>
250+
void GPUGather(const phi::GPUContext& ctx,
251+
const DenseTensor& src,
252+
const DenseTensor& index,
253+
DenseTensor* output) {
254+
GatherV2CUDAFunction<T, IndexT>(&src, &index, /* axis= */ 0, output, ctx);
285255
}
286256

287257
template <typename T, typename U>

0 commit comments

Comments
 (0)