[PHI] Optimize Gather kernel with vectorization

lshpku · lshpku · commit 5ad51a73b9f2 · 2025-04-14T11:58:59.000Z
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
@@ -22,29 +22,13 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace phi {
 namespace funcs {
 
-template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params,
-                                 const IndexT* indices,
-                                 T* output,
-                                 size_t index_size,
-                                 size_t slice_size,
-                                 int64_t index_dim_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i =
-        (indices[indices_i] < 0 ? (indices[indices_i] + index_dim_size)
-                                : indices[indices_i]);
-    int64_t params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
-  }
-}
-
 template <typename T, typename IndexT = int>
 __global__ void GatherNdCUDAKernel(const T* input,
                                    const Dim<DDim::kMaxRank> input_dims,
@@ -81,48 +65,6 @@ __global__ void GatherNdCUDAKernel(const T* input,
   }
 }
 
-/**
- * A thin wrapper on gpu tensor
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
- * return: output tensor
- */
-template <typename T, typename IndexT = int>
-void GPUGather(const phi::GPUContext& ctx,
-               const DenseTensor& src,
-               const DenseTensor& index,
-               DenseTensor* output) {
-  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(
-        index.dims()[1],
-        1,
-        common::errors::InvalidArgument("If the index's rank of gather_op is 2,"
-                                        " the second dimension should be 1."));
-  }
-
-  // index size
-  int64_t index_size = index.dims().size() == 0 ? 1 : index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  // slice size
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  int block = 512;
-  int64_t n = slice_size * index_size;
-  dim3 grid = dim3((n + block - 1) / block);
-  phi::backends::gpu::LimitGridDim(ctx, &grid);
-
-  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size, src_dims[0]);
-}
-
 template <typename T, typename IndexT = int>
 void GPUGatherNd(const phi::GPUContext& ctx,
                  const DenseTensor& input,
@@ -170,20 +112,20 @@ void GPUGatherNd(const phi::GPUContext& ctx,
                                                                   end_size);
 }
 
-template <typename T, typename U>
+template <typename T, typename U, int VecSize>
 __global__ void GatherGPUKernel(const T* input,
                                 const U* index,
                                 T* out,
                                 int64_t outer_dim_size,
-                                int64_t inner_dim_size,
                                 int64_t out_index_dim_size,
                                 int64_t input_index_dim_size,
                                 int64_t size) {
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int64_t block_size = blockDim.x;
+  int64_t idx = (blockIdx.x * block_size + threadIdx.x) * VecSize;
   int64_t outer_size = outer_dim_size * out_index_dim_size;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
+  for (; idx < size; idx += gridDim.x * block_size * VecSize) {
     int64_t inner_dim_index = idx / outer_size;
-    int64_t next_idx = idx - outer_size * inner_dim_index;
+    int64_t next_idx = idx % outer_size;
     int64_t index_dim_index = next_idx / outer_dim_size;
     U index_val = index[index_dim_index];
 
@@ -201,11 +143,15 @@ __global__ void GatherGPUKernel(const T* input,
       index_val += input_index_dim_size;
     }
 
-    int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
+    int64_t out_dim_index = next_idx % outer_dim_size;
     int64_t input_index =
         inner_dim_index * (outer_dim_size * input_index_dim_size) +
         index_val * outer_dim_size + out_dim_index;
-    out[idx] = input[input_index];
+
+    using VecType = kps::details::VectorType<T, VecSize>;
+    const VecType* src = reinterpret_cast<const VecType*>(&input[input_index]);
+    VecType* dst = reinterpret_cast<VecType*>(&out[idx]);
+    *dst = *src;
   }
 }
 
@@ -248,12 +194,10 @@ void GatherV2CUDAFunction(const DenseTensor* input,
   int axis_index = axis;
   int64_t index_dim_size = input_dim[axis_index];
 
-  int64_t inner_dim_size = 1;
   int64_t outer_dim_size = 1;
   std::vector<int64_t> out_dim_vec;
 
   for (int i = 0; i < axis_index; i++) {
-    inner_dim_size *= input_dim[i];
     out_dim_vec.push_back(input_dim[i]);
   }
   if (index->dims().size() != 0) {
@@ -270,18 +214,54 @@ void GatherV2CUDAFunction(const DenseTensor* input,
   int64_t out_size = out->numel();
   if (out_size == 0) return;
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
+  int vec_size = 4;
+  vec_size = std::min(phi::GetVectorizedSize(input), vec_size);
+  vec_size = std::min(phi::GetVectorizedSize(out), vec_size);
+  while (vec_size > 1 && outer_dim_size % vec_size != 0) {
+    vec_size /= 2;
+  }
+
+  constexpr int loop_count = 4;
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      ctx, out_size, vec_size * loop_count);
   auto stream = ctx.stream();
-  GatherGPUKernel<T, U>
-      <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-          input_data,
-          index_data,
-          out_data,
-          outer_dim_size,
-          inner_dim_size,
-          index_size,
-          index_dim_size,
-          out_size);
+
+  switch (vec_size) {
+#define CASE_VEC_SIZE(__Sz)                                              \
+  case __Sz:                                                             \
+    GatherGPUKernel<T, U, __Sz>                                          \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>( \
+            input_data,                                                  \
+            index_data,                                                  \
+            out_data,                                                    \
+            outer_dim_size,                                              \
+            index_size,                                                  \
+            index_dim_size,                                              \
+            out_size);                                                   \
+    break
+    CASE_VEC_SIZE(4);
+    CASE_VEC_SIZE(2);
+    CASE_VEC_SIZE(1);
+#undef CASE_VEC_SIZE
+    default:
+      PADDLE_THROW(common::errors::Unimplemented(
+          "Unsupported vectorized size: %d", vec_size));
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-IndexT index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T, typename IndexT = int>
+void GPUGather(const phi::GPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
+  GatherV2CUDAFunction<T, IndexT>(&src, &index, /* axis= */ 0, output, ctx);
 }
 
 template <typename T, typename U>
diff --git a/test/ir/inference/test_trt_convert_gather.py b/test/ir/inference/test_trt_convert_gather.py
@@ -49,59 +49,57 @@ def generate_input3(axis):
             return np.array([axis]).astype(np.int32)
 
         for shape in [[32], [16, 64], [32, 16, 16], [32, 64, 16, 32]]:
-            for index in [[1, 4], [4, 8]]:
+            for index in [[0, 1]]:
                 for axis in [0, 1, 2, 3]:
-                    for overwrite in [True, False]:
-                        for input in [
-                            {"X": ["input_data"], "Index": ["index_data"]},
-                        ]:
-                            for index_type_int32 in [True, False]:
-                                self.shape = shape
-                                self.axis = axis
-                                self.input_num = len(input)
-                                self.index_type_int32 = index_type_int32
-                                dics = [{"overwrite": overwrite, "axis": axis}]
-                                ops_config = [
+                    for input in [
+                        {"X": ["input_data"], "Index": ["index_data"]},
+                    ]:
+                        for index_type_int32 in [True, False]:
+                            self.shape = shape
+                            self.axis = axis
+                            self.input_num = len(input)
+                            self.index_type_int32 = index_type_int32
+                            ops_config = [
+                                {
+                                    "op_type": "gather",
+                                    "op_inputs": input,
+                                    "op_outputs": {"Out": ["output_data"]},
+                                    "op_attrs": {"axis": axis},
+                                }
+                            ]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={},
+                                inputs=(
                                     {
-                                        "op_type": "gather",
-                                        "op_inputs": input,
-                                        "op_outputs": {"Out": ["output_data"]},
-                                        "op_attrs": dics[0],
+                                        "index_data": TensorConfig(
+                                            data_gen=partial(
+                                                (
+                                                    generate_input2
+                                                    if index_type_int32
+                                                    else generate_input4
+                                                ),
+                                                index,
+                                            )
+                                        ),
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1, shape
+                                            )
+                                        ),
                                     }
-                                ]
-                                ops = self.generate_op_config(ops_config)
-
-                                program_config = ProgramConfig(
-                                    ops=ops,
-                                    weights={},
-                                    inputs=(
-                                        {
-                                            "index_data": TensorConfig(
-                                                data_gen=partial(
-                                                    (
-                                                        generate_input2
-                                                        if index_type_int32
-                                                        else generate_input4
-                                                    ),
-                                                    index,
-                                                )
-                                            ),
-                                            "input_data": TensorConfig(
-                                                data_gen=partial(
-                                                    generate_input1, shape
-                                                )
-                                            ),
-                                        }
-                                    ),
-                                    outputs=["output_data"],
-                                )
-
-                                yield program_config
+                                ),
+                                outputs=["output_data"],
+                            )
+
+                            yield program_config
 
     def generate_dynamic_shape(self):
         if len(self.shape) == 1:
             self.dynamic_shape.min_input_shape = {
-                "input_data": [1],
+                "input_data": [2],
                 "index_data": [2],
             }
             self.dynamic_shape.max_input_shape = {