[PHI] Optimize Gather kernel with vectorization

lshpku · lshpku · commit 0498fa71c4a3 · 2025-04-14T05:36:07.000Z
diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
@@ -22,29 +22,13 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 namespace phi {
 namespace funcs {
 
-template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params,
-                                 const IndexT* indices,
-                                 T* output,
-                                 size_t index_size,
-                                 size_t slice_size,
-                                 int64_t index_dim_size) {
-  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
-    int64_t indices_i = i / slice_size;
-    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i =
-        (indices[indices_i] < 0 ? (indices[indices_i] + index_dim_size)
-                                : indices[indices_i]);
-    int64_t params_i = gather_i * slice_size + slice_i;
-    *(output + i) = *(params + params_i);
-  }
-}
-
 template <typename T, typename IndexT = int>
 __global__ void GatherNdCUDAKernel(const T* input,
                                    const Dim<DDim::kMaxRank> input_dims,
@@ -81,48 +65,6 @@ __global__ void GatherNdCUDAKernel(const T* input,
   }
 }
 
-/**
- * A thin wrapper on gpu tensor
- * Return a new tensor from source tensor, gathered according to index
- * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
- * return: output tensor
- */
-template <typename T, typename IndexT = int>
-void GPUGather(const phi::GPUContext& ctx,
-               const DenseTensor& src,
-               const DenseTensor& index,
-               DenseTensor* output) {
-  if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(
-        index.dims()[1],
-        1,
-        common::errors::InvalidArgument("If the index's rank of gather_op is 2,"
-                                        " the second dimension should be 1."));
-  }
-
-  // index size
-  int64_t index_size = index.dims().size() == 0 ? 1 : index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  // slice size
-  int64_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-  T* p_output = output->data<T>();
-
-  int block = 512;
-  int64_t n = slice_size * index_size;
-  dim3 grid = dim3((n + block - 1) / block);
-  phi::backends::gpu::LimitGridDim(ctx, &grid);
-
-  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size, src_dims[0]);
-}
-
 template <typename T, typename IndexT = int>
 void GPUGatherNd(const phi::GPUContext& ctx,
                  const DenseTensor& input,
@@ -170,42 +112,36 @@ void GPUGatherNd(const phi::GPUContext& ctx,
                                                                   end_size);
 }
 
-template <typename T, typename U>
+template <typename T, typename U, int VecSize>
 __global__ void GatherGPUKernel(const T* input,
                                 const U* index,
                                 T* out,
                                 int64_t outer_dim_size,
-                                int64_t inner_dim_size,
                                 int64_t out_index_dim_size,
                                 int64_t input_index_dim_size,
                                 int64_t size) {
-  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int64_t block_size = blockDim.x;
+  int64_t idx = (blockIdx.x * block_size + threadIdx.x) * VecSize;
   int64_t outer_size = outer_dim_size * out_index_dim_size;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
+  for (; idx < size; idx += gridDim.x * block_size * VecSize) {
     int64_t inner_dim_index = idx / outer_size;
-    int64_t next_idx = idx - outer_size * inner_dim_index;
+    int64_t next_idx = idx % outer_size;
     int64_t index_dim_index = next_idx / outer_dim_size;
     U index_val = index[index_dim_index];
 
-    PADDLE_ENFORCE(
-        index_val >= -input_index_dim_size && index_val < input_index_dim_size,
-        "The index is out of bounds, "
-        "please check whether the dimensions of index and "
-        "input meet the requirements. It should "
-        "be less than [%ld] and greater than or equal to [%ld], but "
-        "received [%ld]",
-        input_index_dim_size,
-        -input_index_dim_size,
-        index_val);
     if (index_val < 0) {
       index_val += input_index_dim_size;
     }
 
-    int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
+    int64_t out_dim_index = next_idx % outer_dim_size;
     int64_t input_index =
         inner_dim_index * (outer_dim_size * input_index_dim_size) +
         index_val * outer_dim_size + out_dim_index;
-    out[idx] = input[input_index];
+
+    using VecType = kps::details::VectorType<T, VecSize>;
+    const VecType* src = reinterpret_cast<const VecType*>(&input[input_index]);
+    VecType* dst = reinterpret_cast<VecType*>(&out[idx]);
+    *dst = *src;
   }
 }
 
@@ -248,12 +184,10 @@ void GatherV2CUDAFunction(const DenseTensor* input,
   int axis_index = axis;
   int64_t index_dim_size = input_dim[axis_index];
 
-  int64_t inner_dim_size = 1;
   int64_t outer_dim_size = 1;
   std::vector<int64_t> out_dim_vec;
 
   for (int i = 0; i < axis_index; i++) {
-    inner_dim_size *= input_dim[i];
     out_dim_vec.push_back(input_dim[i]);
   }
   if (index->dims().size() != 0) {
@@ -270,18 +204,54 @@ void GatherV2CUDAFunction(const DenseTensor* input,
   int64_t out_size = out->numel();
   if (out_size == 0) return;
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
+  int vec_size = 4;
+  vec_size = std::min(phi::GetVectorizedSize(input), vec_size);
+  vec_size = std::min(phi::GetVectorizedSize(out), vec_size);
+  while (vec_size > 1 && outer_dim_size % vec_size != 0) {
+    vec_size /= 2;
+  }
+
+  constexpr int loop_count = 4;
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      ctx, out_size, vec_size * loop_count);
   auto stream = ctx.stream();
-  GatherGPUKernel<T, U>
-      <<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-          input_data,
-          index_data,
-          out_data,
-          outer_dim_size,
-          inner_dim_size,
-          index_size,
-          index_dim_size,
-          out_size);
+
+  switch (vec_size) {
+#define CASE_VEC_SIZE(__Sz)                                              \
+  case __Sz:                                                             \
+    GatherGPUKernel<T, U, __Sz>                                          \
+        <<<config.block_per_grid, config.thread_per_block, 0, stream>>>( \
+            input_data,                                                  \
+            index_data,                                                  \
+            out_data,                                                    \
+            outer_dim_size,                                              \
+            index_size,                                                  \
+            index_dim_size,                                              \
+            out_size);                                                   \
+    break
+    CASE_VEC_SIZE(4);
+    CASE_VEC_SIZE(2);
+    CASE_VEC_SIZE(1);
+#undef CASE_VEC_SIZE
+    default:
+      PADDLE_THROW(common::errors::Unimplemented(
+          "Unsupported vectorized size: %d", vec_size));
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-IndexT index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T, typename IndexT = int>
+void GPUGather(const phi::GPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
+  GatherV2CUDAFunction<T, IndexT>(&src, &index, /* axis= */ 0, output, ctx);
 }
 
 template <typename T, typename U>