add index_elementwise_get_grad kernel

zhanghonggeng · zhanghonggeng · commit 588f0ecebe3e · 2025-06-04T11:59:22.000Z
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
@@ -54,13 +54,15 @@ static inline common::DDim infer_size_symdimvector(common::DDim a,
     auto sizeA = (dimA >= 0) ? a[dimA] : 1;
     auto sizeB = (dimB >= 0) ? b[dimB] : 1;
 
-    PADDLE_ENFORCE(sizeA == sizeB || sizeA == 1 || sizeB == 1,
-                   common::errors::Fatal("The size of tensor a (",
-                                         sizeA,
-                                         ") must match the size of tensor b (",
-                                         sizeB,
-                                         ") at non-singleton dimension ",
-                                         i));
+    PADDLE_ENFORCE_EQ(
+        sizeA == sizeB || sizeA == 1 || sizeB == 1,
+        true,
+        common::errors::Fatal("The size of tensor a (",
+                              sizeA,
+                              ") must match the size of tensor b (",
+                              sizeB,
+                              ") at non-singleton dimension ",
+                              i));
 
     // 1s map to the other size (even 0).
     expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
@@ -1887,4 +1887,18 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
     value_grad->share_lod(values);
   }
 }
+
+void IndexElementwiseGetGradInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& index,
+    const MetaTensor& out_grad,
+    const std::vector<int64_t>& input_dims,
+    const std::vector<int64_t>& input_strides,
+    const std::vector<int64_t>& index_dims,
+    const std::vector<int64_t>& index_strides,
+    MetaTensor* x_grad) {
+  if (x_grad) {
+    x_grad->share_meta(x);
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
@@ -680,4 +680,14 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
                            MetaTensor* x_grad,
                            MetaTensor* value_grad);
 
+void IndexElementwiseGetGradInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& index,
+    const MetaTensor& out_grad,
+    const std::vector<int64_t>& input_dims,
+    const std::vector<int64_t>& input_strides,
+    const std::vector<int64_t>& index_dims,
+    const std::vector<int64_t>& index_strides,
+    MetaTensor* x_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/index_elementwise.cu.h b/paddle/phi/kernels/funcs/index_elementwise.cu.h
@@ -36,6 +36,7 @@ constexpr int MAX_DIMS = 16;
 #else
 constexpr int MAX_DIMS = 25;
 #endif
+constexpr int MAX_DIMS = 9;
 
 static constexpr int launch_bound2 = 4;
 static constexpr int launch_size_nd = 128;
@@ -91,9 +92,11 @@ struct OffsetCalculator {
                    const int64_t* const* strides,
                    const int64_t* element_sizes = nullptr)
       : dims(dims) {
-    PADDLE_ENFORCE(dims <= MAX_DIMS,
-                   "The number of dimensions (%d) exceeds MAX_DIMS.",
-                   dims);
+    PADDLE_ENFORCE_LE(
+        dims,
+        MAX_DIMS,
+        common::errors::InvalidArgument(
+            "Tensor has too many dims. Maximum dim is d%.", MAX_DIMS));
     for (int i = 0; i < dims; i++) {
       sizes_[i] = IntDivider<index_t>(sizes[i]);
       for (int arg = 0; arg < NARGS; arg++) {
@@ -144,10 +147,12 @@ std::array<char*, DDim::kMaxRank> GetIndexDataPtrs(
   for (size_t i = 0; i < index.size(); ++i) {
     const IndexT* p_index = index[i]->data<IndexT>();
 
-    PADDLE_ENFORCE(p_index != nullptr,
-                   "The pointer p_index must not be nullptr. "
-                   "Please ensure the index tensor is valid and its data "
-                   "is correctly initialized.");
+    PADDLE_ENFORCE_NOT_NULL(
+        p_index,
+        ::common::errors::InvalidArgument(
+            "The pointer p_index is nullptr, "
+            "please check whether the index tensor is valid and "
+            "its data is correctly initialized."));
 
     index_ptrs[i] = reinterpret_cast<char*>(const_cast<IndexT*>(p_index));
   }
diff --git a/paddle/phi/kernels/funcs/stride_utils.h b/paddle/phi/kernels/funcs/stride_utils.h
@@ -197,9 +197,10 @@ static inline void reorder_dimensions(const std::vector<int64_t> stride_size,
   permute_dimensions<N>(stride_size, perm_, strides_array, shape_);
 }
 
-std::vector<int64_t> compatible_stride(const std::vector<int64_t>* shape_,
-                                       const int64_t ndim,
-                                       const int64_t element_size) {
+static inline std::vector<int64_t> compatible_stride(
+    const std::vector<int64_t>* shape_,
+    const int64_t ndim,
+    const int64_t element_size) {
   std::vector<int64_t> stride;
   int64_t next_stride = element_size;
 
diff --git a/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
@@ -0,0 +1,197 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_elementwise_get_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/index_elementwise.cu.h"
+#include "paddle/phi/kernels/funcs/stride_utils.h"
+
+namespace phi {
+
+template <typename T, typename IndexT = int>
+void GPUIndexElementwisePutKernel(const phi::GPUContext& ctx,
+                                  const DenseTensor& input,
+                                  const DenseTensor& value,
+                                  const std::vector<const DenseTensor*>& index,
+                                  const std::vector<int64_t>& input_dims,
+                                  const std::vector<int64_t>& input_strides,
+                                  const std::vector<int64_t>& index_dims,
+                                  const std::vector<int64_t>& index_strides,
+                                  DenseTensor* output) {
+  int64_t numel = 0;
+
+  auto num_indices = index_dims.size();
+
+  auto sizes = std::array<int64_t, 25>{};
+  auto strides = std::array<int64_t, 25>{};
+  for (unsigned i = 0; i < num_indices; i++) {
+    sizes[i] = index_dims[i];
+    strides[i] = index_strides[i];
+  }
+  auto index_ptrs = funcs::GetIndexDataPtrs<IndexT>(index);
+
+  std::array<int64_t*, 3> strides_array;
+  std::vector<int64_t> desired_shape;
+
+  funcs::IndexPutStride<3>(input_dims,
+                           input_strides,
+                           phi::SizeOf(input.dtype()),
+                           std::vector<int64_t>(),
+                           std::vector<int64_t>(),
+                           phi::SizeOf(value.dtype()),
+                           common::vectorize<int64_t>(index[0]->dims()),
+                           common::vectorize<int64_t>(index[0]->strides()),
+                           phi::SizeOf(index[0]->dtype()),
+                           &desired_shape,
+                           &strides_array,
+                           &numel);
+
+  const int64_t* template_stride = strides_array[2];
+  PADDLE_ENFORCE_NOT_NULL(template_stride,
+                          ::common::errors::InvalidArgument(
+                              "strides_array[2] should not be nullptr in "
+                              "GPUIndexElementwiseGetKernel"));
+
+  size_t stride_size = desired_shape.size();
+  std::vector<std::vector<int64_t>> strides_vector;
+  strides_vector.reserve(num_indices + 2);
+
+  for (int i = 0; i < 2; ++i) {
+    if (i < strides_array.size() && strides_array[i] != nullptr) {
+      strides_vector.emplace_back(strides_array[i],
+                                  strides_array[i] + stride_size);
+    } else {
+      strides_vector.emplace_back(stride_size, 0);
+    }
+  }
+
+  std::vector<int64_t> template_vec(template_stride,
+                                    template_stride + stride_size);
+  for (size_t i = 0; i < num_indices; ++i) {
+    strides_vector.push_back(template_vec);
+  }
+
+  auto offset_calc = funcs::make_offset_calculator<3>(
+      desired_shape.size(), desired_shape.data(), strides_vector);
+
+  const int64_t N = numel;
+  PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
+                 "Output numel be in the range [0, "
+                 "std::numeric_limits<int32_t>::max()]");
+
+  constexpr int nt = 128;
+  constexpr int vt = 4;
+  const dim3 block(nt);
+  const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  auto stream = ctx.stream();
+
+  using dtype = funcs::OpaqueType<sizeof(T)>;
+
+  const char* in_ptr = reinterpret_cast<const char*>(value.data<T>());
+  char* out_ptr = reinterpret_cast<char*>(output->data<T>());
+
+  funcs::index_elementwise_kernel<nt, vt>
+      <<<grid, block, 0, stream>>>(N, [=] __device__(int idx) {
+        const auto offsets = offset_calc.get(idx);
+        char* const out_data = out_ptr + offsets[0];
+        const char* const in_data = in_ptr + offsets[1];
+
+        int64_t offset = 0;
+#pragma unroll
+        for (int i = 0; i < num_indices; i++) {
+          int64_t index =
+              *reinterpret_cast<int64_t*>(index_ptrs[i] + offsets[2]);
+          PADDLE_ENFORCE(-sizes[i] <= index && index < sizes[i],
+                         "index out of bounds");
+          if (index < 0) {
+            index += sizes[i];
+          }
+          offset += index * strides[i];
+        }
+        *reinterpret_cast<dtype*>(out_data + offset) =
+            *reinterpret_cast<const dtype*>(in_data);
+      });
+}
+
+template <typename T, typename Context>
+void IndexElementwiseGetGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const std::vector<const DenseTensor*>& index,
+                                   const DenseTensor& out_grad,
+                                   const std::vector<int64_t>& input_dims,
+                                   const std::vector<int64_t>& input_strides,
+                                   const std::vector<int64_t>& index_dims,
+                                   const std::vector<int64_t>& index_strides,
+                                   DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto& place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  const auto& index_type = index[0]->dtype();
+  PADDLE_ENFORCE_EQ(
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64,
+      true,
+      common::errors::InvalidArgument(
+          "Index holds the wrong type, it holds [%s], but "
+          "desires to be [%s] or [%s].",
+          index_type,
+          phi::DataType::INT32,
+          phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    GPUIndexElementwisePutKernel<T, int>(ctx,
+                                         x,
+                                         out_grad,
+                                         index,
+                                         input_dims,
+                                         input_strides,
+                                         index_dims,
+                                         index_strides,
+                                         x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    GPUIndexElementwisePutKernel<T, int64_t>(ctx,
+                                             x,
+                                             out_grad,
+                                             index,
+                                             input_dims,
+                                             input_strides,
+                                             index_dims,
+                                             index_strides,
+                                             x_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(index_elementwise_get_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexElementwiseGetGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int8_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_array.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexElementwiseGetGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const std::vector<const DenseTensor*>& index,
+                                   const DenseTensor& out_grad,
+                                   const std::vector<int64_t>& input_dims,
+                                   const std::vector<int64_t>& input_strides,
+                                   const std::vector<int64_t>& index_dims,
+                                   const std::vector<int64_t>& index_strides,
+                                   DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -1672,6 +1672,18 @@
   inplace : (out_grad -> x_grad)
   backward : index_add_double_grad
 
+- backward_op : index_elementwise_get_grad
+  forward : index_elementwise_get (Tensor x, Tensor[] index, int64_t[] input_dims, int64_t[] input_strides, int64_t[] index_dims, int64_t[] index_stride) -> Tensor(out)
+  args : (Tensor x, Tensor[] index, Tensor out_grad, int64_t[] input_dims, int64_t[] input_strides, int64_t[] index_dims, int64_t[] index_stride)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : IndexElementwiseGetGradInferMeta
+  kernel :
+    func : index_elementwise_get_grad
+    data_type : out_grad
+  data_transform :
+    skip_transform : index
+
 - backward_op : index_put_double_grad
   forward : index_put_grad (Tensor x, Tensor[] indices, Tensor value, Tensor grad_out, bool accumulate=false) -> Tensor(grad_x), Tensor(grad_value)
   args : (Tensor x, Tensor[] indices, Tensor value, Tensor grad_x_grad, Tensor grad_value_grad, bool accumulate=false)
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -2761,9 +2761,7 @@
   kernel :
     func : index_elementwise_get
     data_type : x
-  # backward : index_elementwise_grad
-  # interfaces : paddle::dialect::InferSymbolicShapeInterface
-  traits : paddle::dialect::ForwardOnlyTrait
+  backward : index_elementwise_get_grad
 
 - op : index_put
   args : (Tensor x, Tensor[] indices, Tensor value, bool accumulate=false)
diff --git a/test/legacy_test/test_index_elementwise.py b/test/legacy_test/test_index_elementwise.py