add index_elementwise_get_grad kernel

zhanghonggeng · zhanghonggeng · commit a8a30e6c7080 · 2025-06-04T11:46:53.000Z
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
@@ -1887,4 +1887,18 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
     value_grad->share_lod(values);
   }
 }
+
+void IndexElementwiseGetGradInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& index,
+    const MetaTensor& out_grad,
+    const std::vector<int64_t>& input_dims,
+    const std::vector<int64_t>& input_strides,
+    const std::vector<int64_t>& index_dims,
+    const std::vector<int64_t>& index_strides,
+    MetaTensor* x_grad) {
+  if (x_grad) {
+    x_grad->share_meta(x);
+  }
+}
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
@@ -680,4 +680,14 @@ void SetValueGradInferMeta(const MetaTensor& out_grad,
                            MetaTensor* x_grad,
                            MetaTensor* value_grad);
 
+void IndexElementwiseGetGradInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& index,
+    const MetaTensor& out_grad,
+    const std::vector<int64_t>& input_dims,
+    const std::vector<int64_t>& input_strides,
+    const std::vector<int64_t>& index_dims,
+    const std::vector<int64_t>& index_strides,
+    MetaTensor* x_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/stride_utils.h b/paddle/phi/kernels/funcs/stride_utils.h
@@ -197,9 +197,10 @@ static inline void reorder_dimensions(const std::vector<int64_t> stride_size,
   permute_dimensions<N>(stride_size, perm_, strides_array, shape_);
 }
 
-std::vector<int64_t> compatible_stride(const std::vector<int64_t>* shape_,
-                                       const int64_t ndim,
-                                       const int64_t element_size) {
+static inline std::vector<int64_t> compatible_stride(
+    const std::vector<int64_t>* shape_,
+    const int64_t ndim,
+    const int64_t element_size) {
   std::vector<int64_t> stride;
   int64_t next_stride = element_size;
 
diff --git a/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
@@ -0,0 +1,194 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_elementwise_get_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/index_elementwise.cu.h"
+#include "paddle/phi/kernels/funcs/stride_utils.h"
+
+namespace phi {
+
+template <typename T, typename IndexT = int>
+void GPUIndexElementwisePutKernel(const phi::GPUContext& ctx,
+                                  const DenseTensor& input,
+                                  const DenseTensor& value,
+                                  const std::vector<const DenseTensor*>& index,
+                                  const std::vector<int64_t>& input_dims,
+                                  const std::vector<int64_t>& input_strides,
+                                  const std::vector<int64_t>& index_dims,
+                                  const std::vector<int64_t>& index_strides,
+                                  DenseTensor* output) {
+  int64_t numel = 0;
+
+  auto num_indices = index_dims.size();
+
+  auto sizes = std::array<int64_t, 25>{};
+  auto strides = std::array<int64_t, 25>{};
+  for (unsigned i = 0; i < num_indices; i++) {
+    sizes[i] = index_dims[i];
+    strides[i] = index_strides[i];
+  }
+  auto index_ptrs = funcs::GetIndexDataPtrs<IndexT>(index);
+
+  std::array<int64_t*, 3> strides_array;
+  std::vector<int64_t> desired_shape;
+
+  funcs::IndexPutStride<3>(input_dims,
+                           input_strides,
+                           phi::SizeOf(input.dtype()),
+                           std::vector<int64_t>(),
+                           std::vector<int64_t>(),
+                           phi::SizeOf(value.dtype()),
+                           common::vectorize<int64_t>(index[0]->dims()),
+                           common::vectorize<int64_t>(index[0]->strides()),
+                           phi::SizeOf(index[0]->dtype()),
+                           &desired_shape,
+                           &strides_array,
+                           &numel);
+
+  const int64_t* template_stride = strides_array[2];
+  PADDLE_ENFORCE(
+      template_stride != nullptr,
+      "strides_array[2] should not be nullptr in GPUIndexElementwiseGetKernel");
+  size_t stride_size = desired_shape.size();
+  std::vector<std::vector<int64_t>> strides_vector;
+  strides_vector.reserve(num_indices + 2);
+
+  for (int i = 0; i < 2; ++i) {
+    if (i < strides_array.size() && strides_array[i] != nullptr) {
+      strides_vector.emplace_back(strides_array[i],
+                                  strides_array[i] + stride_size);
+    } else {
+      strides_vector.emplace_back(stride_size, 0);
+    }
+  }
+
+  std::vector<int64_t> template_vec(template_stride,
+                                    template_stride + stride_size);
+  for (size_t i = 0; i < num_indices; ++i) {
+    strides_vector.push_back(template_vec);
+  }
+
+  auto offset_calc = funcs::make_offset_calculator<3>(
+      desired_shape.size(), desired_shape.data(), strides_vector);
+
+  const int64_t N = numel;
+  PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
+                 "N >= 0 && N <= std::numeric_limits<int32_t>::max()");
+
+  constexpr int nt = 128;
+  constexpr int vt = 4;
+  const dim3 block(nt);
+  const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  auto stream = ctx.stream();
+
+  using dtype = funcs::OpaqueType<sizeof(T)>;
+
+  const char* in_ptr = reinterpret_cast<const char*>(value.data<T>());
+  char* out_ptr = reinterpret_cast<char*>(output->data<T>());
+
+  funcs::index_elementwise_kernel<nt, vt>
+      <<<grid, block, 0, stream>>>(N, [=] __device__(int idx) {
+        const auto offsets = offset_calc.get(idx);
+        char* const out_data = out_ptr + offsets[0];
+        const char* const in_data = in_ptr + offsets[1];
+
+        int64_t offset = 0;
+#pragma unroll
+        for (int i = 0; i < num_indices; i++) {
+          int64_t index =
+              *reinterpret_cast<int64_t*>(index_ptrs[i] + offsets[2]);
+          PADDLE_ENFORCE(-sizes[i] <= index && index < sizes[i],
+                         "index out of bounds");
+          if (index < 0) {
+            index += sizes[i];
+          }
+          offset += index * strides[i];
+        }
+        *reinterpret_cast<dtype*>(out_data + offset) =
+            *reinterpret_cast<const dtype*>(in_data);
+      });
+}
+
+template <typename T, typename Context>
+void IndexElementwiseGetGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const std::vector<const DenseTensor*>& index,
+                                   const DenseTensor& out_grad,
+                                   const std::vector<int64_t>& input_dims,
+                                   const std::vector<int64_t>& input_strides,
+                                   const std::vector<int64_t>& index_dims,
+                                   const std::vector<int64_t>& index_strides,
+                                   DenseTensor* x_grad) {
+  ctx.template Alloc<T>(x_grad);
+  auto dxt = phi::EigenVector<T>::Flatten(*x_grad);
+  auto& place = *ctx.eigen_device();
+  dxt.device(place) = dxt.constant(static_cast<T>(0));
+  if (out_grad.numel() == 0) return;
+
+  const auto& index_type = index[0]->dtype();
+  PADDLE_ENFORCE_EQ(
+      index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64,
+      true,
+      common::errors::InvalidArgument(
+          "Index holds the wrong type, it holds [%s], but "
+          "desires to be [%s] or [%s].",
+          index_type,
+          phi::DataType::INT32,
+          phi::DataType::INT64));
+
+  if (index_type == phi::DataType::INT32) {
+    GPUIndexElementwisePutKernel<T, int>(ctx,
+                                         x,
+                                         out_grad,
+                                         index,
+                                         input_dims,
+                                         input_strides,
+                                         index_dims,
+                                         index_strides,
+                                         x_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    GPUIndexElementwisePutKernel<T, int64_t>(ctx,
+                                             x,
+                                             out_grad,
+                                             index,
+                                             input_dims,
+                                             input_strides,
+                                             index_dims,
+                                             index_strides,
+                                             x_grad);
+  }
+}
+
+}  // namespace phi
+PD_REGISTER_KERNEL(index_elementwise_get_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexElementwiseGetGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int,
+                   int8_t,
+                   int64_t,
+                   int16_t,
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/index_elementwise_get_grad_kernel.h b/paddle/phi/kernels/index_elementwise_get_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_array.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexElementwiseGetGradKernel(const Context& ctx,
+                                   const DenseTensor& x,
+                                   const std::vector<const DenseTensor*>& index,
+                                   const DenseTensor& out_grad,
+                                   const std::vector<int64_t>& input_dims,
+                                   const std::vector<int64_t>& input_strides,
+                                   const std::vector<int64_t>& index_dims,
+                                   const std::vector<int64_t>& index_strides,
+                                   DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -1672,6 +1672,18 @@
   inplace : (out_grad -> x_grad)
   backward : index_add_double_grad
 
+- backward_op : index_elementwise_get_grad
+  forward : index_elementwise_get (Tensor x, Tensor[] index, int64_t[] input_dims, int64_t[] input_strides, int64_t[] index_dims, int64_t[] index_stride) -> Tensor(out)
+  args : (Tensor x, Tensor[] index, Tensor out_grad, int64_t[] input_dims, int64_t[] input_strides, int64_t[] index_dims, int64_t[] index_stride)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : IndexElementwiseGetGradInferMeta
+  kernel :
+    func : index_elementwise_get_grad
+    data_type : out_grad
+  data_transform :
+    skip_transform : index
+
 - backward_op : index_put_double_grad
   forward : index_put_grad (Tensor x, Tensor[] indices, Tensor value, Tensor grad_out, bool accumulate=false) -> Tensor(grad_x), Tensor(grad_value)
   args : (Tensor x, Tensor[] indices, Tensor value, Tensor grad_x_grad, Tensor grad_value_grad, bool accumulate=false)
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -2761,9 +2761,7 @@
   kernel :
     func : index_elementwise_get
     data_type : x
-  # backward : index_elementwise_grad
-  # interfaces : paddle::dialect::InferSymbolicShapeInterface
-  traits : paddle::dialect::ForwardOnlyTrait
+  backward : index_elementwise_get_grad
 
 - op : index_put
   args : (Tensor x, Tensor[] indices, Tensor value, bool accumulate=false)
diff --git a/test/legacy_test/test_index_elementwise.py b/test/legacy_test/test_index_elementwise.py
@@ -16,8 +16,6 @@
 import numpy as np
 
 import paddle
-from paddle import base
-from paddle.base import core
 
 
 def np_index_elementwise(x, index):
@@ -64,45 +62,6 @@ def setUp(self):
 
         self.out_np = np_index_elementwise(self.x_np, self.index_np)
 
-    def test_static_graph(self):
-        paddle.enable_static()
-        startup_program = base.Program()
-        train_program = base.Program()
-
-        with base.program_guard(startup_program, train_program):
-            x = paddle.static.data(
-                name='x', dtype=self.dtype, shape=self.x_shape
-            )
-            index = paddle.static.data(
-                name='index', dtype='bool', shape=self.index_shape
-            )
-            out = x[index]
-
-            place = (
-                base.CUDAPlace(0)
-                if core.is_compiled_with_cuda()
-                else base.CPUPlace()
-            )
-            exe = base.Executor(place)
-
-            result = exe.run(
-                base.default_main_program(),
-                feed={
-                    'x': self.x_np,
-                    'index': self.index_np,
-                },
-                fetch_list=[out],
-            )[0]
-
-            atol = 1e-05 if self.dtype in ["float32", "float64"] else 0
-            rtol = 1e-05 if self.dtype in ["float32", "float64"] else 0
-
-            np.testing.assert_allclose(
-                result, self.out_np, atol=atol, rtol=rtol
-            )
-
-            paddle.disable_static()
-
     def test_dygraph(self):
         paddle.disable_static()
 
@@ -227,7 +186,6 @@ def setUp(self):
             )
             self.out_np = np_index_elementwise(self.x_np, self.index_np)
 
-            self.test_static_graph()
             self.test_dygraph()