PaddlePaddle
diff --git a/‎paddle/fluid/pybind/slice_utils.h
Lines changed: 195 additions & 145 deletions b/‎paddle/fluid/pybind/slice_utils.h
Lines changed: 195 additions & 145 deletions
diff --git a/‎paddle/phi/infermeta/backward.cc
Lines changed: 14 additions & 0 deletions b/‎paddle/phi/infermeta/backward.cc
Lines changed: 14 additions & 0 deletions
@@ -39,6 +39,159 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
+static inline common::DDim infer_size_symdimvector(common::DDim a,
+                                                   common::DDim b) {
+  // Use ptrdiff_t to ensure signed comparison.
+  auto dimsA = a.size();
+  auto dimsB = b.size();
+  auto ndim = dimsA > dimsB ? dimsA : dimsB;
+  common::DDim expandedSizes = common::make_ddim(std::vector<int64_t>(ndim, 0));
+
+  for (int64_t i = ndim - 1; i >= 0; --i) {
+    int64_t offset = ndim - 1 - i;
+    int64_t dimA = dimsA - 1 - offset;
+    int64_t dimB = dimsB - 1 - offset;
+    auto sizeA = (dimA >= 0) ? a[dimA] : 1;
+    auto sizeB = (dimB >= 0) ? b[dimB] : 1;
+
+    PADDLE_ENFORCE_EQ(
+        sizeA == sizeB || sizeA == 1 || sizeB == 1,
+        true,
+        common::errors::Fatal("The size of tensor a (",
+                              sizeA,
+                              ") must match the size of tensor b (",
+                              sizeB,
+                              ") at non-singleton dimension ",
+                              i));
+
+    // 1s map to the other size (even 0).
+    expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+  }
+
+  return expandedSizes;
+}
+
+static inline std::vector<paddle::Tensor> expand_outplace(
+    std::vector<paddle::Tensor> to_expand) {
+  // expands a list of Tensors; ignores undefined (null) tensors
+  bool first = true;
+  common::DDim sizes;
+  for (size_t i = 0; i < to_expand.size(); i++) {
+    if (!to_expand[i].initialized()) {
+      continue;
+    } else if (first) {
+      sizes = to_expand[i].dims();
+      first = false;
+    } else {
+      sizes = infer_size_symdimvector(sizes, to_expand[i].dims());
+    }
+  }
+
+  std::vector<paddle::Tensor> result(to_expand.size());
+  for (size_t i = 0; i < to_expand.size(); i++) {
+    if (!to_expand[i].initialized()) {
+      continue;
+    } else if (to_expand[i].dims() == sizes) {
+      result[i] = to_expand[i];
+    } else {
+      result[i] =
+          expand_ad_func(to_expand[i], common::vectorize<int64_t>(sizes));
+    }
+  }
+  return result;
+}
+
+struct AdvancedIndex {
+  AdvancedIndex(paddle::Tensor src,
+                std::vector<paddle::Tensor> indices,
+                bool bool_case);
+
+  paddle::Tensor src;
+  std::vector<paddle::Tensor> indices;
+  std::vector<int64_t> indexed_sizes;
+  std::vector<int64_t> indexed_strides;
+  std::vector<int64_t> src_sizes;
+  std::vector<int64_t> src_strides;
+  int64_t dims_before;
+  int64_t dims_after;
+  bool bool_case;
+};
+
+inline static void restride_src(std::vector<int64_t>* shape,
+                                std::vector<int64_t>* strides,
+                                int64_t dims_before,
+                                int64_t dims_indexed,
+                                std::vector<int64_t> replacement_shape) {
+  int64_t end = dims_before + dims_indexed;
+  shape->erase(shape->begin() + dims_before, shape->begin() + end);
+  strides->erase(strides->begin() + dims_before, strides->begin() + end);
+  shape->insert(shape->begin() + dims_before,
+                replacement_shape.begin(),
+                replacement_shape.end());
+  strides->insert(strides->begin() + dims_before, replacement_shape.size(), 0);
+}
+
+// move to cuda kernel
+inline static paddle::Tensor reshape_indexer(paddle::Tensor* index,
+                                             int64_t dims_before,
+                                             int64_t dims_after) {
+  auto orig_shape = common::vectorize<int64_t>(index->dims());
+  auto shape = std::vector<int64_t>{};
+  shape.insert(shape.end(), dims_before, 1);
+  shape.insert(shape.end(), orig_shape.begin(), orig_shape.end());
+  shape.insert(shape.end(), dims_after, 1);
+  *index = reshape_ad_func(*index, shape);
+  return *index;
+}
+
+inline AdvancedIndex::AdvancedIndex(paddle::Tensor src,
+                                    std::vector<paddle::Tensor> indices_list,
+                                    bool bool_case = false) {
+  uint32_t element_size_bytes = phi::SizeOf(src.dtype());
+  int64_t dims_before = 0, dims_after = 0, dims_indexed = 0;
+  std::vector<int64_t> shape_vec = common::vectorize<int64_t>(src.dims());
+  std::vector<int64_t> stride_vec = common::vectorize<int64_t>(src.strides());
+  std::vector<int64_t> replacement_shape;
+  std::vector<int64_t> idx_shape_vec = {};
+  std::vector<int64_t> idx_stride_vec = {};
+
+  for (size_t dim = 0; dim < indices_list.size(); dim++) {
+    if (!indices_list[dim].defined() || indices_list[dim].dims().size() == 0) {
+      if (dims_indexed == 0) {
+        dims_before++;
+      } else {
+        dims_after++;
+      }
+    } else {
+      dims_indexed++;
+      replacement_shape = common::vectorize<int64_t>(indices_list[dim].dims());
+      if (bool_case && !replacement_shape.empty() &&
+          replacement_shape.back() == 1) {
+        replacement_shape.pop_back();
+      }
+
+      idx_shape_vec.push_back(shape_vec[dim]);
+      idx_stride_vec.push_back(stride_vec[dim] * element_size_bytes);
+    }
+  }
+
+  this->dims_before = dims_before;
+  this->dims_after = dims_after;
+  restride_src(
+      &shape_vec, &stride_vec, dims_before, dims_indexed, replacement_shape);
+  this->src_sizes = shape_vec;
+  this->src_strides = stride_vec;
+
+  this->indexed_sizes = idx_shape_vec;
+  this->indexed_strides = idx_stride_vec;
+
+  // use dims_before and dims_after / move to cuda kernel
+  for (auto& index : indices_list) {
+    if (index.defined() && index.dims().size() > 0) {
+      this->indices.push_back(reshape_indexer(&index, dims_before, dims_after));
+    }
+  }
+}
 
 template <typename T>
 inline T GetDenseTensorValue(const phi::DenseTensor* x) {
@@ -493,6 +646,35 @@ static paddle::Tensor dealWithAdvancedIndex(
   return transed_tensor;
 }
 
+static std::vector<paddle::Tensor> PrepareIndices(
+    const paddle::Tensor& tensor,
+    const paddle::Tensor& bool_2_idx,
+    const paddle::Tensor& bool_index) {
+  std::vector<paddle::Tensor> indices;
+  for (int j = 0; j < bool_2_idx.shape()[1]; ++j) {
+    paddle::Tensor sliced_tensor =
+        slice_ad_func(bool_2_idx, {1}, {j}, {j + 1}, {1}, {});
+
+    // Calculate the required dimensionality
+    int64_t original_ndim =
+        tensor.shape().size() - bool_index.shape().size() + 1;
+    int64_t sliced_ndim = sliced_tensor.shape().size();
+    int64_t num_ones_to_add = original_ndim - sliced_ndim;
+
+    // Reshape the tensor by adding 1s if needed
+    if (num_ones_to_add > 0) {
+      std::vector<int64_t> new_shape = sliced_tensor.shape();
+      for (int64_t k = 0; k < num_ones_to_add; ++k) {
+        new_shape.push_back(1);
+      }
+      sliced_tensor = reshape_ad_func(sliced_tensor, new_shape);
+    }
+
+    indices.emplace_back(sliced_tensor);
+  }
+  return indices;
+}
+
 static paddle::Tensor getValueForBoolTensor(const paddle::Tensor& tensor,
                                             const paddle::Tensor& bool_index) {
   PADDLE_ENFORCE(bool_index.shape().size() <= tensor.shape().size(),
@@ -532,7 +714,20 @@ static paddle::Tensor getValueForBoolTensor(const paddle::Tensor& tensor,
   }
 
   auto bool_2_idx = nonzero_ad_func(bool_index);
+#ifdef PADDLE_WITH_CUDA
+  auto indices = PrepareIndices(tensor, bool_2_idx, bool_index);
+  AdvancedIndex ad = AdvancedIndex(tensor, indices, true);
+
+  return index_elementwise_get_ad_func(tensor,
+                                       ad.indices,
+                                       ad.src_sizes,
+                                       ad.src_strides,
+                                       ad.indexed_sizes,
+                                       ad.indexed_strides);
+#else
+
   return gather_nd_ad_func(tensor, bool_2_idx);
+#endif
 }
 
 static void ParseBoolAndBroadcastIndices(
@@ -672,150 +867,5 @@ static paddle::Tensor dealWithValues(const paddle::Tensor& tensor,
   return value_tensor;
 }
 
-static inline common::DDim infer_size_symdimvector(common::DDim a,
-                                                   common::DDim b) {
-  // Use ptrdiff_t to ensure signed comparison.
-  auto dimsA = a.size();
-  auto dimsB = b.size();
-  auto ndim = dimsA > dimsB ? dimsA : dimsB;
-  common::DDim expandedSizes = common::make_ddim(std::vector<int64_t>(ndim, 0));
-
-  for (int64_t i = ndim - 1; i >= 0; --i) {
-    int64_t offset = ndim - 1 - i;
-    int64_t dimA = dimsA - 1 - offset;
-    int64_t dimB = dimsB - 1 - offset;
-    auto sizeA = (dimA >= 0) ? a[dimA] : 1;
-    auto sizeB = (dimB >= 0) ? b[dimB] : 1;
-
-    PADDLE_ENFORCE_EQ(
-        sizeA == sizeB || sizeA == 1 || sizeB == 1,
-        true,
-        common::errors::Fatal("The size of tensor a (",
-                              sizeA,
-                              ") must match the size of tensor b (",
-                              sizeB,
-                              ") at non-singleton dimension ",
-                              i));
-
-    // 1s map to the other size (even 0).
-    expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
-  }
-
-  return expandedSizes;
-}
-
-static inline std::vector<paddle::Tensor> expand_outplace(
-    std::vector<paddle::Tensor> to_expand) {
-  // expands a list of Tensors; ignores undefined (null) tensors
-  bool first = true;
-  common::DDim sizes;
-  for (size_t i = 0; i < to_expand.size(); i++) {
-    if (!to_expand[i].initialized()) {
-      continue;
-    } else if (first) {
-      sizes = to_expand[i].dims();
-      first = false;
-    } else {
-      sizes = infer_size_symdimvector(sizes, to_expand[i].dims());
-    }
-  }
-
-  std::vector<paddle::Tensor> result(to_expand.size());
-  for (size_t i = 0; i < to_expand.size(); i++) {
-    if (!to_expand[i].initialized()) {
-      continue;
-    } else if (to_expand[i].dims() == sizes) {
-      result[i] = to_expand[i];
-    } else {
-      result[i] =
-          expand_ad_func(to_expand[i], common::vectorize<int64_t>(sizes));
-    }
-  }
-  return result;
-}
-
-struct AdvancedIndex {
-  AdvancedIndex(paddle::Tensor src, std::vector<paddle::Tensor> indices);
-
-  paddle::Tensor src;
-  std::vector<paddle::Tensor> indices;
-  std::vector<int64_t> indexed_sizes;
-  std::vector<int64_t> indexed_strides;
-  std::vector<int64_t> src_sizes;
-  std::vector<int64_t> src_strides;
-  int64_t dims_before;
-  int64_t dims_after;
-};
-
-inline static void restride_src(std::vector<int64_t>* shape,
-                                std::vector<int64_t>* strides,
-                                int64_t dims_before,
-                                int64_t dims_indexed,
-                                std::vector<int64_t> replacement_shape) {
-  int64_t end = dims_before + dims_indexed;
-  shape->erase(shape->begin() + dims_before, shape->begin() + end);
-  strides->erase(strides->begin() + dims_before, strides->begin() + end);
-  shape->insert(shape->begin() + dims_before,
-                replacement_shape.begin(),
-                replacement_shape.end());
-  strides->insert(strides->begin() + dims_before, replacement_shape.size(), 0);
-}
-
-// move to cuda kernel
-inline static paddle::Tensor reshape_indexer(paddle::Tensor* index,
-                                             int64_t dims_before,
-                                             int64_t dims_after) {
-  auto orig_shape = common::vectorize<int64_t>(index->dims());
-  auto shape = std::vector<int64_t>{};
-  shape.insert(shape.end(), dims_before, 1);
-  shape.insert(shape.end(), orig_shape.begin(), orig_shape.end());
-  shape.insert(shape.end(), dims_after, 1);
-  *index = reshape_ad_func(*index, shape);
-  return *index;
-}
-
-inline AdvancedIndex::AdvancedIndex(paddle::Tensor src,
-                                    std::vector<paddle::Tensor> indices_list) {
-  uint32_t element_size_bytes = phi::SizeOf(src.dtype());
-  int64_t dims_before = 0, dims_after = 0, dims_indexed = 0;
-  std::vector<int64_t> shape_vec = common::vectorize<int64_t>(src.dims());
-  std::vector<int64_t> stride_vec = common::vectorize<int64_t>(src.strides());
-  std::vector<int64_t> replacement_shape;
-  std::vector<int64_t> idx_shape_vec = {};
-  std::vector<int64_t> idx_stride_vec = {};
-
-  for (size_t dim = 0; dim < indices_list.size(); dim++) {
-    if (!indices_list[dim].defined() || indices_list[dim].dims().size() == 0) {
-      if (dims_indexed == 0) {
-        dims_before++;
-      } else {
-        dims_after++;
-      }
-    } else {
-      dims_indexed++;
-      replacement_shape = common::vectorize<int64_t>(indices_list[dim].dims());
-      idx_shape_vec.push_back(shape_vec[dim]);
-      idx_stride_vec.push_back(stride_vec[dim] * element_size_bytes);
-    }
-  }
-
-  this->dims_before = dims_before;
-  this->dims_after = dims_after;
-  restride_src(
-      &shape_vec, &stride_vec, dims_before, dims_indexed, replacement_shape);
-  this->src_sizes = shape_vec;
-  this->src_strides = stride_vec;
-
-  this->indexed_sizes = idx_shape_vec;
-  this->indexed_strides = idx_stride_vec;
-
-  // use dims_before and dims_after / move to cuda kernel
-  for (auto& index : indices_list) {
-    if (index.defined() && index.dims().size() > 0) {
-      this->indices.push_back(reshape_indexer(&index, dims_before, dims_after));
-    }
-  }
-}
-
 }  // namespace pybind
 }  // namespace paddle
@@ -2091,4 +2091,18 @@ void FusedRMSNormGradInferMeta(const MetaTensor& x,
   scale_grad->set_dims(scale.dims());
   scale_grad->set_dtype(scale.dtype());
 }
+
+void IndexElementwiseGetGradInferMeta(
+    const MetaTensor& x,
+    const std::vector<const MetaTensor*>& index,
+    const MetaTensor& out_grad,
+    const std::vector<int64_t>& input_dims,
+    const std::vector<int64_t>& input_strides,
+    const std::vector<int64_t>& index_dims,
+    const std::vector<int64_t>& index_strides,
+    MetaTensor* x_grad) {
+  if (x_grad) {
+    x_grad->share_meta(x);
+  }
+}
 }  // namespace phi