PaddlePaddle
diff --git a/‎paddle/fluid/pybind/slice_utils.h
Lines changed: 181 additions & 22 deletions b/‎paddle/fluid/pybind/slice_utils.h
Lines changed: 181 additions & 22 deletions
diff --git a/‎paddle/phi/infermeta/binary.cc
Lines changed: 7 additions & 5 deletions b/‎paddle/phi/infermeta/binary.cc
Lines changed: 7 additions & 5 deletions
diff --git a/‎paddle/phi/infermeta/binary.h
Lines changed: 7 additions & 5 deletions b/‎paddle/phi/infermeta/binary.h
Lines changed: 7 additions & 5 deletions
diff --git a/‎paddle/phi/kernels/funcs/index_elementwise.cu.h
Lines changed: 4 additions & 107 deletions b/‎paddle/phi/kernels/funcs/index_elementwise.cu.h
Lines changed: 4 additions & 107 deletions
@@ -39,6 +39,152 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
+static inline common::DDim infer_size_symdimvector(common::DDim a,
+                                                   common::DDim b) {
+  // Use ptrdiff_t to ensure signed comparison.
+  auto dimsA = a.size();
+  auto dimsB = b.size();
+  auto ndim = dimsA > dimsB ? dimsA : dimsB;
+  common::DDim expandedSizes = common::make_ddim(std::vector<int64_t>(ndim, 0));
+
+  for (int64_t i = ndim - 1; i >= 0; --i) {
+    int64_t offset = ndim - 1 - i;
+    int64_t dimA = dimsA - 1 - offset;
+    int64_t dimB = dimsB - 1 - offset;
+    auto sizeA = (dimA >= 0) ? a[dimA] : 1;
+    auto sizeB = (dimB >= 0) ? b[dimB] : 1;
+
+    PADDLE_ENFORCE(sizeA == sizeB || sizeA == 1 || sizeB == 1,
+                   common::errors::Fatal("The size of tensor a (",
+                                         sizeA,
+                                         ") must match the size of tensor b (",
+                                         sizeB,
+                                         ") at non-singleton dimension ",
+                                         i));
+
+    // 1s map to the other size (even 0).
+    expandedSizes[i] = sizeA == 1 ? sizeB : sizeA;
+  }
+
+  return expandedSizes;
+}
+
+static inline std::vector<paddle::Tensor> expand_outplace(
+    std::vector<paddle::Tensor> to_expand) {
+  // expands a list of Tensors; ignores undefined (null) tensors
+  bool first = true;
+  common::DDim sizes;
+  for (size_t i = 0; i < to_expand.size(); i++) {
+    if (!to_expand[i].initialized()) {
+      continue;
+    } else if (first) {
+      sizes = to_expand[i].dims();
+      first = false;
+    } else {
+      sizes = infer_size_symdimvector(sizes, to_expand[i].dims());
+    }
+  }
+
+  std::vector<paddle::Tensor> result(to_expand.size());
+  for (size_t i = 0; i < to_expand.size(); i++) {
+    if (!to_expand[i].initialized()) {
+      continue;
+    } else if (to_expand[i].dims() == sizes) {
+      result[i] = to_expand[i];
+    } else {
+      result[i] =
+          expand_ad_func(to_expand[i], common::vectorize<int64_t>(sizes));
+    }
+  }
+  return result;
+}
+
+struct AdvancedIndex {
+  AdvancedIndex(paddle::Tensor src, std::vector<paddle::Tensor> indices);
+
+  paddle::Tensor src;
+  std::vector<paddle::Tensor> indices;
+  std::vector<int64_t> indexed_sizes;
+  std::vector<int64_t> indexed_strides;
+  std::vector<int64_t> src_sizes;
+  std::vector<int64_t> src_strides;
+  int64_t dims_before;
+  int64_t dims_after;
+};
+
+inline static void restride_src(std::vector<int64_t>* shape,
+                                std::vector<int64_t>* strides,
+                                int64_t dims_before,
+                                int64_t dims_indexed,
+                                std::vector<int64_t> replacement_shape) {
+  int64_t end = dims_before + dims_indexed;
+  shape->erase(shape->begin() + dims_before, shape->begin() + end);
+  strides->erase(strides->begin() + dims_before, strides->begin() + end);
+  shape->insert(shape->begin() + dims_before,
+                replacement_shape.begin(),
+                replacement_shape.end());
+  strides->insert(strides->begin() + dims_before, replacement_shape.size(), 0);
+}
+
+// move to cuda kernel
+inline static paddle::Tensor reshape_indexer(paddle::Tensor* index,
+                                             int64_t dims_before,
+                                             int64_t dims_after) {
+  auto orig_shape = common::vectorize<int64_t>(index->dims());
+  auto shape = std::vector<int64_t>{};
+  shape.insert(shape.end(), dims_before, 1);
+  shape.insert(shape.end(), orig_shape.begin(), orig_shape.end());
+  shape.insert(shape.end(), dims_after, 1);
+  *index = reshape_ad_func(*index, shape);
+  return *index;
+}
+
+inline AdvancedIndex::AdvancedIndex(paddle::Tensor src,
+                                    std::vector<paddle::Tensor> indices_list) {
+  uint32_t element_size_bytes = phi::SizeOf(src.dtype());
+  int64_t dims_before = 0, dims_after = 0, dims_indexed = 0;
+  std::vector<int64_t> shape_vec = common::vectorize<int64_t>(src.dims());
+  std::vector<int64_t> stride_vec = common::vectorize<int64_t>(src.strides());
+  std::vector<int64_t> replacement_shape;
+  std::vector<int64_t> idx_shape_vec = {};
+  std::vector<int64_t> idx_stride_vec = {};
+
+  for (size_t dim = 0; dim < indices_list.size(); dim++) {
+    if (!indices_list[dim].defined() || indices_list[dim].dims().size() == 0) {
+      if (dims_indexed == 0) {
+        dims_before++;
+      } else {
+        dims_after++;
+      }
+    } else {
+      dims_indexed++;
+      replacement_shape = common::vectorize<int64_t>(indices_list[dim].dims());
+      if (!replacement_shape.empty() && replacement_shape.back() == 1) {
+        replacement_shape.pop_back();
+      }
+
+      idx_shape_vec.push_back(shape_vec[dim]);
+      idx_stride_vec.push_back(stride_vec[dim] * element_size_bytes);
+    }
+  }
+
+  this->dims_before = dims_before;
+  this->dims_after = dims_after;
+  restride_src(
+      &shape_vec, &stride_vec, dims_before, dims_indexed, replacement_shape);
+  this->src_sizes = shape_vec;
+  this->src_strides = stride_vec;
+
+  this->indexed_sizes = idx_shape_vec;
+  this->indexed_strides = idx_stride_vec;
+
+  // use dims_before and dims_after / move to cuda kernel
+  for (auto& index : indices_list) {
+    if (index.defined() && index.dims().size() > 0) {
+      this->indices.push_back(reshape_indexer(&index, dims_before, dims_after));
+    }
+  }
+}
 
 template <typename T>
 inline T GetDenseTensorValue(const phi::DenseTensor* x) {
@@ -493,18 +639,33 @@ static paddle::Tensor dealWithAdvancedIndex(
   return transed_tensor;
 }
 
-inline std::vector<int64_t> ComputeIndexStrides(const paddle::Tensor& input,
-                                                const size_t index_dims_size) {
-  const auto& input_strides = input.strides();
-  size_t element_size_bytes = phi::SizeOf(input.dtype());
-  std::vector<int64_t> strides(index_dims_size, 0);
-  const size_t min_size =
-      std::min(static_cast<size_t>(input_strides.size()), index_dims_size);
-  for (size_t i = 0; i < min_size; ++i) {
-    strides[i] = input_strides[i] * element_size_bytes;
-  }
+static std::vector<paddle::Tensor> PrepareIndices(
+    const paddle::Tensor& tensor,
+    const paddle::Tensor& bool_2_idx,
+    const paddle::Tensor& bool_index) {
+  std::vector<paddle::Tensor> indices;
+  for (int j = 0; j < bool_2_idx.shape()[1]; ++j) {
+    paddle::Tensor sliced_tensor =
+        slice_ad_func(bool_2_idx, {1}, {j}, {j + 1}, {1}, {});
 
-  return strides;
+    // Calculate the required dimensionality
+    int64_t original_ndim =
+        tensor.shape().size() - bool_index.shape().size() + 1;
+    int64_t sliced_ndim = sliced_tensor.shape().size();
+    int64_t num_ones_to_add = original_ndim - sliced_ndim;
+
+    // Reshape the tensor by adding 1s if needed
+    if (num_ones_to_add > 0) {
+      std::vector<int64_t> new_shape = sliced_tensor.shape();
+      for (int64_t k = 0; k < num_ones_to_add; ++k) {
+        new_shape.push_back(1);
+      }
+      sliced_tensor = reshape_ad_func(sliced_tensor, new_shape);
+    }
+
+    indices.emplace_back(sliced_tensor);
+  }
+  return indices;
 }
 
 static paddle::Tensor getValueForBoolTensor(const paddle::Tensor& tensor,
@@ -547,17 +708,15 @@ static paddle::Tensor getValueForBoolTensor(const paddle::Tensor& tensor,
 
   auto bool_2_idx = nonzero_ad_func(bool_index);
 #ifdef PADDLE_WITH_CUDA
-  std::vector<paddle::Tensor> indices;
-  for (int j = 0; j < bool_2_idx.shape()[1]; ++j) {
-    paddle::Tensor sliced_tensor =
-        slice_ad_func(bool_2_idx, {1}, {j}, {j + 1}, {1}, {});
-    indices.emplace_back(sliced_tensor);
-  }
-  auto index_dims_vec = common::vectorize<int64_t>(bool_index.dims());
-  auto index_stride = ComputeIndexStrides(tensor, index_dims_vec.size());
-
-  return index_elementwise_ad_func(
-      tensor, indices, index_dims_vec, index_stride);
+  auto indices = PrepareIndices(tensor, bool_2_idx, bool_index);
+  AdvancedIndex ad = AdvancedIndex(tensor, indices);
+
+  return index_elementwise_get_ad_func(tensor,
+                                       ad.indices,
+                                       ad.src_sizes,
+                                       ad.src_strides,
+                                       ad.indexed_sizes,
+                                       ad.indexed_strides);
 #else
 
   return gather_nd_ad_func(tensor, bool_2_idx);
 
@@ -2146,11 +2146,13 @@ void GatherNdInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
-void IndexElementwiseInferMeta(const MetaTensor& x,
-                               const std::vector<const MetaTensor*>& index,
-                               const std::vector<int64_t>& index_dims,
-                               const std::vector<int64_t>& index_stride,
-                               MetaTensor* out) {
+void IndexElementwiseGetInferMeta(const MetaTensor& x,
+                                  const std::vector<const MetaTensor*>& index,
+                                  const std::vector<int64_t>& input_dims,
+                                  const std::vector<int64_t>& input_strides,
+                                  const std::vector<int64_t>& index_dims,
+                                  const std::vector<int64_t>& index_stride,
+                                  MetaTensor* out) {
   const auto& x_dims = x.dims();
 
   PADDLE_ENFORCE_LE(
 
@@ -401,11 +401,13 @@ void GatherNdInferMeta(const MetaTensor& x,
                        const MetaTensor& index,
                        MetaTensor* out);
 
-void IndexElementwiseInferMeta(const MetaTensor& x,
-                               const std::vector<const MetaTensor*>& index,
-                               const std::vector<int64_t>& index_dims,
-                               const std::vector<int64_t>& index_stride,
-                               MetaTensor* out);
+void IndexElementwiseGetInferMeta(const MetaTensor& x,
+                                  const std::vector<const MetaTensor*>& index,
+                                  const std::vector<int64_t>& input_dims,
+                                  const std::vector<int64_t>& input_strides,
+                                  const std::vector<int64_t>& index_dims,
+                                  const std::vector<int64_t>& index_stride,
+                                  MetaTensor* out);
 
 void GatherTreeMeta(const MetaTensor& ids,
                     const MetaTensor& parents,
 
@@ -157,119 +157,16 @@ std::array<char*, DDim::kMaxRank> GetIndexDataPtrs(
 
 template <int N, bool signed_strides = false>
 static OffsetCalculator<N, uint32_t, signed_strides> make_offset_calculator(
-    const DenseTensor& output,
-    const DenseTensor& input,
-    const std::vector<const DenseTensor*> index) {
-  int ndim = output.dims().size();
-  const int64_t* shape = output.dims().Get();
-  std::vector<int64_t> shape_vec(shape, shape + ndim);
-  std::reverse(shape_vec.begin(), shape_vec.end());
-  const int64_t* desired_shape = shape_vec.data();
-
-  std::vector<std::vector<int64_t>> strides;
-  std::vector<const DenseTensor*> tensors = {&output, &input};
-
-  for (const auto& idx_tensor : index) {
-    tensors.push_back(idx_tensor);
-  }
-
-  for (const auto& tensor : tensors) {
-    std::vector<int64_t> stride_bytes(ndim, 0);
-    const auto& original_shape = tensor->dims();
-    const auto& original_strides = tensor->strides();
-    int64_t element_size_in_bytes = phi::SizeOf(tensor->dtype());
-    int offset = ndim - original_shape.size();
-
-    if (tensor == &input) {
-      stride_bytes[ndim - 1] = element_size_in_bytes;
-    } else {
-      if (offset > 0) {
-        stride_bytes.resize(ndim, 0);
-      } else {
-        stride_bytes.resize(ndim);
-      }
-
-      for (int i = 0; i < original_shape.size(); ++i) {
-        if (original_shape[i] == 1 && shape[offset + i] != 1) {
-          stride_bytes[offset + i] = 0;
-        } else {
-          stride_bytes[offset + i] =
-              original_strides[i] * element_size_in_bytes;
-        }
-      }
-    }
-    std::reverse(stride_bytes.begin(), stride_bytes.end());
-    strides.push_back(stride_bytes);
-  }
-
+    int ndim,
+    const int64_t* shape,
+    const std::vector<std::vector<int64_t>>& strides) {
   std::array<const int64_t*, N> strides_array;
   for (int i = 0; i < N; ++i) {
     strides_array[i] = strides[i].data();
   }
 
   return OffsetCalculator<N, uint32_t, signed_strides>(
-      ndim, desired_shape, strides_array.data());
-}
-
-template <typename T, typename IndexT = int>
-void IndexElementwiseKernel(const phi::GPUContext& ctx,
-                            const DenseTensor& input,
-                            const std::vector<const DenseTensor*> index,
-                            const std::vector<int64_t>& index_dims,
-                            const std::vector<int64_t>& index_stride,
-                            DenseTensor* output) {
-  auto num_indices = index_dims.size();
-
-  auto index_ptrs = GetIndexDataPtrs<IndexT>(index);
-
-  auto sizes = std::array<int64_t, DDim::kMaxRank>{};
-  auto strides = std::array<int64_t, DDim::kMaxRank>{};
-
-  for (unsigned i = 0; i < num_indices; i++) {
-    sizes[i] = index_dims[i];
-    strides[i] = index_stride[i];
-  }
-
-  auto offset_calc = make_offset_calculator<3>(*output, input, index);
-
-  const int64_t N = output->numel();
-  PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
-
-                 "Output numel be in the range [0, "
-                 "std::numeric_limits<int32_t>::max()]");
-  constexpr int nt = launch_size_nd;
-  constexpr int vt = launch_bound2;
-  const dim3 block(nt);
-  const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
-  auto stream = ctx.stream();
-
-  using dtype = OpaqueType<sizeof(T)>;
-
-  const char* in_ptr = reinterpret_cast<const char*>(input.data<T>());
-  char* out_ptr = reinterpret_cast<char*>(output->data<T>());
-
-  index_elementwise_kernel<nt, vt>
-      <<<grid, block, 0, stream>>>(N, [=] __device__(int idx) {
-        const auto offsets = offset_calc.get(idx);
-        char* const out_data = out_ptr + offsets[0];
-        const char* const in_data = in_ptr + offsets[1];
-
-        int64_t offset = 0;
-#pragma unroll
-        for (int i = 0; i < num_indices; i++) {
-          int64_t index =
-              *reinterpret_cast<int64_t*>(index_ptrs[i] + offsets[2]);
-          PADDLE_ENFORCE(-sizes[i] <= index && index < sizes[i],
-                         "index out of bounds");
-          if (index < 0) {
-            index += sizes[i];
-          }
-          offset += index * strides[i];
-        }
-
-        *reinterpret_cast<dtype*>(out_data) =
-            *reinterpret_cast<const dtype*>(in_data + offset);
-      });
+      ndim, shape, strides_array.data());
 }
 
 }  // namespace funcs