PaddlePaddle
diff --git a/‎paddle/phi/kernels/funcs/unfold_functor.h
Lines changed: 5 additions & 8 deletions b/‎paddle/phi/kernels/funcs/unfold_functor.h
Lines changed: 5 additions & 8 deletions
diff --git a/‎paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/impl/unfold_kernel_impl.h
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/impl/unfold_kernel_impl.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/xpu/beam_search_decode_kernel.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/xpu/beam_search_decode_kernel.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/xpu/distribute_fpn_proposals_kernel.cc
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/kernels/xpu/distribute_fpn_proposals_kernel.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/kernels/xpu/elementwise.h
Lines changed: 0 additions & 2 deletions b/‎paddle/phi/kernels/xpu/elementwise.h
Lines changed: 0 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/xpu/flash_attn_utils.h
Lines changed: 1 addition & 8 deletions b/‎paddle/phi/kernels/xpu/flash_attn_utils.h
Lines changed: 1 addition & 8 deletions
diff --git a/‎paddle/phi/kernels/xpu/flatten2_grad_kernel.cc
Lines changed: 0 additions & 4 deletions b/‎paddle/phi/kernels/xpu/flatten2_grad_kernel.cc
Lines changed: 0 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/xpu/flatten2_kernel.cc
Lines changed: 0 additions & 4 deletions b/‎paddle/phi/kernels/xpu/flatten2_kernel.cc
Lines changed: 0 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/xpu/gather_grad_kernel.cc
Lines changed: 9 additions & 14 deletions b/‎paddle/phi/kernels/xpu/gather_grad_kernel.cc
Lines changed: 9 additions & 14 deletions
diff --git a/‎paddle/phi/kernels/xpu/gather_kernel.cc
Lines changed: 4 additions & 1 deletion b/‎paddle/phi/kernels/xpu/gather_kernel.cc
Lines changed: 4 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/xpu/gather_nd_grad_kernel.cc
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/xpu/gather_nd_kernel.cc
Lines changed: 4 additions & 4 deletions b/‎paddle/phi/kernels/xpu/gather_nd_kernel.cc
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/xpu/generate_proposals_kernel.cc
Lines changed: 11 additions & 12 deletions b/‎paddle/phi/kernels/xpu/generate_proposals_kernel.cc
Lines changed: 11 additions & 12 deletions
diff --git a/‎paddle/phi/kernels/xpu/grid_sample_kernel.cc
Lines changed: 12 additions & 12 deletions b/‎paddle/phi/kernels/xpu/grid_sample_kernel.cc
Lines changed: 12 additions & 12 deletions
diff --git a/‎paddle/phi/kernels/xpu/group_norm_grad_kernel.cc
Lines changed: 12 additions & 11 deletions b/‎paddle/phi/kernels/xpu/group_norm_grad_kernel.cc
Lines changed: 12 additions & 11 deletions
@@ -18,14 +18,11 @@ namespace phi {
 namespace funcs {
 
 //////// CalcOutputSize Functor ///////
-inline int CalcOutputSize(int input_size,
-                          int filter_size,
-                          int dilation,
-                          int padding1,
-                          int padding2,
-                          int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
+template <typename T = int>
+inline T CalcOutputSize(
+    T input_size, T filter_size, T dilation, T padding1, T padding2, T stride) {
+  const T dkernel = dilation * (filter_size - 1) + 1;
+  T output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
   return input_size == -1 ? -1 : output_size;
 }
 
 
@@ -39,13 +39,13 @@ void UnfoldGradKernel(const Context& ctx,
   const auto& x_dims = x_grad->dims();
   const int batch_size = static_cast<int>(x_dims[0]);
 
-  int out_height = phi::funcs::CalcOutputSize(x_dims[2],
+  int out_height = phi::funcs::CalcOutputSize(static_cast<int>(x_dims[2]),
                                               kernel_sizes[0],
                                               dilations[0],
                                               paddings[0],
                                               paddings[2],
                                               strides[0]);
-  int out_width = phi::funcs::CalcOutputSize(x_dims[3],
+  int out_width = phi::funcs::CalcOutputSize(static_cast<int>(x_dims[3]),
                                              kernel_sizes[1],
                                              dilations[1],
                                              paddings[1],
 
@@ -37,13 +37,13 @@ void UnfoldKernel(const Context& ctx,
   phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kCFO, Context, T> im2col;
   const auto& x_dims = x.dims();
 
-  int out_height = phi::funcs::CalcOutputSize(x_dims[2],
+  int out_height = phi::funcs::CalcOutputSize(static_cast<int>(x_dims[2]),
                                               kernel_sizes[0],
                                               dilations[0],
                                               paddings[0],
                                               paddings[2],
                                               strides[0]);
-  int out_width = phi::funcs::CalcOutputSize(x_dims[3],
+  int out_width = phi::funcs::CalcOutputSize(static_cast<int>(x_dims[3]),
                                              kernel_sizes[1],
                                              dilations[1],
                                              paddings[1],
 
@@ -86,15 +86,15 @@ void BeamSearchDecodeXPUKernel(const Context& dev_ctx,
         *sentenceIds, sentenceIds_temp, 1, ids->at(0).place());
     PADDLE_ENFORCE_EQ(
         r,
-        xpu::Error_t::SUCCESS,
+        0,
         common::errors::External(
             "Execute function CopyTensorByXPU failed by [%d]", r));
 
     r = phi::funcs::CopyTensorByType(
         *sentenceScores, sentenceScores_temp, 1, ids->at(0).place());
     PADDLE_ENFORCE_EQ(
         r,
-        xpu::Error_t::SUCCESS,
+        0,
         common::errors::External(
             "Execute function CopyTensorByType failed by [%d]", r));
     sentenceIds_temp->set_lod(sentenceIds->lod());
 
@@ -106,7 +106,7 @@ void DistributeFpnProposalsKernel(
     rois_lod_vec[i] = static_cast<int>(fpn_rois_lod[i]);
   }
   xpu::VectorParam<int> rois_lod = {
-      rois_lod_vec.data(), static_cast<int>(rois_lod_vec.size()), nullptr};
+      rois_lod_vec.data(), static_cast<int64_t>(rois_lod_vec.size()), nullptr};
 
   int r = xpu::distribute_fpn_proposals_helper<XPUType, int>(
       dev_ctx.x_context(),
 
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#ifdef PADDLE_WITH_XPU
 #include <algorithm>
 #include <string>
 #include <tuple>
@@ -218,4 +217,3 @@ void XPUElementwiseGrad(const XPUContext& dev_ctx,
 }
 
 }  // namespace phi
-#endif
@@ -13,9 +13,6 @@
 // limitations under the License.
 
 #pragma once
-
-#ifdef PADDLE_WITH_XPU
-
 #include <vector>
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -24,7 +21,6 @@
 namespace xfa = baidu::xpu::xfa;
 namespace phi {
 
-#ifdef PADDLE_WITH_XPU_XRE5
 using XPUTypeFP16 = typename XPUTypeTrait<phi::dtype::float16>::Type;
 using XPUTypeBF16 = typename XPUTypeTrait<phi::dtype::bfloat16>::Type;
 
@@ -87,8 +83,5 @@ static void GenerateRNGState(
     seed_offset_data[1] = static_cast<int64_t>(seed_offset_pair.second);
   }
 }
-
-#endif
-
 }  // namespace phi
-#endif
+#
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_XPU
-
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/flatten2_kernel_impl.h"
@@ -32,5 +30,3 @@ PD_REGISTER_KERNEL(flatten2_grad,
                    int8_t,
                    uint8_t,
                    bool) {}
-
-#endif
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_XPU
-
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/flatten2_kernel_impl.h"
@@ -32,5 +30,3 @@ PD_REGISTER_KERNEL(flatten2,
                    int8_t,
                    uint8_t,
                    bool) {}
-
-#endif
@@ -26,9 +26,9 @@ void GatherGradKernel(const Context& dev_ctx,
                       const DenseTensor& out_grad,
                       const Scalar& axis,
                       DenseTensor* x_grad) {
-  auto axis_v = axis.to<int>();
+  auto axis_v = axis.to<int64_t>();
   if (axis_v < 0) {
-    axis_v += static_cast<int>(x.dims().size());
+    axis_v += static_cast<int64_t>(x.dims().size());
   }
 
   const auto& index_type = index.dtype();
@@ -53,7 +53,7 @@ void GatherGradKernel(const Context& dev_ctx,
             "The index should be 0D or 1D, when it is not 2D, but we get %d",
             index_dims.size()));
   }
-  std::vector<int> xshape(x_grad->dims().size());
+  std::vector<int64_t> xshape(x_grad->dims().size());
   for (int i = 0; i < x_grad->dims().size(); ++i) {
     xshape[i] = x_grad->dims()[i];
   }
@@ -72,24 +72,19 @@ void GatherGradKernel(const Context& dev_ctx,
         index.dims().size() == 0 ? 1 : index.dims()[0],
         axis_v,
         false);
-  } else {
-    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
-    int* index_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(index.numel());
-    r = xpu::cast<int64_t, int32_t>(dev_ctx.x_context(),
-                                    index.data<int64_t>(),
-                                    index_int_ptr_l3,
-                                    index.numel());
-    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-
-    r = xpu::gather_grad<XPUType, int>(
+  } else if (index_type == DataType::INT64) {
+    r = xpu::gather_grad<XPUType, int64_t>(
         dev_ctx.x_context(),
         reinterpret_cast<const XPUType*>(out_grad.data<T>()),
-        index_int_ptr_l3,
+        index.data<int64_t>(),
         reinterpret_cast<XPUType*>(x_grad->data<T>()),
         xshape,
         index.dims().size() == 0 ? 1 : index.dims()[0],
         axis_v,
         false);
+  } else {
+    PADDLE_THROW(common::errors::InvalidArgument("Unsupported index type: %s",
+                                                 DataTypeToString(index_type)));
   }
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather_grad");
 }
 
@@ -67,7 +67,7 @@ void GatherKernel(const Context& dev_ctx,
         xshape,
         index.dims().size() == 0 ? 1 : index.dims()[0],
         axis_v);
-  } else {
+  } else if (index_type == DataType::INT64) {
     r = xpu::paddle_gather<XPUType, int64_t>(
         dev_ctx.x_context(),
         reinterpret_cast<const XPUType*>(x.data<T>()),
@@ -76,6 +76,9 @@ void GatherKernel(const Context& dev_ctx,
         xshape,
         index.dims().size() == 0 ? 1 : index.dims()[0],
         axis_v);
+  } else {
+    PADDLE_THROW(common::errors::InvalidArgument("Unsupported index type: %s",
+                                                 DataTypeToString(index_type)));
   }
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_gather");
 }
 
@@ -93,9 +93,9 @@ void GatherNdGradKernel(const Context &ctx,
     index_shape.insert(index_shape.begin(), 1);
   }
   xpu::VectorParam<int64_t> x_vec = {
-      x_shape.data(), static_cast<int>(x_shape.size()), nullptr};
+      x_shape.data(), static_cast<int64_t>(x_shape.size()), nullptr};
 
-  int index_size = static_cast<int>(index.numel());
+  int64_t index_size = index.numel();
   if (index_type == phi::DataType::INT32) {
     auto index_data = const_cast<int *>(index.data<int>());
     xpu::VectorParam<int> index_vec{nullptr, index_size, index_data};
 
@@ -77,13 +77,13 @@ void GatherNdKernel(const Context &ctx,
                         DataType::INT32,
                         DataType::INT64));
 
-  auto x_shape = common::vectorize<int>(x.dims());
-  auto index_shape = common::vectorize<int>(index.dims());
+  auto x_shape = common::vectorize<int64_t>(x.dims());
+  auto index_shape = common::vectorize<int64_t>(index.dims());
   if (index_shape.size() == 1) {
     index_shape.insert(index_shape.begin(), 1);
   }
-  xpu::VectorParam<int> x_vec = {
-      x_shape.data(), static_cast<int>(x_shape.size()), nullptr};
+  xpu::VectorParam<int64_t> x_vec = {
+      x_shape.data(), static_cast<int64_t>(x_shape.size()), nullptr};
 
   int ret = 0;
 #ifndef PADDLE_WITH_XPU_PLUGIN
 
@@ -46,7 +46,7 @@ static void SortDescending(const XPUContext& dev_ctx,
   DenseTensor index_t;
   index_t.Resize({value.numel()});
   int* index = dev_ctx.template HostAlloc<int>(&index_t);
-  for (int i = 0; i < value.numel(); ++i) {
+  for (int64_t i = 0; i < value.numel(); ++i) {
     index[i] = i;
   }
 
@@ -104,26 +104,25 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
                                 scores_slice.data<T>(),
                                 index_sort.data<int>(),
                                 scores_sel.data<T>(),
-                                {static_cast<int>(scores_slice.numel()), 1},
+                                {scores_slice.numel(), 1},
                                 index_sort.numel(),
                                 0);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_gather");
 
-  r = xpu::paddle_gather<T>(
-      dev_ctx.x_context(),
-      bbox_deltas_slice.data<T>(),
-      index_sort.data<int>(),
-      bbox_sel.data<T>(),
-      {static_cast<int>(bbox_deltas_slice.numel()) / 4, 4},
-      index_sort.numel(),
-      0);
+  r = xpu::paddle_gather<T>(dev_ctx.x_context(),
+                            bbox_deltas_slice.data<T>(),
+                            index_sort.data<int>(),
+                            bbox_sel.data<T>(),
+                            {bbox_deltas_slice.numel() / 4, 4},
+                            index_sort.numel(),
+                            0);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_gather");
 
   r = xpu::paddle_gather<T>(dev_ctx.x_context(),
                             anchors.data<T>(),
                             index_sort.data<int>(),
                             anchor_sel.data<T>(),
-                            {static_cast<int>(anchors.numel()) / 4, 4},
+                            {anchors.numel() / 4, 4},
                             index_sort.numel(),
                             0);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_gather");
@@ -132,7 +131,7 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
                             variances.data<T>(),
                             index_sort.data<int>(),
                             var_sel.data<T>(),
-                            {static_cast<int>(variances.numel()) / 4, 4},
+                            {variances.numel() / 4, 4},
                             index_sort.numel(),
                             0);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "paddle_gather");
 
@@ -64,14 +64,14 @@ void GridSampleKernel(const Context& dev_ctx,
   const T* input_data = x.data<T>();
   const T* grid_data = grid.data<T>();
 
-  int n = x.dims()[0];
-  int c = x.dims()[1];
+  int64_t n = x.dims()[0];
+  int64_t c = x.dims()[1];
 
   if (x.dims().size() == 4) {  // 2D grid sample
-    int h = x.dims()[2];
-    int w = x.dims()[3];
-    int out_h = grid.dims()[1];
-    int out_w = grid.dims()[2];
+    int64_t h = x.dims()[2];
+    int64_t w = x.dims()[3];
+    int64_t out_h = grid.dims()[1];
+    int64_t out_w = grid.dims()[2];
 
     bool is_nchw_bool;
     if (data_format == "NCHW") {
@@ -104,12 +104,12 @@ void GridSampleKernel(const Context& dev_ctx,
                              is_nchw_bool);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sampler");
   } else {  // 3D grid sample
-    int d = x.dims()[2];
-    int h = x.dims()[3];
-    int w = x.dims()[4];
-    int out_d = grid.dims()[1];
-    int out_h = grid.dims()[2];
-    int out_w = grid.dims()[3];
+    int64_t d = x.dims()[2];
+    int64_t h = x.dims()[3];
+    int64_t w = x.dims()[4];
+    int64_t out_d = grid.dims()[1];
+    int64_t out_h = grid.dims()[2];
+    int64_t out_w = grid.dims()[3];
 
     out->Resize(common::make_ddim({n, c, out_d, out_h, out_w}));
     T* output_data = dev_ctx.template Alloc<T>(out);
 
@@ -47,19 +47,20 @@ void GroupNormGradKernel(const Context& dev_ctx,
   const DataLayout data_layout = common::StringToDataLayout(data_layout_str);
   const auto scale_ptr = scale.get_ptr();
   const auto bias_ptr = bias.get_ptr();
-  const auto x_dims = common::vectorize<int>(x.dims());
-  const int N = x_dims[0];
+  const auto x_dims = common::vectorize<int64_t>(x.dims());
+  const int64_t N = x_dims[0];
   const bool channel_first =
       data_layout == DataLayout::kNCHW || data_layout == DataLayout::kNCDHW;
-  const int C = (channel_first ? x_dims[1] : x_dims[x_dims.size() - 1]);
-  const int L =
-      (channel_first
-           ? std::accumulate(
-                 x_dims.begin() + 2, x_dims.end(), 1, std::multiplies<int>())
-           : std::accumulate(x_dims.begin() + 1,
-                             x_dims.end() - 1,
-                             1,
-                             std::multiplies<int>()));
+  const int64_t C = (channel_first ? x_dims[1] : x_dims[x_dims.size() - 1]);
+  const int64_t L =
+      (channel_first ? std::accumulate(x_dims.begin() + 2,
+                                       x_dims.end(),
+                                       1,
+                                       std::multiplies<int64_t>())
+                     : std::accumulate(x_dims.begin() + 1,
+                                       x_dims.end() - 1,
+                                       1,
+                                       std::multiplies<int64_t>()));
 
   dev_ctx.template Alloc<T>(d_x);
   phi::funcs::SetConstant<XPUContext, T> set_zero;
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ void DistributeFpnProposalsKernel(`
`106`	`106`	`rois_lod_vec[i] = static_cast<int>(fpn_rois_lod[i]);`
`107`	`107`	`}`
`108`	`108`	`xpu::VectorParam<int> rois_lod = {`
`109`		`- rois_lod_vec.data(), static_cast<int>(rois_lod_vec.size()), nullptr};`
	`109`	`+ rois_lod_vec.data(), static_cast<int64_t>(rois_lod_vec.size()), nullptr};`
`110`	`110`
`111`	`111`	`int r = xpu::distribute_fpn_proposals_helper<XPUType, int>(`
`112`	`112`	`dev_ctx.x_context(),`