PaddlePaddle · houj04 · May 7, 2025 · Apr 22, 2025 · cqulilujia · Apr 24, 2025
diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h
@@ -46,5 +46,31 @@ inline void ExtractNCWHD(const phi::DDim &dims,
              : 1;
   }
 }
+
+inline void ExtractNCWHD(const phi::DDim &dims,
+                         const DataLayout &data_layout,
+                         int64_t *N,
+                         int64_t *C,
+                         int64_t *H,
+                         int64_t *W,
+                         int64_t *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -144,7 +144,7 @@ int xpu_activation_backward(const Context& dev_ctx,
                                               const XPUType*,
                                               const XPUType*,
                                               XPUType*,
-                                              int)> func) {
+                                              int64_t)> func) {
   /* TODO: relu tanh sigmoid are inplace */
   const XPUType* x_data = nullptr;
   const XPUType* y_data = nullptr;
@@ -446,9 +446,9 @@ void PowGradKernel(const Context& dev_ctx,
   T* x_grad = dx->data<T>();
 
   // check dims: all dims should equal
-  auto x_dims = common::vectorize<int>(x.dims());
-  auto dy_dims = common::vectorize<int>(dout.dims());
-  auto dx_dims = common::vectorize<int>(dx->dims());
+  auto x_dims = common::vectorize<int64_t>(x.dims());
+  auto dy_dims = common::vectorize<int64_t>(dout.dims());
+  auto dx_dims = common::vectorize<int64_t>(dx->dims());
   PADDLE_ENFORCE_EQ(x_dims,
                     dy_dims,
                     errors::PreconditionNotMet("x_dims should match dy_dims."));

diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -72,7 +72,7 @@ int xpu_activation_func(
     const Context& dev_ctx,
     const DenseTensor& x,
     DenseTensor* out,
-    std::function<int(xpu::Context*, const XPUType*, XPUType*, int)> func) {
+    std::function<int(xpu::Context*, const XPUType*, XPUType*, int64_t)> func) {
   int r = func(dev_ctx.x_context(),
                reinterpret_cast<const XPUType*>(x.data<T>()),
                reinterpret_cast<XPUType*>(out->data<T>()),
@@ -85,8 +85,8 @@ int xpu_activation_func_with_max_x_y(
     const Context& dev_ctx,
     const DenseTensor& x,
     DenseTensor* out,
-    std::function<
-        int(xpu::Context*, const XPUType*, XPUType*, int, const float*, float*)>
+    std::function<int(
+        xpu::Context*, const XPUType*, XPUType*, int64_t, const float*, float*)>
         func) {
   // does not support "const float* max_x, float* max_y" now
   int r = func(dev_ctx.x_context(),
@@ -106,7 +106,7 @@ int xpu_activation_1attr_func(const Context& dev_ctx,
                               std::function<int(xpu::Context*,
                                                 const XPUType*,
                                                 XPUType*,
-                                                int,
+                                                int64_t,
                                                 float,
                                                 const float*,
                                                 float*)> func) {
@@ -130,7 +130,7 @@ int xpu_activation_2attr_func(const Context& dev_ctx,
                               std::function<int(xpu::Context*,
                                                 const XPUType*,
                                                 XPUType*,
-                                                int,
+                                                int64_t,
                                                 float,
                                                 float,
                                                 const float*,

diff --git a/paddle/phi/kernels/xpu/add_n_kernel.cc b/paddle/phi/kernels/xpu/add_n_kernel.cc
@@ -147,7 +147,7 @@ void AddNArrayKernel(const Context& dev_ctx,
               reinterpret_cast<const XPUType*>(out->at(j).data<T>()));
 
           // int sum(Context* ctx, const std::vector<const T*>& x_list, T*
-          // y, int len);
+          // y, int64_t len);
           int r = xpu::sum(dev_ctx.x_context(),
                            ptrs,
                            reinterpret_cast<XPUType*>(out->at(j).data<T>()),

diff --git a/paddle/phi/kernels/xpu/affine_channel_grad_kernel.cc b/paddle/phi/kernels/xpu/affine_channel_grad_kernel.cc
@@ -43,9 +43,10 @@ void AffineChannelGradXPUKernel(const Context& dev_ctx,
   const phi::DataLayout layout = common::StringToDataLayout(data_layout);
 
   auto dims = x->dims();
-  int N = dims[0];
-  int C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-  int HxW = x->numel() / N / C;
+  int64_t N = dims[0];
+  int64_t C =
+      (layout == phi::DataLayout::kNCHW) ? dims[1] : dims[dims.size() - 1];
+  int64_t HxW = x->numel() / N / C;
 
   auto* dy_d = dy->data<T>();
   auto* scale_d = scale->data<T>();

diff --git a/paddle/phi/kernels/xpu/affine_channel_kernel.cc b/paddle/phi/kernels/xpu/affine_channel_kernel.cc
@@ -39,9 +39,10 @@ void AffineChannelXPUKernel(const Context& dev_ctx,
   const phi::DataLayout layout = common::StringToDataLayout(data_layout);
 
   auto dims = x->dims();
-  int N = dims[0];
-  int C = layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-  int HxW = x->numel() / N / C;
+  int64_t N = dims[0];
+  int64_t C =
+      layout == phi::DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+  int64_t HxW = x->numel() / N / C;
 
   auto* scale_d = scale->data<T>();
   auto* bias_d = bias->data<T>();

diff --git a/paddle/phi/kernels/xpu/amp_kernel.cc b/paddle/phi/kernels/xpu/amp_kernel.cc
@@ -67,7 +67,7 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
   for (size_t i = 0; i < xs.size(); ++i) {
     auto* out = outs[i];
     T* out_data = dev_ctx.template Alloc<T>(out);
-    int num = out->numel();
+    int64_t num = out->numel();
     if (cpu_found_inf_data) {
       VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --";
       int r = 0;

diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -49,7 +49,7 @@ void ArgMaxKernel(const Context& dev_ctx,
           dtype));
   // TODO(ZHUI): fix dtype of out
   DDim x_dims;
-  int axis_val = axis.to<int>();
+  int64_t axis_val = axis.to<int64_t>();
   if (flatten) {
     x_dims = common::make_ddim({x.numel()});
     // if flatten, the axis just as 0
@@ -58,7 +58,7 @@ void ArgMaxKernel(const Context& dev_ctx,
     x_dims = x.dims();
     if (axis_val < 0) axis_val += x_dims.size();
   }
-  auto xdims_vec = common::vectorize<int>(x_dims);
+  auto xdims_vec = common::vectorize<int64_t>(x_dims);
   if (dtype != DataType::INT32) {
     dev_ctx.template Alloc<int64_t>(out);
     if (x.dims().size() == 0) {
@@ -130,7 +130,7 @@ void ArgMinKernel(const Context& dev_ctx,
           dtype));
 
   DDim x_dims;
-  int axis_val = axis.to<int>();
+  int64_t axis_val = axis.to<int64_t>();
   if (flatten) {
     x_dims = common::make_ddim({x.numel()});
     // If flatten, the axis just as 0
@@ -139,7 +139,7 @@ void ArgMinKernel(const Context& dev_ctx,
     x_dims = x.dims();
     if (axis_val < 0) axis_val += x_dims.size();
   }
-  auto xdims_vec = common::vectorize<int>(x_dims);
+  auto xdims_vec = common::vectorize<int64_t>(x_dims);
   if (dtype != DataType::INT32) {
     dev_ctx.template Alloc<int64_t>(out);
     if (x.dims().size() == 0) {

diff --git a/paddle/phi/kernels/xpu/argsort_grad_kernel.cc b/paddle/phi/kernels/xpu/argsort_grad_kernel.cc
@@ -51,15 +51,15 @@ void ArgsortGradKernel(const Context& dev_ctx,
   if (axis == -1 || axis + 1 == in_dims.size()) {
     is_need_transpose = false;
   }
-  int len_before = common::product(common::slice_ddim(in_dims, 0, axis));
-  int len_after =
+  auto len_before = common::product(common::slice_ddim(in_dims, 0, axis));
+  auto len_after =
       common::product(common::slice_ddim(in_dims, axis + 1, in_dims.size()));
-  int m = len_before * len_after;
-  int n = in_dims[axis];
-  int len = m * n;
-  std::vector<int> permute_vec{0, 2, 1};
-  std::vector<int> data_shape{len_before, n, len_after};
-  std::vector<int> data_shape_trans{len_before, len_after, n};
+  auto m = len_before * len_after;
+  auto n = in_dims[axis];
+  auto len = m * n;
+  std::vector<int64_t> permute_vec{0, 2, 1};
+  std::vector<int64_t> data_shape{len_before, n, len_after};
+  std::vector<int64_t> data_shape_trans{len_before, len_after, n};
 
   const int64_t* indices_data = indices.data<int64_t>();
   const T* out_grad_data = out_grad.data<T>();