From 1980035aec2e0f046188aa063639156a27463e40 Mon Sep 17 00:00:00 2001
From: chen huangrun <1181749441@qq.com>
Date: Mon, 26 May 2025 13:16:15 +0800
Subject: [PATCH] fix unstack big tensor

---
 paddle/phi/infermeta/backward.cc                   |  4 ++--
 paddle/phi/infermeta/unary.cc                      |  2 +-
 paddle/phi/kernels/funcs/stack_and_unstack.h       | 11 +++++++----
 paddle/phi/kernels/impl/unstack_grad_kernel_impl.h | 12 ++++++------
 4 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index a7d368ea869b22..020a3c931c2d60 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -1747,8 +1747,8 @@ void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
                         rank));
   if (axis < 0) axis += (rank + 1);
 
-  auto vec = common::vectorize<int>(input_dims[0]);
-  vec.insert(vec.begin() + axis, static_cast<int>(input_dims.size()));
+  auto vec = common::vectorize<int64_t>(input_dims[0]);
+  vec.insert(vec.begin() + axis, static_cast<int64_t>(input_dims.size()));
   x_grad->set_dims(common::make_ddim(vec));
   x_grad->set_dtype(out_grad[0]->dtype());
 }
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index abf1823d67c86e..bbbed42e15f903 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -5962,7 +5962,7 @@ void UnStackInferMeta(const MetaTensor& x,
             x_dim[axis],
             num));
   }
-  auto vec = common::vectorize<int>(x_dim);
+  auto vec = common::vectorize<int64_t>(x_dim);
   vec.erase(vec.begin() + axis);
   for (size_t i = 0; i < output_count; i++) {
     outs[i]->set_dims(common::make_ddim(vec));
diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
index b0b7a983764acf..73d17c770a3ae7 100644
--- a/paddle/phi/kernels/funcs/stack_and_unstack.h
+++ b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -210,7 +210,7 @@ void LaunchUnStackKernel(const Context& ctx,
     constexpr int kWarpSize = 32;
     constexpr int kMaxOut = 16;
 
-    int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
+    int64_t tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
     if (split_dim < kMaxOut) {
       tid_y = split_dim;
       tid_x =
@@ -219,10 +219,13 @@ void LaunchUnStackKernel(const Context& ctx,
     } else {
       tid_y = kMaxOut;
       tid_x = kWarpSize;
-      bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
+      bid_y = backends::gpu::DivUp<int64_t>(split_dim, kMaxOut);
     }
-    int tile_x_num = backends::gpu::DivUp<int>(out_row, tid_x);
-    bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
+    int64_t tile_x_num = backends::gpu::DivUp<int64_t>(out_row, tid_x);
+    if (tile_x_num < static_cast<int64_t>(backends::gpu::kMultiDimslimit))
+      bid_x = tile_x_num;
+    else
+      bid_x = backends::gpu::kMultiDimslimit;
     dim3 blocks(tid_x, tid_y, 1);
     dim3 grids(bid_x, bid_y, 1);
 
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
index 0576742e349a83..3546b91d66fc12 100644
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -28,13 +28,13 @@ void UnStackGradKernel(const Context &dev_ctx,
                        DenseTensor *x_grad) {
   if (axis < 0) axis += (x[0]->dims().size() + 1);
 
-  int n = static_cast<int>(x.size());
+  int64_t n = static_cast<int64_t>(x.size());
   auto *x_grad_data = dev_ctx.template Alloc<T>(x_grad);
   std::vector<const T *> x_datas(n);
-  for (int i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
+  for (int64_t i = 0; i < n; i++) x_datas[i] = x[i]->data<T>();
 
-  int pre = 1;
-  int post = 1;
+  int64_t pre = 1;
+  int64_t post = 1;
   auto &dim = x[0]->dims();
   for (auto i = 0; i < axis; ++i) pre *= dim[i];
   for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
@@ -56,8 +56,8 @@ void UnStackGradKernel(const Context &dev_ctx,
 
   size_t x_offset = 0;
   size_t y_offset = 0;
-  for (int i = 0; i < pre; i++) {
-    for (int j = 0; j < n; j++) {
+  for (int64_t i = 0; i < pre; i++) {
+    for (int64_t j = 0; j < n; j++) {
       std::memcpy(
           x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
       y_offset += post;