From 1980035aec2e0f046188aa063639156a27463e40 Mon Sep 17 00:00:00 2001 From: chen huangrun <1181749441@qq.com> Date: Mon, 26 May 2025 13:16:15 +0800 Subject: [PATCH] fix unstack big tensor --- paddle/phi/infermeta/backward.cc | 4 ++-- paddle/phi/infermeta/unary.cc | 2 +- paddle/phi/kernels/funcs/stack_and_unstack.h | 11 +++++++---- paddle/phi/kernels/impl/unstack_grad_kernel_impl.h | 12 ++++++------ 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index a7d368ea869b22..020a3c931c2d60 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -1747,8 +1747,8 @@ void UnStackGradInferMeta(const std::vector& out_grad, rank)); if (axis < 0) axis += (rank + 1); - auto vec = common::vectorize(input_dims[0]); - vec.insert(vec.begin() + axis, static_cast(input_dims.size())); + auto vec = common::vectorize(input_dims[0]); + vec.insert(vec.begin() + axis, static_cast(input_dims.size())); x_grad->set_dims(common::make_ddim(vec)); x_grad->set_dtype(out_grad[0]->dtype()); } diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index abf1823d67c86e..bbbed42e15f903 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -5962,7 +5962,7 @@ void UnStackInferMeta(const MetaTensor& x, x_dim[axis], num)); } - auto vec = common::vectorize(x_dim); + auto vec = common::vectorize(x_dim); vec.erase(vec.begin() + axis); for (size_t i = 0; i < output_count; i++) { outs[i]->set_dims(common::make_ddim(vec)); diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h index b0b7a983764acf..73d17c770a3ae7 100644 --- a/paddle/phi/kernels/funcs/stack_and_unstack.h +++ b/paddle/phi/kernels/funcs/stack_and_unstack.h @@ -210,7 +210,7 @@ void LaunchUnStackKernel(const Context& ctx, constexpr int kWarpSize = 32; constexpr int kMaxOut = 16; - int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1; + int64_t tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1; if (split_dim < kMaxOut) { tid_y = split_dim; tid_x = @@ -219,10 +219,13 @@ void LaunchUnStackKernel(const Context& ctx, } else { tid_y = kMaxOut; tid_x = kWarpSize; - bid_y = backends::gpu::DivUp(split_dim, kMaxOut); + bid_y = backends::gpu::DivUp(split_dim, kMaxOut); } - int tile_x_num = backends::gpu::DivUp(out_row, tid_x); - bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit); + int64_t tile_x_num = backends::gpu::DivUp(out_row, tid_x); + if (tile_x_num < static_cast(backends::gpu::kMultiDimslimit)) + bid_x = tile_x_num; + else + bid_x = backends::gpu::kMultiDimslimit; dim3 blocks(tid_x, tid_y, 1); dim3 grids(bid_x, bid_y, 1); diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h index 0576742e349a83..3546b91d66fc12 100644 --- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h @@ -28,13 +28,13 @@ void UnStackGradKernel(const Context &dev_ctx, DenseTensor *x_grad) { if (axis < 0) axis += (x[0]->dims().size() + 1); - int n = static_cast(x.size()); + int64_t n = static_cast(x.size()); auto *x_grad_data = dev_ctx.template Alloc(x_grad); std::vector x_datas(n); - for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); + for (int64_t i = 0; i < n; i++) x_datas[i] = x[i]->data(); - int pre = 1; - int post = 1; + int64_t pre = 1; + int64_t post = 1; auto &dim = x[0]->dims(); for (auto i = 0; i < axis; ++i) pre *= dim[i]; for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; @@ -56,8 +56,8 @@ void UnStackGradKernel(const Context &dev_ctx, size_t x_offset = 0; size_t y_offset = 0; - for (int i = 0; i < pre; i++) { - for (int j = 0; j < n; j++) { + for (int64_t i = 0; i < pre; i++) { + for (int64_t j = 0; j < n; j++) { std::memcpy( x_grad_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); y_offset += post;