Skip to content

Commit 4a03621

Browse files
authored
[PHI] Fix bincount kernel for big tensor (#72706)
* fix bincount kernel for big tensor * use HostAlloc to alloc memory * add cpu test case
1 parent 0df410c commit 4a03621

File tree

3 files changed

+92
-32
lines changed

3 files changed

+92
-32
lines changed

paddle/phi/kernels/cpu/bincount_kernel.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ template <typename Context, typename T, typename InputT>
2424
void BincountInner(const Context& dev_ctx,
2525
const DenseTensor& x,
2626
const paddle::optional<DenseTensor>& weights,
27-
int minlength,
27+
int64_t minlength,
2828
DenseTensor* out) {
2929
const DenseTensor* input = &x;
3030
DenseTensor* output = out;
@@ -48,7 +48,7 @@ void BincountInner(const Context& dev_ctx,
4848
int64_t output_size = static_cast<int64_t>(*std::max_element(
4949
input_data, input_data + input_numel)) +
5050
1L;
51-
output_size = std::max(output_size, static_cast<int64_t>(minlength));
51+
output_size = std::max(output_size, minlength);
5252

5353
phi::DDim out_dim{output_size};
5454
output->Resize(out_dim);
@@ -89,7 +89,7 @@ void BincountKernel(const Context& dev_ctx,
8989
const paddle::optional<DenseTensor>& weights,
9090
const Scalar& minlength,
9191
DenseTensor* out) {
92-
int int_minlength = minlength.to<int>();
92+
int64_t int_minlength = minlength.to<int64_t>();
9393
PADDLE_ENFORCE_GE(int_minlength,
9494
0,
9595
common::errors::InvalidArgument(

paddle/phi/kernels/gpu/bincount_kernel.cu

Lines changed: 77 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,64 @@ namespace phi {
2424

2525
using phi::PADDLE_CUDA_NUM_THREADS;
2626

27-
inline int GET_BLOCKS(const int N) {
27+
inline int64_t GET_BLOCKS(const int64_t N) {
2828
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
2929
}
3030

31+
template <typename T>
32+
__global__ void KernelReduceMinMax(const T* input,
33+
int64_t numel,
34+
T* min_out,
35+
T* max_out) {
36+
__shared__ T smin[PADDLE_CUDA_NUM_THREADS];
37+
__shared__ T smax[PADDLE_CUDA_NUM_THREADS];
38+
int tid = threadIdx.x;
39+
int64_t global_thread_id =
40+
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
41+
int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
42+
43+
T local_min = std::numeric_limits<T>::max();
44+
T local_max = std::numeric_limits<T>::lowest();
45+
46+
for (int64_t i = global_thread_id; i < numel; i += stride) {
47+
T val = input[i];
48+
local_min = min(local_min, val);
49+
local_max = max(local_max, val);
50+
}
51+
52+
smin[tid] = local_min;
53+
smax[tid] = local_max;
54+
__syncthreads();
55+
56+
for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
57+
if (tid < offset) {
58+
smin[tid] = min(smin[tid], smin[tid + offset]);
59+
smax[tid] = max(smax[tid], smax[tid + offset]);
60+
}
61+
__syncthreads();
62+
}
63+
64+
if (tid == 0) {
65+
phi::CudaAtomicMin(min_out, smin[0]);
66+
phi::CudaAtomicMax(max_out, smax[0]);
67+
}
68+
}
69+
3170
template <typename T, typename InputT, typename OutT>
3271
__global__ void KernelBincount(const InputT* input,
33-
const int total_elements,
72+
const int64_t total_elements,
3473
const bool has_weights,
3574
const T* weights,
3675
OutT* output) {
37-
int tid = blockIdx.x * blockDim.x + threadIdx.x;
38-
if (tid < total_elements) {
76+
int64_t global_tid =
77+
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
78+
int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
79+
for (int64_t i = global_tid; i < total_elements; i += stride) {
80+
InputT index = input[i];
3981
if (!has_weights) {
40-
phi::CudaAtomicAdd(&output[input[tid]], 1L);
82+
phi::CudaAtomicAdd(&output[index], 1L);
4183
} else {
42-
phi::CudaAtomicAdd(&output[input[tid]], static_cast<OutT>(weights[tid]));
84+
phi::CudaAtomicAdd(&output[index], static_cast<OutT>(weights[i]));
4385
}
4486
}
4587
}
@@ -48,39 +90,45 @@ template <typename Context, typename T, typename InputT>
4890
void BincountCUDAInner(const Context& dev_ctx,
4991
const DenseTensor& x,
5092
const paddle::optional<DenseTensor>& weights,
51-
int minlength,
93+
int64_t minlength,
5294
DenseTensor* out) {
5395
const DenseTensor* input = &x;
5496
DenseTensor* output = out;
5597
const InputT* input_data = input->data<InputT>();
5698

57-
const int input_numel = input->numel();
99+
int64_t input_numel = static_cast<int64_t>(input->numel());
58100

59101
if (input_data == nullptr) {
60102
phi::DDim out_dim{0};
61103
output->Resize(out_dim);
62104
dev_ctx.template Alloc<T>(output);
63105
return;
64106
}
65-
auto input_x = EigenVector<InputT>::Flatten(*input);
66-
DenseTensor input_min_t, input_max_t;
67-
input_max_t.Resize({1});
68-
auto* input_max_data = dev_ctx.template Alloc<InputT>(&input_max_t);
69-
input_min_t.Resize({1});
70-
auto* input_min_data = dev_ctx.template Alloc<InputT>(&input_min_t);
71107

72-
auto input_max_scala = EigenScalar<InputT>::From(input_max_t);
73-
auto input_min_scala = EigenScalar<InputT>::From(input_min_t);
108+
DenseTensor input_min_max_cpu;
109+
input_min_max_cpu.Resize({2});
110+
auto* input_min_max_cpu_data =
111+
dev_ctx.template HostAlloc<InputT>(&input_min_max_cpu);
112+
input_min_max_cpu.data<InputT>()[0] = std::numeric_limits<InputT>::max();
113+
input_min_max_cpu.data<InputT>()[1] = std::numeric_limits<InputT>::lowest();
114+
115+
DenseTensor input_min_max_t;
116+
input_min_max_t.Resize({2});
117+
auto* input_min_max_data = dev_ctx.template Alloc<InputT>(&input_min_max_t);
118+
119+
phi::Copy(
120+
dev_ctx, input_min_max_cpu, dev_ctx.GetPlace(), true, &input_min_max_t);
74121

75-
auto* place = dev_ctx.eigen_device();
76-
input_max_scala.device(*place) = input_x.maximum();
77-
input_min_scala.device(*place) = input_x.minimum();
122+
int64_t max_grid_x = dev_ctx.GetCUDAMaxGridDimSize()[0];
123+
int64_t num_blocks = std::min(GET_BLOCKS(input_numel), max_grid_x);
124+
KernelReduceMinMax<InputT>
125+
<<<num_blocks, PADDLE_CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>(
126+
input_data, input_numel, input_min_max_data, input_min_max_data + 1);
78127

79-
DenseTensor input_min_cpu, input_max_cpu;
80-
phi::Copy(dev_ctx, input_min_t, phi::CPUPlace(), true, &input_min_cpu);
81-
phi::Copy(dev_ctx, input_max_t, phi::CPUPlace(), true, &input_max_cpu);
128+
phi::Copy(
129+
dev_ctx, input_min_max_t, phi::CPUPlace(), true, &input_min_max_cpu);
82130

83-
InputT input_min = input_min_cpu.data<InputT>()[0];
131+
InputT input_min = input_min_max_cpu.data<InputT>()[0];
84132

85133
PADDLE_ENFORCE_GE(
86134
input_min,
@@ -89,9 +137,9 @@ void BincountCUDAInner(const Context& dev_ctx,
89137
"The elements in input tensor must be non-negative ints"));
90138

91139
int64_t output_size =
92-
static_cast<int64_t>(input_max_cpu.data<InputT>()[0]) + 1L;
140+
static_cast<int64_t>(input_min_max_cpu.data<InputT>()[1]) + 1L;
93141

94-
output_size = std::max(output_size, static_cast<int64_t>(minlength));
142+
output_size = std::max(output_size, minlength);
95143
phi::DDim out_dim{output_size};
96144
output->Resize(out_dim);
97145

@@ -106,7 +154,7 @@ void BincountCUDAInner(const Context& dev_ctx,
106154
dev_ctx, output, static_cast<int64_t>(0));
107155

108156
KernelBincount<T, InputT, int64_t>
109-
<<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
157+
<<<num_blocks, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
110158
input_data, input_numel, has_weights, weights_data, output_data);
111159
} else {
112160
if (weights->dtype() == DataType::FLOAT32) {
@@ -115,14 +163,14 @@ void BincountCUDAInner(const Context& dev_ctx,
115163
dev_ctx, output, static_cast<float>(0));
116164

117165
KernelBincount<T, InputT, float>
118-
<<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
166+
<<<num_blocks, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
119167
input_data, input_numel, has_weights, weights_data, output_data);
120168
} else {
121169
double* output_data = dev_ctx.template Alloc<double>(output);
122170
phi::funcs::SetConstant<Context, double>()(
123171
dev_ctx, output, static_cast<double>(0));
124172
KernelBincount<T, InputT, double>
125-
<<<GET_BLOCKS(input_numel), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
173+
<<<num_blocks, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
126174
input_data, input_numel, has_weights, weights_data, output_data);
127175
}
128176
}
@@ -134,7 +182,7 @@ void BincountKernel(const Context& dev_ctx,
134182
const paddle::optional<DenseTensor>& weights,
135183
const Scalar& minlength,
136184
DenseTensor* out) {
137-
int int_minlength = minlength.to<int>();
185+
int64_t int_minlength = minlength.to<int64_t>();
138186
PADDLE_ENFORCE_GE(int_minlength,
139187
0,
140188
common::errors::InvalidArgument(

test/legacy_test/test_bincount_op.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,18 @@ def test_dygraph(self):
7171
msg='bincount output is wrong, out =' + str(actual.numpy()),
7272
)
7373

74+
def test_dygraph_cpu(self):
75+
with base.dygraph.guard():
76+
paddle.device.set_device('cpu')
77+
inputs_np = np.array([0, 1, 1, 3, 2, 1, 7]).astype(np.int64)
78+
inputs = paddle.to_tensor(inputs_np)
79+
actual = paddle.bincount(inputs)
80+
expected = np.bincount(inputs)
81+
self.assertTrue(
82+
(actual.numpy() == expected).all(),
83+
msg='bincount output is wrong, out =' + str(actual.numpy()),
84+
)
85+
7486

7587
class TestBincountOpError(unittest.TestCase):
7688
"""Test bincount op error."""

0 commit comments

Comments
 (0)