@@ -24,22 +24,64 @@ namespace phi {
24
24
25
25
using phi::PADDLE_CUDA_NUM_THREADS;
26
26
27
- inline int GET_BLOCKS (const int N) {
27
+ inline int64_t GET_BLOCKS (const int64_t N) {
28
28
return (N + PADDLE_CUDA_NUM_THREADS - 1 ) / PADDLE_CUDA_NUM_THREADS;
29
29
}
30
30
31
+ template <typename T>
32
+ __global__ void KernelReduceMinMax (const T* input,
33
+ int64_t numel,
34
+ T* min_out,
35
+ T* max_out) {
36
+ __shared__ T smin[PADDLE_CUDA_NUM_THREADS];
37
+ __shared__ T smax[PADDLE_CUDA_NUM_THREADS];
38
+ int tid = threadIdx .x ;
39
+ int64_t global_thread_id =
40
+ static_cast <int64_t >(blockIdx .x ) * blockDim .x + threadIdx .x ;
41
+ int64_t stride = static_cast <int64_t >(gridDim .x ) * blockDim .x ;
42
+
43
+ T local_min = std::numeric_limits<T>::max ();
44
+ T local_max = std::numeric_limits<T>::lowest ();
45
+
46
+ for (int64_t i = global_thread_id; i < numel; i += stride) {
47
+ T val = input[i];
48
+ local_min = min (local_min, val);
49
+ local_max = max (local_max, val);
50
+ }
51
+
52
+ smin[tid] = local_min;
53
+ smax[tid] = local_max;
54
+ __syncthreads ();
55
+
56
+ for (int offset = blockDim .x / 2 ; offset > 0 ; offset >>= 1 ) {
57
+ if (tid < offset) {
58
+ smin[tid] = min (smin[tid], smin[tid + offset]);
59
+ smax[tid] = max (smax[tid], smax[tid + offset]);
60
+ }
61
+ __syncthreads ();
62
+ }
63
+
64
+ if (tid == 0 ) {
65
+ phi::CudaAtomicMin (min_out, smin[0 ]);
66
+ phi::CudaAtomicMax (max_out, smax[0 ]);
67
+ }
68
+ }
69
+
31
70
template <typename T, typename InputT, typename OutT>
32
71
__global__ void KernelBincount (const InputT* input,
33
- const int total_elements,
72
+ const int64_t total_elements,
34
73
const bool has_weights,
35
74
const T* weights,
36
75
OutT* output) {
37
- int tid = blockIdx .x * blockDim .x + threadIdx .x ;
38
- if (tid < total_elements) {
76
+ int64_t global_tid =
77
+ static_cast <int64_t >(blockIdx .x ) * blockDim .x + threadIdx .x ;
78
+ int64_t stride = static_cast <int64_t >(gridDim .x ) * blockDim .x ;
79
+ for (int64_t i = global_tid; i < total_elements; i += stride) {
80
+ InputT index = input[i];
39
81
if (!has_weights) {
40
- phi::CudaAtomicAdd (&output[input[tid] ], 1L );
82
+ phi::CudaAtomicAdd (&output[index ], 1L );
41
83
} else {
42
- phi::CudaAtomicAdd (&output[input[tid]] , static_cast <OutT>(weights[tid ]));
84
+ phi::CudaAtomicAdd (&output[index] , static_cast <OutT>(weights[i ]));
43
85
}
44
86
}
45
87
}
@@ -48,39 +90,45 @@ template <typename Context, typename T, typename InputT>
48
90
void BincountCUDAInner (const Context& dev_ctx,
49
91
const DenseTensor& x,
50
92
const paddle::optional<DenseTensor>& weights,
51
- int minlength,
93
+ int64_t minlength,
52
94
DenseTensor* out) {
53
95
const DenseTensor* input = &x;
54
96
DenseTensor* output = out;
55
97
const InputT* input_data = input->data <InputT>();
56
98
57
- const int input_numel = input->numel ();
99
+ int64_t input_numel = static_cast < int64_t >( input->numel () );
58
100
59
101
if (input_data == nullptr ) {
60
102
phi::DDim out_dim{0 };
61
103
output->Resize (out_dim);
62
104
dev_ctx.template Alloc <T>(output);
63
105
return ;
64
106
}
65
- auto input_x = EigenVector<InputT>::Flatten (*input);
66
- DenseTensor input_min_t , input_max_t ;
67
- input_max_t .Resize ({1 });
68
- auto * input_max_data = dev_ctx.template Alloc <InputT>(&input_max_t );
69
- input_min_t .Resize ({1 });
70
- auto * input_min_data = dev_ctx.template Alloc <InputT>(&input_min_t );
71
107
72
- auto input_max_scala = EigenScalar<InputT>::From (input_max_t );
73
- auto input_min_scala = EigenScalar<InputT>::From (input_min_t );
108
+ DenseTensor input_min_max_cpu;
109
+ input_min_max_cpu.Resize ({2 });
110
+ auto * input_min_max_cpu_data =
111
+ dev_ctx.template HostAlloc <InputT>(&input_min_max_cpu);
112
+ input_min_max_cpu.data <InputT>()[0 ] = std::numeric_limits<InputT>::max ();
113
+ input_min_max_cpu.data <InputT>()[1 ] = std::numeric_limits<InputT>::lowest ();
114
+
115
+ DenseTensor input_min_max_t ;
116
+ input_min_max_t .Resize ({2 });
117
+ auto * input_min_max_data = dev_ctx.template Alloc <InputT>(&input_min_max_t );
118
+
119
+ phi::Copy (
120
+ dev_ctx, input_min_max_cpu, dev_ctx.GetPlace (), true , &input_min_max_t );
74
121
75
- auto * place = dev_ctx.eigen_device ();
76
- input_max_scala.device (*place) = input_x.maximum ();
77
- input_min_scala.device (*place) = input_x.minimum ();
122
+ int64_t max_grid_x = dev_ctx.GetCUDAMaxGridDimSize ()[0 ];
123
+ int64_t num_blocks = std::min (GET_BLOCKS (input_numel), max_grid_x);
124
+ KernelReduceMinMax<InputT>
125
+ <<<num_blocks, PADDLE_CUDA_NUM_THREADS, 0 , dev_ctx.stream()>>> (
126
+ input_data, input_numel, input_min_max_data, input_min_max_data + 1 );
78
127
79
- DenseTensor input_min_cpu, input_max_cpu;
80
- phi::Copy (dev_ctx, input_min_t , phi::CPUPlace (), true , &input_min_cpu);
81
- phi::Copy (dev_ctx, input_max_t , phi::CPUPlace (), true , &input_max_cpu);
128
+ phi::Copy (
129
+ dev_ctx, input_min_max_t , phi::CPUPlace (), true , &input_min_max_cpu);
82
130
83
- InputT input_min = input_min_cpu .data <InputT>()[0 ];
131
+ InputT input_min = input_min_max_cpu .data <InputT>()[0 ];
84
132
85
133
PADDLE_ENFORCE_GE (
86
134
input_min,
@@ -89,9 +137,9 @@ void BincountCUDAInner(const Context& dev_ctx,
89
137
" The elements in input tensor must be non-negative ints" ));
90
138
91
139
int64_t output_size =
92
- static_cast <int64_t >(input_max_cpu .data <InputT>()[0 ]) + 1L ;
140
+ static_cast <int64_t >(input_min_max_cpu .data <InputT>()[1 ]) + 1L ;
93
141
94
- output_size = std::max (output_size, static_cast < int64_t >( minlength) );
142
+ output_size = std::max (output_size, minlength);
95
143
phi::DDim out_dim{output_size};
96
144
output->Resize (out_dim);
97
145
@@ -106,7 +154,7 @@ void BincountCUDAInner(const Context& dev_ctx,
106
154
dev_ctx, output, static_cast <int64_t >(0 ));
107
155
108
156
KernelBincount<T, InputT, int64_t >
109
- <<<GET_BLOCKS(input_numel) , PADDLE_CUDA_NUM_THREADS, 0 , stream>>> (
157
+ <<<num_blocks , PADDLE_CUDA_NUM_THREADS, 0 , stream>>> (
110
158
input_data, input_numel, has_weights, weights_data, output_data);
111
159
} else {
112
160
if (weights->dtype () == DataType::FLOAT32) {
@@ -115,14 +163,14 @@ void BincountCUDAInner(const Context& dev_ctx,
115
163
dev_ctx, output, static_cast <float >(0 ));
116
164
117
165
KernelBincount<T, InputT, float >
118
- <<<GET_BLOCKS(input_numel) , PADDLE_CUDA_NUM_THREADS, 0 , stream>>> (
166
+ <<<num_blocks , PADDLE_CUDA_NUM_THREADS, 0 , stream>>> (
119
167
input_data, input_numel, has_weights, weights_data, output_data);
120
168
} else {
121
169
double * output_data = dev_ctx.template Alloc <double >(output);
122
170
phi::funcs::SetConstant<Context, double >()(
123
171
dev_ctx, output, static_cast <double >(0 ));
124
172
KernelBincount<T, InputT, double >
125
- <<<GET_BLOCKS(input_numel) , PADDLE_CUDA_NUM_THREADS, 0 , stream>>> (
173
+ <<<num_blocks , PADDLE_CUDA_NUM_THREADS, 0 , stream>>> (
126
174
input_data, input_numel, has_weights, weights_data, output_data);
127
175
}
128
176
}
@@ -134,7 +182,7 @@ void BincountKernel(const Context& dev_ctx,
134
182
const paddle::optional<DenseTensor>& weights,
135
183
const Scalar& minlength,
136
184
DenseTensor* out) {
137
- int int_minlength = minlength.to <int >();
185
+ int64_t int_minlength = minlength.to <int64_t >();
138
186
PADDLE_ENFORCE_GE (int_minlength,
139
187
0 ,
140
188
common::errors::InvalidArgument (
0 commit comments