@@ -67,12 +67,8 @@ __global__ void ReduceSumWithSubtract(
67
67
const T* x, const T* y, T* out, int64_t N, Functor func) {
68
68
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
69
69
MT sum_val (0.0 );
70
- for (int i = blockIdx .x * blockDim .x + threadIdx .x ; i < N;
71
- i += blockDim .x * gridDim .x ) {
72
- sum_val += func (x[i], y[i]);
73
- }
70
+ CUDA_KERNEL_LOOP_TYPE (i, N, int64_t ) { sum_val += func (x[i], y[i]); }
74
71
75
- __syncthreads ();
76
72
sum_val = phi::funcs::BlockReduceSum<MT>(sum_val, FULL_MASK);
77
73
if (threadIdx .x == 0 ) {
78
74
out[blockIdx .x ] = static_cast <T>(sum_val);
@@ -86,12 +82,10 @@ __global__ void ReduceMaxWithSubtract(const T* x,
86
82
int64_t N) {
87
83
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
88
84
MT max_val = std::numeric_limits<MT>::min ();
89
- for (int i = blockIdx .x * blockDim .x + threadIdx .x ; i < N;
90
- i += blockDim .x * gridDim .x ) {
85
+ CUDA_KERNEL_LOOP_TYPE (i, N, int64_t ) {
91
86
max_val = max (max_val, abs (static_cast <MT>(x[i]) - static_cast <MT>(y[i])));
92
87
}
93
88
94
- __syncthreads ();
95
89
max_val = phi::funcs::BlockReduceMax<MT>(max_val, FULL_MASK);
96
90
if (threadIdx .x == 0 ) {
97
91
out[blockIdx .x ] = static_cast <T>(max_val);
@@ -105,12 +99,10 @@ __global__ void ReduceMinWithSubtract(const T* x,
105
99
int64_t N) {
106
100
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
107
101
MT min_val = std::numeric_limits<MT>::max ();
108
- for (int i = blockIdx .x * blockDim .x + threadIdx .x ; i < N;
109
- i += blockDim .x * gridDim .x ) {
102
+ CUDA_KERNEL_LOOP_TYPE (i, N, int64_t ) {
110
103
min_val = min (min_val, abs (static_cast <MT>(x[i]) - static_cast <MT>(y[i])));
111
104
}
112
105
113
- __syncthreads ();
114
106
min_val = phi::funcs::BlockReduceMin<MT>(min_val, FULL_MASK);
115
107
if (threadIdx .x == 0 ) {
116
108
out[blockIdx .x ] = static_cast <T>(min_val);
0 commit comments