@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
31
31
paddle::platform::float16* d_in) {
32
32
int64_t out_idx = threadIdx .x + blockDim .x * blockIdx .x ;
33
33
if (out_idx < n) {
34
+ int64_t out_idx_tmp = out_idx;
34
35
int coords[D] = {0 };
35
36
for (int i = D - 1 ; i >= 0 ; --i) {
36
- coords[i] = out_idx % out_dims[i];
37
- out_idx /= out_dims[i];
37
+ coords[i] = out_idx_tmp % out_dims[i];
38
+ out_idx_tmp /= out_dims[i];
38
39
coords[i] += offsets[i];
39
40
}
40
41
41
42
int64_t in_idx = 0 ;
42
- for (int i = 0 ; i < D - 1 ; ++i) {
43
- in_idx += coords[i] * in_dims[i + 1 ];
43
+ for (int i = 0 ; i < D; ++i) {
44
+ in_idx = in_idx * in_dims[i] + coords[i ];
44
45
}
45
- in_idx += coords[D - 1 ];
46
46
47
47
d_in[in_idx] = d_out[out_idx];
48
48
}
@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
80
80
set_zero (dev_ctx, d_in, static_cast <paddle::platform::float16>(0 ));
81
81
82
82
int64_t numel = d_out->numel ();
83
- dim3 blocks ((numel - 1 ) / PADDLE_CUDA_NUM_THREADS + 1 , 1 , 1 );
84
- dim3 threads (PADDLE_CUDA_NUM_THREADS, 1 , 1 );
83
+ dim3 blocks ((numel - 1 ) / PADDLE_CUDA_NUM_THREADS + 1 );
84
+ dim3 threads (PADDLE_CUDA_NUM_THREADS);
85
85
auto stream = ctx.cuda_device_context ().stream ();
86
86
87
87
auto out_shape = framework::vectorize2int (out_dims);
0 commit comments