Skip to content

Commit e61d724

Browse files
author
Yibing Liu
authored
Fix the bug in fp16 backward kernel (#16266)
test=release/1.3
1 parent c56d902 commit e61d724

File tree

2 files changed

+33
-7
lines changed

2 files changed

+33
-7
lines changed

paddle/fluid/operators/slice_op.cu

+7-7
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
3131
paddle::platform::float16* d_in) {
3232
int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
3333
if (out_idx < n) {
34+
int64_t out_idx_tmp = out_idx;
3435
int coords[D] = {0};
3536
for (int i = D - 1; i >= 0; --i) {
36-
coords[i] = out_idx % out_dims[i];
37-
out_idx /= out_dims[i];
37+
coords[i] = out_idx_tmp % out_dims[i];
38+
out_idx_tmp /= out_dims[i];
3839
coords[i] += offsets[i];
3940
}
4041

4142
int64_t in_idx = 0;
42-
for (int i = 0; i < D - 1; ++i) {
43-
in_idx += coords[i] * in_dims[i + 1];
43+
for (int i = 0; i < D; ++i) {
44+
in_idx = in_idx * in_dims[i] + coords[i];
4445
}
45-
in_idx += coords[D - 1];
4646

4747
d_in[in_idx] = d_out[out_idx];
4848
}
@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
8080
set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
8181

8282
int64_t numel = d_out->numel();
83-
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
84-
dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
83+
dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
84+
dim3 threads(PADDLE_CUDA_NUM_THREADS);
8585
auto stream = ctx.cuda_device_context().stream();
8686

8787
auto out_shape = framework::vectorize2int(out_dims);

python/paddle/fluid/tests/unittests/test_slice_op.py

+26
Original file line numberDiff line numberDiff line change
@@ -87,5 +87,31 @@ def test_check_grad_normal(self):
8787
place, ['Input'], 'Out', max_relative_error=0.006)
8888

8989

90+
@unittest.skipIf(not core.is_compiled_with_cuda(),
91+
"core is not compiled with CUDA")
92+
class TestFP16_2(TestSliceOp):
93+
def config(self):
94+
self.dtype = "float16"
95+
self.input = np.random.random([3, 4, 5]).astype(self.dtype)
96+
self.starts = [0]
97+
self.ends = [1]
98+
self.axes = [1]
99+
self.out = self.input[:, 0:1, :]
100+
101+
def test_check_output(self):
102+
place = core.CUDAPlace(0)
103+
if core.is_float16_supported(place):
104+
self.check_output_with_place(place, atol=1e-5)
105+
106+
def test_check_grad_normal(self):
107+
place = core.CUDAPlace(0)
108+
if core.is_float16_supported(place):
109+
self.check_grad_with_place(
110+
place, ['Input'],
111+
'Out',
112+
max_relative_error=0.006,
113+
numeric_grad_delta=0.5)
114+
115+
90116
if __name__ == '__main__':
91117
unittest.main()

0 commit comments

Comments
 (0)