From d49890470813daa0ad2ea08cb165bc9940b4e73b Mon Sep 17 00:00:00 2001 From: Uper <41718895+Hyliu-BUAA@users.noreply.github.com> Date: Sat, 23 Aug 2025 12:13:08 +0800 Subject: [PATCH 1/3] fix: add tail-case handling for elementwise_add_f16x8_pack_kernel to avoid out-of-bounds access --- kernels/elementwise/elementwise.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernels/elementwise/elementwise.cu b/kernels/elementwise/elementwise.cu index 1cba70ce..4ddfe505 100644 --- a/kernels/elementwise/elementwise.cu +++ b/kernels/elementwise/elementwise.cu @@ -117,6 +117,10 @@ __global__ void elementwise_add_f16x8_pack_kernel(half *a, half *b, half *c, // reinterpret as float4 and store 128 bits in 1 memory issue. if ((idx + 7) < N) { LDST128BITS(c[idx]) = LDST128BITS(pack_c[0]); + } else { + for (int i=0; nx+i Date: Sun, 24 Aug 2025 08:47:56 +0800 Subject: [PATCH 2/3] Rename nx to idx --- kernels/elementwise/elementwise.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/elementwise/elementwise.cu b/kernels/elementwise/elementwise.cu index 4ddfe505..aa6503e4 100644 --- a/kernels/elementwise/elementwise.cu +++ b/kernels/elementwise/elementwise.cu @@ -119,7 +119,7 @@ __global__ void elementwise_add_f16x8_pack_kernel(half *a, half *b, half *c, LDST128BITS(c[idx]) = LDST128BITS(pack_c[0]); } else { for (int i=0; nx+i Date: Sun, 24 Aug 2025 20:40:48 +0800 Subject: [PATCH 3/3] Fix bug: nx -> idx --- kernels/elementwise/elementwise.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernels/elementwise/elementwise.cu b/kernels/elementwise/elementwise.cu index aa6503e4..80302a3e 100644 --- a/kernels/elementwise/elementwise.cu +++ b/kernels/elementwise/elementwise.cu @@ -118,7 +118,7 @@ __global__ void elementwise_add_f16x8_pack_kernel(half *a, half *b, half *c, if ((idx + 7) < N) { LDST128BITS(c[idx]) = LDST128BITS(pack_c[0]); } else { - for (int i=0; nx+i