We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 0d307e2 commit 38708d4Copy full SHA for 38708d4
paddle/phi/kernels/fusion/gpu/block_attn.h
@@ -892,7 +892,9 @@ __global__ __launch_bounds__(THREADS_PER_BLOCK) void gqa_block_attention_kernel(
892
float qk_maxs[GQA_SUB_PARTITION_SIZE];
893
#pragma unroll
894
for (int i = 0; i < GQA_SUB_PARTITION_SIZE; i++) {
895
- qk_maxs[i] = -FLT_MAX;
+ // qk_maxs[i] = -FLT_MAX;
896
+ // initialize qk_maxs!!!
897
+ qk_maxs[i] = qk_smem[act_time_step * GQA_SUB_PARTITION_SIZE + i];
898
}
899
900
// threads in one block can process 'K_PER_ITER' keys
0 commit comments