Bug fix: fix duplicate launch in POD (#1267)

Edenzzzz · web-flow · commit 5502828fa1ec · 2025-07-26T21:40:04.000-07:00
## 📌 Description Mistakenly added a duplicate kernel launch last time (actually by cursor, but should've checked more closely😂) cc @yzh119 ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/include/flashinfer/attention/pod.cuh b/include/flashinfer/attention/pod.cuh
@@ -438,8 +438,6 @@ cudaError_t PODWithKVCacheTensorDispatched(PrefillParams prefill_params,
               FLASHINFER_CUDA_CALL(
                   cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
             }
-            FLASHINFER_CUDA_CALL(
-                cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
 
             // Post-kernel stuff for split-kv prefill
             if (!(num_chunks <= 1 || tmp_p == nullptr)) {
@@ -457,11 +455,11 @@ cudaError_t PODWithKVCacheTensorDispatched(PrefillParams prefill_params,
                 FLASHINFER_CUDA_CALL(VariableLengthMergeStates(
                     tmp_v, tmp_s, decode_params.merge_indptr, o_d, lse_d,
                     decode_params.max_total_num_rows, decode_params.total_num_rows, num_qo_heads,
-                    HEAD_DIM_VO, stream));
+                    HEAD_DIM_VO, enable_pdl, stream));
               } else {
                 FLASHINFER_CUDA_CALL(VariableLengthAttentionSum(
                     tmp_v, decode_params.merge_indptr, o_d, decode_params.max_total_num_rows,
-                    decode_params.total_num_rows, num_qo_heads, HEAD_DIM_VO, stream));
+                    decode_params.total_num_rows, num_qo_heads, HEAD_DIM_VO, enable_pdl, stream));
               }
             }
           }

Original file line number	Diff line number	Diff line change
`@@ -438,8 +438,6 @@ cudaError_t PODWithKVCacheTensorDispatched(PrefillParams prefill_params,`
`438`	`438`	`FLASHINFER_CUDA_CALL(`
`439`	`439`	`cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));`
`440`	`440`	`}`
`441`		`- FLASHINFER_CUDA_CALL(`
`442`		`- cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));`
`443`	`441`
`444`	`442`	`// Post-kernel stuff for split-kv prefill`
`445`	`443`	`if (!(num_chunks <= 1 \|\| tmp_p == nullptr)) {`
`@@ -457,11 +455,11 @@ cudaError_t PODWithKVCacheTensorDispatched(PrefillParams prefill_params,`
`457`	`455`	`FLASHINFER_CUDA_CALL(VariableLengthMergeStates(`
`458`	`456`	`tmp_v, tmp_s, decode_params.merge_indptr, o_d, lse_d,`
`459`	`457`	`decode_params.max_total_num_rows, decode_params.total_num_rows, num_qo_heads,`
`460`		`- HEAD_DIM_VO, stream));`
	`458`	`+ HEAD_DIM_VO, enable_pdl, stream));`
`461`	`459`	`} else {`
`462`	`460`	`FLASHINFER_CUDA_CALL(VariableLengthAttentionSum(`
`463`	`461`	`tmp_v, decode_params.merge_indptr, o_d, decode_params.max_total_num_rows,`
`464`		`- decode_params.total_num_rows, num_qo_heads, HEAD_DIM_VO, stream));`
	`462`	`+ decode_params.total_num_rows, num_qo_heads, HEAD_DIM_VO, enable_pdl, stream));`
`465`	`463`	`}`
`466`	`464`	`}`
`467`	`465`	`}`