Skip to content

Commit fd38ab4

Browse files
Fix uncoalesced global memory access in decode attention bf16 kernel (#5109)
Summary: X-link: facebookresearch/FBGEMM#2114 Issue reported in ncu profile {F1983281351} Reviewed By: Aya-ZIbra Differential Revision: D85631783
1 parent a0c6f1a commit fd38ab4

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_load_cpasync_warpspecialized.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,12 @@ struct Sm100FmhaLoadCpAsyncWarpspecialized {
171171
auto tSgQ = thr_mma_qk.partition_A(gQ);
172172
auto tScQ = thr_mma_qk.partition_A(cQ);
173173

174-
auto atom_q_tv = Layout<Shape<Shape<_2, _32>, _16>, Stride<Stride<_16, _32>, _1>>{};
175-
auto atom_kv_tv = Layout<Shape<Shape<_2, _32>, _16>, Stride<Stride<_16, _32>, _1>>{};
174+
// Each cp.async copy atom is 16-bytes uint128_t. So we adjust the number of
175+
// elements in atom's TV layout accordingly to match Element dtype.
176+
// This avoids uncoalesced gmem access according to ncu.
177+
using ElemPerAtom = cute::Int<sizeof(uint128_t) / sizeof(Element)>;
178+
auto atom_q_tv = Layout<Shape<Shape<_2, _32>, ElemPerAtom>, Stride<Stride<ElemPerAtom, decltype(_2{} * ElemPerAtom{})>, _1>>{};
179+
auto atom_kv_tv = Layout<Shape<Shape<_2, _32>, ElemPerAtom>, Stride<Stride<ElemPerAtom, decltype(_2{} * ElemPerAtom{})>, _1>>{};
176180

177181
auto tiled_copy_q = make_cotiled_copy(
178182
Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, Element>{},

0 commit comments

Comments
 (0)