Fix cutlass_blackwell_fmha_custom_op and add comprehensive FMHA tests (#5108)

jsisometa · meta-codesync[bot] · commit 6350109636a0 · 2025-11-12T09:38:55.000-08:00
Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2113 Pull Request resolved: #5108 This diff fixes the cutlass_blackwell_fmha_custom_op.py to be fully functional and adds comprehensive testing for Blackwell FMHA (Fused Multi-Head Attention). ## Changes Made: ### 1. Fixed `cutlass_blackwell_fmha_custom_op.py` - Added missing parameters to `fmha_fwd`: `page_table`, `seqlen_k`, `window_size_left`, `window_size_right`, `bottom_right` - Added missing parameters to `fmha_bwd`: `softmax_scale`, `window_size_left`, `window_size_right`, `bottom_right`, `deterministic` - Fixed parameter type issues: `torch.ops.fbgemm.fmha_fwd/bwd` expect `int` and `bool` types, not `Optional[int]` or `Optional[bool]` - Added proper default value handling: - `window_size_left = -1` (default for no left window) - `window_size_right = -1` (default for no right window) - `bottom_right = True` (default) - `deterministic = False` (default) - Updated `_backward`, `_setup_context`, and wrapper functions to properly pass all parameters - The custom op now correctly wraps `torch.ops.fbgemm.fmha_fwd` and `torch.ops.fbgemm.fmha_bwd` ### 2. Created `blackwell_fmha.py` Test File - Structured following `blackwell_gdpa.py` as reference - Uses `cutlass_blackwell_fmha_custom_op` (Cutlass implementation) for forward and backward passes - Compares against `jagged_flash_attention_v2` (Triton JFA v2 implementation) - Tests BF16 dtype only (as specified) - Tests both forward outputs and backward gradients (dq, dk, dv) - Runs 10 random test configurations with varying batch sizes, sequence lengths, and number of heads - Uses `generate_jagged_data` utility for proper test data generation ### 3. Updated BUCK Dependencies - Changed from `//ads_mkl/ops:jfa` to `//ads_mkl/ops/triton:triton_jfa_v2` - Added `//ads_mkl/ops/utils:jfa_utils` for data generation utilities - Changed from `blackwell_attention_ops_gpu` to `blackwell_attention` to include Python bindings --- > Generated by [Confucius Code Assist (CCA)](https://www.internalfb.com/wiki/Confucius/Analect/Shared_Analects/Confucius_Code_Assist_(CCA)/) [Session](https://www.internalfb.com/confucius?session_id=96622022-bc27-11f0-bdba-7c8c09f29af2&tab=Chat), [Trace](https://www.internalfb.com/confucius?session_id=96622022-bc27-11f0-bdba-7c8c09f29af2&tab=Trace) Reviewed By: devashishshankar Differential Revision: D86583157 fbshipit-source-id: 8771f26c80b587694e2568e6b3232d4ae367c915
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_custom_op.py
@@ -12,13 +12,13 @@
 
 torch.library.define(
     "blackwell_fmha::fmha_fwd",
-    "(Tensor q, Tensor k, Tensor v, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, Tensor? seqlen_kv) -> (Tensor, Tensor)",
+    "(Tensor q, Tensor k, Tensor v, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, Tensor? seqlen_kv, Tensor? page_table, int seqlen_k=-1, int window_size_left=-1, int window_size_right=-1, bool bottom_right=True) -> (Tensor, Tensor)",
     tags=torch.Tag.pt2_compliant_tag,
 )
 
 torch.library.define(
     "blackwell_fmha::fmha_bwd",
-    "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, bool? causal) -> (Tensor, Tensor, Tensor)",
+    "(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seq_len_q, int? max_seq_len_k, float? softmax_scale, bool? causal, int window_size_left=-1, int window_size_right=-1, bool bottom_right=True, bool deterministic=False) -> (Tensor, Tensor, Tensor)",
     tags=torch.Tag.pt2_compliant_tag,
 )
 
@@ -35,13 +35,19 @@ def custom_op_fmha(
     softmax_scale: Optional[float] = None,
     causal: bool = False,
     seqlen_kv: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[int] = None,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert q.is_contiguous(), "q is not contiguous"
     assert k.is_contiguous(), "k is not contiguous"
     assert v.is_contiguous(), "v is not contiguous"
     assert q.is_cuda, "q must be on GPU"
     assert k.is_cuda, "k must be on GPU"
     assert v.is_cuda, "v must be on GPU"
+
     return torch.ops.fbgemm.fmha_fwd(
         q,
         k,
@@ -53,6 +59,11 @@ def custom_op_fmha(
         softmax_scale=softmax_scale,
         causal=causal,
         seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
     )
 
 
@@ -68,6 +79,11 @@ def fmha_fwd_meta(
     softmax_scale: Optional[float] = None,
     causal: bool = False,
     seqlen_kv: Optional[torch.Tensor] = None,
+    page_table: Optional[torch.Tensor] = None,
+    seqlen_k: Optional[int] = None,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
 ):
     if q.dtype == torch.float16:
         out_dtype = torch.float16
@@ -122,8 +138,14 @@ def custom_op_fmha_bwd(
     cu_seqlens_k: Optional[torch.Tensor] = None,
     max_seq_len_q: Optional[int] = None,
     max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
     causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
     return torch.ops.fbgemm.fmha_bwd(
         dOutput,
         query,
@@ -135,7 +157,12 @@ def custom_op_fmha_bwd(
         cu_seqlens_k=cu_seqlens_k,
         max_seq_len_q=max_seq_len_q,
         max_seq_len_k=max_seq_len_k,
+        softmax_scale=softmax_scale,
         causal=causal,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
+        deterministic=deterministic,
     )
 
 
@@ -151,7 +178,12 @@ def fmha_bwd_meta(
     cu_seqlens_k: Optional[torch.Tensor] = None,
     max_seq_len_q: Optional[int] = None,
     max_seq_len_k: Optional[int] = None,
+    softmax_scale: Optional[float] = None,
     causal: bool = False,
+    window_size_left: int = -1,
+    window_size_right: int = -1,
+    bottom_right: bool = True,
+    deterministic: bool = False,
 ):
     return (
         torch.empty_like(query),
@@ -198,9 +230,30 @@ def _backward(ctx, *grad):
         ctx.cu_seqlens_k,
         ctx.max_seq_len_q,
         ctx.max_seq_len_k,
+        ctx.softmax_scale,
         ctx.causal,
+        ctx.window_size_left,
+        ctx.window_size_right,
+        ctx.bottom_right,
+        ctx.deterministic,
+    )
+    return (
+        dq,
+        dk,
+        dv,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
     )
-    return dq, dk, dv, None, None, None, None, None, None, None
 
 
 def _setup_context(ctx, inputs, output):
@@ -215,6 +268,11 @@ def _setup_context(ctx, inputs, output):
         softmax_scale,
         causal,
         seqlen_kv,
+        page_table,
+        seqlen_k,
+        window_size_left,
+        window_size_right,
+        bottom_right,
     ) = inputs
     (out, softmax_lse) = output
     ctx.save_for_backward(q, k, v, out, softmax_lse)
@@ -224,6 +282,10 @@ def _setup_context(ctx, inputs, output):
     ctx.max_seq_len_k = max_seq_len_k
     ctx.cu_seqlens_q = cu_seqlens_q
     ctx.cu_seqlens_k = cu_seqlens_k
+    ctx.window_size_left = window_size_left
+    ctx.window_size_right = window_size_right
+    ctx.bottom_right = bottom_right
+    ctx.deterministic = False  # Set default value
     ctx.is_gen = False
 
 
@@ -246,6 +308,11 @@ def cutlass_blackwell_fmha_custom_op(
     max_seq_len_q: int | None = None,
     max_seq_len_k: int | None = None,
     seqlen_kv: torch.Tensor | None = None,
+    page_table: torch.Tensor | None = None,
+    seqlen_k: int | None = -1,
+    window_size_left: int | None = -1,
+    window_size_right: int | None = -1,
+    bottom_right: bool | None = True,
 ):
     return torch.ops.blackwell_fmha.fmha_fwd(
         q=q,
@@ -258,4 +325,9 @@ def cutlass_blackwell_fmha_custom_op(
         softmax_scale=softmax_scale,
         causal=causal,
         seqlen_kv=seqlen_kv,
+        page_table=page_table,
+        seqlen_k=seqlen_k,
+        window_size_left=window_size_left,
+        window_size_right=window_size_right,
+        bottom_right=bottom_right,
     )[0]