Fix lint, hints, and comments

mutiann · mutiann · commit bb2e32e29678 · 2025-09-01T16:53:49.000+02:00
diff --git a/fla/layers/nsa.py b/fla/layers/nsa.py
@@ -10,10 +10,10 @@
 from einops import rearrange
 from transformers.utils import logging
 
+from fla.layers.utils import pad_input, unpad_input
 from fla.modules import RotaryEmbedding
 from fla.ops.nsa.parallel import parallel_nsa
 from fla.ops.utils.index import prepare_lens_from_mask
-from fla.layers.utils import pad_input, unpad_input
 
 if TYPE_CHECKING:
     from fla.models.utils import Cache
diff --git a/fla/layers/utils.py b/fla/layers/utils.py
@@ -99,7 +99,7 @@ def get_unpad_data(
 
 
 def unpad_input(
-    q: Union[torch.Tensor, Tuple[torch.Tensor]],
+    q: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
     states: Tuple[torch.Tensor],
     attention_mask: torch.Tensor,
     q_len: int,
@@ -133,11 +133,11 @@ def unpad_input(
             Shape: [1, total_source_length, ...] if `keepdim=True` else [total_source_length, ...].
         indices_q (`torch.Tensor`):
             The indices of non-masked tokens from the flattened input target sequence.
-        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
+        (cu_seqlens_q, cu_seqlens_k) (`Tuple[torch.LongTensor, torch.LongTensor]`):
             The cumulative sequence lengths for the target (query) and source (key, value),
             used to index into ragged (unpadded) tensors.
             `cu_seqlens` shape is [batch_size + 1].
-        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
+        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int, int]`):
             Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence
             i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
     """
diff --git a/fla/ops/nsa/compression.py b/fla/ops/nsa/compression.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 import torch
 import triton
@@ -84,9 +84,8 @@ def parallel_nsa_compression_fwd_kernel(
     # lse = log(acc) + m
     b_acc = tl.zeros([G], dtype=tl.float32)
 
-
     for i_c in range(0, NC, BC):
-        o_c = i_c + tl.arange(0, BC) # block idx
+        o_c = i_c + tl.arange(0, BC)  # block idx
 
         p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1))
         p_v = tl.make_block_ptr(v + (boc * H + i_h) * V, (TC, V), (H*V, 1), (i_c, i_v * BV), (BC, BV), (1, 0))
@@ -380,7 +379,6 @@ def parallel_nsa_compression_fwd(
     return o, lse
 
 
-
 def parallel_nsa_compression_bwd(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -512,6 +510,7 @@ def forward(
             token_indices_q=token_indices_q
         )
         ctx.save_for_backward(q, k, v, o, lse)
+        # Use cu_seqlens of q in backward, as cu_seqlens for q & k are different only for inference
         ctx.cu_seqlens = cu_seqlens_q
         ctx.token_indices = token_indices_q
         ctx.block_size = block_size
@@ -545,7 +544,7 @@ def parallel_nsa_compression(
     TK: int,
     block_size: int = 64,
     scale: float = None,
-    cu_seqlens: Optional[torch.LongTensor] = None
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None
 ):
     if scale is None:
         scale = k.shape[-1] ** -0.5
diff --git a/fla/ops/nsa/naive.py b/fla/ops/nsa/naive.py
@@ -2,14 +2,14 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import warnings
-from typing import Optional, Union, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from einops import repeat
-from torch.nn.attention.flex_attention import create_block_mask, and_masks
-from torch.nn.attention.flex_attention import flex_attention
+from torch.nn.attention.flex_attention import and_masks, create_block_mask, flex_attention
+
+from fla.ops.utils import prepare_chunk_offsets, prepare_token_indices
 from fla.ops.utils.pooling import mean_pooling
-from fla.ops.utils import prepare_token_indices, prepare_chunk_offsets
 
 try:
     from flash_attn import flash_attn_func, flash_attn_varlen_func
@@ -20,14 +20,15 @@
     )
     flash_attn_func = None
 
+
 def naive_nsa_sel(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
     block_indices: torch.LongTensor,
     block_size: int = 64,
     scale: Optional[float] = None,
-    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor]] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
     head_first: bool = False
 ) -> torch.Tensor:
     r"""
@@ -47,7 +48,7 @@ def naive_nsa_sel(
         scale (Optional[float]):
             Scale factor for attention scores.
             If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
-        cu_seqlens (torch.LongTensor or Tuple[torch.LongTensor]):
+        cu_seqlens (torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor] or None):
             Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
             consistent with the FlashAttention API.
             When a tuple is provided, it should contain two tensors: `(cu_seqlens_q, cu_seqlens_k)`.
@@ -88,10 +89,10 @@ def naive_nsa_sel(
         Tq = Tk = q.shape[1]
         cu_q = torch.cat([
             block_indices.new_tensor(range(0, B * Tq, Tq)), block_indices.new_tensor([B * Tq])
-        ])
+        ]).to(device=q.device)
         cu_k = torch.cat([
             block_indices.new_tensor(range(0, B * Tk, Tk)), block_indices.new_tensor([B * Tk])
-        ])
+        ]).to(device=q.device)
     else:
         if isinstance(cu_seqlens, tuple):
             cu_q, cu_k = cu_seqlens
@@ -104,8 +105,9 @@ def naive_nsa_sel(
         else:
             Tq = cu_q[i+1] - cu_q[i]
             Tk = cu_k[i+1] - cu_k[i]
-            q_b, k_b, v_b, i_b = q[0][cu_q[i]:cu_q[i+1]], k[0][cu_k[i]:cu_k[i+1]], v[0][cu_k[i]:cu_k[i+1]], block_indices[0][cu_q[i]:cu_q[i+1]]
-
+            q_b, k_b, v_b, i_b = (q[0][cu_q[i]:cu_q[i+1]], k[0][cu_k[i]:cu_k[i+1]],
+                                  v[0][cu_k[i]:cu_k[i+1]], block_indices[0][cu_q[i]:cu_q[i+1]])
+        assert Tq == Tk, "TQ != TK case is not supported in naive_nsa_sel"
         i_b = i_b.unsqueeze(-1) * BS + i_b.new_tensor(range(BS))
         # [T, S*BS, HQ]
         i_b = i_b.view(Tq, block_indices.shape[2], -1).transpose(1, 2)
@@ -115,16 +117,19 @@ def naive_nsa_sel(
             # [S*BS, HQ]
             i_i = i_b[i_q]
             # [S*BS, HQ, -1]
-            k_i, v_i = map(lambda x: x.gather(0, i_i.clamp(0, Tk-1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])), (k_b, v_b))
+            k_i, v_i = map(lambda x: x.gather(0, i_i.clamp(0, Tk-1).unsqueeze(-1).expand(*i_i.shape, x.shape[-1])),
+                           (k_b, v_b))
             # [S*BS, HQ]
-            attn = torch.einsum('h d, n h d -> n h', q_i, k_i).masked_fill(torch.logical_or(i_i > i_q, i_i < 0), float('-inf')).softmax(0)
+            attn = torch.einsum('h d, n h d -> n h', q_i, k_i).masked_fill(
+                torch.logical_or(i_i > i_q, i_i < 0), float('-inf')).softmax(0)
             if not varlen:
                 o[i, i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
             else:
                 o[0][cu_q[i] + i_q] = torch.einsum('n h, n h v -> h v', attn, v_i)
 
     return o.to(dtype)
 
+
 def naive_nsa_cmp(q, k_cmp, v_cmp, block_size, scale, cu_seqlens=None):
     if cu_seqlens is not None:
         seq_indices = prepare_token_indices(cu_seqlens)
@@ -167,7 +172,7 @@ def naive_nsa_topk(
     block_counts: Union[int, torch.Tensor],  # int or [B, T_q, Hkv]
     block_size: int,
     scale: float,
-    cu_seqlens: Union[None, torch.Tensor, Tuple[torch.Tensor]] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
 ) -> torch.Tensor:
     B, Tq, Hq, _ = q.shape
     Hkv = k_cmp.shape[2]
@@ -214,12 +219,12 @@ def naive_nsa_topk(
         t = torch.arange(Tq, device=device).unsqueeze(1)
         s = torch.arange(Tc, device=device).unsqueeze(0)
         block_last_pos = (s + 1) * block_size - 1
-        base_allow = (block_last_pos <= t) # [Tq,Tc]
+        base_allow = (block_last_pos <= t)  # [Tq,Tc]
 
         i_qb = (t // block_size)                                                 # [Tq,1]
         is_current_block = (s == i_qb) | (s == 0) | (s == i_qb - 1)              # [Tq,Tc]
         logits = logits.masked_fill(~base_allow[:, None, None, :], float("-inf"))
-        allow = base_allow | is_current_block # [Tq,Tc]
+        allow = base_allow | is_current_block  # [Tq,Tc]
 
         probs_q = torch.softmax(logits, dim=-1)  # [Tq, Hkv, G, Tc]
         probs_q = torch.nan_to_num(probs_q, nan=0.0)  # rows with no valid blocks -> 0
@@ -273,7 +278,7 @@ def naive_nsa(
     block_size: int = 64,
     window_size: int = 0,
     scale: Optional[float] = None,
-    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor]] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
     return_block_indices: bool = False,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.LongTensor]]:
     r"""
@@ -307,7 +312,7 @@ def naive_nsa(
         scale (Optional[float]):
             Scale factor for attention scores.
             If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
-        cu_seqlens (torch.LongTensor or Tuple[torch.LongTensor]):
+        cu_seqlens (torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor] or None):
             Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
             consistent with the FlashAttention API.
             When a tuple is provided, it should contain two tensors: `(cu_seqlens_q, cu_seqlens_k)`.
diff --git a/fla/ops/nsa/parallel.py b/fla/ops/nsa/parallel.py
@@ -2,7 +2,7 @@
 # Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import warnings
-from typing import Optional, Union, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 import triton
@@ -130,8 +130,8 @@ def parallel_nsa_kernel_topk(
     o_i = tl.zeros([BC], dtype=tl.int32)
     m_i = tl.arange(0, BC) < BC//2
 
-    IC = (i_t + Q_OFFSET) // BS # Idx of the current query block
-    for i_c in range(0, IC + 1, BC): # +1, because the current block might be also included
+    IC = (i_t + Q_OFFSET) // BS  # Idx of the current query block
+    for i_c in range(0, IC + 1, BC):  # +1, because the current block might be also included
         o_c = i_c + tl.arange(0, BC)
         # Recall k: [B, TC, H, K], boc = i_b * TC
         # we first shift to k[i_b, 0, i_h], and read a block of transposed keys from k[i_b, i_c, i_h]
@@ -207,7 +207,7 @@ def parallel_nsa_fwd_kernel(
     IS_VARLEN: tl.constexpr,
     USE_BLOCK_COUNTS: tl.constexpr
 ):
-    i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) # i_t: token, i_v: value dim, i_bh: batch * kv head
+    i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)  # i_t: token, i_v: value dim, i_bh: batch * kv head
     i_b, i_h = i_bh // H, i_bh % H
     # k: [B, TK, H, K], v: [B, TK, H, V], q: [B, TQ, HQ, K]
     # block_indices: [B, TQ, H, S]
@@ -259,7 +259,7 @@ def parallel_nsa_fwd_kernel(
     # p_q then reads the BK dimensions at the last dimension
     # the Q block is kept in the shared memory throughout the whole kernel
     # [G, BK]
-    b_q = tl.load(p_q, boundary_check=(0, 1)) # note that BK >= K, but there is boundary check
+    b_q = tl.load(p_q, boundary_check=(0, 1))  # note that BK >= K, but there is boundary check
     b_q = (b_q * scale).to(b_q.dtype)
 
     p_o = tl.make_block_ptr(
@@ -275,10 +275,10 @@ def parallel_nsa_fwd_kernel(
     # [G, BV]
     b_o = tl.zeros([G, BV], dtype=tl.float32)
 
-    b_m = tl.full([G], float('-inf'), dtype=tl.float32) # running maximum
-    b_acc = tl.zeros([G], dtype=tl.float32) # sumexp
-    for i in range(NS): # number of blocks
-        i_s = tl.load(block_indices + i).to(tl.int32) * BS # i_s is the start token index of the current KV block
+    b_m = tl.full([G], float('-inf'), dtype=tl.float32)  # running maximum
+    b_acc = tl.zeros([G], dtype=tl.float32)  # sumexp
+    for i in range(NS):  # number of blocks
+        i_s = tl.load(block_indices + i).to(tl.int32) * BS  # i_s is the start token index of the current KV block
         # Here we assume that q tokens are last TQ tokens
         if i_s <= Q_OFFSET + i_t and i_s >= 0:
             # Recall: k ([B, T, H, K]) already shifted to the start of the current sequence at head i_h, i.e. k[i_b, 0, i_h]
@@ -306,11 +306,10 @@ def parallel_nsa_fwd_kernel(
             # [G, BS]
             b_p = exp(b_s - b_m[:, None])
             # [G]
-            b_acc = b_acc * b_r + tl.sum(b_p, 1) # summed over T dimension
+            b_acc = b_acc * b_r + tl.sum(b_p, 1)  # summed over T dimension
             # [G, BV]; note that b_p is fp32, while b_q may not
             b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v)
 
-
     # o = o_n / a_n
     # lse = log( exp(m_n) * a_n )
 
@@ -319,6 +318,7 @@ def parallel_nsa_fwd_kernel(
     tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
     tl.store(p_lse, b_m.to(p_lse.dtype.element_ty))
 
+
 @triton.heuristics({
     'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor)
 })
@@ -548,6 +548,7 @@ def parallel_nsa_bwd_kernel_dkv(
     tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
     tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
 
+
 @contiguous
 def parallel_nsa_topk(
     q: torch.Tensor,
@@ -557,7 +558,7 @@ def parallel_nsa_topk(
     block_counts: Union[torch.LongTensor, int],
     block_size: int = 64,
     scale: float = None,
-    cu_seqlens: Optional[torch.LongTensor] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
 ) -> torch.LongTensor:
     B, TQ, HQ, K = q.shape
     _, TC, H, _ = k.shape
@@ -610,6 +611,7 @@ def parallel_nsa_topk(
     )
     return block_indices
 
+
 @contiguous
 def parallel_nsa_fwd(
     q: torch.Tensor,
@@ -655,7 +657,7 @@ def parallel_nsa_fwd(
         token_indices_q=token_indices_q,
         TQ=T_q,
         TK=T_kv,
-	    H=H,
+        H=H,
         HQ=HQ,
         G=G,
         K=K,
@@ -855,6 +857,7 @@ def backward(ctx, do):
         )
         return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None
 
+
 @contiguous
 def parallel_nsa(
     q: torch.Tensor,
@@ -868,7 +871,7 @@ def parallel_nsa(
     block_size: int = 64,
     window_size: int = 0,
     scale: Optional[float] = None,
-    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor]] = None,
+    cu_seqlens: Union[None, torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]] = None,
 ) -> torch.Tensor:
     r"""
     Args:
@@ -888,7 +891,7 @@ def parallel_nsa(
         block_indices (torch.LongTensor):
             Block indices of shape `[B, TQ, H, S]`.
             `S` is the number of selected blocks for each query token, which is set to 16 in the paper.
-            If `g_cmp` is provided, the passed `block_indices` will be ignored.
+            Will override the computed block indices from compression if provided.
         block_counts (Optional[Union[torch.LongTensor, int]]):
             Number of selected blocks for each query.
             If a tensor is provided, with shape `[B, TQ, H]`,
@@ -901,9 +904,10 @@ def parallel_nsa(
         scale (Optional[float]):
             Scale factor for attention scores.
             If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
-        cu_seqlens (torch.LongTensor):
+        cu_seqlens (torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor] or None):
             Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
             consistent with the FlashAttention API.
+            When a tuple is provided, it should contain two tensors: `(cu_seqlens_q, cu_seqlens_k)`.
 
     Returns:
         o (torch.Tensor):
diff --git a/tests/ops/test_nsa.py b/tests/ops/test_nsa.py