fla-org · yiyousong · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/fla/ops/linear_attn/fused_chunk.py b/fla/ops/linear_attn/fused_chunk.py
@@ -276,6 +276,7 @@ def fused_chunk_linear_attn(
     v: torch.Tensor,
     scale: Optional[float] = None,
     initial_state: torch.Tensor = None,
+    cum_k: torch.Tensor = None,
     output_final_state: bool = False,
     normalize: bool = True,
     head_first: bool = True
@@ -312,7 +313,7 @@ def fused_chunk_linear_attn(
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
     o, final_state = FusedChunkLinearAttentionFunction.apply(q, k, v, scale, initial_state, output_final_state)
     if normalize:
-        o = normalize_output(q * scale, k, o)
+        o = normalize_output(q * scale, k, o, cum_k)
     if not head_first:
         o = o.transpose(1, 2)
     return o, final_state
diff --git a/fla/ops/linear_attn/fused_recurrent.py b/fla/ops/linear_attn/fused_recurrent.py
@@ -235,6 +235,7 @@ def fused_recurrent_linear_attn(
     v: torch.Tensor,
     scale: Optional[float] = None,
     initial_state: torch.Tensor = None,
+    cum_k: torch.Tensor = None,
     output_final_state: bool = False,
     normalize: bool = False,
     head_first: bool = True
@@ -245,7 +246,7 @@ def fused_recurrent_linear_attn(
         q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
     o, final_state = FusedRecurrentLinearAttentionFunction.apply(q, k, v, scale, initial_state, output_final_state)
     if normalize:
-        o = normalize_output(q * scale, k, o)
+        o = normalize_output(q * scale, k, o, cum_k)
     if not head_first:
         o = o.transpose(1, 2)
     return o, final_state
diff --git a/fla/ops/linear_attn/utils.py b/fla/ops/linear_attn/utils.py
@@ -1,10 +1,13 @@
 # -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
 
 import torch
 
 
-@torch.jit.script
-def normalize_output(q, k, o):
+@torch.compile
+def normalize_output(q, k, o, cum_k=None):
     k = k.cumsum(-2)
+    if cum_k is not None:
+        k = k + cum_k
     z = (q * k).sum(-1, keepdim=True)
     return o / (z + 1e-10)