fixes

manueldeprada · manueldeprada · commit 595a1a4fd7d7 · 2025-07-30T13:16:06.000+02:00
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
@@ -23,7 +23,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, EncoderDecoderCache
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
@@ -1036,7 +1036,7 @@ def forward(
 
         # initialize `past_key_values`
         if use_cache and past_key_values is None:
-            past_key_values = EncoderDecoderCache()
+            past_key_values = EncoderDecoderCache() if encoder_hidden_states is not None else DynamicCache()
         return_legacy_cache = False
         if use_cache and isinstance(past_key_values, tuple):
             return_legacy_cache = True
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
@@ -22,7 +22,7 @@
 from torch import nn
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, EncoderDecoderCache
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
@@ -492,7 +492,7 @@ def forward(
 
         # initialize `past_key_values`
         if use_cache and past_key_values is None:
-            past_key_values = EncoderDecoderCache()
+            past_key_values = EncoderDecoderCache() if encoder_hidden_states is not None else DynamicCache()
         return_legacy_cache = False
         if use_cache and isinstance(past_key_values, tuple):
             return_legacy_cache = True