@@ -2949,6 +2949,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
2949
2949
def set_vocab (self ):
2950
2950
self ._set_vocab_gpt2 ()
2951
2951
2952
+ self .gguf_writer .add_add_bos_token (True )
2953
+ self .gguf_writer .add_diffusion_shift_logits (False )
2954
+
2952
2955
def set_gguf_parameters (self ):
2953
2956
super ().set_gguf_parameters ()
2954
2957
self ._try_set_pooling_type ()
@@ -2974,14 +2977,6 @@ def set_gguf_parameters(self):
2974
2977
feed_forward_length = self .hparams .get ("mlp_hidden_size" , 12288 )
2975
2978
self .gguf_writer .add_feed_forward_length (feed_forward_length )
2976
2979
2977
- # Set RoPE parameters
2978
- if "rope_theta" in self .hparams :
2979
- self .gguf_writer .add_rope_freq_base (self .hparams ["rope_theta" ])
2980
-
2981
- # Set RMS norm epsilon
2982
- if "rms_norm_eps" in self .hparams :
2983
- self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["rms_norm_eps" ])
2984
-
2985
2980
# LLaDA models use non-causal attention for diffusion, similar to Dream
2986
2981
self .gguf_writer .add_causal_attention (False )
2987
2982
# Handle RoPE scaling similar to LlamaModel and Dream
@@ -2992,10 +2987,6 @@ def set_gguf_parameters(self):
2992
2987
if mask_token_id is not None :
2993
2988
self .gguf_writer .add_mask_token_id (mask_token_id )
2994
2989
2995
- self .gguf_writer .add_add_bos_token (True )
2996
-
2997
- logging .info ("Adding diffusion shift logits to False" )
2998
- self .gguf_writer .add_diffusion_shift_logits (False )
2999
2990
3000
2991
@staticmethod
3001
2992
def permute (weights : Tensor , n_head : int , n_head_kv : int | None ):
0 commit comments