ggml-org
diff --git a/‎common/arg.cpp
Lines changed: 10 additions & 9 deletions b/‎common/arg.cpp
Lines changed: 10 additions & 9 deletions
diff --git a/‎common/common.h
Lines changed: 11 additions & 13 deletions b/‎common/common.h
Lines changed: 11 additions & 13 deletions
diff --git a/‎convert_hf_to_gguf.py
Lines changed: 6 additions & 0 deletions b/‎convert_hf_to_gguf.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/diffusion/README.md
Lines changed: 5 additions & 39 deletions b/‎examples/diffusion/README.md
Lines changed: 5 additions & 39 deletions
@@ -3451,37 +3451,38 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
 
     add_opt(common_arg(
-        { "--diffusion--dream-eps" }, "F",
+        { "--diffusion-eps" }, "F",
         string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
         [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
-        { "--diffusion-dream-algorithm" }, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+        { "--diffusion-algorithm" }, "N",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
                       params.diffusion.algorithm),
         [](common_params & params, int value) { params.diffusion.algorithm = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
-        { "--diffusion-dream-alg-temp" }, "F",
+        { "--diffusion-alg-temp" }, "F",
         string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
         [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
 
     add_opt(common_arg(
-        { "--diffusion-llada-block-length" }, "N",
+        { "--diffusion-block-length" }, "N",
         string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
         [](common_params & params, int value) { params.diffusion.block_length = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
-        { "--diffusion-llada-cfg-scale" }, "F",
+        { "--diffusion-cfg-scale" }, "F",
         string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
         [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
-        { "--diffusion-llada-algorithm" }, "N",
-        string_format("llada remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion.remasking),
-        [](common_params & params, int value) { params.diffusion.remasking = value; }
+        { "--diffusion-add-gumbel-noise" }, "F",
+        string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+        [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
 
+
     return ctx_arg;
 }
@@ -220,19 +220,17 @@ struct common_params_vocoder {
 };
 
 struct common_params_diffusion {
-    // Common parameters
-    int32_t steps         = 128;     // number of diffusion steps
-    bool    visual_mode   = false;  // show progressive diffusion on screen
-
-    // Dream-specific parameters
-    float   eps           = 1e-3f;  // epsilon for timesteps
-    int32_t algorithm     = 3;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
-    float   alg_temp      = 0.0f;   // algorithm temperature
-
-    // LLaDA-specific parameters
-    int32_t block_length  = 32;     // block length for generation
-    float   cfg_scale     = 0.2f;   // classifier-free guidance scale
-    int32_t remasking     = 1;      // remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM
+    int32_t steps         = 128;
+    bool    visual_mode   = false;
+
+    float   eps           = 0;        // epsilon for timesteps
+    int32_t block_length  = 32;       // block length for generation
+
+    int32_t algorithm     = 4;        // default algorithm: low-confidence
+    float   alg_temp      = 0.0f;     // algorithm temperature
+
+    float   cfg_scale     = 0;        // classifier-free guidance scale
+    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
 };
 
 enum common_reasoning_format {
 
@@ -2988,9 +2988,15 @@ def set_gguf_parameters(self):
 
         # Add LLaDA-specific parameters
         mask_token_id = self.hparams.get("mask_token_id")
+
         if mask_token_id is not None:
             self.gguf_writer.add_mask_token_id(mask_token_id)
 
+        self.gguf_writer.add_add_bos_token(True)
+
+        logging.info("Adding diffusion shift logits to False")
+        self.gguf_writer.add_diffusion_shift_logits(False)
+
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
         if n_head_kv is not None and n_head != n_head_kv:
 
@@ -1,41 +1,7 @@
-# Diffusion Text Generation Examples
+# Diffusion Text Generation
 
-This directory contains implementations for diffusion-based text generation using two different model architectures: **Dream** and **LLaDA-8B**. Both models use iterative denoising processes to generate text, but employ different sampling strategies and algorithms.
+This directory contains implementations for Diffusion LLMs (DLLMs)
 
-## Supported Architechtures
-
-### 1. Dream
-
-Example models:
-- https://huggingface.co/Dream-org/Dream-v0-Base-7B
-- PR - https://github.com/ggml-org/llama.cpp/pull/14644
-
-The Dream model supports four different sampling algorithms controlled by the `--diffusion-dream-algorithm` parameter:
-
-1. **ORIGIN (0)** - Original diffusion algorithm
-   - Uses probability transfer based on timestep ratios
-
-2. **MASKGIT_PLUS (1)** - Enhanced MaskGIT sampling
-   - Improved version of the MaskGIT algorithm
-
-3. **TOPK_MARGIN (2)** - Top-K margin-based sampling
-   - Confidence calculated as the margin between top-1 and top-2 probabilities
-
-4. **ENTROPY (3)** - Entropy-based sampling (default, recommended)
-   - Uses entropy calculation for confidence estimation
-
-### 2. LLaDA
-
-Example models:
-- https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct
-- PR: https://github.com/ggml-org/llama.cpp/pull/14771
-
-### LLaDA Model Remasking Strategies
-
-The LLaDA model uses two remasking approaches controlled by the `--diffusion-llada-algorithm` parameter:
-
-1. **REMASKING_LOW_CONFIDENCE (0)** - Default strategy
-   - Remasks tokens with lowest confidence scores
-   - Uses softmax probabilities to determine confidence
-
-2. **REMASKING_RANDOM (1)** - Random remasking
+More Info:
+- https://github.com/ggml-org/llama.cpp/pull/14644
+- https://github.com/ggml-org/llama.cpp/pull/14771