dump avg_bits

xinhe3 · xin3he · commit 5a040c49aa38 · 2025-09-19T02:42:16.000-04:00
Signed-off-by: xinhe3 &lt;xinhe3@habana.ai&gt;
Signed-off-by: He, Xin3 &lt;xin3.he@intel.com&gt;
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -95,6 +95,7 @@
     to_device,
     to_dtype,
     unsupport_meta_device,
+    get_avg_bits,
 )
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
@@ -1690,6 +1691,13 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         # because it may cause the gguf format to not be exported normally.
         self.model = _handle_moe_model(self.model, formats=formats)
         self.has_qlayer_outside_block = self._set_layerwise_config(self.layer_config)
+        average_bits = get_avg_bits(self.model)
+        average_bits_w_lm_head = get_avg_bits(self.model, with_lm_head=True)
+        if average_bits_w_lm_head != average_bits:
+            logger.info(f"The target average bits of blocks in the model (without lm_head): {average_bits:.3f} bits")
+            logger.info(f"The target average bits of the entire model (with lm_head): {average_bits_w_lm_head:.3f} bits")
+        else:
+            logger.info(f"The target average bits of the entire model: {average_bits:.3f} bits")
         if not hasattr(self, "formats"):
             logger.warning("this API is deprecated, please use `quantize_and_save` instead")
         else:
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -2730,3 +2730,48 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
                 return True
 
     return False
+
+
+def get_avg_bits(module, with_lm_head=False):
+    """
+    Calculates the average number of bits per weight element for supported layers in a given module.
+
+    Iterates through all named modules in the module, accumulating the total number of weight elements
+    and the corresponding bit usage, including additional scale bits for specific data types.
+
+    Args:
+        module: A neural network module containing layers to be analyzed.
+
+    Returns:
+        float: The average number of bits per weight element across all supported layers.
+
+    Note:
+        - Only layers of types specified in SUPPORTED_LAYER_TYPES are considered.
+        - For certain data types ("fp4_v2", "nv_fp4", "mx_fp4", "mx_fp8"), scale bits are added.
+        - For "fp4_v2" and "nv_fp4", an additional 32 global scale bits are included.
+    """
+    all_numel = 0
+    all_bits = 0
+
+    lm_head_name = get_lm_head_name(module)
+    if lm_head_name is None:
+        with_lm_head = False
+    for n, m in module.named_modules():
+        if n == lm_head_name and not with_lm_head:
+            continue
+        if isinstance(m, SUPPORTED_LAYER_TYPES):
+            m_numel = m.weight.numel()
+            all_numel += m_numel
+            w_bits = m.bits * m_numel
+            all_bits += w_bits
+            if m.data_type in ("fp4_v2", "nv_fp", "mx_fp", "nv_fp4", "mx_fp4", "mx_fp8"):
+                scale_bits = 8 * (m_numel // m.group_size)
+                if m.data_type in ("fp4_v2", "nv_fp"):
+                    scale_bits += 32  # global scale bits
+                all_bits += scale_bits
+            else: # woq
+                scale_bits = 16 * (m_numel // m.group_size)
+                all_bits += scale_bits
+
+    avg_bits = all_bits / all_numel if all_numel > 0 else 0
+    return round(avg_bits, 6)