From 2b1577b81c325bfd3abc3dc7b35167fe87a8decc Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 16 Sep 2025 14:50:04 +0800
Subject: [PATCH 01/10] enable dynamic quantization config saving

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/autoround.py                       |  6 +-
 .../export/export_to_autogptq/export.py       | 31 +++++++-
 .../export/export_to_autoround/export.py      | 12 ++++
 .../export_to_llmcompressor/export_to_fp.py   |  9 +--
 .../export/export_to_llmcompressor/utils.py   | 37 ++++++++++
 auto_round/utils.py                           | 70 ++++++++++++++++++-
 6 files changed, 155 insertions(+), 10 deletions(-)
 create mode 100644 auto_round/export/export_to_llmcompressor/utils.py

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 9d56cb8b..ff82191f 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1916,6 +1916,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
         """
         # Get the names of layers in quantization blocks
         supported_types = self.supported_types
+        dynamic_config = {}
         layers_in_blocks = get_layer_names_in_block(
             self.model, supported_types, self.quant_block_list, self.inner_supported_types
         )
@@ -1944,6 +1945,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
                     matched_names.append(layer_name)
             if len(matched_names) > 0:
                 val = layer_config[name]
+                dynamic_config[name] = val # keep regex config
                 layer_config.pop(name)
                 for match_name in matched_names:
                     layer_config[match_name] = val
@@ -2033,7 +2035,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
         need_to_quantize_lm_head = self._check_need_to_quantize_lm_head_embedding()
         if need_to_quantize_lm_head:
             has_qlayer_outside_block = True
-
+        self.dynamic_config = dynamic_config
         # Return whether there are quantized layers outside the blocks
         return has_qlayer_outside_block
 
@@ -3125,6 +3127,7 @@ def save_quantized(
             "act_data_type",
             "super_bits",
             "super_group_size",
+            "dynamic_config",
         ]
         if isinstance(self.dataset, str):
             serialization_keys.append("dataset")
@@ -3449,3 +3452,4 @@ def _step(self, scaler, optimizer, lr_schedule):
             lr_schedule.step()
         if is_optimum_habana_available():
             htcore.mark_step()
+
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index 6c755817..0c5461f6 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -45,7 +45,7 @@
 import torch.nn as nn
 import transformers
 from tqdm import tqdm
-
+from typing import Any, Dict
 import auto_round.export.export_to_autogptq.qlinear_triton
 from auto_round.logger import logger
 from auto_round.utils import (
@@ -57,6 +57,7 @@
     get_block_names,
     get_module,
     set_module,
+    to_standard_regex,
 )
 
 BLOCK_PATTERNS = [  ## copy from transformers optimum
@@ -66,6 +67,30 @@
     "model.layers",
 ]
 
+def convert_to_autogptq_dynamic(
+    dynamic_config: Dict[str, Dict[str, Any]]
+) -> Dict[str, Dict[str, Any]]:
+    """
+    Convert AutoRound-style dynamic_config into AutoGPTQ-style QuantizerConfig.dynamic.
+
+    Rules:
+    - bits < 16 -> quantize -> positive match `+:regex`
+    - bits == 16 -> skip quantize -> negative match `-:regex`
+    """
+    converted = {}
+    for name, cfg in dynamic_config.items():
+        bits = cfg.get("bits")
+        regex = to_standard_regex(name)
+
+        if bits is None:
+            continue  # ignore invalid entries
+        elif bits < 16:
+            converted[f"r'+:{regex}'"] = {"bits": bits, **{k: v for k, v in cfg.items() if k != "bits"}}
+        else:
+            # skip quantization
+            converted[f"r'-:{regex}'"] = {}
+    return converted
+
 
 def pack_layer(name, model, backend, device=None):
     if name == "lm_head":  ##dese not support lm-head
@@ -155,7 +180,8 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
         logger.error("auto-gptq format may not support loading this quantized model")
         quantization_config["block_name_to_quantize"] = common_prefix
     quantization_config.pop("to_quant_block_names", None)
-
+    dynamic_config = quantization_config.pop("dynamic_config")
+    quantization_config['dynamic'] = convert_to_autogptq_dynamic(dynamic_config)
     ## as layers maybe already packed, we need to check in layer_config
     layer_config = kwargs["layer_config"]
     for n, m in model.named_modules():
@@ -265,3 +291,4 @@ def save(
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
+
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 8877f4d4..da845fdc 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -41,6 +41,7 @@
     is_nv_fp,
     is_standard_fp,
     set_module,
+    to_standard_regex,
 )
 
 
@@ -316,6 +317,10 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
             extra_config[layer_name]["data_type"] = layer_config[layer_name]["data_type"]
             extra_config[layer_name]["group_size"] = layer_config[layer_name]["group_size"]
             extra_config[layer_name]["sym"] = layer_config[layer_name]["sym"]
+            extra_config[layer_name]["act_bits"] = layer_config[layer_name]["act_bits"]
+            extra_config[layer_name]["act_data_type"] = layer_config[layer_name]["act_data_type"]
+            extra_config[layer_name]["act_group_size"] = layer_config[layer_name]["act_group_size"]
+            extra_config[layer_name]["act_sym"] = layer_config[layer_name]["act_sym"]
         elif layer_config[layer_name]["in_blocks"] or (
             block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize)
         ):
@@ -327,6 +332,12 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
             for key in neq_keys:
                 if layer_config[layer_name][key] is not None:
                     extra_config[layer_name][key] = layer_config[layer_name][key]
+
+    dynamic_config = quantization_config.pop("dynamic_config")
+    if name in dynamic_config.keys():
+        regex_name = to_standard_regex(name)
+        extra_config[regex_name] = dynamic_config[name]
+
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
@@ -417,3 +428,4 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
+
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index 6581f0c6..6ee6f0bc 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -38,6 +38,7 @@
     is_nv_fp,
     set_amax_for_all_moe_layers,
     set_module,
+    generate_ignore_regex_list,
 )
 from auto_round.wrapper import WrapperWALayer
 
@@ -198,12 +199,7 @@ def wrapper(name):
             for _ in executor.map(wrapper, names):
                 pass
 
-    # TODO fix the ignore re match issue, compile with fp8 & int8 config
-    ignore = ["lm_head"]
-    for layer_name in layer_config:
-        if layer_config[layer_name]["bits"] > 8:  ## find ignore layers
-            ignore.append(layer_name)
-        ignore = list(set(ignore))
+    # ignore = generate_ignore_regex_list() ## check
 
     # get llm-compressor format config
     check_compressed_tensors_supported()
@@ -285,3 +281,4 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
+
diff --git a/auto_round/export/export_to_llmcompressor/utils.py b/auto_round/export/export_to_llmcompressor/utils.py
new file mode 100644
index 00000000..9dd7b4eb
--- /dev/null
+++ b/auto_round/export/export_to_llmcompressor/utils.py
@@ -0,0 +1,37 @@
+from typing import Dict, List
+from auto_round.utils import to_standard_regex, matches_any_regex
+
+
+def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
+    """
+    Generate ignore regex list for llm_compressor based on dynamic_config and layer_config.
+
+    Rules:
+    1. Any layer in dynamic_config with bits >= 16 is ignored.
+    2. Any layer in layer_config with bits >= 16 is ignored if not already included.
+    3. Output regex patterns are normalized for llm_compressor ('re:...' style).
+    
+    Args:
+        dynamic_config (Dict[str, Dict]): dynamic quantization config
+        layer_config (Dict[str, Dict]): layer-wise quantization config
+
+    Returns:
+        List[str]: List of regex patterns to ignore during quantization.
+    """
+    prefix = "re:"
+    ignore_regex: List[str] = []
+
+    # Step 1: Add dynamic_config keys with bits >= 16
+    for key, cfg in dynamic_config.items():
+        bits = cfg.get("bits")
+        if bits > 8:
+            ignore_regex.append(prefix + to_standard_regex(key))
+
+    # Step 2: Add layer_config keys if bits >= 16 and not already included
+    for key, cfg in layer_config.items():
+        bits = cfg.get("bits")
+        
+        if not matches_any_regex(key, ignore_regex, prefix):
+            ignore_regex.append(key)
+
+    return ignore_regex
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 47088ef9..076f678a 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -25,7 +25,7 @@
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, Tuple, Union
+from typing import Any, Callable, Tuple, Union, List
 
 import cpuinfo
 import torch
@@ -2687,3 +2687,71 @@ def copy_python_files_from_model_cache(model, save_path: str):
             if file.endswith(".py") and os.path.isfile(full_file_name):
                 logger.debug(f"Transferring {full_file_name} to {save_path}")
                 shutil.copy(full_file_name, save_path)
+
+def to_standard_regex(pattern: str) -> str:
+    """
+    Convert a user-specified string into a standardized regex for layer matching.
+    
+    Rules:
+    - If the pattern already contains regex tokens ('.*', '^', '$', etc.),
+      keep them as-is.
+    - Otherwise, wrap the pattern with `.*` on both sides to allow substring matching.
+    - Always ensure the returned regex is valid (compilable by re).
+    
+    Examples:
+    >>> to_standard_regex("model.embed_tokens")
+    '.*model\\.embed_tokens.*'
+    >>> to_standard_regex("mlp.gate")
+    '.*mlp\\.gate.*'
+    >>> to_standard_regex("mlp.gate$")
+    '.*mlp\\.gate$'
+    >>> to_standard_regex("mlp.*gate")
+    '.*mlp.*gate.*'
+    """
+    # Heuristic: if pattern contains regex meta characters, assume partial regex
+    meta_chars = {".*", "^", "$", "|", "(", ")", "[", "]", "?", "+"}
+    has_regex = any(tok in pattern for tok in meta_chars)
+    if not has_regex:
+        # Escape literal dots, etc., and wrap with .* for substring matching
+        pattern = re.escape(pattern)
+        regex = f".*{pattern}.*"
+    else:
+        # Only escape bare dots that are not already part of regex constructs
+        # Avoid double escaping .* sequences
+        tmp = []
+        i = 0
+        while i < len(pattern):
+            if pattern[i] == ".":
+                if i + 1 < len(pattern) and pattern[i+1] == "*":
+                    tmp.append(".*")  # keep regex token
+                    i += 2
+                    continue
+                else:
+                    tmp.append("\\.")  # escape bare dot
+            else:
+                tmp.append(pattern[i])
+            i += 1
+        regex = "".join(tmp)
+        # If no anchors are provided, allow substring matching
+        if not regex.startswith("^") and not regex.startswith(".*"):
+            regex = ".*" + regex
+        if not regex.endswith("$") and not regex.endswith(".*"):
+            regex = regex + ".*"
+    # Validate regex
+    try:
+        re.compile(regex)
+    except re.error as e:
+        raise ValueError(f"Invalid regex generated from pattern '{pattern}': {e}")
+    return regex
+
+
+def matches_any_regex(layer_name: str, regex_list: List[str], prefix="re:") -> bool:
+    """
+    Check if layer_name matches any regex pattern in regex_list.
+    """
+    for pattern in regex_list:
+        # Remove 're:' prefix for matching
+        pat = pattern[len(prefix):] if pattern.startswith(prefix) else pattern
+        if re.fullmatch(pat, layer_name):
+            return True
+    return False

From 56a2218bdb6de30d2ad9f5b01e796b9f31a13da4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Sep 2025 06:51:13 +0000
Subject: [PATCH 02/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py                       |  3 +--
 .../export/export_to_autogptq/export.py       | 11 +++++-----
 .../export/export_to_autoround/export.py      |  1 -
 .../export_to_llmcompressor/export_to_fp.py   |  3 +--
 .../export/export_to_llmcompressor/utils.py   | 21 ++++++++++++++++---
 auto_round/utils.py                           | 11 +++++-----
 6 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index ff82191f..24871d61 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1945,7 +1945,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
                     matched_names.append(layer_name)
             if len(matched_names) > 0:
                 val = layer_config[name]
-                dynamic_config[name] = val # keep regex config
+                dynamic_config[name] = val  # keep regex config
                 layer_config.pop(name)
                 for match_name in matched_names:
                     layer_config[match_name] = val
@@ -3452,4 +3452,3 @@ def _step(self, scaler, optimizer, lr_schedule):
             lr_schedule.step()
         if is_optimum_habana_available():
             htcore.mark_step()
-
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index 0c5461f6..6406f565 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -17,6 +17,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Dict
 
 import threadpoolctl as tctl
 
@@ -45,7 +46,7 @@
 import torch.nn as nn
 import transformers
 from tqdm import tqdm
-from typing import Any, Dict
+
 import auto_round.export.export_to_autogptq.qlinear_triton
 from auto_round.logger import logger
 from auto_round.utils import (
@@ -67,9 +68,8 @@
     "model.layers",
 ]
 
-def convert_to_autogptq_dynamic(
-    dynamic_config: Dict[str, Dict[str, Any]]
-) -> Dict[str, Dict[str, Any]]:
+
+def convert_to_autogptq_dynamic(dynamic_config: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
     """
     Convert AutoRound-style dynamic_config into AutoGPTQ-style QuantizerConfig.dynamic.
 
@@ -181,7 +181,7 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll
         quantization_config["block_name_to_quantize"] = common_prefix
     quantization_config.pop("to_quant_block_names", None)
     dynamic_config = quantization_config.pop("dynamic_config")
-    quantization_config['dynamic'] = convert_to_autogptq_dynamic(dynamic_config)
+    quantization_config["dynamic"] = convert_to_autogptq_dynamic(dynamic_config)
     ## as layers maybe already packed, we need to check in layer_config
     layer_config = kwargs["layer_config"]
     for n, m in model.named_modules():
@@ -291,4 +291,3 @@ def save(
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
-
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index da845fdc..2dd6707c 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -428,4 +428,3 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
-
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index 6ee6f0bc..5002a2fc 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -32,13 +32,13 @@
     check_to_quantized,
     copy_python_files_from_model_cache,
     filter_quantization_config,
+    generate_ignore_regex_list,
     get_block_names,
     get_module,
     is_mx_fp,
     is_nv_fp,
     set_amax_for_all_moe_layers,
     set_module,
-    generate_ignore_regex_list,
 )
 from auto_round.wrapper import WrapperWALayer
 
@@ -281,4 +281,3 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
-
diff --git a/auto_round/export/export_to_llmcompressor/utils.py b/auto_round/export/export_to_llmcompressor/utils.py
index 9dd7b4eb..378b78a5 100644
--- a/auto_round/export/export_to_llmcompressor/utils.py
+++ b/auto_round/export/export_to_llmcompressor/utils.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List
-from auto_round.utils import to_standard_regex, matches_any_regex
+
+from auto_round.utils import matches_any_regex, to_standard_regex
 
 
 def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
@@ -10,7 +25,7 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     1. Any layer in dynamic_config with bits >= 16 is ignored.
     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
-    
+
     Args:
         dynamic_config (Dict[str, Dict]): dynamic quantization config
         layer_config (Dict[str, Dict]): layer-wise quantization config
@@ -30,7 +45,7 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     # Step 2: Add layer_config keys if bits >= 16 and not already included
     for key, cfg in layer_config.items():
         bits = cfg.get("bits")
-        
+
         if not matches_any_regex(key, ignore_regex, prefix):
             ignore_regex.append(key)
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 076f678a..d4904616 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -25,7 +25,7 @@
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, Tuple, Union, List
+from typing import Any, Callable, List, Tuple, Union
 
 import cpuinfo
 import torch
@@ -2688,16 +2688,17 @@ def copy_python_files_from_model_cache(model, save_path: str):
                 logger.debug(f"Transferring {full_file_name} to {save_path}")
                 shutil.copy(full_file_name, save_path)
 
+
 def to_standard_regex(pattern: str) -> str:
     """
     Convert a user-specified string into a standardized regex for layer matching.
-    
+
     Rules:
     - If the pattern already contains regex tokens ('.*', '^', '$', etc.),
       keep them as-is.
     - Otherwise, wrap the pattern with `.*` on both sides to allow substring matching.
     - Always ensure the returned regex is valid (compilable by re).
-    
+
     Examples:
     >>> to_standard_regex("model.embed_tokens")
     '.*model\\.embed_tokens.*'
@@ -2722,7 +2723,7 @@ def to_standard_regex(pattern: str) -> str:
         i = 0
         while i < len(pattern):
             if pattern[i] == ".":
-                if i + 1 < len(pattern) and pattern[i+1] == "*":
+                if i + 1 < len(pattern) and pattern[i + 1] == "*":
                     tmp.append(".*")  # keep regex token
                     i += 2
                     continue
@@ -2751,7 +2752,7 @@ def matches_any_regex(layer_name: str, regex_list: List[str], prefix="re:") -> b
     """
     for pattern in regex_list:
         # Remove 're:' prefix for matching
-        pat = pattern[len(prefix):] if pattern.startswith(prefix) else pattern
+        pat = pattern.removeprefix(prefix)
         if re.fullmatch(pat, layer_name):
             return True
     return False

From db99785d22bdd9c9c1ebd5cd3e9bc28cfe1a742c Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 16 Sep 2025 16:02:14 +0800
Subject: [PATCH 03/10] fixtypo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/autoround.py                       |  1 +
 .../export/export_to_autogptq/export.py       | 14 +++++++++++--
 .../export/export_to_autoround/export.py      |  8 ++++---
 .../export_to_llmcompressor/export_to_fp.py   |  6 ++++--
 .../export/export_to_llmcompressor/utils.py   | 21 +++----------------
 auto_round/utils.py                           |  8 +++++++
 6 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 24871d61..bc2d14b5 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -3452,3 +3452,4 @@ def _step(self, scaler, optimizer, lr_schedule):
             lr_schedule.step()
         if is_optimum_habana_available():
             htcore.mark_step()
+
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index 6406f565..325fa05e 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -48,6 +48,12 @@
 from tqdm import tqdm
 
 import auto_round.export.export_to_autogptq.qlinear_triton
+
+GPTQ_REQUIRED_CONFIG_KEYS = (
+    "bits",
+    "group_size",
+    "sym",
+)
 from auto_round.logger import logger
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
@@ -57,6 +63,7 @@
     get_autogptq_packing_qlinear,
     get_block_names,
     get_module,
+    json_serialize,
     set_module,
     to_standard_regex,
 )
@@ -85,7 +92,9 @@ def convert_to_autogptq_dynamic(dynamic_config: Dict[str, Dict[str, Any]]) -> Di
         if bits is None:
             continue  # ignore invalid entries
         elif bits < 16:
-            converted[f"r'+:{regex}'"] = {"bits": bits, **{k: v for k, v in cfg.items() if k != "bits"}}
+            converted[f"r'+:{regex}'"] = {"bits": bits}
+            for key in GPTQ_REQUIRED_CONFIG_KEYS:  # only save keys gptq
+                converted[f"r'+:{regex}'"][key] = dynamic_config[name][key]
         else:
             # skip quantization
             converted[f"r'-:{regex}'"] = {}
@@ -285,9 +294,10 @@ def save(
     config_file = "quantize_config.json"
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
-            json.dump(model.config.quantization_config, f, indent=2)
+            json.dump(model.config.quantization_config, f, indent=2, default=json_serialize)
 
     try:
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
+
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 2dd6707c..3f21de94 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -334,9 +334,10 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
                     extra_config[layer_name][key] = layer_config[layer_name][key]
 
     dynamic_config = quantization_config.pop("dynamic_config")
-    if name in dynamic_config.keys():
-        regex_name = to_standard_regex(name)
-        extra_config[regex_name] = dynamic_config[name]
+    if dynamic_config is not None:
+        for name in dynamic_config.keys():
+            regex_name = to_standard_regex(name)
+            extra_config[regex_name] = {**{k: dynamic_config[name][k] for k in REQUIRED_CONFIG_KEYS}}
 
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
@@ -428,3 +429,4 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
+
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index 5002a2fc..42b75d20 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -25,6 +25,7 @@
 from tqdm import tqdm
 
 from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
+from auto_round.export.export_to_llmcompressor.utils import generate_ignore_regex_list
 from auto_round.logger import logger
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
@@ -32,7 +33,6 @@
     check_to_quantized,
     copy_python_files_from_model_cache,
     filter_quantization_config,
-    generate_ignore_regex_list,
     get_block_names,
     get_module,
     is_mx_fp,
@@ -199,7 +199,8 @@ def wrapper(name):
             for _ in executor.map(wrapper, names):
                 pass
 
-    # ignore = generate_ignore_regex_list() ## check
+    ignore = ["lm_head"]
+    # ignore = generate_ignore_regex_list()
 
     # get llm-compressor format config
     check_compressed_tensors_supported()
@@ -281,3 +282,4 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
+
diff --git a/auto_round/export/export_to_llmcompressor/utils.py b/auto_round/export/export_to_llmcompressor/utils.py
index 378b78a5..9dd7b4eb 100644
--- a/auto_round/export/export_to_llmcompressor/utils.py
+++ b/auto_round/export/export_to_llmcompressor/utils.py
@@ -1,20 +1,5 @@
-# Copyright (c) 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from typing import Dict, List
-
-from auto_round.utils import matches_any_regex, to_standard_regex
+from auto_round.utils import to_standard_regex, matches_any_regex
 
 
 def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
@@ -25,7 +10,7 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     1. Any layer in dynamic_config with bits >= 16 is ignored.
     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
-
+    
     Args:
         dynamic_config (Dict[str, Dict]): dynamic quantization config
         layer_config (Dict[str, Dict]): layer-wise quantization config
@@ -45,7 +30,7 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     # Step 2: Add layer_config keys if bits >= 16 and not already included
     for key, cfg in layer_config.items():
         bits = cfg.get("bits")
-
+        
         if not matches_any_regex(key, ignore_regex, prefix):
             ignore_regex.append(key)
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index d4904616..c311d6c6 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2756,3 +2756,11 @@ def matches_any_regex(layer_name: str, regex_list: List[str], prefix="re:") -> b
         if re.fullmatch(pat, layer_name):
             return True
     return False
+
+
+def json_serialize(obj: Any):
+    """Convert non-JSON-serializable objects into JSON-friendly formats."""
+    if isinstance(obj, torch.dtype):
+        return str(obj).split(".")[-1]  # e.g., torch.float16 -> "float16"
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
+

From 81e80860c127c61209a68a6016dc9040355e2c36 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Sep 2025 08:05:38 +0000
Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py                       |  1 -
 .../export/export_to_autogptq/export.py       |  1 -
 .../export/export_to_autoround/export.py      |  1 -
 .../export_to_llmcompressor/export_to_fp.py   |  1 -
 .../export/export_to_llmcompressor/utils.py   | 21 ++++++++++++++++---
 auto_round/utils.py                           |  1 -
 6 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index bc2d14b5..24871d61 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -3452,4 +3452,3 @@ def _step(self, scaler, optimizer, lr_schedule):
             lr_schedule.step()
         if is_optimum_habana_available():
             htcore.mark_step()
-
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
index 325fa05e..4d22ceca 100644
--- a/auto_round/export/export_to_autogptq/export.py
+++ b/auto_round/export/export_to_autogptq/export.py
@@ -300,4 +300,3 @@ def save(
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
-
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 3f21de94..9e890996 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -429,4 +429,3 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
-
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index 42b75d20..c0a35143 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -282,4 +282,3 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
         copy_python_files_from_model_cache(model, save_dir)
     except Exception as e:
         logger.warning("Skipping source model Python file copy due to error: %s", e)
-
diff --git a/auto_round/export/export_to_llmcompressor/utils.py b/auto_round/export/export_to_llmcompressor/utils.py
index 9dd7b4eb..378b78a5 100644
--- a/auto_round/export/export_to_llmcompressor/utils.py
+++ b/auto_round/export/export_to_llmcompressor/utils.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List
-from auto_round.utils import to_standard_regex, matches_any_regex
+
+from auto_round.utils import matches_any_regex, to_standard_regex
 
 
 def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
@@ -10,7 +25,7 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     1. Any layer in dynamic_config with bits >= 16 is ignored.
     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
-    
+
     Args:
         dynamic_config (Dict[str, Dict]): dynamic quantization config
         layer_config (Dict[str, Dict]): layer-wise quantization config
@@ -30,7 +45,7 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     # Step 2: Add layer_config keys if bits >= 16 and not already included
     for key, cfg in layer_config.items():
         bits = cfg.get("bits")
-        
+
         if not matches_any_regex(key, ignore_regex, prefix):
             ignore_regex.append(key)
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index c311d6c6..a8340789 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2763,4 +2763,3 @@ def json_serialize(obj: Any):
     if isinstance(obj, torch.dtype):
         return str(obj).split(".")[-1]  # e.g., torch.float16 -> "float16"
     raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
-

From 4e580905c0e3917ef2a08669210e3ac11d8514dc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Sep 2025 01:47:30 +0000
Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index 72748416..304a720b 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List
-from auto_round.utils import to_standard_regex, matches_any_regex
+
+from auto_round.utils import matches_any_regex, to_standard_regex
 
 
 def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
@@ -10,7 +25,7 @@ def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict
     1. Any layer in regex_config with bits >= 16 is ignored.
     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
-    
+
     Args:
         regex_config (Dict[str, Dict]): dynamic quantization config
         layer_config (Dict[str, Dict]): layer-wise quantization config
@@ -33,4 +48,4 @@ def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict
         if bits > 8:
             ignore_regex.append(key)
 
-    return ignore_regex
\ No newline at end of file
+    return ignore_regex

From c75ebdc1f253b32e8e1b05cf604d047fd9f4bf4b Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 24 Sep 2025 10:05:06 +0800
Subject: [PATCH 06/10] rebase code, refine config saving

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/compressors/base.py                |    5 +
 .../export/export_to_autoround/export.py      |    9 +-
 .../export_to_nvfp_mxfp.py                    |    9 +
 .../export_to_llmcompressor/export_to_fp.py   |    1 +
 .../export/export_to_llmcompressor/utils.py   |   36 +-
 auto_round/utils.py                           | 2831 ++++++++++++++++-
 test/test_cpu/test_mix_bits.py                |  110 +
 test/test_cuda/test_mix_bits.py               |  190 ++
 8 files changed, 3136 insertions(+), 55 deletions(-)
 create mode 100644 test/test_cpu/test_mix_bits.py
 create mode 100644 test/test_cuda/test_mix_bits.py

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index a0df6b00..ab858d9f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -1951,6 +1951,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
         """
         # Get the names of layers in quantization blocks
         supported_types = self.supported_types
+        regex_config = {}
         layers_in_blocks = get_layer_names_in_block(
             self.model, supported_types, self.quant_block_list, self.inner_supported_types
         )
@@ -1979,6 +1980,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
                     matched_names.append(layer_name)
             if len(matched_names) > 0:
                 val = layer_config[name]
+                regex_config[name] = val  # keep regex config
                 layer_config.pop(name)
                 for match_name in matched_names:
                     layer_config[match_name] = val
@@ -2069,6 +2071,7 @@ def _set_layerwise_config(self, layer_config: dict) -> bool:
         if need_to_quantize_lm_head:
             has_qlayer_outside_block = True
 
+        self.regex_config = regex_config
         # Return whether there are quantized layers outside the blocks
         return has_qlayer_outside_block
 
@@ -3162,6 +3165,7 @@ def save_quantized(
             "act_data_type",
             "super_bits",
             "super_group_size",
+            "regex_config",
         ]
         if isinstance(self.dataset, str):
             serialization_keys.append("dataset")
@@ -3490,3 +3494,4 @@ def _step(self, scaler, optimizer, lr_schedule):
             lr_schedule.step()
         if is_hpex_available():
             htcore.mark_step()
+
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index aefd0ac1..5c58fe2d 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -349,11 +349,11 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
                 if layer_config[layer_name][key] is not None:
                     extra_config[layer_name][key] = layer_config[layer_name][key]
 
-    dynamic_config = quantization_config.pop("dynamic_config")
-    if dynamic_config is not None:
-        for name in dynamic_config.keys():
+    regex_config = quantization_config.pop("regex_config")
+    if regex_config is not None:
+        for name in regex_config.keys():
             regex_name = to_standard_regex(name)
-            extra_config[regex_name] = {**{k: dynamic_config[name][k] for k in REQUIRED_CONFIG_KEYS}}
+            extra_config[regex_name] = {**{k: regex_config[name][k] for k in REQUIRED_CONFIG_KEYS}}
 
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
@@ -399,3 +399,4 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
+
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
index e5653169..bff5c4f4 100644
--- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
+++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -39,6 +39,7 @@
     is_nv_fp,
     set_amax_for_all_moe_layers,
     set_module,
+    to_standard_regex,
 )
 from auto_round.wrapper import WrapperWALayer
 
@@ -215,6 +216,13 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
             for key in neq_keys:
                 if layer_config[layer_name][key] is not None:
                     extra_config[layer_name][key] = layer_config[layer_name][key]
+
+    regex_config = quantization_config.pop("regex_config")
+    if regex_config is not None:
+        for name in regex_config.keys():
+            regex_name = to_standard_regex(name)
+            extra_config[regex_name] = {**{k: regex_config[name][k] for k in REQUIRED_CONFIG_KEYS}}
+
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
@@ -254,3 +262,4 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
+
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index f2fd0515..9fc138b6 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -244,3 +244,4 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
+
diff --git a/auto_round/export/export_to_llmcompressor/utils.py b/auto_round/export/export_to_llmcompressor/utils.py
index 378b78a5..c2cb1ac4 100644
--- a/auto_round/export/export_to_llmcompressor/utils.py
+++ b/auto_round/export/export_to_llmcompressor/utils.py
@@ -1,33 +1,18 @@
-# Copyright (c) 2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from typing import Dict, List
-
-from auto_round.utils import matches_any_regex, to_standard_regex
+from auto_round.utils import to_standard_regex, matches_any_regex
 
 
-def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
+def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
     """
-    Generate ignore regex list for llm_compressor based on dynamic_config and layer_config.
+    Generate ignore regex list for llm_compressor based on regex_config and layer_config.
 
     Rules:
-    1. Any layer in dynamic_config with bits >= 16 is ignored.
+    1. Any layer in regex_config with bits >= 16 is ignored.
     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
-
+    
     Args:
-        dynamic_config (Dict[str, Dict]): dynamic quantization config
+        regex_config (Dict[str, Dict]): dynamic quantization config
         layer_config (Dict[str, Dict]): layer-wise quantization config
 
     Returns:
@@ -36,17 +21,16 @@ def generate_ignore_regex_list(dynamic_config: Dict[str, Dict], layer_config: Di
     prefix = "re:"
     ignore_regex: List[str] = []
 
-    # Step 1: Add dynamic_config keys with bits >= 16
-    for key, cfg in dynamic_config.items():
+    # Step 1: Add regex_config keys with bits >= 16
+    for key, cfg in regex_config.items():
         bits = cfg.get("bits")
         if bits > 8:
             ignore_regex.append(prefix + to_standard_regex(key))
 
-    # Step 2: Add layer_config keys if bits >= 16 and not already included
+    # Step 2: Add all full named layer from layer_config if bits >= 16
     for key, cfg in layer_config.items():
         bits = cfg.get("bits")
-
-        if not matches_any_regex(key, ignore_regex, prefix):
+        if bits > 8:
             ignore_regex.append(key)
 
     return ignore_regex
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 304a720b..6d95fc2b 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 Intel Corporation
+# Copyright (c) 2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,40 +12,2821 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List
+import collections.abc
+import copy
+import gc
+import importlib
+import json
+import os
+import re
+import sys
+from collections import UserDict
+from enum import Enum
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Callable, List, Tuple, Union
 
-from auto_round.utils import matches_any_regex, to_standard_regex
+import cpuinfo
+import torch
+import transformers
+from packaging import version
+from torch.amp import autocast
 
+from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType
+from auto_round.logger import logger
+from auto_round.schemes import QuantizationScheme
 
-def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
+SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings")
+
+deepspeed_exists = False
+if importlib.util.find_spec("deepspeed"):  # check if deepspeed is installed
+    deepspeed_exists = True
+
+
+class SupportedFormats:
+
+    def __init__(self):
+        self._support_format = (
+            "auto_round",
+            "auto_gptq",
+            "auto_awq",
+            "auto_round:auto_gptq",
+            "auto_round:gptqmodel",
+            "auto_round:auto_awq",
+            "auto_round:llm_compressor",
+            "itrex",
+            "itrex_xpu",
+            "fake",
+            "llm_compressor",
+        )
+        self._gguf_format = tuple(sorted(GGUF_CONFIG.keys()))
+        self._support_list = self._support_format + self._gguf_format
+
+    def __contains__(self, key):
+        return True if key in self._support_list else False
+
+    def __str__(self):
+        # Return "(%s)" % ', '.join(self._support_format + ("gguf:q*_0", "gguf:q*_1", "gguf:q*_k_s"))
+        return "(%s)" % ", ".join(self._support_list)
+
+    def __getitem__(self, key):
+        return self._support_list[key]
+
+
+SUPPORTED_DTYPES = ("int", "mx_fp", "fp", "nv_fp")
+SUPPORTED_FORMATS = SupportedFormats()
+SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D)
+
+# Changed to str as it relies on triton or others lib to load this
+INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",)
+# INNER_SUPPORTED_LAYER_TYPES = (transformers.integrations.finegrained_fp8.FP8Linear,)
+
+if deepspeed_exists:
+    from deepspeed.module_inject import LinearAllreduce, LinearLayer
+
+    SUPPORTED_LAYER_TYPES = SUPPORTED_LAYER_TYPES + (LinearLayer, LinearAllreduce)
+
+
+def infer_bits_by_data_type(data_type: str):
+    """Infer bits by data_type
+
+    Args:
+        data_type (str): data_type
+
+    Returns:
+        int: bits inferred by data_type, None means cannot infer correct bits by data_type
+    """
+    if data_type is None:
+        return 16
+    for supported_dtype in SUPPORTED_DTYPES:
+        if data_type.startswith(supported_dtype) and len(data_type) > len(supported_dtype):
+            ##first check the following two bits
+            suc_2str = data_type[len(supported_dtype) : len(supported_dtype) + 2]
+            if str.isdigit(suc_2str):
+                return int(suc_2str)
+            if str.isdigit(data_type[len(supported_dtype)]):
+                return int(data_type[len(supported_dtype)])
+    return None
+
+
+class LazyImport(object):
+    """Lazy import python module till use."""
+
+    def __init__(self, module_name):
+        """Init LazyImport object.
+
+        Args:
+            module_name (string): The name of module imported later
+        """
+        self.module_name = module_name
+        self.module = None
+
+    def __getattr__(self, name):
+        """Get the attributes of the module by name."""
+        try:
+            self.module = importlib.import_module(self.module_name)
+            mod = getattr(self.module, name)
+        except:
+            spec = importlib.util.find_spec(str(self.module_name + "." + name))
+            mod = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(mod)
+        return mod
+
+    def __call__(self, *args, **kwargs):
+        """Call the function in that module."""
+        function_name = self.module_name.split(".")[-1]
+        module_name = self.module_name.split(f".{function_name}")[0]
+        self.module = importlib.import_module(module_name)
+        function = getattr(self.module, function_name)
+        return function(*args, **kwargs)
+
+
+auto_gptq = LazyImport("auto_gptq")
+htcore = LazyImport("habana_frameworks.torch.core")
+
+
+################ Check available sys.module to decide behavior #################
+def is_package_available(package_name: str) -> bool:
+    """Check if the package exists in the environment without importing.
+
+    Args:
+        package_name (str): package name
+    """
+    from importlib.util import find_spec
+
+    package_spec = find_spec(package_name)
+    return package_spec is not None
+
+
+## check hpex
+if is_package_available("habana_frameworks"):
+    _hpex_available = True
+    import habana_frameworks.torch.hpex  # pylint: disable=E0401
+else:
+    _hpex_available = False
+
+
+@torch._dynamo.disable()
+@lru_cache(None)
+def is_hpex_available():
+    return _hpex_available
+
+
+def get_module(module, key):
+    """Get module from model by key name.
+
+    Args:
+        module (torch.nn.Module): original model
+        key (str): module name to be replaced
+    """
+    name_list = key.split(".")
+    for name in name_list:
+        module = getattr(module, name, None)
+    return module
+
+
+def set_module(model, key, new_module):
+    """Set new module into model by key name.
+
+    Args:
+        model (torch.nn.Module): original model
+        key (str): module name to be replaced
+        new_module (torch.nn.Module): new module to be inserted
+    """
+    module = model
+    name_list = key.split(".")
+    for name in name_list[:-1]:
+        if hasattr(module, name):
+            module = getattr(module, name)
+    setattr(module, name_list[-1], new_module)
+
+
+def get_scale_shape(weight, group_size):
+    """Computes the shape of the scale tensor for quantization based on the weight tensor and group size.
+
+    Args:
+      weight (torch.Tensor): The weight tensor of the layer.
+      group_size (int): The size of the groups for quantization.
+
+    Returns:
+      The shape of the scale tensor to be used for quantization.
+    """
+    if group_size == 0:
+        return 1
+    elif group_size == -1 or weight.shape[1] < group_size:
+        shape = weight.shape[0]
+    else:
+        shape = weight.shape[0] * ((weight.shape[1] + group_size - 1) // group_size)
+
+    return shape
+
+
+def unsupport_meta_device(model):
+    """Checks if the model is a valid model for auto_round.
+
+    Args:
+    model: The model to be checked.
+
+    Returns:
+    bool: True if the model is valid, False otherwise.
+    """
+    target_device = None
+    for param in model.parameters():
+        if target_device is None:
+            target_device = param.device
+        if param.device != target_device:
+            if param.device.type == "meta" or target_device.type == "meta":
+                return True
+    if target_device.type == "meta":
+        if hasattr(model, "path"):
+            return False
+        else:
+            return True
+    return False
+
+
+def to_device(input, device=torch.device("cpu")):
+    """Moves input data to the specified device.
+
+    Args:
+    input: The input data to be moved.
+    device: The target device.
+
+    Returns:
+    The input data on the specified device.
+    """
+    if input is None:
+        return None
+    if isinstance(input, torch.Tensor):
+        return input.to(device)
+    if isinstance(input, dict) or isinstance(input, UserDict):
+        for inp in input.keys():
+            input[inp] = to_device(input[inp], device)
+
+    elif isinstance(input, list) or isinstance(input, tuple):
+        if len(input) == 0:
+            return input
+        input_res = []
+        for inp in input:
+            input_res.append(to_device(inp, device))
+        if isinstance(input, tuple):
+            input_res = tuple(input_res)
+        input = input_res
+
+    return input
+
+
+def mv_module_from_gpu(module, low_cpu_mem_usage=False):
+    """Moves module from gpu to cpu or meta if low_cpu_mem_usage is true.
+
+    Args:
+    module: The module to be moved.
+    low_cpu_mem_usage: Whether to use low CPU memory. If true, move module to meta.
+
+    Returns:
+    The module on the specified device.
+    """
+    if hasattr(module, "device"):
+        target_device = "meta" if low_cpu_mem_usage else "cpu"
+        if module.device.type == target_device:
+            return module
+        else:
+            return module.to(target_device)
+    else:
+        if low_cpu_mem_usage:
+            return module.to("meta")
+        else:
+            return module.to("cpu")
+
+
+def to_dtype(input, dtype=torch.float32):
+    """Moves input data to the specified data type.
+
+    Args:
+    input: The input data to be moved.
+    dtype: The target data type.
+
+    Returns:
+    The input data on the specified data type.
+    """
+    if input is None:
+        return None
+    if isinstance(input, torch.Tensor):
+        return input.to(dtype)
+    if isinstance(input, dict) or isinstance(input, UserDict):
+        for inp in input.keys():
+            input[inp] = to_dtype(input[inp], dtype)
+
+    elif isinstance(input, list) or isinstance(input, tuple):
+        if len(input) == 0:
+            return input
+        input_res = []
+        for inp in input:
+            input_res.append(to_dtype(inp, dtype))
+        if isinstance(input, tuple):
+            input_res = tuple(input_res)
+        input = input_res
+
+    return input
+
+
+def check_is_cpu(device):
+    """Check if the device is a CPU.
+
+    Args:
+        device: The device to be checked.
+
+    Returns:
+        bool: True if the device is a CPU, False otherwise.
+    """
+    return device == torch.device("cpu") or device == "cpu"
+
+
+def get_common_prefix(paths):
+    # Split each path into components and find the common prefix
+    split_paths = [path.split(".") for path in paths]
+    common_prefix = split_paths[0]
+    for path in split_paths[1:]:
+        common_prefix = [comp for comp, other in zip(common_prefix, path) if comp == other]
+    return ".".join(common_prefix)
+
+
+def extract_block_names_to_str(quant_block_list):
+    if not isinstance(quant_block_list, (list, tuple)):
+        return None
+    # Extract common prefix for each list
+    prefixes = [get_common_prefix(blocks) for blocks in quant_block_list]
+    # Join prefixes into a single string
+    return ",".join(prefixes)
+
+
+def find_matching_blocks(model, all_blocks, to_quant_block_names):
+    """
+    Find and return matching blocks in the model based on to_quant_block_names.
+
+    Args:
+        model: The model (not used in this specific function but kept for completeness).
+        all_blocks: List of lists, where each inner list contains full block names in the model.
+        to_quant_block_names: Comma-separated string of target block names to match.
+
+    Returns:
+        target_blocks: List of lists containing full paths of matching blocks in the model.
+    """
+    if not to_quant_block_names:
+        return all_blocks
+    to_quant_block_list = to_quant_block_names
+    if isinstance(to_quant_block_names, list) or isinstance(to_quant_block_names, tuple):
+        return to_quant_block_names
+    if isinstance(to_quant_block_names, str):
+        to_quant_block_list = [name.strip() for name in to_quant_block_names.split(",")]
+    target_blocks = []
+    for block_list in all_blocks:
+        matched_sublist = []
+        for name in to_quant_block_list:
+            matches = [block for block in block_list if re.search(name, block)]
+            if matches:
+                matched_sublist.extend(matches)
+        if matched_sublist:
+            target_blocks.append(matched_sublist)
+    if not target_blocks:
+        raise ValueError(
+            "No block names matched. Please check the input for to_quant_block_name,"
+            "or set to_quant_block_name to None to automatically match quantizable blocks."
+        )
+    return target_blocks
+
+
+def get_block_names(model, quant_vision=False):
+    """Get the block names for transformers-like networks.
+
+    Args:
+    model: The model.
+
+    Returns:
+    block_names: A list whose elements are list of block's layer names
+    """
+    from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK
+
+    def _get_llm_block_names(model):
+        block_names = []
+        target_modules = []
+        for n, m in model.named_modules():
+            if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
+                target_modules.append((n, m))
+                break  ## only find the first modulelist, may be not robust
+        for i, target_m in enumerate(target_modules):
+            block_names.append([])
+            for n, m in target_m[1].named_children():
+                block_names[i].append(target_m[0] + "." + n)
+        return block_names
+
+    def _get_vlm_block_names(model, quant_vision=False):
+        if (
+            hasattr(model, "config")
+            and hasattr(model.config, "model_type")
+            and model.config.model_type in SPECIAL_MULTIMODAL_BLOCK.keys()
+        ):
+            return SPECIAL_MULTIMODAL_BLOCK.get(model.config.model_type)(model, quant_vision=quant_vision)
+        block_names = []
+        target_modules = []
+        vision_blocks_tuple = ("vision", "visual", "image", "img")
+        last_block_name = ""
+        for n, m in model.named_modules():
+            if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
+                if quant_vision or all(key not in n.lower() for key in (vision_blocks_tuple)):
+                    if last_block_name and last_block_name in n:
+                        continue
+                    target_modules.append((n, m))
+                    last_block_name = n
+        for i, target_m in enumerate(target_modules):
+            block_names.append([])
+            for n, m in target_m[1].named_children():
+                block_names[i].append(target_m[0] + "." + n)
+        return block_names
+
+    if quant_vision or not is_pure_text_model(model):
+        return _get_vlm_block_names(model, quant_vision=quant_vision)
+    else:
+        return _get_llm_block_names(model)
+
+
+def collect_best_params(block):
+    params = {}
+    for n, m in block.named_modules():
+        if hasattr(m, "orig_layer"):
+            params[n] = {}
+            for key in m.params.keys():
+                params[n][key] = copy.deepcopy(m.params[key].data)
+    return params
+
+
+def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.float16, device=torch.device("cpu")):
+    """Performs a forward pass through a block with the given inputs.
+
+    Args:
+    block: The block to perform the forward pass on.
+    input_ids: The input IDs.
+    input_others: A dictionary containing other input data.
+    amp: A boolean indicating whether to use automatic mixed precision.
+    amp_dtype: The data type for automatic mixed precision.
+    device: The target device.
+
+    Returns:
+    output: The output of the forward pass.
+    """
+    if input_ids.device != device:
+        input_ids = to_device(input_ids, device)
+        input_others = to_device(input_others, device)
+    input_tuple = input_others.pop("positional_inputs", None)
+    if "alibi" in input_others.keys() and input_others["alibi"] is not None:
+        alibi = input_others["alibi"]
+        input_others["alibi"] = alibi.reshape(-1, alibi.shape[2], alibi.shape[3])
+    if amp:
+        with autocast(device_type=device.split(":")[0], dtype=amp_dtype):  # pragma: no cover
+            output = block(input_ids, *input_tuple, **input_others)
+    else:
+        output = block(input_ids, *input_tuple, **input_others)
+    if isinstance(output, list) or isinstance(output, tuple):
+        output = output[0]
+    return output
+
+
+def check_to_quantized(config):
+    """Checks if the configuration is valid for quantization.
+
+    Args:
+        config (dict or object): The configuration to check. It can be either a
+            dictionary with a 'bits' key or an object with a 'bits' attribute.
+
+    Returns:
+        bool: True if the configuration is valid for quantization (bits <= 8),
+            False otherwise.
+    """
+    if isinstance(config, (dict, QuantizationScheme)):
+        bits = int(config.get("bits", 16))
+        act_bits = int(config.get("act_bits", 16))
+    elif hasattr(config, "orig_layer"):
+        bits = int(config.orig_layer.bits) if hasattr(config.orig_layer, "bits") else 16
+        act_bits = int(config.orig_layer.act_bits) if hasattr(config.orig_layer, "act_bits") else 16
+    else:
+        bits = int(config.bits) if hasattr(config, "bits") else 16
+        act_bits = int(config.act_bits) if hasattr(config, "act_bits") else 16
+
+    return bits <= 8 or act_bits <= 8
+
+
+def detect_device_count():
+    """Detects the number of available computation devices.
+
+    This function checks if CUDA is available. If it is, it returns the count
+    of available CUDA devices. If not, it attempts to import the Habana
+    device framework to return the count of Habana devices. If the import
+    fails or no devices are found, it returns 0.
+
+    Returns:
+        int: The number of available devices (CUDA or Habana).
+    """
+    if torch.cuda.is_available():
+        return torch.cuda.device_count()
+    else:
+        try:
+            import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401
+
+            return hthpu.device_count()
+        except ImportError:
+            return 0
+
+
+def detect_device(device: Union[str, int, torch.device] = None) -> str:
+    """Detects the appropriate computation device.
+
+    This function determines the device to use for computations. It can take
+    a specific device index or default to 'auto'. The function checks for
+    available devices in the following order: CUDA, Habana, and finally CPU.
+
+    Args:
+        device (str, int, or torch.device, optional): The desired device.
+            If 'auto' or None, the function will determine the best device
+            automatically.
+
+    Returns:
+        str: The device to use for computations, formatted as a string.
+    """
+
+    def is_valid_digit(s):
+        try:
+            num = int(s)
+            return 0 <= num
+        except:
+            return False
+
+    dev_idx = None
+    if is_valid_digit(device):
+        dev_idx = int(device)
+        device = "auto"
+    if isinstance(device, str) and "," in device:  # device is "0,1,2"
+        device_list = [int(dev) for dev in device.split(",") if dev.isdigit()]
+        dev_idx = device_list[0] if device_list else None
+        device = "auto"
+    if device is None or device == "auto":
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            # logger.info("Using GPU device")
+        elif is_hpex_available():  # pragma: no cover
+            device = torch.device("hpu")
+            # logger.info("Using HPU device")
+        elif torch.xpu.is_available():  # pragma: no cover
+            device = torch.device("xpu")
+        # Use CPU as a fallback
+        else:
+            device = torch.device("cpu")
+            # logger.info("Using CPU device")
+        if dev_idx is not None and str(device) != "cpu":
+            device = str(device) + f":{dev_idx}"
+        return str(device)
+    elif isinstance(device, torch.device):
+        device = str(device)
+    elif isinstance(device, str):  ## for cuda:0
+        if device == "tp":  # pragma: no cover
+            # should not specify card, e.g., cuda:0
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif is_hpex_available():
+                device = "hpu"
+            else:
+                device = "cpu"
+        else:
+            device = device
+    return device
+
+
+class CpuInfo(object):
+    """Get CPU Info."""
+
+    def __init__(self):
+        """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket."""
+        self._bf16 = False
+        info = cpuinfo.get_cpu_info()
+        if "arch" in info and "X86" in info["arch"]:
+            cpuid = cpuinfo.CPUID()
+            max_extension_support = cpuid.get_max_extension_support()
+            if max_extension_support >= 7:
+                eax = cpuid._run_asm(
+                    b"\xb9\x01\x00\x00\x00",  # mov ecx, 1
+                    b"\xb8\x07\x00\x00\x00" b"\x0f\xa2" b"\xc3",  # mov eax, 7  # cpuid  # ret
+                )
+                self._bf16 = bool(eax & (1 << 5))
+
+    @property
+    def bf16(self):
+        """Get whether it is bf16."""
+        return self._bf16
+
+
+def is_local_path(path):
+    """Checks if a given path exists locally.
+
+    Args:
+        path (str): The path to check.
+
+    Returns:
+        bool: True if the path exists locally, False otherwise.
+    """
+    format_list = (
+        "json",
+        "txt",
+    )
+    flag = None
+    for x in format_list:
+        flag = True if x in path else flag
+    return flag and os.path.exists(path)
+
+
+def convert_dtype_str2torch(str_dtype):
+    """Converts a string dtype to its corresponding PyTorch dtype.
+
+    Args:
+        str_dtype (str): The string representation of the dtype.
+
+    Returns:
+        torch.dtype: The PyTorch dtype.
+
+    Raises:
+        ValueError: If the input str_dtype is unsupported.
+    """
+    if isinstance(str_dtype, torch.dtype) or str_dtype is None:
+        return str_dtype
+    if str_dtype == "int8":
+        return torch.int8
+    elif str_dtype == "fp32" or str_dtype == "float32" or str_dtype == "auto":
+        return torch.float
+    elif str_dtype == "fp16" or str_dtype == "float16":
+        return torch.float16
+    elif str_dtype == "bf16" or str_dtype == "bfloat16":
+        return torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported string dtype '{str_dtype}' for conversion to torch dtype.")
+
+
+def convert_dtype_torch2str(dtype):
+    """Converts a PyTorch dtype to its corresponding string representation.
+
+    Args:
+        dtype: PyTorch dtype or str. The dtype to convert.
+
+    Returns:
+        str: The string representation of the dtype.
+
+    Raises:
+        ValueError: If the input dtype is unsupported.
+    """
+    if isinstance(dtype, str) or dtype is None:
+        return dtype
+    if dtype == torch.int8:
+        return "int8"
+    elif dtype == torch.float:
+        return "fp32"
+    elif dtype == torch.float16:
+        return "fp16"
+    elif dtype == torch.bfloat16:
+        return "bf16"
+    elif isinstance(dtype, str) and dtype in ["int8", "fp32", "fp16", "bf16"]:
+        return dtype
+    else:
+        raise ValueError(f"Unsupported PyTorch dtype '{dtype}' for conversion to string dtype.")
+
+
+def convert_dtype_torch2str_hf(dtype):
+    """Converts a PyTorch dtype to its corresponding huggingface string dtype, e.g. torch.float32 -> 'float32'.
+
+    Args:
+        dtype: PyTorch dtype or str. The dtype to convert.
+
+    Returns:
+         str: The string representation of the dtype.
+
+    Raises:
+        ValueError: If the input str_dtype is unsupported.
+    """
+    if dtype is None:
+        return dtype
+    if isinstance(dtype, str):
+        if "float" not in dtype and "int" not in dtype:
+            dtype = convert_dtype_str2torch(dtype)
+        else:
+            return dtype
+    str_dtype = str(dtype)
+    if "." not in str_dtype:
+        raise ValueError(f"Unsupported pytorch dtype '{dtype}' for conversion to huggingface str dtype")
+    str_dtype = str_dtype.split(".")[1]
+    return str_dtype
+
+
+def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
+    """Checks the availability of memory on the specified device for processing inputs using a given weight tensor.
+
+    Args:
+        device (str): The device type ('cuda' for GPU or 'hpu' for HPU).
+        inputs (torch.Tensor): Input tensor.
+        weight (torch.Tensor): Weight tensor.
+        org_seqlen (int): Original sequence length.
+        org_bs (int): Original batch size.
+
+    Returns:
+        tuple: A tuple containing availability status (bool), modified sequence length (int),
+               and modified batch size (int).
+    """
+    weight_memory = weight.numel() * weight.element_size()
+    if "cuda" in device:
+        current_gpu_index = torch.cuda.current_device()
+        total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory
+        used_memory = torch.cuda.memory_allocated(current_gpu_index)
+        free_space = total_memory - used_memory
+    elif "hpu" in device:  # pragma: no cover
+        current_hpu_index = torch.hpu.current_device()
+        free_space = torch.hpu.memory_reserved(current_hpu_index)
+    else:
+        return True, org_seqlen, org_bs
+
+    free_space = free_space - weight_memory * 10  # for min_max_scale & grad usage
+    seqlen = org_seqlen
+    bs = org_bs
+    in_feature = weight.shape[1]
+    out_feature = weight.shape[0]
+    while seqlen >= 128:
+        input_size = bs * seqlen * in_feature
+        output_size = bs * seqlen * out_feature
+        input_output_memory = 2 * (input_size * inputs.element_size() + output_size * inputs.element_size())
+        if input_output_memory < free_space:
+            return True, seqlen, bs
+        seqlen = seqlen // 2
+        bs = 1
+
+    return False, seqlen, bs
+
+
+def get_layer_names_in_block(
+    model, supported_types=(torch.nn.Linear, transformers.pytorch_utils.Conv1D), quant_block_list=None, class_names=None
+):
+    """Retrieves the names of layers within each block of the model.
+
+    Returns:
+        list: A list of strings, where each string is the name of a layer
+              within a block of the model.
+    """
+    if class_names is None:
+        class_names = []
+    for n, m in model.named_modules():
+        if isinstance(m, supported_types) or (class_names is not None and m.__class__.__name__ in class_names):
+            m.tmp_name = n
+    layers_in_block = []
+    if bool(quant_block_list):
+        all_blocks = quant_block_list
+    else:
+        all_blocks = get_block_names(model)
+    for block_names in all_blocks:
+        for block_name in block_names:
+            block = get_module(model, block_name)
+            for n, m in block.named_modules():
+                if hasattr(m, "tmp_name"):
+                    layers_in_block.append(m.tmp_name)
+    return layers_in_block
+
+
+def is_autoround_exllamav2_available():
+    """Checks if the AutoRound ExLlamaV2 kernels are available.
+
+    Returns:
+        bool:
+            True if the AutoRound ExLlamaV2 kernels are available, False otherwise.
+    """
+    res = True
+    try:
+        from autoround_exllamav2_kernels import gemm_half_q_half, make_q_matrix
+    except ImportError as e:
+        res = False
+    return res
+
+
+def get_library_version(library_name):
+    from packaging.version import Version
+
+    python_vesion = Version(sys.version.split()[0])
+    if python_vesion < Version("3.8"):
+        import warnings
+
+        warnings.filterwarnings("ignore", category=DeprecationWarning)
+        import pkg_resources  # pylint: disable=E0401
+
+        try:
+            version = pkg_resources.get_distribution(library_name).version
+            return version
+        except pkg_resources.DistributionNotFound:
+            return f"{library_name} is not installed"
+    else:
+        import importlib.metadata  # pylint: disable=E0401
+
+        try:
+            version = importlib.metadata.version(library_name)
+            return version
+        except importlib.metadata.PackageNotFoundError:
+            return f"{library_name} is not installed"
+
+
+def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
+    """
+    Configures and returns a QuantLinear class based on the specified backend and parameters.
+
+    Args:
+        backend (str): The backend to be used for quantization. Supported values include "qigen", "triton", "marlin",
+                       "exllama", and "cuda".
+        bits (int, optional): The number of bits for quantization. Default is 4.
+        group_size (int, optional): The group size for quantization. Default is 128.
+        sym (bool, optional): Flag indicating whether to use symmetric quantization. Default is False.
+
+    Returns:
+        class: The dynamically imported QuantLinear class configured according to the specified parameters.
+    """
+    use_triton = True
+    if bits not in [2, 4, 8]:
+        use_triton = False
+    disable_exllamav2 = True
+    disable_exllamav1 = False
+    disable_marlin = True
+    use_qigen = False
+    if "qigen" in backend:
+        use_triton = False
+        use_qigen = True
+    elif "triton" in backend:
+        use_triton = True
+    elif "marlin" in backend and sym:
+        use_triton = False
+        disable_marlin = False
+    elif "exllama" in backend:  ##need v1 code to export
+        use_triton = True  ##same with triton
+        disable_marlin = True
+    elif "cuda" in backend:
+        use_triton = False
+        disable_marlin = True
+        disable_exllamav2 = True
+        disable_exllamav1 = True
+    if use_triton:
+        from auto_round.export.export_to_autogptq.qlinear_triton import QuantLinear
+
+        return QuantLinear
+    try:
+        import auto_gptq  # pylint: disable=E0401
+    except:
+        logger.error(f"please install auto_gptq via 'pip install auto-gptq' to support exporting to {backend}")
+        exit()
+
+    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear  # pylint: disable=E0401
+
+    version = get_library_version("auto_gptq")
+    from packaging.version import Version
+
+    if Version(version) < Version("0.7.2"):
+        QuantLinear = dynamically_import_QuantLinear(
+            use_triton=use_triton,
+            desc_act=False,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=disable_exllamav1,
+            disable_exllamav2=disable_exllamav2,
+            use_qigen=use_qigen,
+            disable_marlin=disable_marlin,
+        )
+    else:
+        QuantLinear = dynamically_import_QuantLinear(  # pylint: disable=E1123
+            use_triton=use_triton,
+            desc_act=False,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=disable_exllamav1,
+            disable_exllamav2=disable_exllamav2,
+            use_qigen=use_qigen,
+            use_marlin=not disable_marlin,
+        )
+    return QuantLinear
+
+
+def _clear_memory_for_cpu_and_cuda(tensor=None):
+    if isinstance(tensor, list):
+        for i in range(len(tensor)):
+            tensor[i] = None
+    if tensor is not None:
+        del tensor
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if torch.xpu.is_available():
+        torch.xpu.empty_cache()
+
+
+@torch._dynamo.disable()
+def clear_memory(tensor=None):
+    if is_hpex_available():
+        # hpu does not have empty_cache
+        return
+    else:
+        _clear_memory_for_cpu_and_cuda(tensor)
+
+
+def compare_versions(v1, v2):
+    return version.parse(v1) >= version.parse(v2)
+
+
+def torch_version_at_least(version_string):
+    return compare_versions(torch.__version__, version_string)
+
+
+TORCH_VERSION_AT_LEAST_2_6_PRE_RELEASE = torch_version_at_least("2.5.99")
+TORCH_VERSION_AT_LEAST_2_6 = torch_version_at_least("2.6.0")
+TORCH_VERSION_AT_LEAST_2_5 = torch_version_at_least("2.5.0")
+TORCH_VERSION_AT_LEAST_2_4 = torch_version_at_least("2.4.0")
+
+
+# Note on HPU usage:
+# There are two modes available for enabling auto-round on HPU:
+# 1. Compile Mode
+#   1) Use PyTorch version ≥ 2.4 (Intel® Gaudi® v1.18 or later)
+#   2) Set `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`
+#   The compile mode can speed up quantization process but still in experimental stage.
+# 2. Lazy Mode (By default)
+
+
+def is_hpu_lazy_mode():
+    return os.getenv("PT_HPU_LAZY_MODE") != "0"
+
+
+def _use_hpu_compile_mode():
+    return TORCH_VERSION_AT_LEAST_2_4 and not is_hpu_lazy_mode()
+
+
+def compile_func_on_hpu(func):
+    if _use_hpu_compile_mode():
+        return torch.compile(func, backend="hpu_backend")
+    return func
+
+
+def compile_func_on_cuda_or_cpu(func):
+    return torch.compile(func)
+
+
+def compile_func(
+    fun: Union[torch.nn.Module, Callable], device: Union[torch.nn.Module, Callable]
+) -> Union[torch.nn.Module, Callable]:
+    """Compile function on the specified device."""
+    if "hpu" in str(device):
+        return compile_func_on_hpu(fun)  ## use auto by default
+    else:
+        return compile_func_on_cuda_or_cpu(fun)
+
+
+def is_numba_available():  # pragma: no cover
+    """Check if Numba is available."""
+    try:
+        import numba
+
+        return True
+    except ImportError:
+        return False
+
+
+def _is_tbb_installed():  # pragma: no cover
+    import importlib.metadata
+
+    try:
+        importlib.metadata.version("tbb")
+        return True
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
+def _is_tbb_configured():  # pragma: no cover
+    try:
+        from numba.np.ufunc.parallel import _check_tbb_version_compatible
+
+        # check if TBB is present and compatible
+        _check_tbb_version_compatible()
+
+        return True
+    except ImportError as e:
+        logger.warning_once(f"TBB not available: {e}")
+        return False
+
+
+def is_tbb_available():  # pragma: no cover
+    """Check if TBB is available."""
+    if not _is_tbb_installed():
+        logger.warning_once("TBB is not installed, please install it with `pip install tbb`.")
+        return False
+    if not _is_tbb_configured():
+        logger.warning_once(
+            (
+                "TBB is installed but not configured correctly. \n"
+                "Please add the TBB library path to `LD_LIBRARY_PATH`, "
+                "for example: `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/`."
+            )
+        )
+        return False
+    return True
+
+
+def can_pack_with_numba():  # pragma: no cover
+    """Check if Numba and TBB are available for packing.
+
+    To pack tensor with Numba, both Numba and TBB are required, and TBB should be configured correctly.
+    """
+    if not is_numba_available():
+        logger.warning_once("Numba is not installed, please install it with `pip install numba`.")
+        return False
+    if not is_tbb_available():
+        return False
+    return True
+
+
+def get_fp_layer_names(model, fp_layers):
+    """Identifies and returns layers in the model to exclude from quantization.
+
+    This function processes a comma-separated list of fully precision (FP) layers,
+    matches them to the names of layers in the model, and returns a list of such
+    layers to exclude from quantization.
+
+    Args:
+        model (torch.nn.Module): The model whose layers will be inspected.
+        fp_layers (str): A comma-separated string of layer names to be excluded
+            from quantization. Whitespace is ignored in this string.
+
+    Returns:
+        list: A list of layer names that match the specified FP layers or are
+        subcomponents of those layers.
+    """
+    if not fp_layers:
+        return []
+    fp_layers = fp_layers.replace(" ", "").split(",")
+    all_layer_names = []
+    for n, m in model.named_modules():
+        if isinstance(m, (torch.nn.Linear, transformers.pytorch_utils.Conv1D)):
+            all_layer_names.append(n)
+    not_to_quantized_layers = []
+
+    for fp_layer in fp_layers:
+        if fp_layer == "":
+            continue
+        if fp_layer in all_layer_names:
+            not_to_quantized_layers.append(fp_layer)
+            continue
+        if fp_layer[-1].isdigit():
+            fp_layer = fp_layer + "."  ##tricky setting
+        for name in all_layer_names:
+            if fp_layer in name:
+                not_to_quantized_layers.append(name)
+
+    return not_to_quantized_layers
+
+
+def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=None):
+    """Checks if a model is compatible with the AutoAWQ GEMM kernel.
+
+    Args:
+        model: The model object to evaluate, typically a PyTorch model.
+        bits (int): The number of bits for quantization (must be 4 for compatibility).
+        group_size (int): The group size for quantization.
+        sym (bool): Whether symmetric quantization is used (not utilized in the current function logic).
+        layer_configs (dict, optional): A dictionary mapping layer names to configurations, where each
+            configuration can specify a custom number of bits for the layer.
+
+    Returns:
+        tuple: A tuple containing:
+            - bool: `True` if the model is compatible, `False` otherwise.
+            - str: An error message describing why the model is incompatible, or an empty string if compatible.
+    """
+    if bits != 4:
+        return False, "AutoAWQ GEMM kernel only supports 4 bits"
+    for n, m in model.named_modules():
+        if isinstance(m, transformers.pytorch_utils.Conv1D):
+            return False, "AutoAWQ GEMM kernel does not support conv1d"
+
+    layer_names = get_layer_names_in_block(model)
+    for layer_name in layer_names:
+        if (
+            layer_configs is not None
+            and layer_name in layer_configs.keys()
+            and layer_configs[layer_name].get("bits", bits) > 8
+        ):
+            continue
+
+        layer = get_module(model, layer_name)
+        if layer.in_features % group_size != 0:
+            return False, f"Layer {layer_name} in_features is not multiple of group_size {group_size}"
+        if layer.out_features % (32 // bits) != 0:
+            return False, f"Layer {layer_name} out_features is not multiple of 32 // bits"
+
+    return True, ""
+
+
+def get_device_and_parallelism(device: Union[str, torch.device, int]) -> Tuple[str, bool]:
+    if isinstance(device, str):
+        devices = device.replace(" ", "").split(",")
+    elif isinstance(device, int):
+        devices = [str(device)]
+    else:
+        devices = [device]
+    if all(s.isdigit() for s in devices) and len(devices) > 1 and torch.cuda.is_available():
+        device = "cuda"
+        parallelism = True
+    elif all(s.isdigit() for s in devices) and len(devices) > 1 and torch.xpu.is_available():
+        device = "xpu"
+        parallelism = False
+    # pragma: no cover
+    elif device == "auto":
+        device = detect_device(device)
+        parallelism = True
+    else:
+        device = detect_device(device)
+        parallelism = False
+    return device, parallelism
+
+
+def set_cuda_visible_devices(device):
+    devices = device.replace(" ", "").split(",")
+    if all(s.isdigit() for s in devices):
+        if "CUDA_VISIBLE_DEVICES" in os.environ:
+            current_visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
+            current_visible_devices = current_visible_devices.split(",")
+            indices = [int(device) for device in devices]
+            try:
+                pick_device = [current_visible_devices[i] for i in indices]
+            except:
+                raise ValueError(
+                    "Invalid '--device' value: It must be smaller than the number of available devices."
+                    " For example, with CUDA_VISIBLE_DEVICES=4,5, "
+                    "--device 0,1 is valid, but --device 4,5 is not supported."
+                )
+            visible_devices = ",".join(pick_device)
+            os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
+        else:
+            os.environ["CUDA_VISIBLE_DEVICES"] = device
+
+
+def is_debug_mode():
+    """Checks if the Python interpreter is running in debug mode.
+
+    Returns:
+        bool: True if debugging is enabled, False otherwise.
+    """
+    return sys.gettrace() is not None or sys.flags.debug == 1
+
+
+def get_layer_features(layer):
+    """Extracts input and output feature dimensions for supported layers."""
+    if isinstance(layer, torch.nn.Linear):
+        return layer.in_features, layer.out_features
+    elif isinstance(layer, transformers.pytorch_utils.Conv1D):  # TODO: Verify correctness
+        return layer.weight.shape[0], layer.weight.shape[1]
+    elif isinstance(layer, torch.nn.Embedding):
+        return layer.num_embeddings, layer.embedding_dim
+    elif deepspeed_exists and isinstance(layer, (LinearLayer, LinearAllreduce)):
+        return layer.weight.shape[1], layer.weight.shape[0]  # (input_dim, output_dim)
+    return None, None  # Unsupported layer type
+
+
+def get_gguf_architecture(dir_model, model_type=ModelType.TEXT):
+    from auto_round.export.export_to_gguf.convert_hf_to_gguf import (
+        ModelBase,
+        get_model_architecture,
+    )
+
+    is_mistral_format = False
+    if isinstance(dir_model, str):
+        dir_model = Path(dir_model)
+
+    hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
+    if isinstance(hparams, dict):
+        tmp_model_type = hparams["model_type"]
+    else:
+        tmp_model_type = hparams.model_type
+    if "mistral" == tmp_model_type:
+        is_mistral_format = True
+        hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
+    if not is_mistral_format:
+        model_class = get_model_architecture(hparams, model_type)
+    elif model_type == ModelType.MMPROJ:
+        assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
+        model_class = "PixtralModel"
+    else:
+        model_class = "MistralModel"
+    return model_class
+
+
+def _gguf_args_check(args_or_ar, formats: list[str] = None, model_type=ModelType.TEXT):
+    import argparse
+
+    from auto_round.export.export_to_gguf.convert import download_convert_file
+    from auto_round.utils import logger
+
+    formats = sorted(formats, key=lambda x: len(x))
+    export_gguf = False
+    for f in formats:
+        if f.startswith("gguf"):
+            export_gguf = True
+
+        if f.startswith("gguf") and f not in GGUF_CONFIG:
+            logger.error(f"{f} is not supported, please check.")
+
+    redownload = False
+    if export_gguf:
+        try:
+            from auto_round.export.export_to_gguf.convert_hf_to_gguf import (  # pylint: disable=E0401
+                ModelBase,
+                ModelType,
+                get_model_architecture,
+            )
+
+            if isinstance(args_or_ar.model, str):
+                model_path = args_or_ar.model
+            else:
+                model_path = args_or_ar.model.name_or_path
+            if not os.path.isdir(model_path):
+                model_path = download_hf_model(model_path)
+            model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
+            if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
+                logger.warning(
+                    f"Current version of gguf export does not support for {model_architecture},"
+                    " will re-download dependency file."
+                )
+                redownload = True
+        except ModuleNotFoundError as e:
+            if "convert_hf_to_gguf" in str(e):
+                logger.warning("GGUF export dependency file is not found, download from github.")
+                redownload = True
+        except AttributeError as e:
+            raise ImportError(
+                "Please use the latest gguf-py, you can use the following command to install it:\n"
+                "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ."
+            )
+        download_convert_file(redownload)
+
+        try:
+            from auto_round.export.export_to_gguf.convert_hf_to_gguf import (  # pylint: disable=E0401
+                ModelBase,
+                ModelType,
+            )
+        except ImportError as e:
+            raise ImportError(
+                "Please use the latest gguf-py, you can use the following command to install it:\n"
+                "git clone https://github.com/ggml-org/llama.cpp.git && cd llama.cpp/gguf-py && pip install ."
+            )
+        if isinstance(args_or_ar.model, str):
+            model_path = args_or_ar.model
+        else:
+            model_path = args_or_ar.model.name_or_path
+        if not os.path.isdir(model_path):
+            model_path = download_hf_model(model_path)
+        model_architecture = get_gguf_architecture(model_path, model_type=ModelType.TEXT)
+        if model_architecture not in ModelBase._model_classes[ModelType.TEXT]:
+            logger.error(f"Model {model_architecture} is not supported to export gguf format.")
+            sys.exit(1)
+
+    pattern = re.compile(r"q\d_k")
+    pre_dq_format = ""
+    unsupport_list, reset_list = [], []
+    for format in GGUF_CONFIG:
+        if format in formats:
+            if format == "q6_k_s":
+                logger.warning("Please note that q6_k_s is q6_k.")
+
+            if re.search(pattern, format):
+                if pre_dq_format and re.search(pattern, format).group() not in pre_dq_format:
+                    logger.error(f"Cannot export {pre_dq_format} and {format} at the same time.")
+                    sys.exit(-1)
+                else:
+                    pre_dq_format = format
+
+            unsupport_list, reset_list = [], []
+            gguf_config = GGUF_CONFIG[format]
+            for k, v in gguf_config.items():
+                if not hasattr(args_or_ar, k):
+                    continue
+                if k == "data_type":
+                    if re.search(r"q\d_1", format) and len(formats) > 1:
+                        v = "int"
+                if k == "sym" and isinstance(args_or_ar, argparse.Namespace):
+                    k = "asym"
+                    v = not v
+                if getattr(args_or_ar, k) != v:
+                    unsupport_list.append(f"{k}={getattr(args_or_ar, k)}")
+                    reset_list.append(f"{k}={v}")
+                    setattr(args_or_ar, k, v)
+            if len(unsupport_list) > 0:
+                logger.info(
+                    f"format {format} does not support for {', '.join(unsupport_list)},"
+                    f" reset to {', '.join(reset_list)}."
+                )
+    # Removed obsolete commented-out block for improved readability and maintainability.
+    return args_or_ar
+
+
+def _to_model_dtype(model, model_dtype):
+    if model_dtype is not None:
+        try:
+            if (model_dtype == "float16" or model_dtype == "fp16") and model.dtype != torch.float16:
+                model = model.to(torch.float16)
+            elif (
+                model_dtype == "bfloat16" or model_dtype == "bfp16" or model_dtype == "bf16"
+            ) and model.dtype != torch.bfloat16:
+                model = model.to(torch.bfloat16)
+            elif model_dtype == "float32" or model_dtype == "fp32" and model.dtype != torch.bfloat32:
+                model = model.to(torch.float32)
+        except:
+            logger.error("please use more device to fit the device or just use one device")
+            exit()
+    return model
+
+
+def set_fake_cuda_device_capability(func=None):
+    if func is not None:
+        torch.cuda.get_device_capability = func
+        return func
+
+    def fake_cuda():
+        return 100, 1
+
+    orig_func = torch.cuda.get_device_capability
+    torch.cuda.get_device_capability = fake_cuda
+    return orig_func
+
+
+def _is_fp8_model(model: torch.nn.Module) -> bool:
+    if not hasattr(model, "is_fp8"):
+        return False
+    else:
+        return model.is_fp8
+
+
+def _is_fp8_linear(module: torch.nn.Module) -> bool:
+    if hasattr(module, "is_fp8_linear"):
+        return module.is_fp8_linear
+    if not (isinstance(module, torch.nn.Linear) or module.__class__.__name__ == "FP8Linear"):
+        return False
+    if module.weight is None:
+        return False
+    if str(module.weight.dtype).startswith("torch.float8"):
+        return True
+    else:
+        return False
+
+
+def check_and_mark_fp8_model(model: torch.nn.Module) -> bool:
+    if _is_fp8_model(model):
+        return True
+    for n, m in model.named_modules():
+        if _is_fp8_linear(m):
+            m.is_fp8_linear = True
+            if not hasattr(model, "is_fp8"):
+                model.is_fp8 = True
+    if hasattr(model, "is_fp8"):
+        return True
+    return False
+
+
+def llm_load_model(
+    pretrained_model_name_or_path,
+    trust_remote_code=True,
+    model_dtype=None,
+    device="cpu",
+    low_cpu_mem_mode=0,
+    low_cpu_mem_tmp_dir=None,
+    **kwargs,
+):
+    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
+    device_str, use_auto_mapping = get_device_and_parallelism(device)
+    torch_dtype = "auto"
+    if device_str is not None and "hpu" in device_str:
+        torch_dtype = torch.bfloat16
+
+    is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower()))
+    low_cpu_mem_usage = False
+
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
+
+    model_cls = AutoModel if is_glm else AutoModelForCausalLM
+    if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code:
+        logger.warning("trust_remote_code is enabled by default, please ensure its correctness.")
+
+    if low_cpu_mem_tmp_dir is None:
+        low_cpu_mem_tmp_dir = "low_cpu_mem_tmp"
+    if low_cpu_mem_mode == 2:
+        from auto_round.low_cpu_mem.utils import load_model_with_hooks
+
+        model = load_model_with_hooks(
+            pretrained_model_name_or_path,
+            model_cls,
+            device=device,
+            clean_weight=True,
+            saved_path=low_cpu_mem_tmp_dir,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+        )
+    elif low_cpu_mem_mode == 1:
+        from auto_round.low_cpu_mem.utils import load_empty_model
+
+        low_cpu_mem_usage = True
+        model = load_empty_model(
+            pretrained_model_name_or_path,
+            model_cls,
+            device=device,
+            saved_path=low_cpu_mem_tmp_dir,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+        )
+    else:
+        if _use_hpu_compile_mode():
+            model = model_cls.from_pretrained(
+                pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                attn_implementation="eager",
+                trust_remote_code=trust_remote_code,
+                device_map="auto" if use_auto_mapping else None,
+            )
+        else:
+            try:
+                model = model_cls.from_pretrained(
+                    pretrained_model_name_or_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=trust_remote_code,
+                    device_map="auto" if use_auto_mapping else None,
+                )
+            except ValueError as e:
+                if "FP8 quantized" in str(e):
+                    orig_func = set_fake_cuda_device_capability()
+                    model = model_cls.from_pretrained(
+                        pretrained_model_name_or_path,
+                        torch_dtype=torch_dtype,
+                        trust_remote_code=trust_remote_code,
+                        device_map="auto" if use_auto_mapping else None,
+                    )
+                    torch.cuda.get_device_capability = orig_func
+                    logger.warning("the support for fp8 model as input is experimental, please use with caution.")
+                else:
+                    raise
+
+            except OSError as e:
+                logger.warning(
+                    f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry."
+                )
+                model = model_cls.from_pretrained(
+                    pretrained_model_name_or_path,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=False,
+                    device_map="auto" if use_auto_mapping else None,
+                )
+
+    model = model.eval()
+    check_and_mark_fp8_model(model)
+    model = _to_model_dtype(model, model_dtype)
+
+    return model, tokenizer, low_cpu_mem_usage
+
+
+def mllm_load_model(
+    pretrained_model_name_or_path,
+    device="cpu",
+    torch_dtype="auto",
+    use_auto_mapping=True,
+    trust_remote_code=True,
+    model_dtype=None,
+    **kwargs,
+):
+    import transformers
+    from huggingface_hub import HfApi, HfFileSystem, hf_hub_download
+    from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+
+    device_str, use_auto_mapping = get_device_and_parallelism(device)
+    torch_dtype = "auto"
+    if device_str is not None and "hpu" in device_str:
+        torch_dtype = torch.bfloat16
+    if os.path.isdir(pretrained_model_name_or_path):
+        config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json")))
+    else:
+        from huggingface_hub import hf_hub_download, list_repo_files
+
+        file_list = list_repo_files(pretrained_model_name_or_path)
+        if "config.json" in file_list:
+            # Load plain JSON
+            config_path = hf_hub_download(pretrained_model_name_or_path, "config.json")
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        elif "config.json.gz" in file_list:
+            # Load gzipped JSON
+            import gzip
+
+            config_path = hf_hub_download(pretrained_model_name_or_path, "config.json.gz")
+            with gzip.open(config_path, "rt", encoding="utf-8") as f:
+                config = json.load(f)
+        else:
+            raise FileNotFoundError(f"No config.json or config.json.gz found for {pretrained_model_name_or_path}")
+
+    if "model_type" in config:
+        model_type = config["model_type"]
+    else:
+        model_type = None
+
+    processor, image_processor = None, None
+    if "deepseek_vl_v2" == model_type:
+        from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor  # pylint: disable=E0401
+
+        processor = DeepseekVLV2Processor.from_pretrained(pretrained_model_name_or_path)
+        tokenizer = processor.tokenizer
+        model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            torch_dtype=torch_dtype,
+            device_map="auto" if use_auto_mapping else None,
+        )
+    else:
+        architectures = config["architectures"][0]
+        if architectures == "LlavaLlamaForCausalLM":
+            from llava.model.builder import load_pretrained_model  # pylint: disable=E0401
+
+            tokenizer, model, image_processor, _ = load_pretrained_model(
+                pretrained_model_name_or_path,
+                model_base=None,
+                model_name=pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+            )
+        else:
+            if architectures.endswith("Model") and hasattr(
+                transformers, n := architectures.replace("Model", "ForConditionalGeneration")
+            ):
+                cls = getattr(transformers, n)
+            elif hasattr(transformers, architectures):
+                cls = getattr(transformers, architectures)
+            else:
+                cls = AutoModelForCausalLM
+            try:
+                model = cls.from_pretrained(
+                    pretrained_model_name_or_path,
+                    trust_remote_code=trust_remote_code,
+                    torch_dtype=torch_dtype,
+                    device_map="auto" if use_auto_mapping else None,
+                )
+            except ValueError as e:
+                if "FP8 quantized" in str(e):
+                    orig_func = set_fake_cuda_device_capability()
+                    model = cls.from_pretrained(
+                        pretrained_model_name_or_path,
+                        trust_remote_code=trust_remote_code,
+                        torch_dtype=torch_dtype,
+                        device_map="auto" if use_auto_mapping else None,
+                    )
+                    torch.cuda.get_device_capability = orig_func
+                    logger.warning("the support for fp8 model as input is experimental, please use with caution.")
+
+            if "Mistral-Small-3.2" in pretrained_model_name_or_path:
+                from mistral_common.tokens.tokenizers.mistral import MistralTokenizer  # pylint: disable=E0401
+
+                if os.path.isdir(pretrained_model_name_or_path):
+                    tokenizer = MistralTokenizer.from_file(os.path.join(pretrained_model_name_or_path, "tekken.json"))
+                else:
+                    tokenizer = MistralTokenizer.from_hf_hub(pretrained_model_name_or_path)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code
+                )
+                processor = AutoProcessor.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code
+                )
+            try:
+                from transformers import AutoImageProcessor
+
+                image_processor = AutoImageProcessor.from_pretrained(
+                    pretrained_model_name_or_path, trust_remote_code=trust_remote_code
+                )
+            except Exception as e:
+                pass
+
+    model = model.eval()
+    check_and_mark_fp8_model(model)
+    model = _to_model_dtype(model, model_dtype)
+
+    return model, processor, tokenizer, image_processor
+
+
+def is_pure_text_model(model):
+    """verify on: phi-3.5, Mistral-Small-3.1, gemma-3, qwen2-vl,"""
+    if hasattr(model, "config") and hasattr(model.config, "vision_config"):
+        return False
+    if hasattr(model.__class__, "main_input_name") and model.__class__.main_input_name != "input_ids":
+        return False
+    for module in model.modules():
+        if hasattr(module.__class__, "main_input_name") and module.__class__.main_input_name != "input_ids":
+            return False
+        if "vision" in str(module.__class__).lower():
+            return False
+        if "image" in str(module.__class__).lower():
+            return False
+        if "img" in str(module.__class__).lower():
+            return False
+    return True
+
+
+def reset_params(inputs):
     """
-    Generate ignore regex list for llm_compressor based on regex_config and layer_config.
+    Resets specific input parameters to avoid saving the key-value cache during fine-tuning.
 
-    Rules:
-    1. Any layer in regex_config with bits >= 16 is ignored.
-    2. Any layer in layer_config with bits >= 16 is ignored if not already included.
-    3. Output regex patterns are normalized for llm_compressor ('re:...' style).
+    Args:
+        inputs (dict): Dictionary of model inputs.
+
+    Modifies:
+        inputs (dict): Sets "use_cache" to False if the key is present.
+    """
+    if "use_cache" in inputs.keys():  # Not storing kv cache
+        inputs["use_cache"] = False
+
+
+def check_skippable_keywords(key):
+    """
+    Prints a reminder if a key is not stored during quantization fine-tuning.
+    """
+    skippable_cache_keys = ("past_key_value",)
+    for cache_key in skippable_cache_keys:
+        if cache_key not in key:
+            return True
+    return False
+
+
+def init_cache(positional_inputs, inputs):
+    """
+    Initializes special model inputs by adding positional inputs if missing.
+
+    Args:
+        positional_inputs (list): List of positional inputs to add to inputs.
+        inputs (dict): Dictionary of model inputs.
+
+    Modifies:
+        inputs (dict): Adds "positional_inputs" key if not present.
+    """
+    if "positional_inputs" not in inputs:  # for chatglm Series
+        inputs["positional_inputs"] = []
+    for idx, item in enumerate(positional_inputs):
+        inputs["positional_inputs"] = to_device(positional_inputs)
+
+
+def get_shared_keys(model):
+    """
+    Retrieves shared keys from the model's state dictionary.
+
+    Args:
+        model (torch.nn.Module): The model to retrieve shared keys from.
+
+    Returns:
+        tuple: tuple of shared keys.
+    """
+    from auto_round.special_model_handler import SPECIAL_SHARED_CACHE_KEYS
+
+    shared_keys = SHARED_CACHE_KEYS
+    shared_keys += SPECIAL_SHARED_CACHE_KEYS.get(model.__class__.__name__, ())
+    return shared_keys
+
+
+def get_model_dtype(model_dtype, default="auto"):
+    if model_dtype is None or model_dtype == "auto":
+        model_dtype = default
+    elif model_dtype in ["bf16", "bfloat16"]:
+        model_dtype = "bfloat16"
+    elif model_dtype in ["f16", "float16", "fp16"]:
+        model_dtype = "float16"
+    elif model_dtype in ["f32", "float32", "fp32"]:
+        model_dtype = "float32"
+    else:
+        logger.warning(f"Unable to identify model_dtype {model_dtype}, reset to default model_dtype {default}")
+        model_dtype = default
+    return model_dtype
+
+
+def str2bool(v):
+    import argparse
+
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def filter_quantization_config(quantization_config):
+    default_dict = {
+        "amp": True,
+        "batch_size": 8,
+        "data_type": int,
+        "dataset": "NeelNanda/pile-10k",
+        "enable_minmax_tuning": True,
+        "enable_norm_bias_tuning": False,
+        "enable_quanted_input": True,
+        "gradient_accumulate_steps": 1,
+        "iters": 200,
+        "low_gpu_mem_usage": False,
+        "nsamples": 128,
+        "scale_dtype": "torch.float16",
+        "seqlen": 2048,
+    }
+    iters = quantization_config.get("iters", 200)
+
+    default_dict["lr"] = 1.0 / iters if iters > 0 else 5e-3
+    default_dict["minmax_lr"] = default_dict["lr"]
+
+    for key in default_dict:
+        if key in quantization_config and default_dict[key] == quantization_config[key]:
+            quantization_config.pop(key)
+    for k in list(quantization_config.keys()):
+        if quantization_config[k] is None:
+            quantization_config.pop(k)
+
+    if quantization_config.get("act_bits", 16) >= 16:
+        quantization_config.pop("act_bits", None)
+        quantization_config.pop("act_data_type", None)
+        quantization_config.pop("act_dynamic", None)
+        quantization_config.pop("act_sym", None)
+        quantization_config.pop("act_group_size", None)
+
+
+def check_start_with_block_name(name: str, block_name_to_quantize: list):
+    """
+    Checks if the given layer name starts with any of the block names to be quantized.
+
+    Args:
+        name (str): The name of the layer.
+        block_name_to_quantize (list): A list of block names to check against.
+
+    Returns:
+        bool: True if the layer name starts with any of the block names, False otherwise.
+    """
+    for block_name in block_name_to_quantize:
+        if name.startswith(block_name):
+            return True
+    return False
+
+
+def check_seqlen_compatible(input_seqlen, tokenizer=None, model=None):
+    """
+    Check whether the input sequence length is within the limits defined
+    by the tokenizer and the model configuration.
+
+    Args:
+        input_seqlen (int): The length of the input sequence.
+        tokenizer: Optional, a HuggingFace tokenizer object.
+        model: Optional, a HuggingFace model object.
+
+    Returns:
+        ValueError: if the input length is not valid, riase Error.
+    """
+    if model is not None and hasattr(model, "config"):
+        model_config = model.config
+        if hasattr(model_config, "max_position_embeddings") and input_seqlen > model_config.max_position_embeddings:
+            raise ValueError(
+                f"seqlen({input_seqlen}) exceeds model.config.max_position_embeddings("
+                f"{model_config.max_position_embeddings}). Please lowering '--seqlen'"
+            )
+    if tokenizer is not None and hasattr(tokenizer, "model_max_length") and input_seqlen > tokenizer.model_max_length:
+        raise ValueError(
+            f"seqlen({input_seqlen}) exceeds tokenizer.model_max_length({tokenizer.model_max_length}). "
+            "Please oncider Consider lowering the '--seqlen' or increasing tokenizer.model_max_length."
+        )
+
+
+def _use_more_bits(i_layer: int, n_layer: int):
+    return (i_layer < n_layer // 8) or (i_layer >= 7 * n_layer // 8) or ((i_layer - n_layer // 8) % 3 == 2)
+
+
+def _get_digital_in_layer_name(layer_name):
+    pattern = re.compile(r"([a-zA-Z]+\.){1,}(\d+)")
+    res = re.search(pattern, layer_name)
+    if res:
+        return int(res[2])
+    else:
+        return None
+
+
+def _search_gguf_type(gguf_type):
+    if gguf_type in GGUF_INNER_CONFIG:
+        return gguf_type
+    pattern = re.compile("gguf:q([0-9]{1,})_[01k]")
+    bits = re.search(pattern, gguf_type)
+    if not bits:
+        raise KeyError(f"{gguf_type} is not a correct gguf type, please check")
+
+    for suffix in ["_k", "_0", "_1"]:
+        if gguf_type.endswith(suffix):
+            continue
+        if (tmp_type := re.sub("_[01k]", suffix, gguf_type)) in GGUF_INNER_CONFIG:
+            return tmp_type
+    return None
+
+
+def _gguf_type_fallback(gguf_type):
+    if gguf_type in ("gguf:q2_k", "gguf:q3_k", "gguf:q4_k"):
+        gguf_type = "gguf:q5_0"
+    elif gguf_type == "gguf:q5_k":
+        gguf_type = "gguf:q5_0"
+    elif gguf_type == "gguf:q6_k":
+        gguf_type = "gguf:q8_0"
+    return gguf_type
+
+
+##https://github.com/ggml-org/llama.cpp/blob/9e31bec4fd53634c9e5b04650488a09a055f5dab/src/llama-quant.cpp#L129
+def get_layer_config_by_gguf_format(layer_config, gguf_format, model, model_type=ModelType.TEXT):
+    # TODO: support for other format later
+    target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None)
+
+    import gguf  # pylint: disable=E0401
+
+    # from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture
+    convert_hf_to_gguf = LazyImport("auto_round.export.export_to_gguf.convert_hf_to_gguf")
+
+    model_architecture = convert_hf_to_gguf.get_model_architecture(
+        hparams=model.config.to_dict(), model_type=model_type
+    )
+    try:
+        model_class = convert_hf_to_gguf.ModelBase.from_model_architecture(model_architecture, model_type=model_type)
+    except NotImplementedError:
+        return layer_config, {}
+
+    n_layer = None
+    for name in ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]:
+        sub_attr = "text_config" if model_type == ModelType.TEXT else "vision_config"
+        if hasattr(model.config, name):
+            n_layer = getattr(model.config, name)
+            break
+        if hasattr(model.config, sub_attr):
+            if hasattr(getattr(model.config, sub_attr), name):
+                n_layer = getattr(getattr(model.config, sub_attr), name)
+                break
+    if n_layer is None:
+        return layer_config, {}
+
+    tensor_map = gguf.get_tensor_name_map(model_class.model_arch, n_layer)
+
+    def _set_config(config, target_config):
+        for k, v in target_config.items():
+            if isinstance(config, dict):
+                config[k] = v
+            else:
+                setattr(config, k, v)
+        return config
+
+    gguf_format_config = {}
+    lm_head_name = get_lm_head_name(model)
+    inner_gguf_format = GGUF_CONFIG[target_gguf_format]["mostly"]
+    # ggml_type =  getattr(gguf.GGMLQuantizationType,inner_gguf_format.split(":")[-1].upper())
+    block_size = GGML_QUANT_SIZES[inner_gguf_format.split(":")[-1].lower()][0]
+    tie_word_embeddings = True
+    if hasattr(model, "config") and hasattr(model.config, "tie_word_embeddings"):
+        tie_word_embeddings = model.config.tie_word_embeddings
+
+    n_gqa = 1
+    if (
+        hasattr(model, "config")
+        and hasattr(model.config, "num_attention_heads")
+        and hasattr(model.config, "num_key_value_heads")
+    ):
+        n_gqa = model.config.num_attention_heads // model.config.num_key_value_heads
+    n_expert = 0
+    for name in ["num_experts", "num_local_experts", "n_routed_experts"]:
+        if hasattr(model.config, name):
+            n_expert = getattr(model.config, name)
+
+    i_attention_wv = 0
+    i_ffn_down = 0
+    layer_config_copy = copy.deepcopy(layer_config)
+    target_bits = None
+    if inner_gguf_format.startswith("gguf:q") and len(inner_gguf_format) >= 7 and (inner_gguf_format[6]).isdigit():
+        target_bits = int(inner_gguf_format[6])
+
+    for layer_name, config in layer_config_copy.items():
+        if not check_to_quantized(config):
+            continue
+        new_type = GGUF_CONFIG[target_gguf_format]["mostly"]
+        layer = get_module(model, layer_name)
+        if isinstance(layer, transformers.pytorch_utils.Conv1D):
+            input_features = layer.weight.shape[0]
+        else:
+            input_features = layer.weight.shape[-1]
+        i_layer = _get_digital_in_layer_name(layer_name)
+
+        if lm_head_name is not None and layer_name == lm_head_name:
+            target_bits = int(re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["lm_head"]).group(1))
+        if isinstance(layer, torch.nn.Embedding):
+            target_bits = int(
+                re.search("gguf:q([0-9]{1,})_[01k]", GGUF_CONFIG[target_gguf_format]["embedding"]).group(1)
+            )
+
+        gguf_name = tensor_map.get_name(layer_name)
+        bits_index = 6
+        if config.get("fixed_by_user", False):
+            if "bits" not in config:
+                logger.warning(
+                    f"Setting layer_config requires providing bits, {layer_name} has not bits,"
+                    f" using bits={target_bits} instead."
+                )
+                new_type = new_type[:bits_index] + target_bits + new_type[bits_index + 1 :]
+            else:
+                new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :]
+            new_type = _search_gguf_type(new_type)
+            if new_type is None:
+                raise ValueError(f"invalid bit setting for {layer_name}")
+        elif target_bits is not None and "bits" in config and config["bits"] != target_bits:
+            new_type = new_type[:bits_index] + str(config["bits"]) + new_type[bits_index + 1 :]
+            new_type = _search_gguf_type(new_type)
+            if new_type is None:
+                raise ValueError(f"invalid bit setting for {layer_name}")
+        elif lm_head_name is not None and layer_name == lm_head_name and not tie_word_embeddings:
+            if gguf.MODEL_ARCH.FALCON == model_class.model_arch or input_features % block_size != 0:
+                new_type = "gguf:q8_0"
+            elif "lm_head" in GGUF_CONFIG[target_gguf_format]:
+                new_type = GGUF_CONFIG[target_gguf_format]["lm_head"]
+            elif new_type != "gguf:q8_0":
+                new_type = "gguf:q6_k"
+        elif lm_head_name is not None and layer_name == lm_head_name and tie_word_embeddings:
+            pass
+        elif isinstance(layer, torch.nn.Embedding):
+            if "embedding" in GGUF_CONFIG[target_gguf_format]:
+                new_type = GGUF_CONFIG[target_gguf_format]["embedding"]
+        elif gguf_name is None:
+            pass
+        # attn_v
+        elif "attn_v" in gguf_name:
+            if target_gguf_format == "gguf:q2_k":
+                new_type = "gguf:q4_k" if n_gqa >= 4 else "gguf:q3_k"
+            elif target_gguf_format == "gguf:q2_k_s" and n_gqa >= 4:
+                new_type = "gguf:q4_k"
+            elif target_gguf_format == "gguf:q3_k_m":
+                new_type = "gguf:q5_k" if i_attention_wv < 2 else "gguf:q4_k"
+            elif target_gguf_format == "gguf:q3_k_l":
+                new_type = "gguf:q5_k"
+            elif (target_gguf_format == "gguf:q4_k_m" or target_gguf_format == "gguf:q5_k_m") and _use_more_bits(
+                i_layer, n_layer
+            ):
+                new_type = "gguf:q6_k"
+            elif target_gguf_format == "gguf:q4_k_s" and i_attention_wv < 4:
+                new_type = "gguf:q5_k"
+            ##TODO check which models are be grouped into to LLM_TYPE_70B
+            # if (qs.model.type == LLM_TYPE_70B) {
+            # // In the 70B model we have 8 heads sharing the same attn_v weights.
+            # As a result, the attn_v.weight tensor is
+            # // 8x smaller compared to attn_q.weight.Hence, we can get a nice boost in quantization accuracy with
+            # // nearly negligible increase in model size by quantizing this tensor with more bits:
+            #     if
+            # (new_type == GGML_TYPE_Q3_K | | new_type == GGML_TYPE_Q4_K)
+            # new_type = GGML_TYPE_Q5_K;
+            # }
+            if n_expert == 8:
+                new_type = "gguf:q8_k"
+            i_attention_wv += 1
+
+        elif "attn_k" in gguf_name:
+            if n_expert == 8:
+                new_type = "gguf:q8_0"
+        # ffn_down
+        elif "ffn_down" in gguf_name:
+            if target_gguf_format == "gguf:q2_k":
+                new_type = "gguf:q3_k"
+            elif target_gguf_format == "gguf:q2_k_s":
+                if i_layer < n_layer / 8:
+                    new_type = "gguf:q4_k"
+            elif target_gguf_format == "gguf:q3_k_m":
+                if i_layer < n_layer / 16:
+                    new_type = "gguf:q5_k"
+                elif gguf.MODEL_ARCH.FALCON == model_class.model_arch or _use_more_bits(i_layer, n_layer):
+                    new_type = "gguf:q4_k"
+                else:
+                    new_type = "gguf:q3_k"
+            elif target_gguf_format == "gguf:q3_k_l":
+                if gguf.MODEL_ARCH.FALCON == model_class.model_arch:
+                    new_type = "gguf:q4_k"
+                else:
+                    new_type = "gguf:q5_k"
+            elif target_gguf_format == "gguf:q4_k_m":
+                if gguf.MODEL_ARCH.FALCON == model_class.model_arch:
+                    if i_layer < n_layer // 16:
+                        new_type = "gguf:q6_k"
+                    elif _use_more_bits(i_layer, n_layer):
+                        new_type = "gguf:q5_k"
+                    else:
+                        new_type = "gguf:q4_k"
+                else:
+                    if _use_more_bits(i_layer, n_layer):
+                        new_type = "gguf:q6_k"
+            elif target_gguf_format == "gguf:q5_k_m" and _use_more_bits(i_layer, n_layer):
+                new_type = "gguf:q6_k"
+            elif (
+                target_gguf_format == "gguf:q4_k_s"
+                and model_class.model_arch != gguf.MODEL_ARCH.FALCON
+                and i_layer < n_layer / 8
+            ):
+                new_type = "gguf:q5_k"
+            elif (target_gguf_format == "gguf:q4_0" or target_gguf_format == "gguf:q5_0") and i_layer < n_layer / 8:
+                if target_gguf_format == "gguf:q4_0":
+                    new_type = "gguf:q4_1"
+                else:
+                    new_type = "gguf:q5_1"
+            i_ffn_down += 1
+
+        # attn_output
+        elif "attn_output" in gguf_name:
+            if gguf.MODEL_ARCH.FALCON != model_class.model_arch:
+                if n_expert == 8:
+                    if target_gguf_format in (
+                        "gguf:q2_k",
+                        "gguf:q3_k_s",
+                        "gguf:q3_k_m",
+                        "gguf:q4_k_s",
+                        "gguf:q4_k_m",
+                        "gguf:q5_k",
+                    ):
+                        new_type = "gguf:q5_k"
+                    elif target_gguf_format == "gguf:q2_k":
+                        new_type = "gguf:q3_k"
+                    elif target_gguf_format == "gguf:q3_k_m":
+                        new_type = "gguf:q4_k"
+                    elif target_gguf_format == "gguf:q3_k_l":
+                        new_type = "gguf:q5_k"
+            else:
+                if target_gguf_format == "gguf:q3_k_l":
+                    new_type = "gguf:q4_k"
+        # attn_qkv
+        elif "attn_qkv" in gguf_name:
+            if target_gguf_format in ("gguf:q3_k_m", "gguf:q3_k_l"):
+                new_type = "gguf:q4_k"
+            elif target_gguf_format == "gguf:q4_k_m":
+                new_type = "gguf:q5_k"
+            elif target_gguf_format == "gguf:q5_k_m":
+                new_type = "gguf:q5_k"
+        new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0]
+        if input_features % new_block_size != 0:
+            new_type = _gguf_type_fallback(new_type)
+            new_block_size = GGML_QUANT_SIZES[new_type.split(":")[-1].lower()][0]
+            if input_features % new_block_size != 0:
+                new_type = "gguf:bf16"
+            logger.warning(
+                f"fallback {layer_name} to {new_type}, "
+                f"because input_features({input_features}) % block_size({block_size}) != 0"
+            )
+        # for deepseek v2
+        if layer_name.endswith("kv_b_proj") and new_type.endswith("_k") and "Deepseek" in model.config.architectures[0]:
+            fallback = False
+
+            # calc if need fallback
+            qk_nope_head_dim = model.config.qk_nope_head_dim
+            kv_b_shape = get_module(model, layer_name).weight.shape
+
+            if (
+                qk_nope_head_dim < QK_K
+                or qk_nope_head_dim % QK_K != 0
+                or kv_b_shape[-1] < QK_K
+                or kv_b_shape[-1] % QK_K != 0
+            ):
+                fallback = True
+            if fallback:
+                tmp_type = _gguf_type_fallback(new_type)
+                logger.warning_once(
+                    f"self_attn.kv_b_proj does not support the use of {new_type}, replace it with {tmp_type}"
+                )
+                new_type = tmp_type
+
+        target_config = GGUF_INNER_CONFIG[new_type]
+
+        _set_config(layer_config[layer_name], target_config)
+        _set_config(layer, target_config)
+        gguf_format_config[layer_name] = new_type
+
+    return layer_config, gguf_format_config
+
+
+def get_lm_head_name(model):
+    block_names = get_block_names(model, True)
+    last_name = None
+    for n, m in model.named_modules():
+        if any(m.children()):
+            continue
+        last_name = n
+    for l in block_names:
+        if last_name in l:
+            last_name = None
+            break
+    return last_name
+
+
+def get_gguf_qtype_by_layer_config(layer_config):
+    import gguf  # pylint: disable=E0401
+
+    if layer_config["bits"] >= 16:
+        return None
+    bits = layer_config["bits"]
+    super_bits = layer_config.get("super_bits", None)
+    sym = layer_config["sym"]
+    group_size = layer_config.get("group_size", None)
+    super_group_size = layer_config.get("super_group_size", None)
+    if bits == 2 and super_bits == 4 and not sym and group_size == 16 and super_group_size == 16:
+        return gguf.GGMLQuantizationType.Q2_K
+    if bits == 3 and super_bits == 6 and sym and group_size == 16 and super_group_size == 16:
+        return gguf.GGMLQuantizationType.Q3_K
+    if bits == 4:
+        if super_bits is not None and super_bits == 6 and not sym and group_size == 32 and super_group_size == 8:
+            return gguf.GGMLQuantizationType.Q4_K
+        if super_bits is None and sym and group_size == 32:
+            return gguf.GGMLQuantizationType.Q4_0
+        if super_bits is None and not sym and group_size == 32:
+            return gguf.GGMLQuantizationType.Q4_1
+    if bits == 5:
+        if super_bits == 6 and not sym and group_size == 32 and super_group_size == 8:
+            return gguf.GGMLQuantizationType.Q5_K
+        if super_bits is None and sym and group_size == 32:
+            return gguf.GGMLQuantizationType.Q5_0
+        if super_bits is None and not sym and group_size == 32:
+            return gguf.GGMLQuantizationType.Q5_1
+    if bits == 6 and super_bits == 8 and group_size == 16 and super_group_size == 16:
+        return gguf.GGMLQuantizationType.Q6_K
+    if bits == 8 and sym and group_size == 32:
+        return gguf.GGMLQuantizationType.Q8_0
+    raise ValueError("Unknown layer config")
+
+
+def flatten_list(nested_list):
+    flattened = []
+    for item in nested_list:
+        if isinstance(item, (list, tuple)):
+            flattened.extend(flatten_list(item))
+        else:
+            flattened.append(item)
+    return flattened
+
+
+def clean_module_parameter(submodule, parameter):
+    if submodule is None:
+        return
+    is_buffer = parameter in submodule._buffers
+    with torch.no_grad():
+        if is_buffer:
+            submodule._buffers[parameter] = None
+        else:
+            submodule._parameters[parameter] = None
+
+
+def get_reciprocal(tensor):
+    if torch.dtype is torch.float16:
+        tensor = torch.sign(tensor) * torch.clamp(torch.abs(tensor), min=1e-5)
+    else:
+        tensor = torch.where(torch.abs(tensor) < 1e-30, 0, tensor)
+    return torch.where(tensor != 0, 1 / tensor, torch.zeros_like(tensor))
+
+
+def check_need_act_calibration(
+    is_act_dynamic: Union[bool, None], act_data_type: Union[str, None] = None, act_bits: int = 16
+) -> bool:
+    if act_bits > 8:
+        return False
+    # None is dynamic
+    if is_act_dynamic is not None and not is_act_dynamic:
+        return True
+    if act_data_type is not None and "static" in act_data_type:
+        return True
+    return False
+
+
+def pad_weight(weight: torch.Tensor, block_size: list) -> Tuple[torch.Tensor, int, int]:
+    """Pads a matrix to make its dimensions multiples of block_size."""
+    M, N = weight.shape[-2:]
+    block_size_m, block_size_n = block_size
+    pad_M = (block_size_m - M % block_size_m) % block_size_m
+    pad_N = (block_size_n - N % block_size_n) % block_size_n
+
+    if pad_M == 0 and pad_N == 0:
+        return weight, M, N  # No padding needed
+    padded_weight = torch.nn.functional.pad(weight, (0, pad_N, 0, pad_M), mode="constant", value=0)
+    return padded_weight, M, N  # Return original dimensions for unpadding
+
+
+def unpad_weight(weight: torch.Tensor, original_M: int, original_N: int, keep_first_dim: bool = False) -> torch.Tensor:
+    """Removes padding from the matrix to restore its original shape."""
+    if (weight.shape[-2] == original_M) and (weight.shape[-1] == original_N):
+        return weight
+    if keep_first_dim:
+        return weight[:, :original_M, :original_N]
+    else:
+        return weight[:original_M, :original_N]
+
+
+def pad_block_fp8_weight_naive(
+    weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list
+) -> Tuple[torch.Tensor, int, int]:
+    assert len(block_size) == 2
+
+    block_size_m, block_size_n = block_size
+    weight_scale_m, weight_scale_n = weight_scale.shape[-2:]
+
+    weight, orig_M, orig_N = pad_weight(weight, block_size)
+    M, N = weight.shape[-2:]
+
+    assert weight_scale_m == M // block_size_m
+    assert weight_scale_n == N // block_size_n
+
+    return weight, orig_M, orig_N
+
+
+def dequant_block_fp8_weight(weight: torch.Tensor, weight_scale: torch.Tensor, block_size: list) -> torch.Tensor:
+    dtype = torch.bfloat16
+    if weight_scale is None:
+        return weight
+    assert len(block_size) == 2
+
+    weight, orig_M, orig_N = pad_block_fp8_weight_naive(weight, weight_scale, block_size)
+
+    weight_shape_len = len(weight.shape)
+
+    block_size_m, block_size_n = block_size
+
+    # mul scale
+    if weight_shape_len == 2:
+        weight_scale_m, weight_scale_n = weight_scale.shape
+        weight_scale = weight_scale.view(weight_scale_m, 1, weight_scale_n, 1)
+        weight = weight.view(weight_scale_m, block_size_m, weight_scale_n, block_size_n)
+        dequant_weight = weight.to(dtype) * weight_scale.to(dtype)
+        dequant_weight = dequant_weight.view(weight_scale_m * block_size_m, weight_scale_n * block_size_n)
+        keep_first_dim = False
+    elif weight_shape_len == 3:
+        fd, weight_scale_m, weight_scale_n = weight_scale.shape
+        weight_scale = weight_scale.view(fd, weight_scale_m, 1, weight_scale_n, 1)
+        weight = weight.view(fd, weight_scale_m, block_size_m, weight_scale_n, block_size_n)
+        dequant_weight = weight.to(dtype) * weight_scale.to(dtype)
+        dequant_weight = dequant_weight.view(fd, weight_scale_m * block_size_m, weight_scale_n * block_size_n)
+        keep_first_dim = True
+    else:
+        raise ValueError("Only support original weight shape is either 2 or 3")
+
+    dequant_weight = unpad_weight(dequant_weight, orig_M, orig_N, keep_first_dim=keep_first_dim)
+
+    return dequant_weight
+
+
+def convert_fp8_layer_to_linear(layer, dtype=torch.bfloat16):
+    """ """
+    new_layer = torch.nn.Linear(layer.in_features, layer.out_features, bias=layer.bias is not None, dtype=dtype)
+    if layer.bias is not None:
+        new_layer.bias.data.copy_(layer.bias.data.to(dtype=dtype))
+
+    keys = get_quant_keys() + ["tmp_name"]
+    for key in keys:
+        setattr(new_layer, key, getattr(layer, key, None))
+
+    if layer.__class__.__name__ == "CompressedLinear":
+        dq_weight = layer.compressor.decompress_module(layer)
+    else:
+        weight_scale = layer.weight_scale if hasattr(layer, "weight_scale") else layer.weight_scale_inv
+        dq_weight = dequant_block_fp8_weight(layer.weight, weight_scale, layer.block_size)
+    new_layer.weight.data.copy_(dq_weight.to(dtype=dtype))
+    return new_layer
+
+
+def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16):
+    """
+    Convert a model with FP8 quantized layers to a model with 16-bit linear layers.
+    This is useful for compatibility with other frameworks or for further processing.
+    """
+    for n, m in model.named_modules():
+        if m.__class__.__name__ == "FP8Linear":
+            new_module = convert_fp8_layer_to_linear(m, dtype=dtype)
+            set_module(model, n, new_module)
+    return model
+
+
+def get_quant_keys():
+    keys = [
+        "bits",
+        "group_size",
+        "sym",
+        "data_type",
+        "scale_dtype",
+        "act_bits",
+        "act_group_size",
+        "act_sym",
+        "act_dynamic",
+        "act_data_type",
+        "super_bits",
+        "super_group_size",
+    ]
+    return keys
+
+
+def out_of_vram(error_msg):
+    error_msg = str(error_msg)
+    # CUDA
+    if "CUDA out of memory" in error_msg:
+        return True
+    # gaudi
+    if "MODULE:PT_DEVMEM" in error_msg:
+        return True
+    # XPU
+    if "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in error_msg:
+        return True
+    # ROCM
+    if "HIP out of memory. Tried to allocate" in error_msg:
+        return True
+    return False
+
+
+def download_hf_model(repo_id, cache_dir=None, repo_type=None, revision=None):
+    """Download hugging face model from hf hub."""
+    from huggingface_hub.constants import DEFAULT_REVISION, HUGGINGFACE_HUB_CACHE
+    from huggingface_hub.file_download import REGEX_COMMIT_HASH, repo_folder_name
+    from huggingface_hub.utils import EntryNotFoundError
+
+    if cache_dir is None:
+        cache_dir = HUGGINGFACE_HUB_CACHE
+    if revision is None:
+        revision = DEFAULT_REVISION
+    if repo_type is None:
+        repo_type = "model"
+    storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
+    commit_hash = None
+    if REGEX_COMMIT_HASH.match(revision):
+        commit_hash = revision
+    else:
+        ref_path = os.path.join(storage_folder, "refs", revision)
+        if os.path.exists(ref_path):
+            with open(ref_path) as f:
+                commit_hash = f.read()
+    if storage_folder and commit_hash:
+        pointer_path = os.path.join(storage_folder, "snapshots", commit_hash)
+        if os.path.isdir(pointer_path):
+            return pointer_path
+    else:  # pragma: no cover
+        from huggingface_hub import snapshot_download
+
+        model_path = snapshot_download(repo_id)
+        return model_path
+
+
+def is_moe(module: torch.nn.Module) -> bool:
+    """Returns whether the module is an MOE layer."""
+    return any(
+        key in type(module).__name__.lower()
+        for key in [
+            "MixtralSparseMoeBlock".lower(),
+            "ArcticMoE".lower(),
+            "DbrxFFN".lower(),
+            "MoELayer".lower(),
+            "PhimoeSparseMoeBlock".lower(),
+            "DeepseekMoE".lower(),
+            "DeepseekV2MoE".lower(),
+            "DeepseekV3MoE".lower(),
+            "Qwen2MoeSparseMoeBlock".lower(),
+            "Qwen3MoeSparseMoeBlock".lower(),
+        ]
+    )
+
+
+# please refer to https://github.com/NVIDIA/TensorRT-Model-Optimizer
+# /blob/4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/layer_utils.py#L976
+def get_expert_linear_names(module: torch.nn.Module) -> list[str]:
+    """Get the list of linear names for the experts."""
+
+    def module_match_name_list(module, name_list):
+        """Check if the module name matches any of the names in the list.
+
+        e.g. module_match_name_list(QuantQwen3MoeSparseMoeBlock, ['Qwen3MoeSparseMoeBlock']) -> True
+
+        """
+        return any(name.lower() in type(module).__name__.lower() for name in name_list)
+
+    if module_match_name_list(
+        module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE", "DeepseekV2MoE", "DeepseekV3MoE"]
+    ):
+        return ["gate_proj", "down_proj", "up_proj"]
+    elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]):
+        return ["linear_fc1", "linear_fc2"]
+    elif module_match_name_list(module, ["DBRXMoeSparseMoeBlock"]):
+        return ["w1_linear", "w2_linear", "v1_linear"]
+    else:
+        # assuing w1, w2, w3 by default
+        return ["w1", "w2", "w3"]
+
+
+def get_nested_attr(module, attr_name: str):
+    """Recursively get nested attribute (e.g., 'orig_layer.act_max')."""
+    attrs = attr_name.split(".")
+    for attr in attrs:
+        if not hasattr(module, attr):
+            return None
+        module = getattr(module, attr)
+    return module
+
+
+def set_nested_attr(module, attr_name: str, value):
+    """Recursively set nested attribute (e.g., 'orig_layer.act_max' = value)."""
+    attrs = attr_name.split(".")
+    for attr in attrs[:-1]:
+        if not hasattr(module, attr):
+            raise AttributeError(f"{module} has no attribute '{attr}'")
+        module = getattr(module, attr)
+    setattr(module, attrs[-1], value)
+
+
+def set_amax_for_uncalibrated_experts(
+    experts: torch.nn.Module, set_amax_value: float | None = None, attr_name="act_max"
+):
+    """Set amax of uncalibrated experts to a given value or the max of existing amax value from other experts.
+
+    Args:
+        experts: a list of experts
+        set_amax_value: set amax value to the given value.
+                        If None, set amax value to the max of existing amax value from other experts.
+
+    Returns:
+        uncalibrated_experts: a list of uncalibrated experts
+    """
+    uncalibrated_experts = []
+    # get the max amax value from all experts
+    if set_amax_value is None:
+        amax_values = [
+            get_nested_attr(module, attr_name) for module in experts if get_nested_attr(module, attr_name) is not None
+        ]
+        if len(amax_values) == 0:
+            return uncalibrated_experts
+        # Flatten all tensors to 1D before concatenation
+        flat_values = [t.reshape(-1) for t in amax_values]
+        all_values = torch.cat(flat_values)
+        set_amax_value = torch.max(all_values)
+
+    for module in experts:
+        if get_nested_attr(module, attr_name) is None:
+            logger.warning_once(
+                "Missing amax value of expert layers."
+                "This typically occurs in MoE models when certain experts are not activated during calibration. "
+                "Consider increasing your calibration dataset size to ensure all experts are exercised."
+            )
+            # Use float32 dtype explicitly to ensure we create a floating point tensor
+            if not isinstance(set_amax_value, torch.Tensor):
+                set_amax_value = torch.tensor(set_amax_value, dtype=torch.float32)
+            set_nested_attr(module, attr_name, set_amax_value)
+            # uncalibrated_experts.append(module)
+
+
+# Please refer to: https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/
+# 4c611e47a60084a86e1de7e48690a692a1b8170c/modelopt/torch/export/unified_export_hf.py#L195-L207
+def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_name="act_max"):
+    if layer_name is not None:
+        parts = layer_name.split(".")
+        if "experts" not in parts:
+            raise ValueError
+        idx = parts.index("experts")
+        moe_name = ".".join(parts[:idx])
+        model = get_module(model, moe_name)
+    # Handle input quantizers of experts that are not calibrated
+    for name, sub_module in model.named_modules():
+        if not (is_moe(sub_module) and hasattr(sub_module, "experts")):
+            continue
+        expert_linear_names = get_expert_linear_names(sub_module)
+        for linear_name in expert_linear_names:
+            if isinstance(sub_module.experts, collections.abc.Iterable):
+                # For other MoE models (like Mixtral) with iterable experts
+                try:
+                    set_amax_for_uncalibrated_experts(
+                        [getattr(expert, linear_name) for expert in sub_module.experts], attr_name=attr_name
+                    )
+                except AttributeError as e:
+                    # Provide more helpful debugging information
+                    expert_types = list(set(type(expert).__name__ for expert in sub_module.experts))
+                    raise AttributeError(
+                        f"Failed to access attribute '{linear_name}' on experts. "
+                        f"MoE module type: {type(sub_module).__name__}, "
+                        f"Expert types: {expert_types}, "
+                        f"Expected linear names: {expert_linear_names}. "
+                        f"This suggests the get_expert_linear_names function may need "
+                        f"to be updated for this model architecture. "
+                        f"Original error: {e}"
+                    ) from e
+            else:
+                # Unsupported MoE model structure
+                raise NotImplementedError(
+                    f"MoE model with experts type '{type(sub_module.experts).__name__}' is not supported in export."
+                    f"Please file an issue or add support for this model architecture."
+                )
+
+
+class BackendDataType(str, Enum):
+    STANDARD_FP = "fp"
+    MX_FP = "mx_fp"
+    NV_FP = "nv_fp"
+
+
+def is_standard_fp(backend):
+    backend = backend.lower()
+    return BackendDataType.STANDARD_FP in backend and not is_mx_fp(backend) and not is_nv_fp(backend)
+
+
+def is_mx_fp(backend):
+    backend = backend.lower()
+    return BackendDataType.MX_FP in backend
+
+
+def is_nv_fp(backend):
+    backend = backend.lower()
+    return BackendDataType.NV_FP in backend
+
+
+def _is_weight_fp8_activation_static_fp8(
+    bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool
+) -> bool:
+    return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic
+
+
+def is_wfp8afp8(ar):
+    if (
+        ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8))
+        and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8))
+        and is_standard_fp(ar.act_data_type)
+        and is_standard_fp(ar.data_type)
+    ):
+        return True
+    else:
+        return False
+
+
+def is_static_wfp8afp8(ar_or_format: Union[str, Callable]) -> bool:
+    if isinstance(ar_or_format, str):
+        return "fp8_static" in ar_or_format
+    if ar_or_format.act_dynamic:
+        return False
+    if is_wfp8afp8(ar_or_format):
+        return True
+    return False
+
+
+def bytes_to_gigabytes(bytes) -> int:
+    """
+    Converts bytes to gigabytes.
+
+    Args:
+        bytes (int): The number of bytes.
+
+    Returns:
+        int: The equivalent number of gigabytes.
+    """
+    return bytes / 1024 / 1024 / 1024
+
+
+def get_device_memory(i: int = 0) -> int:
+    """
+    Gets the available memory on the specified device.
+
+    Args:
+        i (int, optional): Device index. Defaults to 0.
+
+    Returns:
+        int: Available memory in gigabytes.
+    """
+    if torch.cuda.is_available():
+        total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory)
+    elif torch.xpu.is_available():
+        raise RuntimeError("XPU does not support device_map='auto' currently.")
+    else:
+        raise RuntimeError("No supported device found (CUDA or XPU).")
+    return total_memory
+
+
+def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]:
+    """
+    Calculates the memory consumption of a specific block in the model.
+
+    Args:
+        block (torch.nn.Module): The block of the model to analyze.
+        input_ids (list[torch.Tensor]): A list of input tensors for the block.
+
+    Returns:
+        tuple: A tuple containing the following:
+            - block_memory (float): The memory consumption (in GB) of the block's linear layers.
+            - input_output_memory (float): The memory consumption (in GB) for input and output
+                tensors of the block.
+    """
+    # Calculate all block parameters memory
+    total_param_mem = 0
+    for name, module in block.named_modules():
+        if check_to_quantized(module):
+            param_size = module.weight.nbytes
+            total_param_mem += param_size
+    block_memory = total_param_mem / 1024**3  # Convert to GB
+
+    # Assuming bfloat16 or float32, input and output
+    input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
+
+    return block_memory, input_output_memory
+
+
+def get_max_vram(ratio: float = 0.9) -> dict:
+    max_memory = {}
+    if torch.cuda.is_available():  # NVIDIA CUDA
+        num_devices = torch.cuda.device_count()
+        for i in range(num_devices):
+            total_mem = torch.cuda.get_device_properties(i).total_memory
+            max_mem_gb = int(total_mem / 1024**3 * ratio)
+            max_memory[i] = f"{max_mem_gb}GiB"
+    elif torch.xpu.is_available():  # TODO need verification
+        num_devices = torch.xpu.device_count()
+        for i in range(num_devices):
+            total_mem = torch.xpu.get_device_properties(i).total_memory
+            max_mem_gb = int(total_mem / 1024**3 * ratio)
+            max_memory[i] = f"{max_mem_gb}GiB"
+
+    else:
+        raise RuntimeError("No CUDA or XPU devices found.")
+    return max_memory
+
+
+def _get_packing_device(device: str | torch.device | None = "auto") -> torch.device:
+    """
+    Selects the packing device.
+    - "auto": choose best available (CUDA > XPU > CPU).
+    - str: parsed by torch.device (e.g., "cuda:2", "cpu").
+    - torch.device: returned as-is.
+    - None: treated as "auto".
 
     Args:
-        regex_config (Dict[str, Dict]): dynamic quantization config
-        layer_config (Dict[str, Dict]): layer-wise quantization config
+        device: Target device spec ("auto", "cuda:0", "xpu:0", "cpu", or torch.device).
 
     Returns:
-        List[str]: List of regex patterns to ignore during quantization.
+        torch.device: The resolved device.
+    """
+    if device is None or (isinstance(device, str) and device.lower() == "auto"):
+        if torch.cuda.is_available():
+            return torch.device("cuda:0")
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            return torch.device("xpu:0")
+        return torch.device("cpu")
+
+    if isinstance(device, torch.device):
+        return device
+
+    if isinstance(device, str):
+        try:
+            return torch.device(device)
+        except Exception as e:
+            raise ValueError(f"Invalid device string: {device}") from e
+
+    raise TypeError(f"Unsupported device type: {type(device)} ({device})")
+
+
+# Adapted from https://github.com/vllm-project/llm-compressor/blob/
+# 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144
+def copy_python_files_from_model_cache(model, save_path: str):
+    config = model.config
+    cache_path = None
+    if hasattr(config, "_name_or_path"):
+        import os
+        import shutil
+
+        from huggingface_hub import hf_hub_download
+        from transformers import TRANSFORMERS_CACHE
+        from transformers.utils import http_user_agent
+
+        cache_path = config._name_or_path
+        if not os.path.exists(cache_path):
+            user_agent = http_user_agent()
+            config_file_path = hf_hub_download(
+                repo_id=cache_path,
+                filename="config.json",
+                cache_dir=TRANSFORMERS_CACHE,
+                force_download=False,
+                user_agent=user_agent,
+            )
+            cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+
+        for file in os.listdir(cache_path):
+            full_file_name = os.path.join(cache_path, file)
+            if file.endswith(".py") and os.path.isfile(full_file_name):
+                logger.debug(f"Transferring {full_file_name} to {save_path}")
+                shutil.copy(full_file_name, save_path)
+
+
+def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
+    MM_KEYS = [
+        "multi_modal_projector",
+        "vision_tower",
+        "multimodal_projector",
+        "thinker",
+        "visual",
+        "audio",
+        "talker",
+        "token2wav",
+        "vision_model",
+        "audio_tower",
+        "vision_encoder",
+        "vision_language_adapter",
+        "patch_merger",
+        "pre_mm_projector_norm",
+        "vision",
+    ]
+
+    model_path = model_or_path if isinstance(model_or_path, str) else model_or_path.name_or_path
+    if not os.path.isdir(model_path):
+        model_path = download_hf_model(model_path)
+
+    if isinstance(model_path, str):
+        if os.path.exists(os.path.join(model_path, "preprocessor_config.json")):
+            return True
+        if os.path.exists(os.path.join(model_path, "processor_config.json")):
+            return True
+        with open(os.path.join(model_path, "config.json")) as f:
+            config = json.load(f)
+        for key in config.keys():
+            if any([k in key for k in MM_KEYS]):
+                return True
+
+    if isinstance(model_or_path, torch.nn.Module):
+        for name, module in model_or_path.named_modules():
+            if any([k in name for k in MM_KEYS]):
+                return True
+
+    return False
+
+
+def to_standard_regex(pattern: str) -> str:
+    """
+    Convert a user-specified string into a standardized regex for layer matching.
+
+    Rules:
+    - If the pattern already contains regex tokens ('.*', '^', '$', etc.),
+      keep them as-is.
+    - Otherwise, wrap the pattern with `.*` on both sides to allow substring matching.
+    - Always ensure the returned regex is valid (compilable by re).
+
+    Examples:
+    >>> to_standard_regex("model.embed_tokens")
+    '.*model\\.embed_tokens.*'
+    >>> to_standard_regex("mlp.gate")
+    '.*mlp\\.gate.*'
+    >>> to_standard_regex("mlp.gate$")
+    '.*mlp\\.gate$'
+    >>> to_standard_regex("mlp.*gate")
+    '.*mlp.*gate.*'
+    """
+    # Heuristic: if pattern contains regex meta characters, assume partial regex
+    meta_chars = {".*", "^", "$", "|", "(", ")", "[", "]", "?", "+"}
+    has_regex = any(tok in pattern for tok in meta_chars)
+    if not has_regex:
+        # Escape literal dots, etc., and wrap with .* for substring matching
+        pattern = re.escape(pattern)
+        regex = f".*{pattern}.*"
+    else:
+        # Only escape bare dots that are not already part of regex constructs
+        # Avoid double escaping .* sequences
+        tmp = []
+        i = 0
+        while i < len(pattern):
+            if pattern[i] == ".":
+                if i + 1 < len(pattern) and pattern[i + 1] == "*":
+                    tmp.append(".*")  # keep regex token
+                    i += 2
+                    continue
+                else:
+                    tmp.append("\\.")  # escape bare dot
+            else:
+                tmp.append(pattern[i])
+            i += 1
+        regex = "".join(tmp)
+        # If no anchors are provided, allow substring matching
+        if not regex.startswith("^") and not regex.startswith(".*"):
+            regex = ".*" + regex
+        if not regex.endswith("$") and not regex.endswith(".*"):
+            regex = regex + ".*"
+    # Validate regex
+    try:
+        re.compile(regex)
+    except re.error as e:
+        raise ValueError(f"Invalid regex generated from pattern '{pattern}': {e}")
+    return regex
+
+
+def matches_any_regex(layer_name: str, regex_list: List[str], prefix="re:") -> bool:
+    """
+    Check if layer_name matches any regex pattern in regex_list.
     """
-    prefix = "re:"
-    ignore_regex: List[str] = []
+    for pattern in regex_list:
+        # Remove 're:' prefix for matching
+        pat = pattern.removeprefix(prefix)
+        if re.fullmatch(pat, layer_name):
+            return True
+    return False
 
-    # Step 1: Add regex_config keys with bits >= 16
-    for key, cfg in regex_config.items():
-        bits = cfg.get("bits")
-        if bits > 8:
-            ignore_regex.append(prefix + to_standard_regex(key))
 
-    # Step 2: Add all full named layer from layer_config if bits >= 16
-    for key, cfg in layer_config.items():
-        bits = cfg.get("bits")
-        if bits > 8:
-            ignore_regex.append(key)
+def json_serialize(obj: Any):
+    """Convert non-JSON-serializable objects into JSON-friendly formats."""
+    if isinstance(obj, torch.dtype):
+        return str(obj).split(".")[-1]  # e.g., torch.float16 -> "float16"
+    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
 
-    return ignore_regex
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
new file mode 100644
index 00000000..9e12e1c8
--- /dev/null
+++ b/test/test_cpu/test_mix_bits.py
@@ -0,0 +1,110 @@
+import os
+import shutil
+import sys
+import unittest
+
+from parameterized import parameterized
+
+sys.path.insert(0, "../..")
+import torch
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
+from auto_round.testing_utils import require_gptqmodel
+
+from auto_round import AutoRound
+
+def _get_folder_size(path: str) -> float:
+    """Return folder size in GB."""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.isfile(fp):
+                total_size += os.path.getsize(fp)
+    return total_size / (1024**3)  # convert to GB
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        model_name = "facebook/opt-125m"
+        self.save_dir = "./saved"
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.llm_dataloader = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+    
+    @require_gptqmodel
+    def test_mixed_gptqmodel(self):
+        bits, sym, group_size = 4, True, 128
+        model_name = "facebook/opt-125m"
+        layer_config = {
+            "k_proj": {"bits": 8},
+            "lm_head": {"bits": 16},
+            "fc1": {"bits": 16},
+        }
+        autoround = AutoRound(
+            model=model_name,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = "./saved"
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        from gptqmodel import GPTQModel
+        model = GPTQModel.load(quantized_model_path)
+        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
+        result = model.generate("Uncovering deep insights begins with")[0] # tokens
+        assert("!!!" not in model.tokenizer.decode(result)) # string output
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    def test_mixed_autoround_format(self):
+        bits, sym, group_size = 4, True, 128
+        model_name = "facebook/opt-125m"
+        layer_config = {
+            "k_proj": {"bits": 8},
+            "q_proj": {"bits": 3},
+            "lm_head": {"bits": 16},
+            "fc1": {"bits": 16},
+        }
+        autoround = AutoRound(
+            model=model_name,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = "./saved"
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
+        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+if __name__ == "__main__":
+    unittest.main()
+
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
new file mode 100644
index 00000000..81e0a471
--- /dev/null
+++ b/test/test_cuda/test_mix_bits.py
@@ -0,0 +1,190 @@
+import os
+import shutil
+import sys
+import unittest
+
+from parameterized import parameterized
+
+sys.path.insert(0, "../..")
+import torch
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
+from auto_round.testing_utils import require_gptqmodel
+
+from auto_round import AutoRound
+
+def _get_folder_size(path: str) -> float:
+    """Return folder size in GB."""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.isfile(fp):
+                total_size += os.path.getsize(fp)
+    return total_size / (1024**3)  # convert to GB
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        model_name = "facebook/opt-125m"
+        self.save_dir = "./saved"
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        self.llm_dataloader = LLMDataLoader()
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+    
+    @require_gptqmodel
+    def test_mixed_gptqmodel(self):
+        bits, sym, group_size = 4, True, 128
+        model_name = "facebook/opt-125m"
+        layer_config = {
+            "k_proj": {"bits": 8},
+            "lm_head": {"bits": 16},
+            "fc1": {"bits": 16},
+        }
+        autoround = AutoRound(
+            model=model_name,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            layer_config=layer_config,
+            dataset=self.llm_dataloader,
+        )
+        quantized_model_path = "./saved"
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
+        from gptqmodel import GPTQModel
+        model = GPTQModel.load(quantized_model_path)
+        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
+        result = model.generate("Uncovering deep insights begins with")[0] # tokens
+        assert("!!!" not in model.tokenizer.decode(result)) # string output
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    def test_mixed_autoround_format(self):
+        bits, sym, group_size = 4, True, 128
+        model_name = "facebook/opt-125m"
+        layer_config = {
+            "k_proj": {"bits": 8},
+            "q_proj": {"bits": 3},
+            "lm_head": {"bits": 16},
+            "fc1": {"bits": 16},
+        }
+        autoround = AutoRound(
+            model=model_name,
+            bits=bits,
+            group_size=group_size,
+            sym=sym,
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = "./saved"
+        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
+        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
+        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
+        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
+        text = "There is a girl who likes adventure,"
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+    def test_mixed_autoround_format_vllm(self):
+        layer_config = {
+            "self_attn": {"bits": 8},
+            "lm_head": {"bits": 16},
+        }
+        autoround = AutoRound(
+            self.model,
+            self.tokenizer,
+            model_name,
+            scheme="W4A16",
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        autoround.quantize()
+        quantized_model_path = self.save_dir
+        autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
+
+        from vllm import LLM, SamplingParams
+        # Sample prompts.
+        prompts = [
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+        # Create an LLM.
+        QUANTIZATION = "auto-round" #quantized_model_path
+        llm = LLM(model=quantized_model_path, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            # if "France" in prompt:
+            assert "!!!" not in generated_text
+            print(f"{prompt}: {generated_text}")
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
+    def test_mixed_llmcompressor_format_vllm(self):
+        model_name = "facebook/opt-125m"
+        layer_config = {
+            "self_attn": {"bits": 16, "act_bits": 16, "data_type": "float"},
+            "lm_head": {"bits": 16, "act_bits": 16, "data_type": "float"},
+            "fc1": {"bits": 16, "act_bits": 16, "data_type": "float", },
+        }
+        autoround = AutoRound(
+            model_name,
+            scheme="NVFP4",
+            iters=2,
+            seqlen=2,
+            dataset=self.llm_dataloader,
+            layer_config=layer_config,
+        )
+        quantized_model_path = self.save_dir
+        compressed,_ = autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="llm_compressor")
+        from vllm import LLM, SamplingParams
+        # Sample prompts.
+        prompts = [
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+        # Create an LLM.
+        QUANTIZATION = "auto-round" #quantized_model_path
+        llm = LLM(model=quantized_model_path, trust_remote_code=True, tensor_parallel_size=1)
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"{prompt}: {generated_text}")
+            assert "!!!" not in generated_text
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
+
+
+
+if __name__ == "__main__":
+    unittest.main()
+

From 21ff4b9a4713860fae2969c336321412e8cae709 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Sep 2025 02:05:53 +0000
Subject: [PATCH 07/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/compressors/base.py                |  1 -
 .../export/export_to_autoround/export.py      |  1 -
 .../export_to_nvfp_mxfp.py                    |  1 -
 .../export_to_llmcompressor/export_to_fp.py   |  1 -
 .../export/export_to_llmcompressor/utils.py   | 19 +++++++++-
 auto_round/utils.py                           |  1 -
 test/test_cpu/test_mix_bits.py                | 20 +++++-----
 test/test_cuda/test_mix_bits.py               | 37 +++++++++++--------
 8 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 260a19b8..c8bba384 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -3492,4 +3492,3 @@ def _step(self, scaler, optimizer, lr_schedule):
             lr_schedule.step()
         if is_hpex_available():
             htcore.mark_step()
-
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 5c58fe2d..0de6a12b 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -399,4 +399,3 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
-
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
index bff5c4f4..349ff2b5 100644
--- a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
+++ b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -262,4 +262,3 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
-
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
index 9fc138b6..f2fd0515 100644
--- a/auto_round/export/export_to_llmcompressor/export_to_fp.py
+++ b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -244,4 +244,3 @@ def wrapper(name):
     save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return model
-
diff --git a/auto_round/export/export_to_llmcompressor/utils.py b/auto_round/export/export_to_llmcompressor/utils.py
index c2cb1ac4..304a720b 100644
--- a/auto_round/export/export_to_llmcompressor/utils.py
+++ b/auto_round/export/export_to_llmcompressor/utils.py
@@ -1,5 +1,20 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List
-from auto_round.utils import to_standard_regex, matches_any_regex
+
+from auto_round.utils import matches_any_regex, to_standard_regex
 
 
 def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict[str, Dict]) -> List[str]:
@@ -10,7 +25,7 @@ def generate_ignore_regex_list(regex_config: Dict[str, Dict], layer_config: Dict
     1. Any layer in regex_config with bits >= 16 is ignored.
     2. Any layer in layer_config with bits >= 16 is ignored if not already included.
     3. Output regex patterns are normalized for llm_compressor ('re:...' style).
-    
+
     Args:
         regex_config (Dict[str, Dict]): dynamic quantization config
         layer_config (Dict[str, Dict]): layer-wise quantization config
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 74e825b9..90f1f9f0 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2831,4 +2831,3 @@ def json_serialize(obj: Any):
     if isinstance(obj, torch.dtype):
         return str(obj).split(".")[-1]  # e.g., torch.float16 -> "float16"
     raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
-
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 9e12e1c8..41aaa961 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -8,9 +8,10 @@
 sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
-from auto_round.testing_utils import require_gptqmodel
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_gptqmodel
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -45,7 +46,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-    
+
     @require_gptqmodel
     def test_mixed_gptqmodel(self):
         bits, sym, group_size = 4, True, 128
@@ -68,11 +69,12 @@ def test_mixed_gptqmodel(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
         from gptqmodel import GPTQModel
+
         model = GPTQModel.load(quantized_model_path)
-        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
-        result = model.generate("Uncovering deep insights begins with")[0] # tokens
-        assert("!!!" not in model.tokenizer.decode(result)) # string output
+        assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4
+        result = model.generate("Uncovering deep insights begins with")[0]  # tokens
+        assert "!!!" not in model.tokenizer.decode(result)  # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format(self):
@@ -97,14 +99,14 @@ def test_mixed_autoround_format(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
-        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
+        assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index 81e0a471..3f864941 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -8,9 +8,10 @@
 sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
-from auto_round.testing_utils import require_gptqmodel
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_gptqmodel
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -45,7 +46,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-    
+
     @require_gptqmodel
     def test_mixed_gptqmodel(self):
         bits, sym, group_size = 4, True, 128
@@ -68,11 +69,12 @@ def test_mixed_gptqmodel(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
         from gptqmodel import GPTQModel
+
         model = GPTQModel.load(quantized_model_path)
-        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
-        result = model.generate("Uncovering deep insights begins with")[0] # tokens
-        assert("!!!" not in model.tokenizer.decode(result)) # string output
+        assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4
+        result = model.generate("Uncovering deep insights begins with")[0]  # tokens
+        assert "!!!" not in model.tokenizer.decode(result)  # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format(self):
@@ -97,8 +99,8 @@ def test_mixed_autoround_format(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
-        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
+        assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -125,6 +127,7 @@ def test_mixed_autoround_format_vllm(self):
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
         from vllm import LLM, SamplingParams
+
         # Sample prompts.
         prompts = [
             "The capital of France is",
@@ -133,7 +136,7 @@ def test_mixed_autoround_format_vllm(self):
         # Create a sampling params object.
         sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         # Create an LLM.
-        QUANTIZATION = "auto-round" #quantized_model_path
+        QUANTIZATION = "auto-round"  # quantized_model_path
         llm = LLM(model=quantized_model_path, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
         outputs = llm.generate(prompts, sampling_params)
         # Print the outputs.
@@ -145,13 +148,16 @@ def test_mixed_autoround_format_vllm(self):
             print(f"{prompt}: {generated_text}")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-
     def test_mixed_llmcompressor_format_vllm(self):
         model_name = "facebook/opt-125m"
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16, "data_type": "float"},
             "lm_head": {"bits": 16, "act_bits": 16, "data_type": "float"},
-            "fc1": {"bits": 16, "act_bits": 16, "data_type": "float", },
+            "fc1": {
+                "bits": 16,
+                "act_bits": 16,
+                "data_type": "float",
+            },
         }
         autoround = AutoRound(
             model_name,
@@ -162,8 +168,11 @@ def test_mixed_llmcompressor_format_vllm(self):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        compressed,_ = autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="llm_compressor")
+        compressed, _ = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="llm_compressor"
+        )
         from vllm import LLM, SamplingParams
+
         # Sample prompts.
         prompts = [
             "The capital of France is",
@@ -172,7 +181,7 @@ def test_mixed_llmcompressor_format_vllm(self):
         # Create a sampling params object.
         sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         # Create an LLM.
-        QUANTIZATION = "auto-round" #quantized_model_path
+        QUANTIZATION = "auto-round"  # quantized_model_path
         llm = LLM(model=quantized_model_path, trust_remote_code=True, tensor_parallel_size=1)
         outputs = llm.generate(prompts, sampling_params)
         # Print the outputs.
@@ -184,7 +193,5 @@ def test_mixed_llmcompressor_format_vllm(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
 
-
 if __name__ == "__main__":
     unittest.main()
-

From b97f3fc91b2a3c293afe4b37fd64e375f9dab585 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 24 Sep 2025 10:08:29 +0800
Subject: [PATCH 08/10] refine ut

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 test/test_cuda/test_mix_bits.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index 81e0a471..a8780c8b 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -12,17 +12,6 @@
 
 from auto_round import AutoRound
 
-def _get_folder_size(path: str) -> float:
-    """Return folder size in GB."""
-    total_size = 0
-    for dirpath, _, filenames in os.walk(path):
-        for f in filenames:
-            fp = os.path.join(dirpath, f)
-            if os.path.isfile(fp):
-                total_size += os.path.getsize(fp)
-    return total_size / (1024**3)  # convert to GB
-
-
 class LLMDataLoader:
     def __init__(self):
         self.batch_size = 1
@@ -113,7 +102,6 @@ def test_mixed_autoround_format_vllm(self):
         autoround = AutoRound(
             self.model,
             self.tokenizer,
-            model_name,
             scheme="W4A16",
             iters=2,
             seqlen=2,

From b91bf2067140bf22dd7792239b94ddac67de65f4 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 24 Sep 2025 11:54:37 +0800
Subject: [PATCH 09/10] fix UT

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/export/export_to_awq/export.py |  2 +
 test/test_cpu/test_mix_bits.py            | 26 ++++++-------
 test/test_cuda/test_mix_bits.py           | 45 ++++++++++-------------
 3 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
index 6da6bec3..2c58412d 100644
--- a/auto_round/export/export_to_awq/export.py
+++ b/auto_round/export/export_to_awq/export.py
@@ -137,6 +137,7 @@ def wrapper(name):
         return model
 
     quantization_config = kwargs["serialization_dict"]
+    quantization_config.pop("regex_config") #as awq do not support mixed bits config saving
 
     if output_dir is None:
         return compressed_model
@@ -159,3 +160,4 @@ def wrapper(name):
     save_model(compressed_model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return compressed_model
+
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 41aaa961..40884f3b 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -8,10 +8,9 @@
 sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
-
-from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
+from auto_round import AutoRound
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -36,7 +35,7 @@ def __iter__(self):
 class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -46,11 +45,11 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-
+    
     @require_gptqmodel
     def test_mixed_gptqmodel(self):
         bits, sym, group_size = 4, True, 128
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "k_proj": {"bits": 8},
             "lm_head": {"bits": 16},
@@ -69,17 +68,16 @@ def test_mixed_gptqmodel(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
         from gptqmodel import GPTQModel
-
         model = GPTQModel.load(quantized_model_path)
-        assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8
-        assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4
-        result = model.generate("Uncovering deep insights begins with")[0]  # tokens
-        assert "!!!" not in model.tokenizer.decode(result)  # string output
+        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
+        result = model.generate("Uncovering deep insights begins with")[0] # tokens
+        assert("!!!" not in model.tokenizer.decode(result)) # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format(self):
         bits, sym, group_size = 4, True, 128
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -99,14 +97,14 @@ def test_mixed_autoround_format(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
-        assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
-        assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
+        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-
 if __name__ == "__main__":
     unittest.main()
+
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index a7d7b464..a7e25c70 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -8,10 +8,9 @@
 sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
-
-from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
+from auto_round import AutoRound
 
 class LLMDataLoader:
     def __init__(self):
@@ -25,7 +24,7 @@ def __iter__(self):
 class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -35,11 +34,11 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-
+    
     @require_gptqmodel
     def test_mixed_gptqmodel(self):
         bits, sym, group_size = 4, True, 128
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "k_proj": {"bits": 8},
             "lm_head": {"bits": 16},
@@ -58,17 +57,16 @@ def test_mixed_gptqmodel(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
         from gptqmodel import GPTQModel
-
         model = GPTQModel.load(quantized_model_path)
-        assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8
-        assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4
-        result = model.generate("Uncovering deep insights begins with")[0]  # tokens
-        assert "!!!" not in model.tokenizer.decode(result)  # string output
+        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
+        result = model.generate("Uncovering deep insights begins with")[0] # tokens
+        assert("!!!" not in model.tokenizer.decode(result)) # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format(self):
         bits, sym, group_size = 4, True, 128
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -88,8 +86,8 @@ def test_mixed_autoround_format(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
-        assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
-        assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
+        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
+        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -115,7 +113,6 @@ def test_mixed_autoround_format_vllm(self):
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
         from vllm import LLM, SamplingParams
-
         # Sample prompts.
         prompts = [
             "The capital of France is",
@@ -124,7 +121,7 @@ def test_mixed_autoround_format_vllm(self):
         # Create a sampling params object.
         sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         # Create an LLM.
-        QUANTIZATION = "auto-round"  # quantized_model_path
+        QUANTIZATION = "auto-round" #quantized_model_path
         llm = LLM(model=quantized_model_path, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
         outputs = llm.generate(prompts, sampling_params)
         # Print the outputs.
@@ -136,16 +133,13 @@ def test_mixed_autoround_format_vllm(self):
             print(f"{prompt}: {generated_text}")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+
     def test_mixed_llmcompressor_format_vllm(self):
-        model_name = "facebook/opt-125m"
+        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16, "data_type": "float"},
             "lm_head": {"bits": 16, "act_bits": 16, "data_type": "float"},
-            "fc1": {
-                "bits": 16,
-                "act_bits": 16,
-                "data_type": "float",
-            },
+            "fc1": {"bits": 16, "act_bits": 16, "data_type": "float", },
         }
         autoround = AutoRound(
             model_name,
@@ -156,11 +150,8 @@ def test_mixed_llmcompressor_format_vllm(self):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        compressed, _ = autoround.quantize_and_save(
-            output_dir=quantized_model_path, inplace=False, format="llm_compressor"
-        )
+        compressed,_ = autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="llm_compressor")
         from vllm import LLM, SamplingParams
-
         # Sample prompts.
         prompts = [
             "The capital of France is",
@@ -169,7 +160,7 @@ def test_mixed_llmcompressor_format_vllm(self):
         # Create a sampling params object.
         sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         # Create an LLM.
-        QUANTIZATION = "auto-round"  # quantized_model_path
+        QUANTIZATION = "auto-round" #quantized_model_path
         llm = LLM(model=quantized_model_path, trust_remote_code=True, tensor_parallel_size=1)
         outputs = llm.generate(prompts, sampling_params)
         # Print the outputs.
@@ -181,5 +172,7 @@ def test_mixed_llmcompressor_format_vllm(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
 
+
 if __name__ == "__main__":
     unittest.main()
+

From cd5c69300a443c0513d318b57f291c850038464c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Sep 2025 03:55:14 +0000
Subject: [PATCH 10/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_awq/export.py |  3 +-
 test/test_cpu/test_mix_bits.py            | 20 ++++++------
 test/test_cuda/test_mix_bits.py           | 37 ++++++++++++++---------
 3 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
index 2c58412d..46ba11b3 100644
--- a/auto_round/export/export_to_awq/export.py
+++ b/auto_round/export/export_to_awq/export.py
@@ -137,7 +137,7 @@ def wrapper(name):
         return model
 
     quantization_config = kwargs["serialization_dict"]
-    quantization_config.pop("regex_config") #as awq do not support mixed bits config saving
+    quantization_config.pop("regex_config")  # as awq do not support mixed bits config saving
 
     if output_dir is None:
         return compressed_model
@@ -160,4 +160,3 @@ def wrapper(name):
     save_model(compressed_model, output_dir, safe_serialization=safe_serialization, dtype=dtype)
 
     return compressed_model
-
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 40884f3b..d7a5c55d 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -8,9 +8,10 @@
 sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
-from auto_round.testing_utils import require_gptqmodel
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_gptqmodel
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -45,7 +46,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-    
+
     @require_gptqmodel
     def test_mixed_gptqmodel(self):
         bits, sym, group_size = 4, True, 128
@@ -68,11 +69,12 @@ def test_mixed_gptqmodel(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
         from gptqmodel import GPTQModel
+
         model = GPTQModel.load(quantized_model_path)
-        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
-        result = model.generate("Uncovering deep insights begins with")[0] # tokens
-        assert("!!!" not in model.tokenizer.decode(result)) # string output
+        assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4
+        result = model.generate("Uncovering deep insights begins with")[0]  # tokens
+        assert "!!!" not in model.tokenizer.decode(result)  # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format(self):
@@ -97,14 +99,14 @@ def test_mixed_autoround_format(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
-        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
+        assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
+
 if __name__ == "__main__":
     unittest.main()
-
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index a7e25c70..7353af6b 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -8,9 +8,10 @@
 sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
-from auto_round.testing_utils import require_gptqmodel
 
 from auto_round import AutoRound
+from auto_round.testing_utils import require_gptqmodel
+
 
 class LLMDataLoader:
     def __init__(self):
@@ -34,7 +35,7 @@ def setUpClass(self):
     def tearDownClass(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-    
+
     @require_gptqmodel
     def test_mixed_gptqmodel(self):
         bits, sym, group_size = 4, True, 128
@@ -57,11 +58,12 @@ def test_mixed_gptqmodel(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
         from gptqmodel import GPTQModel
+
         model = GPTQModel.load(quantized_model_path)
-        assert (model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4)
-        result = model.generate("Uncovering deep insights begins with")[0] # tokens
-        assert("!!!" not in model.tokenizer.decode(result)) # string output
+        assert model.model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.model.decoder.layers[0].self_attn.q_proj.bits == 4
+        result = model.generate("Uncovering deep insights begins with")[0]  # tokens
+        assert "!!!" not in model.tokenizer.decode(result)  # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format(self):
@@ -86,8 +88,8 @@ def test_mixed_autoround_format(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="cpu")
-        assert (model.model.decoder.layers[0].self_attn.k_proj.bits == 8)
-        assert (model.model.decoder.layers[0].self_attn.q_proj.bits == 3)
+        assert model.model.decoder.layers[0].self_attn.k_proj.bits == 8
+        assert model.model.decoder.layers[0].self_attn.q_proj.bits == 3
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -113,6 +115,7 @@ def test_mixed_autoround_format_vllm(self):
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
         from vllm import LLM, SamplingParams
+
         # Sample prompts.
         prompts = [
             "The capital of France is",
@@ -121,7 +124,7 @@ def test_mixed_autoround_format_vllm(self):
         # Create a sampling params object.
         sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         # Create an LLM.
-        QUANTIZATION = "auto-round" #quantized_model_path
+        QUANTIZATION = "auto-round"  # quantized_model_path
         llm = LLM(model=quantized_model_path, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
         outputs = llm.generate(prompts, sampling_params)
         # Print the outputs.
@@ -133,13 +136,16 @@ def test_mixed_autoround_format_vllm(self):
             print(f"{prompt}: {generated_text}")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-
     def test_mixed_llmcompressor_format_vllm(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16, "data_type": "float"},
             "lm_head": {"bits": 16, "act_bits": 16, "data_type": "float"},
-            "fc1": {"bits": 16, "act_bits": 16, "data_type": "float", },
+            "fc1": {
+                "bits": 16,
+                "act_bits": 16,
+                "data_type": "float",
+            },
         }
         autoround = AutoRound(
             model_name,
@@ -150,8 +156,11 @@ def test_mixed_llmcompressor_format_vllm(self):
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
-        compressed,_ = autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="llm_compressor")
+        compressed, _ = autoround.quantize_and_save(
+            output_dir=quantized_model_path, inplace=False, format="llm_compressor"
+        )
         from vllm import LLM, SamplingParams
+
         # Sample prompts.
         prompts = [
             "The capital of France is",
@@ -160,7 +169,7 @@ def test_mixed_llmcompressor_format_vllm(self):
         # Create a sampling params object.
         sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
         # Create an LLM.
-        QUANTIZATION = "auto-round" #quantized_model_path
+        QUANTIZATION = "auto-round"  # quantized_model_path
         llm = LLM(model=quantized_model_path, trust_remote_code=True, tensor_parallel_size=1)
         outputs = llm.generate(prompts, sampling_params)
         # Print the outputs.
@@ -172,7 +181,5 @@ def test_mixed_llmcompressor_format_vllm(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
 
-
 if __name__ == "__main__":
     unittest.main()
-