From 50d3dfdc4b53c5aec84874bb2db5536c471e34c5 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Sun, 3 Aug 2025 20:48:24 -0400 Subject: [PATCH 1/4] add smoothquant to autoround Signed-off-by: n1ck-guo --- auto_round/smooth_quant/__init__.py | 18 + auto_round/smooth_quant/absorb_utils.py | 440 +++++++++++++++ auto_round/smooth_quant/auto_alpha.py | 713 ++++++++++++++++++++++++ auto_round/smooth_quant/calibration.py | 109 ++++ auto_round/smooth_quant/sq.py | 492 ++++++++++++++++ auto_round/smooth_quant/utils.py | 245 ++++++++ 6 files changed, 2017 insertions(+) create mode 100644 auto_round/smooth_quant/__init__.py create mode 100644 auto_round/smooth_quant/absorb_utils.py create mode 100644 auto_round/smooth_quant/auto_alpha.py create mode 100644 auto_round/smooth_quant/calibration.py create mode 100644 auto_round/smooth_quant/sq.py create mode 100644 auto_round/smooth_quant/utils.py diff --git a/auto_round/smooth_quant/__init__.py b/auto_round/smooth_quant/__init__.py new file mode 100644 index 000000000..0abe6e53f --- /dev/null +++ b/auto_round/smooth_quant/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_round.smooth_quant.sq import SmoothQuant \ No newline at end of file diff --git a/auto_round/smooth_quant/absorb_utils.py b/auto_round/smooth_quant/absorb_utils.py new file mode 100644 index 000000000..c03627327 --- /dev/null +++ b/auto_round/smooth_quant/absorb_utils.py @@ -0,0 +1,440 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from auto_round.utils import get_module + +SUPPORTED_TORCH_MODULE = [ + "Linear", + "Conv2d", + "ConvTranspose2d", + "LayerNorm", + "BatchNorm2d", + "GroupNorm", + "InstanceNorm2d", + "LlamaRMSNorm", + "T5LayerNorm", + "LPLayerNorm", + "RMSNorm", + "Qwen2RMSNorm", + "WrapperWALayer" +] + +GET_ABSORB_LAYERS = {} + +def register_get_func(name): + """Class decorator to register a get_absorb_layers func + """ + def register(func): + GET_ABSORB_LAYERS[name] = func + return func + return register + +def _check_valid_conv(module): + """Remove group conv except depthwise conv + :param module: + + :return: + """ + if not isinstance(module, torch.nn.Conv2d): + return True + if module.groups > 1: + if module.in_channels == module.out_channels and module.groups == module.in_channels: + return True + else: + return False + return True + +def remove_unsupported_layers(model, absorb_to_layer, no_absorb_layers): + res = {} + for key in absorb_to_layer.keys(): + absorb_layer = get_module(model, key) + layer_type = absorb_layer.__class__.__name__ + if layer_type not in SUPPORTED_TORCH_MODULE: + no_absorb_layers.extend(absorb_to_layer[key]) + continue + supported = True + for layer_name in absorb_to_layer[key]: + layer = get_module(model, layer_name) + layer_type = layer.__class__.__name__ + if (layer_type not in SUPPORTED_TORCH_MODULE) or not _check_valid_conv(layer): + supported = False + no_absorb_layers.extend(absorb_to_layer[key]) + break + if supported: + res[key] = absorb_to_layer[key] + return res + +@register_get_func("opt") +def get_opt_absorb_layers(model): + model_layer_name = "model.decoder.layers" + absorb_to_layer = {} + for idx in range(len(model.model.decoder.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.self_attn_layer_norm"] = [ + f"{model_layer_name}.{idx}.self_attn.q_proj", + f"{model_layer_name}.{idx}.self_attn.k_proj", + f"{model_layer_name}.{idx}.self_attn.v_proj", + ] + + # attention out + # no_absorb_layers.append(f"{model_layer_name}.{idx}.self_attn.out_proj") + absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.out_proj", + ] + + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.final_layer_norm"] = [ + f"{model_layer_name}.{idx}.fc1", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.fc1"] = [ + f"{model_layer_name}.{idx}.fc2", + ] + + # final layer + # absorb_to_layer["model.decoder.final_layer_norm"] = ['lm_head'] + + return absorb_to_layer + +@register_get_func('llama') +def get_llama_absorb_layers(model): + model_layer_name = "model.layers" + absorb_to_layer = {} + + for idx in range(len(model.model.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attn.q_proj", + f"{model_layer_name}.{idx}.self_attn.k_proj", + f"{model_layer_name}.{idx}.self_attn.v_proj", + ] + + # attention out + module = model.model.layers[idx] + if hasattr(module.self_attn.v_proj, "orig_layer"): + v_proj_shape = module.self_attn.v_proj.orig_layer.weight.shape + o_proj_shape = module.self_attn.o_proj.orig_layer.weight.shape + else: + v_proj_shape = module.self_attn.v_proj.weight.shape + o_proj_shape = module.self_attn.o_proj.weight.shape + if v_proj_shape == o_proj_shape: + absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.o_proj", + ] + + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.gate_proj", + f"{model_layer_name}.{idx}.mlp.up_proj", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ + f"{model_layer_name}.{idx}.mlp.down_proj", + ] + + # final layer + # absorb_to_layer["model.norm"] = ['lm_head'] + + return absorb_to_layer + +@register_get_func('mistral') +def get_mistral_absorb_layers(model): + model_layer_name = "model.layers" + absorb_to_layer = {} + for idx in range(len(model.model.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attn.q_proj", + f"{model_layer_name}.{idx}.self_attn.k_proj", + f"{model_layer_name}.{idx}.self_attn.v_proj", + ] + + # attention out + module = model.model.layers[idx] + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: + absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.o_proj", + ] + + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.gate_proj", + f"{model_layer_name}.{idx}.mlp.up_proj", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ + f"{model_layer_name}.{idx}.mlp.down_proj", + ] + + # final layer + # absorb_to_layer["model.norm"] = ['lm_head'] + + return absorb_to_layer + +@register_get_func('mixtral') +def get_mixtral_absorb_layers(model): + model_layer_name = "model.layers" + absorb_to_layer = {} + for idx in range(len(model.model.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attn.q_proj", + f"{model_layer_name}.{idx}.self_attn.k_proj", + f"{model_layer_name}.{idx}.self_attn.v_proj", + ] + + # attention out + module = model.model.layers[idx] + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: + absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.o_proj", + ] + + # linear in + module = get_module(model, f"{model_layer_name}.{idx}.block_sparse_moe.experts") + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [] + for i in range(len(module)): + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"].extend( + [ + f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w1", + f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w3" + ] + ) + + + # linear out + for i in range(len(module)): + absorb_to_layer[f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w3"] = [ + f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w2" + ] + + # final layer + # absorb_to_layer["model.norm"] = ['lm_head'] + return absorb_to_layer + + +@register_get_func('bloom') +def get_bloom_absorb_layers(model): + model_layer_name = "transformer.h" + absorb_to_layer = {} + for idx in range(len(model.transformer.h)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attention.query_key_value", + ] + + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.dense_h_to_4h", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.gelu_impl"] = [ + f"{model_layer_name}.{idx}.mlp.dense_4h_to_h", + ] + + # final layer + # absorb_to_layer["transformer.ln_f"] = ['lm_head'] + + return absorb_to_layer + + +@register_get_func('gptj') +def get_gptj_absorb_layers(model): + model_layer_name = "transformer.h" + absorb_to_layer = {} + for idx in range(len(model.transformer.h)): + # attention input + linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.ln_1"] = [ + f"{model_layer_name}.{idx}.attn.q_proj", + f"{model_layer_name}.{idx}.attn.k_proj", + f"{model_layer_name}.{idx}.attn.v_proj", + f"{model_layer_name}.{idx}.mlp.fc_in", + ] + + # attention out + absorb_to_layer[f"{model_layer_name}.{idx}.attn.v_proj"] = [ + f"{model_layer_name}.{idx}.attn.out_proj", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.act"] = [ + f"{model_layer_name}.{idx}.mlp.fc_out", + ] + + # final layer + # absorb_to_layer["transformer.ln_f"] = ['lm_head'] + + return absorb_to_layer + +@register_get_func('phi3') +def get_phi3_absorb_layers(model): + model_layer_name = "model.layers" + absorb_to_layer = {} + for idx in range(len(model.model.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attn.qkv_proj", + ] + + # attention out + absorb_to_layer[f"{model_layer_name}.{idx}.self_attn.qkv_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.o_proj", + ] + + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.gate_up_proj", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.gate_up_proj"] = [ + f"{model_layer_name}.{idx}.mlp.down_proj", + ] + + # final layer + # absorb_to_layer["model.norm"] = ['lm_head'] + + return absorb_to_layer + + +@register_get_func('qwen') +def get_qwen_absorb_layers(model): + model_layer_name = "transformer.h" + absorb_to_layer = {} + for idx in range(len(model.transformer.h)): + # attention + absorb_to_layer[f"{model_layer_name}.{idx}.ln_1"] = [ + f"{model_layer_name}.{idx}.attn.c_attn" + ] + + # mlp + absorb_to_layer[f"{model_layer_name}.{idx}.ln_2"] = [ + f"{model_layer_name}.{idx}.mlp.w2", + f"{model_layer_name}.{idx}.mlp.w1", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.w1"] = [ + f"{model_layer_name}.{idx}.mlp.c_proj", + ] + + # final layer + # absorb_to_layer["transformer.ln_f"] = ['lm_head'] + + return absorb_to_layer + + +@register_get_func('qwen2') +@register_get_func('qwen3') +def get_qwen2_absorb_layers(model): + model_layer_name = "model.layers" + absorb_to_layer = {} + for idx in range(len(model.model.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attn.q_proj", + f"{model_layer_name}.{idx}.self_attn.k_proj", + f"{model_layer_name}.{idx}.self_attn.v_proj", + ] + + # attention out + module = model.model.layers[idx] + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: + absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.o_proj", + ] + + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.gate_proj", + f"{model_layer_name}.{idx}.mlp.up_proj", + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ + f"{model_layer_name}.{idx}.mlp.down_proj", + ] + + # final layer + # absorb_to_layer["model.norm"] = ['lm_head'] + + return absorb_to_layer + + +@register_get_func('qwen3_moe') +def get_qwen3_moe_absorb_layers(model): + model_layer_name = "model.layers" + absorb_to_layer = {} + for idx in range(len(model.model.layers)): + # attention input + absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ + f"{model_layer_name}.{idx}.self_attn.q_proj", + f"{model_layer_name}.{idx}.self_attn.k_proj", + f"{model_layer_name}.{idx}.self_attn.v_proj", + ] + + # attention out + module = model.model.layers[idx] + if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: + absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ + f"{model_layer_name}.{idx}.self_attn.o_proj", + ] + + if hasattr(module.mlp, "gate"): + # linear in + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.experts.{i}.gate_proj" for i in range(len(module.mlp.experts)) + ] + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"].extend([ + f"{model_layer_name}.{idx}.mlp.experts.{i}.up_proj" for i in range(len(module.mlp.experts)) + ]) + breakpoint() + + # linear out + for i in range(len(module.mlp.experts)): + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.experts.{i}.up_proj"] = [ + f"{model_layer_name}.{idx}.mlp.experts.{i}.down_proj", + ] + else: + # linear 1 + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ + f"{model_layer_name}.{idx}.mlp.gate_proj",f"{model_layer_name}.{idx}.mlp.up_proj" + ] + + # linear 2 + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ + f"{model_layer_name}.{idx}.mlp.down_proj" + ] + + # final layer + # absorb_to_layer["model.norm"] = ['lm_head'] + return absorb_to_layer + +def get_absorb_layers(model, skip_unsupported_layers=False): + model_type = model.config.model_type + assert model_type in GET_ABSORB_LAYERS, f"Unsupported model type: {model_type}" + absorb_to_layer = GET_ABSORB_LAYERS[model_type](model) + no_absorb_layers = [] + # if skip_unsupported_layers: + # absorb_to_layer = remove_unsupported_layers(model, absorb_to_layer, no_absorb_layers) + return absorb_to_layer, no_absorb_layers + \ No newline at end of file diff --git a/auto_round/smooth_quant/auto_alpha.py b/auto_round/smooth_quant/auto_alpha.py new file mode 100644 index 000000000..8c87dc7e1 --- /dev/null +++ b/auto_round/smooth_quant/auto_alpha.py @@ -0,0 +1,713 @@ +# +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import json + + +import torch +from .utils import logger + + +import numpy +from tqdm import tqdm + +from auto_round.smooth_quant.calibration import Calibration +from auto_round.utils import set_module, get_module +from auto_round.smooth_quant.utils import ( + WrapperLayer, enough_memo_store_scale, reshape_scale_as_input, reshape_scale_as_weight, reshape_in_channel_to_last, + cal_scale, forward_wrapper, mul_scale, quant_dequant) + +TUNERS = {} + + +def register_autotune(name): + """Class decorator to register a smoothquant auto-tune subclass. + + :return: the class of register + """ + + def register(auto_tune): + TUNERS[name] = auto_tune + return auto_tune + + return register + + +@register_autotune("version1") +class AutoAlpha: + def __init__( + self, + model, + dataloader, + absorb_to_layer, + op_types, + device, + q_func, + example_inputs, + weight_clip=True, + alpha_min=0.3, + alpha_max=0.7, + alpha_step=0.1, + shared_criterion="mean", + init_alpha=0.5, + folding=False, + do_blockwise=False, + n_samples=32, + calib_iter=100, + group_size=-1 + ): + """Initialize the AutoAlpha tuner with necessary parameters and components.""" + + self.model = model.to("cpu") + self.model.eval() + self.dataloader = dataloader + self.alpha_min = alpha_min + self.alpha_max = alpha_max + self.alpha_step = alpha_step + self.shared_criterion = shared_criterion + self.init_alpha = init_alpha + self.loss_type = "blockwise" if do_blockwise else "model_wise" + self.calib_sample_num = n_samples if n_samples else 32 + self.op_types = op_types + self.absorb_to_layer = absorb_to_layer + self.weight_scale_dict = {} + self.q_func = q_func + self.folding = folding + self.example_inputs = example_inputs + self.max_value_info = {} # to record max values for alpha tune + self.weight_clip = weight_clip[0] if isinstance(weight_clip, tuple) else weight_clip + self.input_maxes = {} + self.input_mins = {} + self.input_maxes_abs = {} + self.device = device + self.calib_iter = calib_iter + self.group_size = group_size + + def tune(self): + """The main entry of auto_alpha + :return: Optimal alpha values and scales based on user-defined recipes.""" + calib = Calibration(self.model, self.dataloader, self.q_func, self.device, self.group_size) + self.input_mins, self.input_maxes = calib.calibrate(self.calib_iter, self.op_types) + for key in self.input_mins.keys(): + self.input_maxes_abs[key] = torch.max(torch.abs(self.input_mins[key]), torch.abs(self.input_maxes[key])) + + if not self.folding: + diff_modules = set(self.absorb_to_layer.keys()).difference(self.input_mins.keys()) + for d in diff_modules: + del self.absorb_to_layer[d] + + scale_memo_use = 0 + for key in self.absorb_to_layer: + layer_name = self.absorb_to_layer[key][0] + input_max = self.input_maxes_abs[layer_name] + scale_memo_use += 4 * input_max.shape[0] * len(self.absorb_to_layer[key]) + alpha_space_len = (self.alpha_max - self.alpha_min) / self.alpha_step + 1 + scale_memo_use *= alpha_space_len + self._save_scale = enough_memo_store_scale(self.device, scale_memo_use) + + if self.loss_type == "blockwise": + self.block_names = self.get_blocks() + logger.info("Blockwise auto-tuning will be performed") + module_names = self._get_sq_layer_names() + block_names, self.block_to_module = self.block_names, {} + for block in block_names: + self.block_to_module[block] = [] + for module in module_names: + checked = False + for block in block_names: + if block + "." in module: + self.block_to_module[block].append(module) + checked = True + if not checked: + self.block_to_module[module] = [module] + self.block_names = list(self.block_to_module.keys()) + logger.info(f"Blockwise auto-tuning: {len(self.block_names)} blocks found") + logger.debug(f"Blockwise auto-tuning blocks info: {self.block_to_module}") + return self._auto_tune_alpha_blockwise() + else: + return self._auto_tune_alpha() + + def get_blocks(self): + """Obtain a list of blocks in block-wise tuning mode.""" + block_names = [] + for n, m in self.model.named_modules(): + if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: + for nn, mm in m.named_children(): + block_name = n + "." + nn + block_names.append(block_name) + break + return block_names + + def _add_blockwise_observer(self, block_modules): + """ + :param block_modules: the block modules which the observer will insert to + :return: + """ + self.blockwise_hook_handles = [] + for key in block_modules.keys(): + hook_func = self._save_blockwise_hook(key) + hook_handle = block_modules[key].register_forward_hook(hook_func) + self.blockwise_hook_handles.append(hook_handle) + + def _save_blockwise_hook(self, name): + """A forward hook to save inputs/outputs of a block + :param name: the block name + :return: A hook function.""" + + def save_blockwise_hook(module, inputs, outputs): + self.block_inputs[name] = inputs[0] + self.block_outputs[name] = outputs[0] + + return save_blockwise_hook + + def _get_all_hook_module_names(self): + """Obtain all the modules that could be hooked based on given op_types.""" + module_names = [] + for n, module in self.model.named_modules(): + if isinstance(module, tuple(self.op_types)): + module_names.append(n) + return module_names + + # def _update_scales_for_auto(self, absorb_scales, weight_scales): + # """Apply activation and weight scales to the model.""" + # for key in self.absorb_to_layer.keys(): + # layer_names = self.absorb_to_layer[key] + # for layer_name in layer_names: + # layer = get_module(self.model, layer_name) + # input_scale = absorb_scales[key] + # weight_scale = weight_scales[layer_name] + # input_scale = reshape_scale_as_input(layer, input_scale) + # weight_scale = reshape_scale_as_weight(layer, weight_scale) + # #layer.update_scale(input_scale, weight_scale) ##FIXME + # layer.update_scale(None, weight_scale) ##FIXME + + def _update_scales_for_auto(self, absorb_scales, weight_scales): + """Apply activation and weight scales to the model.""" + for key in self.absorb_to_layer.keys(): + absorb_layer = get_module(self.model, key) + layer_names = self.absorb_to_layer[key] + if isinstance(absorb_layer, WrapperLayer): + absorb_scale = absorb_scales[key] + absorb_scale = absorb_scale.view(-1, 1) + absorb_layer.update_scale(None, None, absorb_scale) + + for layer_name in layer_names: + layer = get_module(self.model, layer_name) + weight_scale = weight_scales[layer_name] + weight_scale = reshape_scale_as_weight(layer, weight_scale) + layer.update_scale(None, weight_scale) ##FIXME + else: + for layer_name in layer_names: + layer = get_module(self.model, layer_name) + input_scale = absorb_scales[key] + weight_scale = weight_scales[layer_name] + input_scale = reshape_scale_as_input(layer, input_scale) + weight_scale = reshape_scale_as_weight(layer, weight_scale) + layer.update_scale(input_scale, weight_scale) ##FIXME + + def _change_qdq_for_auto(self, enable=True): + """Change the option for qdq.""" + module_names = self._get_all_hook_module_names() + for name in module_names: + name = name.split(".orig_layer")[0] + module = get_module(self.model, name) + if not hasattr(module, "orig_layer"): # skip module if it's not used in calibration + continue + if enable: + module.enable_quant() + else: + module.disable_quant() + + def _qdq_model_wrapper_for_auto(self, save_q_input=False): + """Wrapper all the module with qdq + :return:""" + module_names = self._get_all_hook_module_names() + self.to_unwrap_module_names = module_names + for name in module_names: + if name not in self.input_mins: # skip module if it's not used in calibration + continue + module = get_module(self.model, name) + new_module = WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input, group_size=self.group_size) + set_module(self.model, name, new_module) + + def _qdq_model_unwrapper_for_auto(self): + """Unwrapper all the module with qdq + :return:""" + module_names = self.to_unwrap_module_names + for name in module_names: + module = get_module(self.model, name) + if not hasattr(module, "orig_layer"): # skip module if it's not used in calibration + continue + set_module(self.model, name, module.orig_layer) + + def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5): + """Cal the adjust scales + :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer + :param input_maxes: The channel-wise input max info for layers + :param alpha: Alpha value to balance the quantization difficulty of activation and weight, a float of a dict + :return:""" + absorb_to_input_maxes = {} + for key in absorb_to_layer.keys(): + layer_name = absorb_to_layer[key][0] + absorb_to_input_maxes[key] = input_maxes[layer_name] + + weight_scales_info = {} + absorb_scales_info = {} + for index, key in enumerate(absorb_to_layer.keys()): + alpha_tmp = alpha[key] if isinstance(alpha, dict) else alpha + if alpha_tmp < 0: + scale = torch.ones((1), device=self.device) + else: + input_max = absorb_to_input_maxes[key] + layer_names = absorb_to_layer[key] + weights = [] + for layer_name in layer_names: + weight = reshape_in_channel_to_last(layer_name, self.model) + weights.append(weight) + + weight_max_per_channel = torch.max(torch.abs(torch.cat(weights, dim=0)), dim=0)[0] + if self.weight_clip: + weight_max_per_channel = weight_max_per_channel.clamp(min=1e-5) + + if self._save_scale: + if key in self.weight_scale_dict and alpha_tmp in self.weight_scale_dict[key]: + scale = self.weight_scale_dict[key][alpha_tmp] + else: + scale = cal_scale(input_max, weights, alpha_tmp, group_size=self.group_size) + else: + scale = cal_scale(input_max, weights, alpha_tmp, group_size=self.group_size) + + absorb_scales_info[key] = 1.0 / scale + absorb_scales_info[key][scale == 0] = 0 + layer_names = absorb_to_layer[key] + if self._save_scale: + if key not in self.weight_scale_dict: + self.weight_scale_dict[key] = {} + self.weight_scale_dict[key][alpha_tmp] = scale + for layer_name in layer_names: + ##self._scale_layer_weight(layer_name, scale) + weight_scales_info[layer_name] = scale + return absorb_scales_info, weight_scales_info + + def _get_auto_loss(self, output, output_q, loss_type="abs", loss_alpha=1.0): + """Get the loss for auto tuning + :param output: Fp32 output for one layer + :param output_q: Quant output for one layer + :param loss_type: The type of loss + :param loss_alpha: Loss alpha i for mean scale error + :return: A tensor of the loss.""" + if len(output.shape) <= 2: + max_value = torch.max(torch.abs(output)) + else: + output = output.reshape(output.shape[0], -1) + output_q = output_q.reshape(output_q.shape[0], -1) + max_value = torch.max(torch.abs(output), dim=-1).values.unsqueeze(-1) + max_value = torch.clip(max_value, 1e-5) + + # return torch.sum(torch.nn.functional.cosine_similarity(output, output_q, dim=-1)) + output = output / max_value ##FIXME need copy not replace + output_q = output_q / max_value + if loss_type == "abs": + return torch.sum(torch.pow(torch.abs(output - output_q), 0.5)) + else: + return torch.sum((output - output_q) ** 2) + + def _get_sq_layer_names(self): + """Get all the layers that could be smooth quanted + :return: All the sq layer names.""" + ##TODO this may not fit for folding=False + module_names = [] + for key in self.absorb_to_layer: + module_names += self.absorb_to_layer[key] + return module_names + + def _get_best_alpha(self, absorb_to_layer, loss_alphas, shared_criterion): + """Obtain the optimal alpha values based on shared criterion and loss values recorded in auto-tuning step. + + :return: A dict of layerwise alpha values. + """ + + def dict_to_list(dic): + res = [] + for key in dic.keys(): + res.append((key, dic[key])) + return res + + best_alpha = {} + for ln_name in absorb_to_layer.keys(): + layer_names = absorb_to_layer[ln_name] + cur_shared_criterion = shared_criterion + if len(layer_names) == 1: + cur_shared_criterion = "min" + if cur_shared_criterion == "mean": + loss_tmp = {} + for alpha in loss_alphas[layer_names[0]].keys(): + if alpha not in loss_tmp.keys(): + loss_tmp[alpha] = 0 + for layer_name in layer_names: + loss_tmp[alpha] += loss_alphas[layer_name][alpha] + res = dict_to_list(loss_tmp) + res.sort(key=lambda x: x[1]) + + best_alpha[ln_name] = float(res[0][0]) + + elif cur_shared_criterion == "min" or cur_shared_criterion == "max": + tmp_best_alpha = [] + for layer_name in layer_names: + res = dict_to_list(loss_alphas[layer_name]) + res.sort(key=lambda x: x[1]) + tmp_best_alpha.append(float(res[0][0])) + if cur_shared_criterion == "min": + best_alpha[ln_name] = min(tmp_best_alpha) + else: + best_alpha[ln_name] = max(tmp_best_alpha) + + else: + raise NotImplementedError + return best_alpha + + def _get_one_batch_auto_loss(self, input, alpha_space, orig_best_alpha, input_maxes): + """Calculate the losses for all alpha values given an input. + + :return: A dict of op-wise loss values with respect to alpha values. + """ + self._change_qdq_for_auto(enable=False) + module_names = self._get_sq_layer_names() + forward_wrapper(self.model, input, self.device) ##disable quant and get fp32 output + + fp32_output = {} + for name in module_names: + module = get_module(self.model, name) + fp32_output[name] = module.output + module.output = None + self._change_qdq_for_auto(enable=True) + absorb_input_scales, weight_scales = self._cal_scales(self.absorb_to_layer, input_maxes, orig_best_alpha) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + forward_wrapper(self.model, input, self.device) ##save quant_input + for mod_name in module_names: # save fp32 values + mod = get_module(self.model, mod_name) + if mod_name in self.fp32_output_val: + self.fp32_output_val[mod_name].append(torch.norm(mod.output)) + else: + self.fp32_output_val[mod_name] = [torch.norm(mod.output)] + del mod + + loss_alphas = {} + for name in module_names: + module = get_module(self.model, name) + loss = self._get_auto_loss(fp32_output[name], module.output) + cur_alpha = orig_best_alpha + if isinstance(orig_best_alpha, dict): + cur_alpha = orig_best_alpha[name] + key_name = str(cur_alpha) + loss_alphas[name] = {key_name: loss} + # for name in module_names: + # loss_alphas[name]={} + for alpha in alpha_space: + absorb_input_scales, weight_scales = self._cal_scales(self.absorb_to_layer, input_maxes, alpha) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + for name in module_names: + losses = loss_alphas[name] + if str(alpha) in losses.keys(): + continue + module = get_module(self.model, name) + output = module.q_dq_forward(module.q_input, module.input_scale, module.weight_scale, module.absorb_scale) + loss = self._get_auto_loss(fp32_output[name], output) + loss_alphas[name][str(alpha)] = loss + return loss_alphas + + def _get_one_batch_auto_loss_blockwise(self, input, alpha_space, orig_best_alpha, input_maxes): + """Calculate the losses for all alpha values given an input in blockwise tuning mode. + + :return: A dict of blockwise-wise loss values with respect to alpha values. + """ + self._change_qdq_for_auto(enable=False) + module_names = self._get_sq_layer_names() + + block_modules = {} + for key in self.block_names: + block_modules[key] = get_module(self.model, key) + self._add_blockwise_observer(block_modules) + + forward_wrapper(self.model, input, self.device) ##disable quant and get fp32 output + + fp32_output = {} + for block_name in self.block_names: + fp32_output[block_name] = self.block_outputs[block_name] + self._change_qdq_for_auto(enable=True) + absorb_input_scales, weight_scales = self._cal_scales(self.absorb_to_layer, input_maxes, orig_best_alpha) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + forward_wrapper(self.model, input, self.device) ##save quant_input + for mod_name in module_names: # save fp32 values + mod = get_module(self.model, mod_name) + if mod_name in self.fp32_output_val: + self.fp32_output_val[mod_name].append(torch.norm(mod.output)) + else: + self.fp32_output_val[mod_name] = [torch.norm(mod.output)] + del mod + + loss_alphas = {} + + for block_name in self.block_names: + block = get_module(self.model, block_name) + loss = self._get_auto_loss(fp32_output[block_name], self.block_outputs[block_name]) + cur_alpha = orig_best_alpha + if isinstance(orig_best_alpha, dict): + cur_alpha = orig_best_alpha[self.block_to_module[block_name][0]] + key_name = str(cur_alpha) + loss_alphas[block_name] = {key_name: loss} + # for name in module_names: + # loss_alphas[name]={} + for alpha in alpha_space: + absorb_input_scales, weight_scales = self._cal_scales(self.absorb_to_layer, input_maxes, alpha) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + + for block_name in self.block_names: + losses = loss_alphas[block_name] + if str(alpha) in losses.keys(): + continue + block = get_module(self.model, block_name) + block_copy = copy.deepcopy(block) + for name in self.block_to_module[block_name]: + if name == block_name and len(self.block_to_module[block_name]) == 1: + module, module_copy = block, block_copy + else: + module = get_module(block, name) + module_copy = copy.deepcopy(module) + if module.weight_scale is not None: + # module_copy.orig_layer.weight *= module.weight_scale + module_copy.orig_layer.weight.data = mul_scale(module_copy.orig_layer.weight, module.weight_scale, self.group_size) + # q_dq_weight = quant_dequant_w_v1(module_copy.orig_layer) + q_dq_weight = quant_dequant(module_copy.orig_layer) + module_copy.orig_layer.weight.data.copy_(q_dq_weight) + module_copy.do_blockwise = True + if not (name == block_name and len(self.block_to_module[block_name]) == 1): + set_module(block_copy, name, module_copy) + try: + output = block_copy(self.block_inputs[block_name])[0] + except: # Llama model decoder_layer forward requires position_id + position_ids = torch.arange(self.block_inputs[block_name].size()[1]) + position_ids = position_ids.view(self.block_inputs[block_name].size()[0], -1) + position_ids = position_ids.to(self.device) + output = block_copy(self.block_inputs[block_name], position_ids=position_ids)[0] + loss = self._get_auto_loss(fp32_output[block_name], output) + loss_alphas[block_name][str(alpha)] = loss + del block_copy # release memory + return loss_alphas + + def opwise_rank(self, loss_alphas, best_alphas): + """Rank the final losses of ops based on their ratio with respect to op output norm. + + :return: + """ + max_op, max_ratio, max_key = "", 0, "" + ratio_info = {} + for key in self.absorb_to_layer: + for op_name in self.absorb_to_layer[key]: + fp32_norm, loss_ = ( + torch.sum(torch.stack(self.fp32_output_val[op_name])), + loss_alphas[op_name][str(best_alphas[key])], + ) + ratio = loss_ / fp32_norm + max_op = op_name if ratio > max_ratio else max_op + max_key = key if ratio > max_ratio else max_key + max_ratio = max(ratio, max_ratio) + ratio_info[op_name] = ratio + logger.debug( + f"final loss: {op_name}: {loss_}; @alpha {best_alphas[key]}; \ + fp32_output norm: {fp32_norm}; ratio: {ratio}" + ) + import operator + + ratio_info = dict(sorted(ratio_info.items(), key=operator.itemgetter(1), reverse=True)) + for key in list(ratio_info.keys()): + logger.debug(f"sorted opname-ratio: {key}: {ratio_info[key]}") + if max_op != "": + logger.debug( + f"max loss: {max_op}: {loss_alphas[max_op][str(best_alphas[max_key])]} @alpha {best_alphas[max_key]}\ + fp32_output norm: {torch.sum(torch.stack(self.fp32_output_val[max_op]))}; ratio: {max_ratio}" + ) + return None + + def default_tune_setup(self): + """Setup default auto-tune settings. + + :return: A dict of op-wise loss values with respect to alpha values. + """ + round_num = max( # Initialize the alpha search space + len(str(self.alpha_min).split(".")[1]), + len(str(self.alpha_max).split(".")[1]), + len(str(self.alpha_step).split(".")[1]), + ) + self.alpha_space = numpy.round( + numpy.arange(self.alpha_min, self.alpha_max + self.alpha_step, self.alpha_step), round_num + ).tolist() + ##wrapper new module + self._qdq_model_wrapper_for_auto(save_q_input=True) + + absorb_input_scales, weight_scales = self._cal_scales( + self.absorb_to_layer, self.input_maxes_abs, self.init_alpha + ) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + return absorb_input_scales, weight_scales + + def _auto_tune_alpha(self): + """Perform alpha-tuning to obtain layer-wise optimal alpha values and adjust parameters accordingly.""" + logger.info("Start alpha tuning") + + absorb_input_scales, weight_scales = self.default_tune_setup() + + total_cnt, tmp_cnt = 0, 0 + alpha_update_iter, tune_cnt = 0, 4 + # multiply_factor is used to combine samples to calib_sample_num // 4 before summarizing the best alpha + multiply_factor = ( + self.calib_sample_num // tune_cnt if self.calib_sample_num >= tune_cnt else self.calib_sample_num + ) + self.fp32_output_val = {} + best_alphas = self.init_alpha + + if not self.dataloader: + logger.info(f"Auto-tuning failed due to no dataloader, using {best_alphas} instead.") + self._qdq_model_unwrapper_for_auto() + return best_alphas + # bar = tqdm(self.dataloader, total=self.calib_sample_num, desc="auto tune alpha") + pbar = tqdm( + range(self.calib_sample_num // self.dataloader.batch_size), + desc="auto tune alpha") + for input in self.dataloader: + pbar.update(1) + if isinstance(input, tuple) or isinstance(input, list): + if len(input) == 2: + input, _ = input # Extract input when both input and label are yielded by dataloader. + loss_alphas = {} + best_alphas_per_module = best_alphas + if isinstance(best_alphas, dict): + for key in self.absorb_to_layer.keys(): + layer_names = self.absorb_to_layer[key] + for layer_name in layer_names: + best_alphas_per_module[layer_name] = best_alphas_per_module[key] + loss_tmp = self._get_one_batch_auto_loss( + input, self.alpha_space, best_alphas_per_module, self.input_maxes_abs + ) + if loss_alphas == {}: + loss_alphas = loss_tmp + else: + for key in loss_alphas.keys(): + cur_loss = loss_alphas[key] + for alpha_key in cur_loss.keys(): + cur_loss[alpha_key] += loss_tmp[key][alpha_key] + total_cnt += self.dataloader.batch_size + tmp_cnt += self.dataloader.batch_size + if tmp_cnt // multiply_factor >= 1: + alpha_update_iter += 1 + tmp_cnt = 0 + best_alphas = self._get_best_alpha(self.absorb_to_layer, loss_alphas, self.shared_criterion) + for key in best_alphas.keys(): + logger.info(f"Auto alpha update iter: {alpha_update_iter}, {key}: {best_alphas[key]}") + absorb_input_scales, weight_scales = self._cal_scales( + self.absorb_to_layer, self.input_maxes_abs, best_alphas + ) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + # does not need to reset the weight_scale_dict, because use the weight of ori_layer, no change + # self.weight_scale_dict = {} + + if total_cnt >= self.calib_sample_num: + break + + best_alphas = self._get_best_alpha(self.absorb_to_layer, loss_alphas, self.shared_criterion) + for key in best_alphas.keys(): + logger.info(f"Final alpha {key}:{best_alphas[key]}") + + self.opwise_rank(loss_alphas, best_alphas) + self._qdq_model_unwrapper_for_auto() + logger.info("auto tuning done") + + return best_alphas + + def _auto_tune_alpha_blockwise(self): + """Perform blockwise-alpha-tuning to obtain optimal alpha values and adjust parameters accordingly.""" + logger.info("Start block-wise alpha tuning") + self.block_inputs, self.block_outputs = {}, {} + + absorb_input_scales, weight_scales = self.default_tune_setup() + + total_cnt, tmp_cnt = 0, 0 + alpha_update_iter, tune_cnt = 0, 4 + # multiply_factor is used to combine samples to calib_sample_num // 4 before summarizing the best alpha + multiply_factor = ( + self.calib_sample_num // tune_cnt if self.calib_sample_num >= tune_cnt else self.calib_sample_num + ) + self.fp32_output_val = {} + best_alphas = self.init_alpha + + if not self.dataloader: + logger.info(f"Auto-tuning failed due to no dataloader, using {best_alphas} instead.") + self._qdq_model_unwrapper_for_auto() + return best_alphas + bar = tqdm(self.dataloader, total=self.calib_sample_num, desc="auto tune alpha") + for input in bar: + if isinstance(input, tuple): # Extract input when both input and label are yielded by dataloader. + input = input[0] + loss_alphas = {} + best_alphas_per_module = best_alphas + if isinstance(best_alphas, dict): + for key in self.absorb_to_layer.keys(): + layer_names = self.absorb_to_layer[key] + for layer_name in layer_names: + best_alphas_per_module[layer_name] = best_alphas_per_module[key] + loss_tmp = self._get_one_batch_auto_loss_blockwise( + input, self.alpha_space, best_alphas_per_module, self.input_maxes_abs + ) + if loss_alphas == {}: + for block_name in self.block_names: + for key in self.block_to_module[block_name]: + loss_alphas[key] = loss_tmp[block_name] + else: + for block_name in self.block_names: + for key in self.block_to_module[block_name]: + cur_loss = loss_alphas[key] + for alpha_key in cur_loss.keys(): + cur_loss[alpha_key] += loss_tmp[block_name][alpha_key] + + total_cnt += self.dataloader.batch_size + tmp_cnt += self.dataloader.batch_size + if tmp_cnt // multiply_factor >= 1: + alpha_update_iter += 1 + tmp_cnt = 0 + best_alphas = self._get_best_alpha(self.absorb_to_layer, loss_alphas, self.shared_criterion) + for key in best_alphas.keys(): + logger.info(f"Auto alpha update iter: {alpha_update_iter}, {key}: {best_alphas[key]}") + absorb_input_scales, weight_scales = self._cal_scales( + self.absorb_to_layer, self.input_maxes_abs, best_alphas + ) + self._update_scales_for_auto(absorb_input_scales, weight_scales) + # does not need to reset the weight_scale_dict, because use the weight of ori_layer, no change + # self.weight_scale_dict = {} + if total_cnt >= self.calib_sample_num: + break + + best_alphas = self._get_best_alpha(self.absorb_to_layer, loss_alphas, self.shared_criterion) + for key in best_alphas.keys(): + logger.info(f"Final alpha {key}:{best_alphas[key]}") + + self.opwise_rank(loss_alphas, best_alphas) + self._qdq_model_unwrapper_for_auto() + logger.info("block-wise auto tuning done") + + return best_alphas diff --git a/auto_round/smooth_quant/calibration.py b/auto_round/smooth_quant/calibration.py new file mode 100644 index 000000000..bcc9ecaa6 --- /dev/null +++ b/auto_round/smooth_quant/calibration.py @@ -0,0 +1,109 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import json + +import torch +from .utils import * + +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size + + +class Calibration: + def __init__(self, model, dataloder=None, q_func=None, device="cpu", group_size=-1): + self.model = model + self.dataloader = dataloder + self.q_func = q_func + self.device = device + self.group_size = group_size + + @torch.no_grad() + def _save_input_pc_hook(self, name): + """A forward hook to save input max of a module + :param name: the module name + :return: A hook function.""" + + def save_input_hook(module, inputs, outputs): + if hasattr(module, "orig_layer"): + weight = module.orig_layer.weight + else: + weight = module.weight + input = inputs[0] + ##TODO check input channel is correct + if len(weight.shape) == 4: ##conv3d or conv1d not supported now, need better way + input = input.permute(0, 2, 3, 1) + input, orig_shape, pad_len = reshape_pad_tensor_by_group_size(input, self.group_size) + max_tensor = torch.max(input, dim=0)[0] + min_tensor = torch.min(input, dim=0)[0] + if name not in self.input_maxes.keys(): + self.input_mins[name], self.input_maxes[name] = min_tensor, max_tensor + else: + self.input_mins[name] = torch.min(self.input_mins[name], min_tensor) + self.input_maxes[name] = torch.max(self.input_maxes[name], max_tensor) + + return save_input_hook + + @torch.no_grad() + def _add_min_max_observer(self, modules): + """ + :param modules: the modules which the observer will insert to + :return: + """ + self.hook_handles = [] + for key in modules.keys(): + hook_func = self._save_input_pc_hook(key) + hook_handle = modules[key].register_forward_hook(hook_func) + self.hook_handles.append(hook_handle) + + @torch.no_grad() + def _remove_observer(self): + """Remove the observer from the model + :return:""" + for hook_handle in self.hook_handles: + hook_handle.remove() + + @torch.no_grad() + def _dump_min_max(self, calib_iter=100): + """Dump min max per channel information, the min max value will be saved in input_maxes attribute + :param calibration_method: only support min_max currently + :param calib_iter: Sample size for calibration + :return:""" + if self.q_func: + self.q_func(self.model) + else: + assert self.dataloader, "Please set dataloader for calibration." + model_forward(self.model, self.dataloader, calib_iter, self.device) + + @torch.no_grad() + def calibrate(self, calib_iter, op_types=[torch.nn.Conv2d, torch.nn.Linear]): ##TODO transformers.conv1d + """ + :param absorb_to_layer: A dict,key is the absorb layer, val is a list of the to be smoothed layer + :param calib_iter: Data size for calibration + :return: A dict that saved the layer name and the channel-wise max value info + """ + ##hook all the module + self.input_mins = {} + self.input_maxes = {} + + hook_modules = {} + for n, module in self.model.named_modules(): + if isinstance(module, tuple(op_types)): + hook_modules[n] = module + + self._add_min_max_observer(hook_modules) + + self._dump_min_max(calib_iter=calib_iter) + self._remove_observer() + return self.input_mins, self.input_maxes diff --git a/auto_round/smooth_quant/sq.py b/auto_round/smooth_quant/sq.py new file mode 100644 index 000000000..1b8af7bb0 --- /dev/null +++ b/auto_round/smooth_quant/sq.py @@ -0,0 +1,492 @@ +import torch +import numpy + +from auto_round.utils import logger, get_module, set_module +from auto_round.smooth_quant.calibration import Calibration +from auto_round.smooth_quant.utils import ( + model_forward_per_sample, reshape_in_channel_to_last, cal_scale, mul_scale, reshape_scale_as_weight) +from auto_round.smooth_quant.absorb_utils import get_absorb_layers + +class SmoothQuant: + def __init__(self, model, dataloader=None, device="cpu", dtype=torch.bfloat16, example_inputs=None, q_func=None, traced_model=None, group_size=-1): + """ + :param model: Torch model :param dataloader: Calibration dataloader :param traced_model: A specific model + shares the same architecture as the model and could be traced by torch.jit. If not supplied, we use model + instead. + """ + self.model = model + assert isinstance(self.model, torch.nn.Module) + self.model.eval() + self.device = device + self.dtype = dtype + self.dataloader = dataloader + self.example_inputs = example_inputs + self.q_func = q_func + self.input_maxes = {} + self.input_mins = {} + self.input_maxes_abs = {} + self.traced_model = traced_model + if self.traced_model is None: + self.traced_model = self.model + self.weight_scale_info = {} + self.absorb_scales_info = {} + self.insert_mul = False + self.allow_absorb = True + self.record_max_info = False + self.max_value_info = {} # to record max values for alpha tune + self.absorb_to_layer = {} + self.weight_max_lb = 1e-5 ##weight max low bound + self.sq_scale_info = {} + self.max_value_info = {} + self.need_calibration = False + self.group_size = group_size + + + @torch.no_grad() + def transform_model( + self, + alpha=0.5, + folding=True, + percentile=100, + op_types=[torch.nn.Linear, torch.nn.Conv2d], + scales_per_op=False, + calib_iter=100, + weight_clip=True, + auto_alpha_args={ + "init_alpha": 0.5, + "alpha_min": 0.0, + "alpha_max": 1.0, + "alpha_step": 0.1, + "shared_criterion": "mean", + "n_samples": 32, ##512 for cuda, 128 for cpu? + }, + ): + """The main entry of smooth quant + :param alpha: Alpha value to balance the quantization difficulty of activation and weight, please refer + to the paper for more details + :param folding: whether insert mul(False) or just allow foldable layers(True) for SmoothQuant + :param percentile: Not supported now + :param op_types: The op typed to be smooth quantized + :param scales_per_op: Not supported now + :param calib_iter: Data size for calibration + :param weight_clip: Whether to clip weight_max when calculating scales. + + :param auto_alpha_args: Hyperparameters used to set the alpha search space in SQ auto-tuning. + By default, the search space is 0.0-1.0 with step_size 0.1. + do_blockwise: Whether to do blockwise auto-tuning. + :param init_alpha: A hyperparameter that is used in SQ auto-tuning; by default it is 0.5. + :return: A FP32 model with the same architecture as the orig model but with different weight which will be + benefit to quantization. + """ + if not isinstance(self.model, torch.nn.Module): + logger.warning("smoothquant is ignored since the model is not a torch module") + return self.model + + if isinstance(alpha, float) and (alpha < 0): + logger.warning("reset alpha to >=0") + alpha = numpy.clip(alpha, 0.0) + + if folding: + self.insert_mul, self.allow_absorb = False, True + else: + self.insert_mul, self.allow_absorb = True, False + self.weight_clip = weight_clip + + self.revert() + self.need_calibration = self._check_need_calibration(alpha, percentile, op_types, scales_per_op, calib_iter) + if self.need_calibration: + self.input_mins, self.input_maxes = {}, {} + self.absorb_to_layer = self._parse_absorb_to_layers( + op_types, folding + ) ##need to forward to check modules not used in forward + if len(self.input_mins) != 0: ##this is from _parse_absorb_to_layers, ugly code to support q_func + input_maxes_abs = {} + for key in self.input_mins.keys(): + input_maxes_abs[key] = torch.max(torch.abs(self.input_mins[key]), torch.abs(self.input_maxes[key])) + if self.q_func: + self.need_calibration = False # Avoid double-calibration in fixed-value alpha SQ. + + if self.absorb_to_layer is None: + logger.warning("empty absorb_to_layer, smoothquant is ignored ") + return self.model + example_inputs = self._get_example_input() + if alpha == "auto": ##TODO need to polish later + from auto_round.smooth_quant.auto_alpha import TUNERS + + auto_alpha_version = "version1" + auto_alpha_tuner = TUNERS[auto_alpha_version]( + self.model, + self.dataloader, + self.absorb_to_layer, + op_types=op_types, + device=self.device, + q_func=self.q_func, + folding=folding, + example_inputs=self.example_inputs, + group_size=self.group_size, + calib_iter=self.calib_iter, + **auto_alpha_args, + ) + self.alpha = auto_alpha_tuner.tune() + input_maxes_abs = auto_alpha_tuner.input_maxes_abs + self.input_mins, self.input_maxes = auto_alpha_tuner.input_mins, auto_alpha_tuner.input_maxes + if auto_alpha_tuner.loss_type == "blockwise": + self.block_names = auto_alpha_tuner.block_names + + elif self.need_calibration: + calib = Calibration(self.model, self.dataloader, self.q_func, self.device, self.group_size) + self.input_mins, self.input_maxes = calib.calibrate(calib_iter, op_types) + input_maxes_abs = {} + for key in self.input_mins.keys(): + input_maxes_abs[key] = torch.max(torch.abs(self.input_mins[key]), torch.abs(self.input_maxes[key])) + + if example_inputs is not None: + out_pre_sq = model_forward_per_sample(self.model, example_inputs, self.device) + + if folding: + self._save_scale = False ##TODO remove it later + + if self.record_max_info: + self._export_sq_info(self.absorb_to_layer, input_maxes_abs, self.alpha) + # # max_info is recorded in self.max_value_info + # self._adjust_parameters(self.absorb_to_layer, input_maxes_abs, alpha) + self.model._smoothquant_optimized = False + return self.model + + self.weight_scale_info, self.absorb_scales_info = self._adjust_parameters( + self.absorb_to_layer, input_maxes_abs, self.alpha + ) + self.model._smoothquant_optimized = True + + if example_inputs is not None: + # Check mathematical equivalency + out_post_sq = model_forward_per_sample(self.model, example_inputs, self.device) + if not self.output_is_equal(out_post_sq[0], out_pre_sq[0]): + logger.warning( + "Mathematical equivelancy of Smoothquant is not preserved. " + "Please kindly report this issue to https://github.com/intel/neural-compressor." + ) + else: + logger.warning(" Could not get example input, equivelancy check is skipped") + + return self.model + + @torch.no_grad() + def revert(self): + """Revert the model weights + :return:""" + for key in self.weight_scale_info: + self._scale_layer_weight(key, 1.0 / self.weight_scale_info[key]) + for key in self.absorb_scales_info: + self._absorb_scales(key, 1.0 / self.absorb_scales_info[key]) + self.weight_scale_info = {} ##clear the data + self.absorb_scales_info = {} + + def output_is_equal(self, out1, out2, atol=1e-03): + try: + if isinstance(out1, tuple): + return all(torch.all(torch.isclose(out1[i], out2[i], atol=atol)) for i in range(len(out1))) + elif isinstance(out1, dict): + return all(torch.all(torch.isclose(out1[k], out2[k], atol=atol)) for k in out1.keys()) + elif isinstance(out1, torch.Tensor): + return torch.all(torch.isclose(out1, out2, atol=atol)) + return False + except: + logger.warning( + "Automatically check failed, Please check equivelancy manually " + "between out_pre_sq and out_post_sq if necessary." + ) + return True + + + def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5): + """Cal the adjust scales + :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer + :param input_maxes: The channel-wise input max info for layers + :param alpha: Alpha value to balance the quantization difficulty of activation and weight, a float of a dict + :return:""" + absorb_to_input_maxes = {} + for key in absorb_to_layer.keys(): + layer_name = absorb_to_layer[key][0] + absorb_to_input_maxes[key] = input_maxes[layer_name] + + weight_scales_info = {} + absorb_scales_info = {} + for index, key in enumerate(absorb_to_layer.keys()): + alpha_tmp = alpha[key] if isinstance(alpha, dict) else alpha + + input_max = absorb_to_input_maxes[key] + layer_names = absorb_to_layer[key] + weights = [] + for layer_name in layer_names: + weight = reshape_in_channel_to_last(layer_name, self.model) + weights.append(weight) + scale = cal_scale(input_max, weights, alpha_tmp, group_size=self.group_size) + absorb_scales_info[key] = 1.0 / scale + absorb_scales_info[key][scale == 0] = 0 + layer_names = absorb_to_layer[key] + for layer_name in layer_names: + ##self._scale_layer_weight(layer_name, scale) + weight_scales_info[layer_name] = scale + return absorb_scales_info, weight_scales_info + + def _scale_layer_weight(self, layer_name, scale, alpha=0.5, input_minmax=None): ##input channel + """Scale the layer weights at input channel, depthwise conv output channel + :param layer_name: The layer name + :param scale: The scale to be multiplied + :param alpha: alpha for SQLinearWrapper + :param input_minmax: input_minmax for SQLinearWrapper + :return:""" + layer = get_module(self.model, layer_name) + if self.insert_mul: + from .utils import SQLinearWrapper + + layer = get_module(self.model, layer_name) + if isinstance(layer, SQLinearWrapper): + layer._recover_sq_linear() + set_module(self.model, layer_name, layer.sq_linear) ##recover + else: + new_module = SQLinearWrapper(layer, 1.0 / scale, input_minmax, alpha) + set_module(self.model, layer_name, new_module) + elif self.allow_absorb: + scale = reshape_scale_as_weight(layer, scale) + layer.weight.data = mul_scale(layer.weight, scale) + # layer.weight = torch.nn.Parameter(layer.weight * scale) + layer.weight = torch.nn.Parameter(layer.weight) + return scale + + def _absorb_scales(self, layer_name, scale): ##output channel + """Absorb the scale to the layer at output channel + :param layer_name: The module name + :param scale: The scale to be absorbed + :param alpha_key: The alpha passed to SQLinearWrapper + :return:""" + if self.insert_mul or not self.allow_absorb: + return # absorb is updated in SQLinearWrapper in def _scale_layer_weight + + ##if self.allow absorb + layer = get_module(self.model, layer_name) + if layer.__class__.__name__ == "WrapperLayer": + layer = layer.orig_layer + if ( + isinstance(layer, torch.nn.BatchNorm2d) + or isinstance(layer, torch.nn.GroupNorm) + or isinstance(layer, torch.nn.InstanceNorm2d) + ): + if layer.affine: + layer.weight.data = mul_scale(layer.weight, scale) + layer.bias.data = mul_scale(layer.bias, scale) + else: + layer.affine = True + weight = torch.ones(layer.num_features, device=self.device, dtype=self.dtype) * scale + layer.weight = torch.nn.Parameter(weight, requires_grad=False) + bias = torch.zeros(layer.num_features, device=self.device, dtype=self.dtype) + layer.bias = torch.nn.Parameter(bias, requires_grad=False) + elif isinstance(layer, torch.nn.LayerNorm): + if layer.elementwise_affine: + layer.weight.data = mul_scale(layer.weight, scale) + layer.bias.data = mul_scale(layer.bias, scale) + else: + layer.elementwise_affine = True + weight = torch.ones(layer.num_features, device=self.device, dtype=self.dtype) * scale + layer.weight = torch.nn.Parameter(torch.ones(weight, requires_grad=False)) + bias = torch.zeros(layer.num_features, device=self.device, dtype=self.dtype) + layer.bias = torch.nn.Parameter(bias, requires_grad=False) + + elif isinstance(layer, torch.nn.Conv2d): + ##the order could not be changed + if hasattr(layer, "bias") and (layer.bias is not None): + # layer.bias *= scale + layer.bias = mul_scale(layer.bias, scale) + scale = scale.view(scale.shape[0], 1, 1, 1) + # layer.weight *= scale + layer.weight.data = mul_scale(layer.weight, scale) + + elif isinstance(layer, torch.nn.Linear): + if hasattr(layer, "bias") and (layer.bias is not None): + # layer.bias *= scale + layer.bias.data = mul_scale(layer.bias, scale) + scale = scale.view(scale.shape[0], 1) + # layer.weight *= scale + layer.weight.data = mul_scale(layer.weight, scale) + + elif layer.__class__.__name__ in ["Qwen2RMSNorm", "Qwen3RMSNorm", "LlamaRMSNorm", "T5LayerNorm"]: + # layer.weight *= scale + layer.weight.data = mul_scale(layer.weight, scale) + + else: + logger.warning_once( + f"found unsupported layer {type(layer)}, try to multiply scale to " + f"weight and bias directly, this may introduce accuracy issue, please have a check " + ) + if hasattr(layer, "weight") and layer.weight is not None: + # layer.weight *= scale + layer.weight.data = mul_scale(layer.weight, scale) + if hasattr(layer, "bias") and layer.bias is not None: + # layer.bias *= scale + layer.bias = mul_scale(layer.bias, scale) + + def _adjust_parameters(self, absorb_to_layer, input_maxes, alpha=0.5): + """Adjust the weights and biases + :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer + :param input_maxes: The channel-wise input max info for layers + :param alpha: Alpha value to balance the quantization difficulty of activation and weight, a float of a dict + :return:""" + absorb_scales_info, weight_scales_info = self._cal_scales(absorb_to_layer, input_maxes, alpha) + if not absorb_scales_info or not weight_scales_info: + return weight_scales_info, absorb_scales_info + for index, key in enumerate(absorb_to_layer.keys()): + # layer = get_module(self.model, key) + # if 'norm' not in layer.__class__.__name__.lower(): + # continue + if isinstance(alpha, float): + alpha_tmp = alpha + elif isinstance(alpha, dict): + alpha_tmp = alpha[key] + absorb_scale = absorb_scales_info[key] + self._absorb_scales(key, absorb_scale) + layer_names = absorb_to_layer[key] + for layer_name in layer_names: + input_minmax = [self.input_mins[layer_names[0]], self.input_maxes[layer_names[0]]] + self._scale_layer_weight(layer_name, weight_scales_info[layer_name], alpha_tmp, input_minmax) + return weight_scales_info, absorb_scales_info + + def _check_need_calibration(self, alpha, percentile, op_types, scales_per_op, calib_iter): + """ + check need calibration or not + :param alpha: current alpha + :param percentile: current percentile + :param op_types: current op_types + :param scales_per_op: current scales_per_op + :param calib_iter:: current scales_per_op + :return: + """ + need_calib = True + from peft import PeftModel + + is_peft, is_auto = isinstance(self.model, PeftModel), alpha == "auto" + if len(self.input_maxes) == 0: ## the first time + need_calib = True + self.alpha = alpha + self.percentile = percentile + self.op_types = op_types + self.scales_per_op = scales_per_op + self.calib_iter = calib_iter + return False if (is_auto and not is_peft) else need_calib + + if ( + self.percentile == percentile + and self.op_types == op_types + and self.scales_per_op == scales_per_op + and self.calib_iter == calib_iter + ): + if isinstance(alpha, float) or self.alpha == "auto": + need_calib = False + + self.alpha, self.percentile, self.calib_iter = alpha, percentile, calib_iter + self.op_types, self.scales_per_op = op_types, scales_per_op + return need_calib + + def _get_all_layer_names(self, op_types=[torch.nn.Linear]): + """Try the model to find the layers which can be smooth quantized. + + :param op_types: The op types to be smooth quantized + :return: + self_absorb_layer: A dict, absorb layer name (itself): layers to be smooth quantized + """ + self_absorb_layer = {} + op_types = [torch.nn.Linear] # TODO: only support SQLinearWrapper + for name, module in self.model.named_modules(): + if isinstance(module, tuple(op_types)): + self_absorb_layer[name] = [name] + return self_absorb_layer + + def _get_example_input(self): + if self.dataloader is None and self.example_inputs is None: + return None + for idx, input in enumerate(self.dataloader): + self.example_inputs = input + break + return self.example_inputs + + @torch.no_grad() + def _parse_absorb_to_layers(self, op_types, folding): + self_absorb_layers = {} + if self.insert_mul: + self_absorb_layers = self._get_all_layer_names(op_types) # TODO: only support linear now. + # fetch modules with the same input + group_modules = self._trace(skip_unsupported_layers=False) + if group_modules is not None: + # use one input for qkv + for k, v in group_modules.items(): + for i in v: + if i in self_absorb_layers: + self_absorb_layers.pop(i) + self_absorb_layers[v[0]] = v + logger.debug(f"self_absorb_layers:{self_absorb_layers}") + if self.allow_absorb: + self.absorb_to_layer, no_absorb_layers = self._trace() + if self.absorb_to_layer is None and no_absorb_layers is None: + return None + + # remove self.self_absorb_layers if it exists in self.absorb_to_layer + for k, v in self.absorb_to_layer.items(): + for i in v: + if i in self_absorb_layers: + self_absorb_layers.pop(i) + self.absorb_to_layer.update(self_absorb_layers) + + if self.absorb_to_layer is None and no_absorb_layers is None: + logger.warning( + "sorry, could not trace the model, smooth quant is ignored." + "If you are using huggingface model," + "you could set torchscript to True " + ) + return None + + # Check if input_maxes match self.absorb_to_layer + # (due to self._get_all_layer_names use layer tree instead of forward_path) + if not folding and self.need_calibration: + if len(self.input_mins) == 0: ##there are some modules not used in forward + calib = Calibration(self.model, self.dataloader, self.q_func, self.device, group_size=self.group_size) ## + input_mins, input_maxes = calib.calibrate( + 1, op_types + ) ##TODO if using qfunc for calibration, it will calibrate twice + # use qfunc to calibrate, the input min could be used for fixed alpha transformation + self.input_mins = input_mins + self.input_maxes = input_maxes + diff_modules = set(self.absorb_to_layer.keys()).difference(input_mins.keys()) + for d in diff_modules: + del self.absorb_to_layer[d] + return self.absorb_to_layer + + def _trace(self, skip_unsupported_layers=True): + """Try the model to find the layers which can be smooth quantized. + + :param op_types: The op types to be smooth quantized + :return: + absorb_to_layer: A dict, absorb layer name:layers to be smooth quantized + no_absorb_layers: A list saving the layers which could not find the absorb layer + """ + + absorb_to_layer, no_absorb_layers = get_absorb_layers(self.traced_model, skip_unsupported_layers) + if not skip_unsupported_layers: + return absorb_to_layer + if absorb_to_layer is None and no_absorb_layers is None: + logger.warning( + "sorry, could not trace the model, smooth quant is skipped." + "If you are using huggingface model," + "you could set torchscript to True " + "when loading the model or set the return_dict to False" + ) + elif absorb_to_layer == {}: + logger.warning("could not find any layer to be absorbed") + else: + to_absorb_cnt = 0 + for key, item in absorb_to_layer.items(): + to_absorb_cnt += len(item) + logger.info( + f" {to_absorb_cnt} out of {to_absorb_cnt + len(no_absorb_layers)} " + f"layers could be absorbed in smooth quant" + ) + return absorb_to_layer, no_absorb_layers diff --git a/auto_round/smooth_quant/utils.py b/auto_round/smooth_quant/utils.py new file mode 100644 index 000000000..6cc83f792 --- /dev/null +++ b/auto_round/smooth_quant/utils.py @@ -0,0 +1,245 @@ +import copy +from collections import UserDict, defaultdict + +import torch +from tqdm import tqdm + +from auto_round.utils import logger, get_module +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad + +def mul_scale(tensor, scale, group_size=-1): + ori_shape = tensor.shape + if len(scale.shape) == 2 and scale.shape[1] == 1: + tensor = tensor.reshape(scale.shape[0], -1) + else: + tensor = tensor.reshape(-1, scale.shape[-1]) + + tensor *= scale + return tensor.reshape(ori_shape) + +def reshape_scale_as_input(layer, scale): + """Reshape the scale for input feature in channel + :param layer: + + :param scale: + :return: + """ + if hasattr(layer, "orig_layer"): + layer = layer.orig_layer + if isinstance(layer, torch.nn.Conv2d): + scale = scale.view(1, scale.shape[0], 1, 1) + + elif isinstance(layer, torch.nn.Linear): + scale = scale.view(1, scale.shape[0]) + + return scale + + +def reshape_scale_as_weight(layer, scale): + """Reshape the scale for weight input channel, depthwise output channel + :param layer: torch module + :param scale: orig scale + :return: reshaped scale.""" + if hasattr(layer, "orig_layer"): + layer = layer.orig_layer + if isinstance(layer, torch.nn.Conv2d) and layer.groups > 1: ##only depthwise conv could hit here + scale = scale.view(scale.shape[0], 1, 1, 1) ##mount on output channel + + elif isinstance(layer, torch.nn.Conv2d): + scale = scale.view(1, scale.shape[0], 1, 1) + + elif isinstance(layer, torch.nn.Linear): + scale = scale.view(1, scale.shape[0]) + return scale + +def move_input_to_device(input, device=torch.device("cpu")): + if isinstance(input, dict) or isinstance(input, UserDict): + tmp_input = {} + for k, inp in input.items(): + tmp_input[k] = move_input_to_device(inp, device) + input = tmp_input + elif isinstance(input, list) or isinstance(input, tuple): + is_tuple = isinstance(input, tuple) + tmp_input = [] + for inp in input: + tmp_input.append(move_input_to_device(inp, device)) + input = tuple(tmp_input) if is_tuple else tmp_input + elif isinstance(input, torch.Tensor): + input = input.to(device) # pylint: disable=no-member + return input + +def forward_wrapper(model, input, device=torch.device("cpu")): + try: + model = model.to(device) + input = move_input_to_device(input, device) + except Exception as e: + logger.warning(e) + logger.warning("Please check the input device if the error raised.") + if isinstance(input, dict) or isinstance(input, UserDict): + output = model(**input) + elif isinstance(input, list) or isinstance(input, tuple): + try: + output = model(*input) + except: + output = model(input) + else: + output = model(input) + return output + + +def model_forward_per_sample(model, sample, device): + try: + output = forward_wrapper(model, sample, device) + return output + + except Exception as e: + output = forward_wrapper(model, sample[0], device) + return output + + +def model_forward(model, dataloader, iters, device): + cnt = 0 + pbar = tqdm(dataloader, total=iters) + pbar.set_description("SmoothQuant Calibrating") + for idx, input in enumerate(pbar): + output = forward_wrapper(model, input, device) + cnt += 1 + if iters != -1 and cnt > iters: + break + pbar.close() + +def cal_scale(input_max_abs, weights, alpha, weight_max_lb=1e-5, group_size=-1): + weights = torch.cat(weights, dim=0) + weights, _, _ = reshape_pad_tensor_by_group_size(weights, group_size) + weight_max = torch.max(torch.abs(weights), dim=0)[0] + weight_max = torch.clip(weight_max, weight_max_lb) + input_power = torch.pow(input_max_abs, alpha) + # logger.debug(f"{max(input_max_abs)}, {min(input_max_abs)}") + weight_power = torch.pow(weight_max, 1 - alpha) + weight_scale = torch.clip(input_power / weight_power, min=1e-5) + weight_scale[input_power == 0] = 1.0 + return weight_scale + + +def reshape_in_channel_to_last(layer_name, model): + """Move the input channel to the last dim + :param layer_name: Layer name + :return: The reshaped weight.""" + layer = get_module(model, layer_name) + if layer.__class__.__name__ == "WrapperLayer": + layer = layer.orig_layer + + weight = layer.weight ##TODO oc*ic, support transposed conv + if len(weight.shape) == 4: + weight = weight.permute(0, 2, 3, 1) + weight = weight.reshape(-1, weight.shape[-1]) + return weight + + +def enough_memo_store_scale(device, need_space): + if device == "cuda": # pragma: no cover + current_gpu_index = torch.cuda.current_device() + total_memory = torch.cuda.get_device_properties(current_gpu_index).total_memory + used_memory = torch.cuda.memory_allocated(current_gpu_index) + free_space = total_memory - used_memory + else: + import psutil + + free_space = psutil.virtual_memory().free + return free_space >= need_space + + +def quant_dequant(m, num_bits=4, group_size=32, data_type='mx_fp4', sym=True): + from auto_round.data_type.utils import get_quant_func + # data_type = 'int_asym' + data_type = 'mx_fp4' + tensor = m.weight if hasattr(m, "weight") else m + quant_func, data_type = get_quant_func(data_type, num_bits, sym) + # print(quant_func, num_bits) + data_new, scale, zp = quant_func(tensor, bits=num_bits, group_size=group_size, v=0, max_scale=1.0) + return data_new.to(tensor.dtype) + +class WrapperLayer(torch.nn.Module): + def __init__(self, layer, input_min, input_max, save_q_input=False, group_size=-1): + super(WrapperLayer, self).__init__() + if hasattr(layer, "orig_layer"): + layer = layer.orig_layer + self.add_module("orig_layer", layer) # set orig_layer in get/set_module + self.quant = False + self.q_input = None + self.fp32_output = None + self.input_max = input_max + self.input_min = input_min + self.weight_scale = None + self.input_scale = None + self.absorb_scale = None + self.save_q_input = save_q_input + self.do_blockwise = False + self.group_size = group_size + + def enable_quant(self): + self.quant = True + + def disable_quant(self): + self.quant = False + + def update_scale(self, input_scale, weight_scale, absorb_scale=None): + self.input_scale = input_scale + self.weight_scale = weight_scale + self.absorb_scale = absorb_scale + + ##TODO better tradeoff performance and memory, currently it's too slow + def q_dq_forward(self, x, input_scale, weight_scale, absorb_scale): + layer_copy = copy.deepcopy(self.orig_layer) + if absorb_scale is not None: + ori_shape = layer_copy.weight.shape + layer_copy.weight.data = mul_scale(layer_copy.weight, absorb_scale, group_size=self.group_size) + layer_copy.weight.data = layer_copy.weight.view(ori_shape) + if weight_scale is not None: + ori_shape = layer_copy.weight.shape + # layer_copy.weight *= weight_scale + layer_copy.weight.data = mul_scale(layer_copy.weight, weight_scale, group_size=self.group_size) + layer_copy.weight.data = layer_copy.weight.view(ori_shape) + # q_dq_weight = quant_dequant_w_v1(layer_copy) + q_dq_weight = quant_dequant(layer_copy) + layer_copy.weight.data.copy_(q_dq_weight) + if input_scale is None: + # x = quant_dequant_x_v1(x, self.input_min, self.input_max) + x = quant_dequant(x) + else: + ori_shape = x.shape + # x = input_scale * x + x = mul_scale(x, input_scale) + # x = quant_dequant_x_v1(x, self.input_min * input_scale, self.input_max * input_scale) ##FIXME + x = quant_dequant(x) ##FIXME + output = layer_copy(x) + return output + + def q_dq_forward_blockwise(self, x, input_scale): + layer_copy = copy.deepcopy(self.orig_layer) + if input_scale is None: + # x = quant_dequant_x_v1(x, self.input_min, self.input_max) + x = quant_dequant(x) + else: + x, orig_shape, pad_len = reshape_pad_tensor_by_group_size(x, self.group_size) + x = input_scale * x + x = revert_tensor_by_pad(x, orig_shape, pad_len) + # x = quant_dequant_x_v1(x, self.input_min * input_scale, self.input_max * input_scale) ##FIXME + x = quant_dequant(x) ##FIXME + output = layer_copy(x) + return output + + def forward(self, x): + if self.quant: + # self.q_input = x * scale ##save the q_input + if self.save_q_input: + self.q_input = x + if not self.do_blockwise: + output = self.q_dq_forward(x, self.input_scale, self.weight_scale, self.absorb_scale) + else: + output = self.q_dq_forward_blockwise(x, self.input_scale) + + else: + output = self.orig_layer(x) + self.output = output + return output From c5788471a6b8e6b32524ca105177ccdcc1d93093 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 11 Aug 2025 21:20:12 -0400 Subject: [PATCH 2/4] update Signed-off-by: n1ck-guo --- auto_round/script/llm.py | 27 ++++++ auto_round/smooth_quant/absorb_utils.py | 114 ++++++++++++------------ auto_round/smooth_quant/auto_alpha.py | 13 ++- auto_round/smooth_quant/sq.py | 3 +- auto_round/smooth_quant/utils.py | 52 ++++++++++- 5 files changed, 149 insertions(+), 60 deletions(-) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 52e97f557..41a944650 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -55,6 +55,8 @@ def __init__(self, *args, **kwargs): self.add_argument("--eval", action="store_true", help="whether to use eval only mode") + self.add_argument("--sq", action="store_true", help="whether to use smoothquant") + self.add_argument("--bits", default=4, type=int, help="number of weight bits") self.add_argument("--eval_bs", default=None, type=int, help="batch size in evaluation") @@ -491,6 +493,30 @@ def tune(args): enable_torch_compile = True if "--enable_torch_compile" in sys.argv else False + # sq + if args.sq: + from auto_round.calib_dataset import get_dataloader + dataloader = get_dataloader(tokenizer, args.seqlen, bs=8, nsamples=args.nsamples) + auto_alpha_args={ + "init_alpha": 0.5, + "alpha_min": 0.1, + "alpha_max": 1.0, + "alpha_step": 0.1, + "shared_criterion": "mean", + "n_samples": 512, ##512 for cuda, 128 for cpu? + # "do_blockwise": True + } + from auto_round.smooth_quant import SmoothQuant + model = model.to(device_str) + sq = SmoothQuant(model, dataloader, device=model.device, group_size=-1) + model = sq.transform_model( + alpha=0.5, + # alpha="auto", + auto_alpha_args=auto_alpha_args, + folding=True, + op_types=[torch.nn.Linear, torch.nn.Conv2d], + calib_iter=100) + autoround = round( model, tokenizer, @@ -719,6 +745,7 @@ def eval(args): if file.endswith(".gguf"): is_gguf_file = True gguf_file = file + model = os.path.dirname(args.model) eval_model_dtype = get_model_dtype(args.eval_model_dtype) if is_gguf_file: import torch diff --git a/auto_round/smooth_quant/absorb_utils.py b/auto_round/smooth_quant/absorb_utils.py index c03627327..78b2c2cc6 100644 --- a/auto_round/smooth_quant/absorb_utils.py +++ b/auto_round/smooth_quant/absorb_utils.py @@ -1,7 +1,7 @@ # # -*- coding: utf-8 -*- # -# Copyright (c) 2023 Intel Corporation +# Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ # limitations under the License. import torch -from auto_round.utils import get_module +from auto_round.smooth_quant.utils import get_module SUPPORTED_TORCH_MODULE = [ "Linear", @@ -36,11 +36,14 @@ GET_ABSORB_LAYERS = {} -def register_get_func(name): - """Class decorator to register a get_absorb_layers func - """ +def register_absorb_func(model_type): def register(func): - GET_ABSORB_LAYERS[name] = func + if isinstance(model_type, list): + model_types = model_type + else: + model_types = [model_type] + for name in model_types: + GET_ABSORB_LAYERS[name] = func return func return register @@ -79,7 +82,7 @@ def remove_unsupported_layers(model, absorb_to_layer, no_absorb_layers): res[key] = absorb_to_layer[key] return res -@register_get_func("opt") +@register_absorb_func("opt") def get_opt_absorb_layers(model): model_layer_name = "model.decoder.layers" absorb_to_layer = {} @@ -112,49 +115,49 @@ def get_opt_absorb_layers(model): return absorb_to_layer -@register_get_func('llama') -def get_llama_absorb_layers(model): - model_layer_name = "model.layers" - absorb_to_layer = {} - - for idx in range(len(model.model.layers)): - # attention input - absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ - f"{model_layer_name}.{idx}.self_attn.q_proj", - f"{model_layer_name}.{idx}.self_attn.k_proj", - f"{model_layer_name}.{idx}.self_attn.v_proj", - ] - - # attention out - module = model.model.layers[idx] - if hasattr(module.self_attn.v_proj, "orig_layer"): - v_proj_shape = module.self_attn.v_proj.orig_layer.weight.shape - o_proj_shape = module.self_attn.o_proj.orig_layer.weight.shape - else: - v_proj_shape = module.self_attn.v_proj.weight.shape - o_proj_shape = module.self_attn.o_proj.weight.shape - if v_proj_shape == o_proj_shape: - absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ - f"{model_layer_name}.{idx}.self_attn.o_proj", - ] +# @register_absorb_func('llama') +# def get_llama_absorb_layers(model): +# model_layer_name = "model.layers" +# absorb_to_layer = {} + +# for idx in range(len(model.model.layers)): +# # attention input +# absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ +# f"{model_layer_name}.{idx}.self_attn.q_proj", +# f"{model_layer_name}.{idx}.self_attn.k_proj", +# f"{model_layer_name}.{idx}.self_attn.v_proj", +# ] + +# # attention out +# module = model.model.layers[idx] +# if hasattr(module.self_attn.v_proj, "orig_layer"): +# v_proj_shape = module.self_attn.v_proj.orig_layer.weight.shape +# o_proj_shape = module.self_attn.o_proj.orig_layer.weight.shape +# else: +# v_proj_shape = module.self_attn.v_proj.weight.shape +# o_proj_shape = module.self_attn.o_proj.weight.shape +# if v_proj_shape == o_proj_shape: +# absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ +# f"{model_layer_name}.{idx}.self_attn.o_proj", +# ] - # linear 1 - absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ - f"{model_layer_name}.{idx}.mlp.gate_proj", - f"{model_layer_name}.{idx}.mlp.up_proj", - ] - - # linear 2 - absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ - f"{model_layer_name}.{idx}.mlp.down_proj", - ] +# # linear 1 +# absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ +# f"{model_layer_name}.{idx}.mlp.gate_proj", +# f"{model_layer_name}.{idx}.mlp.up_proj", +# ] + +# # linear 2 +# absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ +# f"{model_layer_name}.{idx}.mlp.down_proj", +# ] - # final layer - # absorb_to_layer["model.norm"] = ['lm_head'] +# # final layer +# # absorb_to_layer["model.norm"] = ['lm_head'] - return absorb_to_layer +# return absorb_to_layer -@register_get_func('mistral') +@register_absorb_func('mistral') def get_mistral_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -189,7 +192,7 @@ def get_mistral_absorb_layers(model): return absorb_to_layer -@register_get_func('mixtral') +@register_absorb_func('mixtral') def get_mixtral_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -231,7 +234,7 @@ def get_mixtral_absorb_layers(model): return absorb_to_layer -@register_get_func('bloom') +@register_absorb_func('bloom') def get_bloom_absorb_layers(model): model_layer_name = "transformer.h" absorb_to_layer = {} @@ -257,7 +260,7 @@ def get_bloom_absorb_layers(model): return absorb_to_layer -@register_get_func('gptj') +@register_absorb_func('gptj') def get_gptj_absorb_layers(model): model_layer_name = "transformer.h" absorb_to_layer = {} @@ -285,7 +288,7 @@ def get_gptj_absorb_layers(model): return absorb_to_layer -@register_get_func('phi3') +@register_absorb_func('phi3') def get_phi3_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -316,7 +319,7 @@ def get_phi3_absorb_layers(model): return absorb_to_layer -@register_get_func('qwen') +@register_absorb_func('qwen') def get_qwen_absorb_layers(model): model_layer_name = "transformer.h" absorb_to_layer = {} @@ -343,11 +346,12 @@ def get_qwen_absorb_layers(model): return absorb_to_layer -@register_get_func('qwen2') -@register_get_func('qwen3') -def get_qwen2_absorb_layers(model): +@register_absorb_func(["qwen2", "qwen3"]) +@register_absorb_func('llama') +def get_defualt_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} + for idx in range(len(model.model.layers)): # attention input absorb_to_layer[f"{model_layer_name}.{idx}.input_layernorm"] = [ @@ -380,7 +384,7 @@ def get_qwen2_absorb_layers(model): return absorb_to_layer -@register_get_func('qwen3_moe') +@register_absorb_func('qwen3_moe') def get_qwen3_moe_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} diff --git a/auto_round/smooth_quant/auto_alpha.py b/auto_round/smooth_quant/auto_alpha.py index 8c87dc7e1..d59c29550 100644 --- a/auto_round/smooth_quant/auto_alpha.py +++ b/auto_round/smooth_quant/auto_alpha.py @@ -28,7 +28,7 @@ from tqdm import tqdm from auto_round.smooth_quant.calibration import Calibration -from auto_round.utils import set_module, get_module +from auto_round.smooth_quant.utils import set_module, get_module from auto_round.smooth_quant.utils import ( WrapperLayer, enough_memo_store_scale, reshape_scale_as_input, reshape_scale_as_weight, reshape_in_channel_to_last, cal_scale, forward_wrapper, mul_scale, quant_dequant) @@ -505,7 +505,14 @@ def _get_one_batch_auto_loss_blockwise(self, input, alpha_space, orig_best_alpha position_ids = torch.arange(self.block_inputs[block_name].size()[1]) position_ids = position_ids.view(self.block_inputs[block_name].size()[0], -1) position_ids = position_ids.to(self.device) - output = block_copy(self.block_inputs[block_name], position_ids=position_ids)[0] + if hasattr(self.model, "rotary_emb"): + position_embeddings = self.model.rotary_emb(self.block_inputs[block_name], position_ids) + else: + position_embeddings = None + output = block_copy( + self.block_inputs[block_name], + position_ids=position_ids, + position_embeddings=position_embeddings)[0] loss = self._get_auto_loss(fp32_output[block_name], output) loss_alphas[block_name][str(alpha)] = loss del block_copy # release memory @@ -626,7 +633,7 @@ def _auto_tune_alpha(self): self._update_scales_for_auto(absorb_input_scales, weight_scales) # does not need to reset the weight_scale_dict, because use the weight of ori_layer, no change # self.weight_scale_dict = {} - + if total_cnt >= self.calib_sample_num: break diff --git a/auto_round/smooth_quant/sq.py b/auto_round/smooth_quant/sq.py index 1b8af7bb0..ef6d8ecb8 100644 --- a/auto_round/smooth_quant/sq.py +++ b/auto_round/smooth_quant/sq.py @@ -1,7 +1,8 @@ import torch import numpy -from auto_round.utils import logger, get_module, set_module +from auto_round.utils import logger +from auto_round.smooth_quant.utils import get_module, set_module from auto_round.smooth_quant.calibration import Calibration from auto_round.smooth_quant.utils import ( model_forward_per_sample, reshape_in_channel_to_last, cal_scale, mul_scale, reshape_scale_as_weight) diff --git a/auto_round/smooth_quant/utils.py b/auto_round/smooth_quant/utils.py index 6cc83f792..ed79ab276 100644 --- a/auto_round/smooth_quant/utils.py +++ b/auto_round/smooth_quant/utils.py @@ -4,9 +4,59 @@ import torch from tqdm import tqdm -from auto_round.utils import logger, get_module +from auto_round.utils import logger from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad +def get_module(model, key): + """Get module from model by key name. + + Args: + model (torch.nn.Module): original model + key (str): module name to be replaced + """ + module = model + name_list = key.split(".") + for name in name_list: + if hasattr(module, name): + module = getattr(module, name) + elif hasattr(module, "sq_linear"): # for peft models + module = getattr(module, "sq_linear") + module = getattr(module, name) + elif hasattr(module, "orig_layer"): # for peft models and auto alpha + module = getattr(module, "orig_layer") + module = getattr(module, name) + else: + module = module + return module + +def set_module(model, key, new_module): + """Set new module into model by key name. + + Args: + model (torch.nn.Module): original model + key (str): module name to be replaced + new_module (torch.nn.Module): new module to be inserted + """ + module = model + name_list = key.split(".") + for name in name_list[:-1]: + if hasattr(module, name): + module = getattr(module, name) + elif hasattr(module, ("sq_linear")): # for peft models that Linears are contained in Linear + module = getattr(module, "sq_linear") + module = getattr(module, name) + elif hasattr(module, ("orig_layer")): # for peft models and auto alpha + module = getattr(module, "orig_layer") + module = getattr(module, name) + else: + module = module + + if hasattr(module, "sq_linear") and name_list[-1] != "sq_linear": # for peft models + module = getattr(module, "sq_linear") + if hasattr(module, "orig_layer") and name_list[-1] != "orig_layer": # for peft models and auto alpha + module = getattr(module, "orig_layer") + setattr(module, name_list[-1], new_module) + def mul_scale(tensor, scale, group_size=-1): ori_shape = tensor.shape if len(scale.shape) == 2 and scale.shape[1] == 1: From 041a66d70ef1c3d87073724bc2d4d199bb4b36e2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Aug 2025 07:58:29 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/script/llm.py | 23 +++--- auto_round/smooth_quant/__init__.py | 2 +- auto_round/smooth_quant/absorb_utils.py | 103 +++++++++++++----------- auto_round/smooth_quant/auto_alpha.py | 48 +++++++---- auto_round/smooth_quant/calibration.py | 3 +- auto_round/smooth_quant/sq.py | 51 +++++++++--- auto_round/smooth_quant/utils.py | 31 ++++++- 7 files changed, 170 insertions(+), 91 deletions(-) diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index b03fcbd7c..b7cb53fd6 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -539,17 +539,19 @@ def tune(args): # sq if args.sq: from auto_round.calib_dataset import get_dataloader + dataloader = get_dataloader(tokenizer, args.seqlen, bs=8, nsamples=args.nsamples) - auto_alpha_args={ - "init_alpha": 0.5, - "alpha_min": 0.1, - "alpha_max": 1.0, - "alpha_step": 0.1, - "shared_criterion": "mean", - "n_samples": 512, ##512 for cuda, 128 for cpu? - # "do_blockwise": True - } + auto_alpha_args = { + "init_alpha": 0.5, + "alpha_min": 0.1, + "alpha_max": 1.0, + "alpha_step": 0.1, + "shared_criterion": "mean", + "n_samples": 512, ##512 for cuda, 128 for cpu? + # "do_blockwise": True + } from auto_round.smooth_quant import SmoothQuant + model = model.to(device_str) sq = SmoothQuant(model, dataloader, device=model.device, group_size=-1) model = sq.transform_model( @@ -558,7 +560,8 @@ def tune(args): auto_alpha_args=auto_alpha_args, folding=True, op_types=[torch.nn.Linear, torch.nn.Conv2d], - calib_iter=100) + calib_iter=100, + ) autoround = round( model=model, diff --git a/auto_round/smooth_quant/__init__.py b/auto_round/smooth_quant/__init__.py index 0abe6e53f..7af3654f7 100644 --- a/auto_round/smooth_quant/__init__.py +++ b/auto_round/smooth_quant/__init__.py @@ -15,4 +15,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from auto_round.smooth_quant.sq import SmoothQuant \ No newline at end of file +from auto_round.smooth_quant.sq import SmoothQuant diff --git a/auto_round/smooth_quant/absorb_utils.py b/auto_round/smooth_quant/absorb_utils.py index 78b2c2cc6..baf2f283b 100644 --- a/auto_round/smooth_quant/absorb_utils.py +++ b/auto_round/smooth_quant/absorb_utils.py @@ -31,11 +31,12 @@ "LPLayerNorm", "RMSNorm", "Qwen2RMSNorm", - "WrapperWALayer" + "WrapperWALayer", ] GET_ABSORB_LAYERS = {} + def register_absorb_func(model_type): def register(func): if isinstance(model_type, list): @@ -43,10 +44,12 @@ def register(func): else: model_types = [model_type] for name in model_types: - GET_ABSORB_LAYERS[name] = func + GET_ABSORB_LAYERS[name] = func return func + return register + def _check_valid_conv(module): """Remove group conv except depthwise conv :param module: @@ -62,6 +65,7 @@ def _check_valid_conv(module): return False return True + def remove_unsupported_layers(model, absorb_to_layer, no_absorb_layers): res = {} for key in absorb_to_layer.keys(): @@ -82,6 +86,7 @@ def remove_unsupported_layers(model, absorb_to_layer, no_absorb_layers): res[key] = absorb_to_layer[key] return res + @register_absorb_func("opt") def get_opt_absorb_layers(model): model_layer_name = "model.decoder.layers" @@ -93,13 +98,13 @@ def get_opt_absorb_layers(model): f"{model_layer_name}.{idx}.self_attn.k_proj", f"{model_layer_name}.{idx}.self_attn.v_proj", ] - + # attention out # no_absorb_layers.append(f"{model_layer_name}.{idx}.self_attn.out_proj") absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ f"{model_layer_name}.{idx}.self_attn.out_proj", ] - + # linear 1 absorb_to_layer[f"{model_layer_name}.{idx}.final_layer_norm"] = [ f"{model_layer_name}.{idx}.fc1", @@ -109,12 +114,13 @@ def get_opt_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.fc1"] = [ f"{model_layer_name}.{idx}.fc2", ] - + # final layer # absorb_to_layer["model.decoder.final_layer_norm"] = ['lm_head'] return absorb_to_layer + # @register_absorb_func('llama') # def get_llama_absorb_layers(model): # model_layer_name = "model.layers" @@ -140,7 +146,7 @@ def get_opt_absorb_layers(model): # absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ # f"{model_layer_name}.{idx}.self_attn.o_proj", # ] - + # # linear 1 # absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ # f"{model_layer_name}.{idx}.mlp.gate_proj", @@ -151,13 +157,14 @@ def get_opt_absorb_layers(model): # absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ # f"{model_layer_name}.{idx}.mlp.down_proj", # ] - + # # final layer # # absorb_to_layer["model.norm"] = ['lm_head'] - + # return absorb_to_layer -@register_absorb_func('mistral') + +@register_absorb_func("mistral") def get_mistral_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -175,7 +182,7 @@ def get_mistral_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ f"{model_layer_name}.{idx}.self_attn.o_proj", ] - + # linear 1 absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ f"{model_layer_name}.{idx}.mlp.gate_proj", @@ -186,13 +193,14 @@ def get_mistral_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ f"{model_layer_name}.{idx}.mlp.down_proj", ] - + # final layer # absorb_to_layer["model.norm"] = ['lm_head'] - + return absorb_to_layer -@register_absorb_func('mixtral') + +@register_absorb_func("mixtral") def get_mixtral_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -210,7 +218,7 @@ def get_mixtral_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ f"{model_layer_name}.{idx}.self_attn.o_proj", ] - + # linear in module = get_module(model, f"{model_layer_name}.{idx}.block_sparse_moe.experts") absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [] @@ -218,23 +226,22 @@ def get_mixtral_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"].extend( [ f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w1", - f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w3" + f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w3", ] ) - # linear out for i in range(len(module)): absorb_to_layer[f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w3"] = [ - f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w2" - ] - + f"{model_layer_name}.{idx}.block_sparse_moe.experts.{i}.w2" + ] + # final layer # absorb_to_layer["model.norm"] = ['lm_head'] return absorb_to_layer -@register_absorb_func('bloom') +@register_absorb_func("bloom") def get_bloom_absorb_layers(model): model_layer_name = "transformer.h" absorb_to_layer = {} @@ -253,14 +260,14 @@ def get_bloom_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.mlp.gelu_impl"] = [ f"{model_layer_name}.{idx}.mlp.dense_4h_to_h", ] - + # final layer # absorb_to_layer["transformer.ln_f"] = ['lm_head'] - + return absorb_to_layer -@register_absorb_func('gptj') +@register_absorb_func("gptj") def get_gptj_absorb_layers(model): model_layer_name = "transformer.h" absorb_to_layer = {} @@ -282,13 +289,14 @@ def get_gptj_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.mlp.act"] = [ f"{model_layer_name}.{idx}.mlp.fc_out", ] - + # final layer # absorb_to_layer["transformer.ln_f"] = ['lm_head'] - + return absorb_to_layer -@register_absorb_func('phi3') + +@register_absorb_func("phi3") def get_phi3_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -302,7 +310,7 @@ def get_phi3_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.self_attn.qkv_proj"] = [ f"{model_layer_name}.{idx}.self_attn.o_proj", ] - + # linear 1 absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ f"{model_layer_name}.{idx}.mlp.gate_up_proj", @@ -312,29 +320,27 @@ def get_phi3_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.mlp.gate_up_proj"] = [ f"{model_layer_name}.{idx}.mlp.down_proj", ] - + # final layer # absorb_to_layer["model.norm"] = ['lm_head'] - + return absorb_to_layer -@register_absorb_func('qwen') +@register_absorb_func("qwen") def get_qwen_absorb_layers(model): model_layer_name = "transformer.h" absorb_to_layer = {} for idx in range(len(model.transformer.h)): # attention - absorb_to_layer[f"{model_layer_name}.{idx}.ln_1"] = [ - f"{model_layer_name}.{idx}.attn.c_attn" - ] + absorb_to_layer[f"{model_layer_name}.{idx}.ln_1"] = [f"{model_layer_name}.{idx}.attn.c_attn"] # mlp absorb_to_layer[f"{model_layer_name}.{idx}.ln_2"] = [ f"{model_layer_name}.{idx}.mlp.w2", f"{model_layer_name}.{idx}.mlp.w1", ] - + # linear 2 absorb_to_layer[f"{model_layer_name}.{idx}.mlp.w1"] = [ f"{model_layer_name}.{idx}.mlp.c_proj", @@ -342,12 +348,12 @@ def get_qwen_absorb_layers(model): # final layer # absorb_to_layer["transformer.ln_f"] = ['lm_head'] - + return absorb_to_layer @register_absorb_func(["qwen2", "qwen3"]) -@register_absorb_func('llama') +@register_absorb_func("llama") def get_defualt_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -366,7 +372,7 @@ def get_defualt_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ f"{model_layer_name}.{idx}.self_attn.o_proj", ] - + # linear 1 absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ f"{model_layer_name}.{idx}.mlp.gate_proj", @@ -380,11 +386,11 @@ def get_defualt_absorb_layers(model): # final layer # absorb_to_layer["model.norm"] = ['lm_head'] - + return absorb_to_layer -@register_absorb_func('qwen3_moe') +@register_absorb_func("qwen3_moe") def get_qwen3_moe_absorb_layers(model): model_layer_name = "model.layers" absorb_to_layer = {} @@ -402,15 +408,15 @@ def get_qwen3_moe_absorb_layers(model): absorb_to_layer[f"{model_layer_name}.{idx}.v_proj"] = [ f"{model_layer_name}.{idx}.self_attn.o_proj", ] - + if hasattr(module.mlp, "gate"): # linear in absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ f"{model_layer_name}.{idx}.mlp.experts.{i}.gate_proj" for i in range(len(module.mlp.experts)) ] - absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"].extend([ - f"{model_layer_name}.{idx}.mlp.experts.{i}.up_proj" for i in range(len(module.mlp.experts)) - ]) + absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"].extend( + [f"{model_layer_name}.{idx}.mlp.experts.{i}.up_proj" for i in range(len(module.mlp.experts))] + ) breakpoint() # linear out @@ -421,18 +427,18 @@ def get_qwen3_moe_absorb_layers(model): else: # linear 1 absorb_to_layer[f"{model_layer_name}.{idx}.post_attention_layernorm"] = [ - f"{model_layer_name}.{idx}.mlp.gate_proj",f"{model_layer_name}.{idx}.mlp.up_proj" + f"{model_layer_name}.{idx}.mlp.gate_proj", + f"{model_layer_name}.{idx}.mlp.up_proj", ] # linear 2 - absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [ - f"{model_layer_name}.{idx}.mlp.down_proj" - ] - + absorb_to_layer[f"{model_layer_name}.{idx}.mlp.up_proj"] = [f"{model_layer_name}.{idx}.mlp.down_proj"] + # final layer # absorb_to_layer["model.norm"] = ['lm_head'] return absorb_to_layer + def get_absorb_layers(model, skip_unsupported_layers=False): model_type = model.config.model_type assert model_type in GET_ABSORB_LAYERS, f"Unsupported model type: {model_type}" @@ -441,4 +447,3 @@ def get_absorb_layers(model, skip_unsupported_layers=False): # if skip_unsupported_layers: # absorb_to_layer = remove_unsupported_layers(model, absorb_to_layer, no_absorb_layers) return absorb_to_layer, no_absorb_layers - \ No newline at end of file diff --git a/auto_round/smooth_quant/auto_alpha.py b/auto_round/smooth_quant/auto_alpha.py index d59c29550..6859cce44 100644 --- a/auto_round/smooth_quant/auto_alpha.py +++ b/auto_round/smooth_quant/auto_alpha.py @@ -19,19 +19,26 @@ import copy import json - -import torch -from .utils import logger - - import numpy +import torch from tqdm import tqdm from auto_round.smooth_quant.calibration import Calibration -from auto_round.smooth_quant.utils import set_module, get_module from auto_round.smooth_quant.utils import ( - WrapperLayer, enough_memo_store_scale, reshape_scale_as_input, reshape_scale_as_weight, reshape_in_channel_to_last, - cal_scale, forward_wrapper, mul_scale, quant_dequant) + WrapperLayer, + cal_scale, + enough_memo_store_scale, + forward_wrapper, + get_module, + mul_scale, + quant_dequant, + reshape_in_channel_to_last, + reshape_scale_as_input, + reshape_scale_as_weight, + set_module, +) + +from .utils import logger TUNERS = {} @@ -70,7 +77,7 @@ def __init__( do_blockwise=False, n_samples=32, calib_iter=100, - group_size=-1 + group_size=-1, ): """Initialize the AutoAlpha tuner with necessary parameters and components.""" @@ -243,7 +250,13 @@ def _qdq_model_wrapper_for_auto(self, save_q_input=False): if name not in self.input_mins: # skip module if it's not used in calibration continue module = get_module(self.model, name) - new_module = WrapperLayer(module, self.input_mins[name], self.input_maxes[name], save_q_input=save_q_input, group_size=self.group_size) + new_module = WrapperLayer( + module, + self.input_mins[name], + self.input_maxes[name], + save_q_input=save_q_input, + group_size=self.group_size, + ) set_module(self.model, name, new_module) def _qdq_model_unwrapper_for_auto(self): @@ -427,7 +440,9 @@ def _get_one_batch_auto_loss(self, input, alpha_space, orig_best_alpha, input_ma if str(alpha) in losses.keys(): continue module = get_module(self.model, name) - output = module.q_dq_forward(module.q_input, module.input_scale, module.weight_scale, module.absorb_scale) + output = module.q_dq_forward( + module.q_input, module.input_scale, module.weight_scale, module.absorb_scale + ) loss = self._get_auto_loss(fp32_output[name], output) loss_alphas[name][str(alpha)] = loss return loss_alphas @@ -492,7 +507,9 @@ def _get_one_batch_auto_loss_blockwise(self, input, alpha_space, orig_best_alpha module_copy = copy.deepcopy(module) if module.weight_scale is not None: # module_copy.orig_layer.weight *= module.weight_scale - module_copy.orig_layer.weight.data = mul_scale(module_copy.orig_layer.weight, module.weight_scale, self.group_size) + module_copy.orig_layer.weight.data = mul_scale( + module_copy.orig_layer.weight, module.weight_scale, self.group_size + ) # q_dq_weight = quant_dequant_w_v1(module_copy.orig_layer) q_dq_weight = quant_dequant(module_copy.orig_layer) module_copy.orig_layer.weight.data.copy_(q_dq_weight) @@ -512,7 +529,8 @@ def _get_one_batch_auto_loss_blockwise(self, input, alpha_space, orig_best_alpha output = block_copy( self.block_inputs[block_name], position_ids=position_ids, - position_embeddings=position_embeddings)[0] + position_embeddings=position_embeddings, + )[0] loss = self._get_auto_loss(fp32_output[block_name], output) loss_alphas[block_name][str(alpha)] = loss del block_copy # release memory @@ -594,9 +612,7 @@ def _auto_tune_alpha(self): self._qdq_model_unwrapper_for_auto() return best_alphas # bar = tqdm(self.dataloader, total=self.calib_sample_num, desc="auto tune alpha") - pbar = tqdm( - range(self.calib_sample_num // self.dataloader.batch_size), - desc="auto tune alpha") + pbar = tqdm(range(self.calib_sample_num // self.dataloader.batch_size), desc="auto tune alpha") for input in self.dataloader: pbar.update(1) if isinstance(input, tuple) or isinstance(input, list): diff --git a/auto_round/smooth_quant/calibration.py b/auto_round/smooth_quant/calibration.py index bcc9ecaa6..38f12764e 100644 --- a/auto_round/smooth_quant/calibration.py +++ b/auto_round/smooth_quant/calibration.py @@ -16,10 +16,11 @@ import json import torch -from .utils import * from auto_round.data_type.utils import reshape_pad_tensor_by_group_size +from .utils import * + class Calibration: def __init__(self, model, dataloder=None, q_func=None, device="cpu", group_size=-1): diff --git a/auto_round/smooth_quant/sq.py b/auto_round/smooth_quant/sq.py index ef6d8ecb8..74a65a31d 100644 --- a/auto_round/smooth_quant/sq.py +++ b/auto_round/smooth_quant/sq.py @@ -1,15 +1,46 @@ -import torch +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy +import torch -from auto_round.utils import logger -from auto_round.smooth_quant.utils import get_module, set_module +from auto_round.smooth_quant.absorb_utils import get_absorb_layers from auto_round.smooth_quant.calibration import Calibration from auto_round.smooth_quant.utils import ( - model_forward_per_sample, reshape_in_channel_to_last, cal_scale, mul_scale, reshape_scale_as_weight) -from auto_round.smooth_quant.absorb_utils import get_absorb_layers + cal_scale, + get_module, + model_forward_per_sample, + mul_scale, + reshape_in_channel_to_last, + reshape_scale_as_weight, + set_module, +) +from auto_round.utils import logger + class SmoothQuant: - def __init__(self, model, dataloader=None, device="cpu", dtype=torch.bfloat16, example_inputs=None, q_func=None, traced_model=None, group_size=-1): + def __init__( + self, + model, + dataloader=None, + device="cpu", + dtype=torch.bfloat16, + example_inputs=None, + q_func=None, + traced_model=None, + group_size=-1, + ): """ :param model: Torch model :param dataloader: Calibration dataloader :param traced_model: A specific model shares the same architecture as the model and could be traced by torch.jit. If not supplied, we use model @@ -42,7 +73,6 @@ def __init__(self, model, dataloader=None, device="cpu", dtype=torch.bfloat16, self.need_calibration = False self.group_size = group_size - @torch.no_grad() def transform_model( self, @@ -182,7 +212,7 @@ def revert(self): self._absorb_scales(key, 1.0 / self.absorb_scales_info[key]) self.weight_scale_info = {} ##clear the data self.absorb_scales_info = {} - + def output_is_equal(self, out1, out2, atol=1e-03): try: if isinstance(out1, tuple): @@ -199,7 +229,6 @@ def output_is_equal(self, out1, out2, atol=1e-03): ) return True - def _cal_scales(self, absorb_to_layer, input_maxes, alpha=0.5): """Cal the adjust scales :param absorb_to_layer: A dict mapping absorb layer to smooth quantized layer @@ -449,7 +478,9 @@ def _parse_absorb_to_layers(self, op_types, folding): # (due to self._get_all_layer_names use layer tree instead of forward_path) if not folding and self.need_calibration: if len(self.input_mins) == 0: ##there are some modules not used in forward - calib = Calibration(self.model, self.dataloader, self.q_func, self.device, group_size=self.group_size) ## + calib = Calibration( + self.model, self.dataloader, self.q_func, self.device, group_size=self.group_size + ) ## input_mins, input_maxes = calib.calibrate( 1, op_types ) ##TODO if using qfunc for calibration, it will calibrate twice diff --git a/auto_round/smooth_quant/utils.py b/auto_round/smooth_quant/utils.py index ed79ab276..93a5ac600 100644 --- a/auto_round/smooth_quant/utils.py +++ b/auto_round/smooth_quant/utils.py @@ -1,11 +1,26 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy from collections import UserDict, defaultdict -import torch +import torch from tqdm import tqdm -from auto_round.utils import logger from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad +from auto_round.utils import logger + def get_module(model, key): """Get module from model by key name. @@ -29,6 +44,7 @@ def get_module(model, key): module = module return module + def set_module(model, key, new_module): """Set new module into model by key name. @@ -57,6 +73,7 @@ def set_module(model, key, new_module): module = getattr(module, "orig_layer") setattr(module, name_list[-1], new_module) + def mul_scale(tensor, scale, group_size=-1): ori_shape = tensor.shape if len(scale.shape) == 2 and scale.shape[1] == 1: @@ -67,6 +84,7 @@ def mul_scale(tensor, scale, group_size=-1): tensor *= scale return tensor.reshape(ori_shape) + def reshape_scale_as_input(layer, scale): """Reshape the scale for input feature in channel :param layer: @@ -102,6 +120,7 @@ def reshape_scale_as_weight(layer, scale): scale = scale.view(1, scale.shape[0]) return scale + def move_input_to_device(input, device=torch.device("cpu")): if isinstance(input, dict) or isinstance(input, UserDict): tmp_input = {} @@ -118,6 +137,7 @@ def move_input_to_device(input, device=torch.device("cpu")): input = input.to(device) # pylint: disable=no-member return input + def forward_wrapper(model, input, device=torch.device("cpu")): try: model = model.to(device) @@ -158,6 +178,7 @@ def model_forward(model, dataloader, iters, device): break pbar.close() + def cal_scale(input_max_abs, weights, alpha, weight_max_lb=1e-5, group_size=-1): weights = torch.cat(weights, dim=0) weights, _, _ = reshape_pad_tensor_by_group_size(weights, group_size) @@ -199,16 +220,18 @@ def enough_memo_store_scale(device, need_space): return free_space >= need_space -def quant_dequant(m, num_bits=4, group_size=32, data_type='mx_fp4', sym=True): +def quant_dequant(m, num_bits=4, group_size=32, data_type="mx_fp4", sym=True): from auto_round.data_type.utils import get_quant_func + # data_type = 'int_asym' - data_type = 'mx_fp4' + data_type = "mx_fp4" tensor = m.weight if hasattr(m, "weight") else m quant_func, data_type = get_quant_func(data_type, num_bits, sym) # print(quant_func, num_bits) data_new, scale, zp = quant_func(tensor, bits=num_bits, group_size=group_size, v=0, max_scale=1.0) return data_new.to(tensor.dtype) + class WrapperLayer(torch.nn.Module): def __init__(self, layer, input_min, input_max, save_q_input=False, group_size=-1): super(WrapperLayer, self).__init__() From 9e9936c398da067bba249c33ee07113238be0dda Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Sep 2025 05:20:43 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/compressors/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 0dc3bc461..04456cc5e 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -385,10 +385,11 @@ def __init__( logger.info("habana_frameworks is available, import htcore explicitly.") import habana_frameworks.torch.core as htcore # pylint: disable=E0401 import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401] - + # sq, for test if sq: from auto_round.calib_dataset import get_dataloader + dataloader = get_dataloader(tokenizer, seqlen, bs=batch_size, nsamples=nsamples) auto_alpha_args = { "init_alpha": 0.5,