diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py index 081274d9..aed2a357 100644 --- a/auto_round/auto_quantizer.py +++ b/auto_round/auto_quantizer.py @@ -545,6 +545,8 @@ def remove_device_str(s, device_str): "via `pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`") QuantLinear = dynamic_import_inference_linear(layer_backend, bits, group_size, sym) + # from auto_round_extension.cuda.qlinear_exllamav2_gptq import QuantLinear + layer_device = get_device(layer) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 818e135c..f229bb93 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -373,6 +373,8 @@ def dump_qinfo_to_layer_config(self): for n, m in self.model.named_modules(): if n not in self.layer_config.keys(): continue + if hasattr(m,"orig_layer"): + m = m.orig_layer if hasattr(m, "scale"): self.layer_config[n]["scale"] = m.scale self.layer_config[n]["zp"] = m.zp diff --git a/auto_round/data_type/fp8.py b/auto_round/data_type/fp8.py index d51de462..15756e0d 100644 --- a/auto_round/data_type/fp8.py +++ b/auto_round/data_type/fp8.py @@ -287,4 +287,4 @@ def progressive_quant_fp8_int4(tensor, bits=4, group_size=-1, v=0, min_scale=1.0 q_scale_thresh=q_scale_thresh) qdq_tensor = qdq_int4_tensor * scale_bf16_to_fp8 - return qdq_tensor, scale_fp8_to_int4 * scale_bf16_to_fp8, None, + return qdq_tensor, scale_fp8_to_int4 * scale_bf16_to_fp8, zp_fp8_to_int4 diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 55290b85..f1e6d1b3 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -95,8 +95,9 @@ def pack_layer(name, model, layer_config, backend, pbar): bits = config["bits"] group_size = config["group_size"] sym = config["sym"] - layer = get_module(model, name) + if hasattr(layer, "orig_layer"): + layer = layer.orig_layer device = layer.weight.device @@ -123,7 +124,7 @@ def pack_layer(name, model, layer_config, backend, pbar): qlayer = new_layer scale = layer_config[name]["scale"] zero = layer_config[name]["zp"] - act_scale = layer_config[name]["act_scale"] + act_scale = layer.act_scale # so far can only pack layer on CPU qlayer.to("cpu") ##force to float32 to be compatible with torch 2.0 @@ -190,7 +191,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex logger.info(f"AutoRound format does not support {backend}, try to pack each layer with AutoGPTQ") backend = backend.replace("auto_round", "auto_gptq") - model = kwargs["model"] + model = kwargs["model"].to(torch.float16) ##TODO change to_quant_block_names = kwargs["to_quant_block_names"] quant_block_list = kwargs.get("quant_block_list", None) safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"] @@ -229,6 +230,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex if len(extra_config) > 0: quantization_config["extra_config"] = extra_config names = list(layer_config.keys()) + with ThreadPoolExecutor(max_workers=2) as executor: with tqdm(total=len(names), leave=True) as pbar: def wrapper(name):