diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 149f2b02..5dd77b9f 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -1075,7 +1075,7 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch total_loss = 0 for i in range(self.iters): - # logger.warning(f"iter {i}") + logger.warning(f"iter {i}") total_loss = 0 if self.sampler == "rand": whole_indices = torch.randperm(nsamples)[:pick_samples] diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index f010e1fc..d99dfa2f 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -191,7 +191,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex logger.info(f"AutoRound format does not support {backend}, try to pack each layer with AutoGPTQ") backend = backend.replace("auto_round", "auto_gptq") - model = kwargs["model"].to(torch.float16) ##TODO change + model = kwargs["model"].to(torch.bfloat16) to_quant_block_names = kwargs["to_quant_block_names"] quant_block_list = kwargs.get("quant_block_list", None) safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"] diff --git a/auto_round/export/export_to_autoround/qlinear_triton_gptq.py b/auto_round/export/export_to_autoround/qlinear_triton_gptq.py index 6cae8a74..e8caf173 100644 --- a/auto_round/export/export_to_autoround/qlinear_triton_gptq.py +++ b/auto_round/export/export_to_autoround/qlinear_triton_gptq.py @@ -158,9 +158,9 @@ def pack(self, linear, scales, zeros, act_scales, w_bf16_to_fp8_scale, g_idx=Non scales = scales.t().contiguous() zeros = zeros.t().contiguous() scale_zeros = zeros * scales - self.scales = scales.clone().half() - self.act_scales = act_scales.clone().contiguous() - self.w_bf16_to_fp8_scale = w_bf16_to_fp8_scale.clone().contiguous() + self.scales.data.copy_(scales.clone().contiguous()) + self.act_scales.data.copy_(act_scales.squeeze().clone()) + self.w_bf16_to_fp8_scale.data.copy_(w_bf16_to_fp8_scale.squeeze().clone()) if linear.bias is not None: self.bias = linear.bias.clone().half()