diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 149f2b02..5dd77b9f 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -1075,7 +1075,7 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
         total_loss = 0
 
         for i in range(self.iters):
-            # logger.warning(f"iter {i}")
+            logger.warning(f"iter {i}")
             total_loss = 0
             if self.sampler == "rand":
                 whole_indices = torch.randperm(nsamples)[:pick_samples]
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index f010e1fc..d99dfa2f 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -191,7 +191,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         logger.info(f"AutoRound format does not support {backend}, try to pack each layer with AutoGPTQ")
         backend = backend.replace("auto_round", "auto_gptq")
 
-    model = kwargs["model"].to(torch.float16) ##TODO change
+    model = kwargs["model"].to(torch.bfloat16)
     to_quant_block_names = kwargs["to_quant_block_names"]
     quant_block_list = kwargs.get("quant_block_list", None)
     safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
diff --git a/auto_round/export/export_to_autoround/qlinear_triton_gptq.py b/auto_round/export/export_to_autoround/qlinear_triton_gptq.py
index 6cae8a74..e8caf173 100644
--- a/auto_round/export/export_to_autoround/qlinear_triton_gptq.py
+++ b/auto_round/export/export_to_autoround/qlinear_triton_gptq.py
@@ -158,9 +158,9 @@ def pack(self, linear, scales, zeros, act_scales, w_bf16_to_fp8_scale, g_idx=Non
         scales = scales.t().contiguous()
         zeros = zeros.t().contiguous()
         scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        self.act_scales = act_scales.clone().contiguous()
-        self.w_bf16_to_fp8_scale = w_bf16_to_fp8_scale.clone().contiguous()
+        self.scales.data.copy_(scales.clone().contiguous())
+        self.act_scales.data.copy_(act_scales.squeeze().clone())
+        self.w_bf16_to_fp8_scale.data.copy_(w_bf16_to_fp8_scale.squeeze().clone())
         if linear.bias is not None:
             self.bias = linear.bias.clone().half()