diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
index 081274d9..aed2a357 100644
--- a/auto_round/auto_quantizer.py
+++ b/auto_round/auto_quantizer.py
@@ -545,6 +545,8 @@ def remove_device_str(s, device_str):
                         "via `pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`")
 
             QuantLinear = dynamic_import_inference_linear(layer_backend, bits, group_size, sym)
+            # from auto_round_extension.cuda.qlinear_exllamav2_gptq import QuantLinear
+
 
             layer_device = get_device(layer)
 
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 818e135c..f229bb93 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -373,6 +373,8 @@ def dump_qinfo_to_layer_config(self):
         for n, m in self.model.named_modules():
             if n not in self.layer_config.keys():
                 continue
+            if hasattr(m,"orig_layer"):
+                m = m.orig_layer
             if hasattr(m, "scale"):
                 self.layer_config[n]["scale"] = m.scale
                 self.layer_config[n]["zp"] = m.zp
diff --git a/auto_round/data_type/fp8.py b/auto_round/data_type/fp8.py
index d51de462..15756e0d 100644
--- a/auto_round/data_type/fp8.py
+++ b/auto_round/data_type/fp8.py
@@ -287,4 +287,4 @@ def progressive_quant_fp8_int4(tensor, bits=4, group_size=-1, v=0, min_scale=1.0
                                                                           q_scale_thresh=q_scale_thresh)
     qdq_tensor = qdq_int4_tensor * scale_bf16_to_fp8
 
-    return qdq_tensor, scale_fp8_to_int4 * scale_bf16_to_fp8, None,
+    return qdq_tensor, scale_fp8_to_int4 * scale_bf16_to_fp8, zp_fp8_to_int4
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 55290b85..f1e6d1b3 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -95,8 +95,9 @@ def pack_layer(name, model, layer_config, backend, pbar):
         bits = config["bits"]
         group_size = config["group_size"]
         sym = config["sym"]
-
         layer = get_module(model, name)
+        if hasattr(layer, "orig_layer"):
+            layer = layer.orig_layer
 
         device = layer.weight.device
 
@@ -123,7 +124,7 @@ def pack_layer(name, model, layer_config, backend, pbar):
             qlayer = new_layer
             scale = layer_config[name]["scale"]
             zero = layer_config[name]["zp"]
-            act_scale = layer_config[name]["act_scale"]
+            act_scale = layer.act_scale
             # so far can only pack layer on CPU
             qlayer.to("cpu")
             ##force to float32 to be compatible with torch 2.0
@@ -190,7 +191,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         logger.info(f"AutoRound format does not support {backend}, try to pack each layer with AutoGPTQ")
         backend = backend.replace("auto_round", "auto_gptq")
 
-    model = kwargs["model"]
+    model = kwargs["model"].to(torch.float16) ##TODO change
     to_quant_block_names = kwargs["to_quant_block_names"]
     quant_block_list = kwargs.get("quant_block_list", None)
     safe_serialization = True if 'safe_serialization' not in kwargs.keys() else kwargs["safe_serialization"]
@@ -229,6 +230,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     if len(extra_config) > 0:
         quantization_config["extra_config"] = extra_config
     names = list(layer_config.keys())
+
     with ThreadPoolExecutor(max_workers=2) as executor:
         with tqdm(total=len(names), leave=True) as pbar:
             def wrapper(name):