diff --git a/README.md b/README.md
index 8e069df9..428e623b 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ image presents an overview of AutoRound.
## What's New
-* [2024/06] AutoRound format supports mixed bits/group_size inference and fixed the asym kernel large drop issue.
+* [2024/06] AutoRound format supports mixed bit-widths and group sizes for inference, resolving the significant performance drop issue with the asymmetric kernel
* [2024/05] Check out our updated paper on [arxiv](https://arxiv.org/pdf/2309.05516v4)
* [2024/05] AutoRound supports lm-head quantization, saving 0.7G for LLaMA3-8B at W4G128.
* [2024/05] AutoRound performs well
@@ -57,7 +57,7 @@ pip install auto-round
### Gaudi2/ CPU/ GPU
We found a significant accuracy discrepancy with the qdq model using the AutoGPTQ GPU backend with asymmetric
-quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to auoround format to fix this issue.
+quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to AuoRound format to fix this issue.
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
index e3efbb5e..84aaa5e7 100644
--- a/auto_round/auto_quantizer.py
+++ b/auto_round/auto_quantizer.py
@@ -37,7 +37,6 @@
from packaging import version
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import Conv1D
-import transformers
from transformers.quantizers import AutoQuantizationConfig, HfQuantizer
from transformers.quantizers.auto import AUTO_QUANTIZER_MAPPING
from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod
@@ -102,7 +101,7 @@ def is_autoround_exllamav2_available():
if is_auto_round_available():
- from auto_round.export.export_to_autoround.post_init import autoround_post_init
+ from auto_round_extension.cuda.post_init import autoround_post_init
#
@@ -214,14 +213,7 @@ def __init__(
group_size: int = 128,
sym: bool = False,
backend="autoround:exllamav2",
- iters: int = 200,
weight_config: dict = None,
- enable_quanted_input=True,
- enable_minmax_tuning=True,
- lr=None,
- minmax_lr=None,
- n_samples=512,
- seqlen=2048,
**kwargs,
):
self.bits = bits
@@ -230,14 +222,7 @@ def __init__(
self.group_size = group_size
self.sym = sym
self.backend = backend
- self.inters = iters
self.weight_config = weight_config
- self.enable_quanted_input = enable_quanted_input
- self.enable_minmax_tuning = enable_minmax_tuning
- self.lr = lr
- self.minmax_lr = minmax_lr
- self.n_samples = n_samples
- self.seqlen = seqlen
if kwargs is not None:
for key in kwargs.keys():
setattr(self, key, kwargs[key])
@@ -327,9 +312,9 @@ def convert_model(self, model: nn.Module):
def _dynamic_import_inference_linear(self, bits, backend):
if bits == 4 and self.exllama2_available and "exllama2" in backend:
- from auto_round.export.export_to_autoround.qliner_exllamav2 import QuantLinear
+ from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
else:
- from auto_round.export.export_to_autoround.qliner_triton import QuantLinear
+ from auto_round_extension.cuda.qliner_triton import QuantLinear
return QuantLinear
def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
@@ -431,4 +416,3 @@ def is_serializable(self):
transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
-from transformers import AutoModelForCausalLM as AutoModelForCausalLM
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 13667619..f89a03aa 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -89,7 +89,7 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size):
return QuantLinear
##export all use trition, inference use exllamav2
elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend:
- from .qliner_triton import QuantLinear
+ from auto_round_extension.cuda.qliner_triton import QuantLinear
return QuantLinear
else:
diff --git a/auto_round_extension/__init__.py b/auto_round_extension/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/auto_round_extension/cuda/__init__.py b/auto_round_extension/cuda/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/auto_round/export/export_to_autoround/post_init.py b/auto_round_extension/cuda/post_init.py
similarity index 98%
rename from auto_round/export/export_to_autoround/post_init.py
rename to auto_round_extension/cuda/post_init.py
index f536f5ad..3de7e5ab 100644
--- a/auto_round/export/export_to_autoround/post_init.py
+++ b/auto_round_extension/cuda/post_init.py
@@ -136,7 +136,7 @@ def autoround_post_init(model):
fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))
if model_uses_exllamav2:
- from .qliner_exllamav2 import ExLlamaV2DeviceTensors
+ from auto_round_extension.cuda.qliner_exllamav2 import ExLlamaV2DeviceTensors
device_tensors = {}
for device, scratch_bytes in fixed_bytes.items():
diff --git a/auto_round/export/export_to_autoround/qliner_exllamav2.py b/auto_round_extension/cuda/qliner_exllamav2.py
similarity index 100%
rename from auto_round/export/export_to_autoround/qliner_exllamav2.py
rename to auto_round_extension/cuda/qliner_exllamav2.py
diff --git a/auto_round/export/export_to_autoround/qliner_triton.py b/auto_round_extension/cuda/qliner_triton.py
similarity index 98%
rename from auto_round/export/export_to_autoround/qliner_triton.py
rename to auto_round_extension/cuda/qliner_triton.py
index 7d02a27a..e307ace3 100644
--- a/auto_round/export/export_to_autoround/qliner_triton.py
+++ b/auto_round_extension/cuda/qliner_triton.py
@@ -41,13 +41,13 @@
import torch.nn as nn
import transformers
-from .triton_utils.mixin import TritonModuleMixin
+from auto_round_extension.cuda.triton_utils.mixin import TritonModuleMixin
logger = getLogger(__name__)
try:
- from .triton_utils.kernels import (
+ from auto_round_extension.cuda.triton_utils import (
QuantLinearFunction,
QuantLinearInferenceOnlyFunction,
quant_matmul_248,
diff --git a/auto_round/export/export_to_autoround/triton_utils/__init__.py b/auto_round_extension/cuda/triton_utils/__init__.py
similarity index 100%
rename from auto_round/export/export_to_autoround/triton_utils/__init__.py
rename to auto_round_extension/cuda/triton_utils/__init__.py
diff --git a/auto_round/export/export_to_autoround/triton_utils/custom_autotune.py b/auto_round_extension/cuda/triton_utils/custom_autotune.py
similarity index 100%
rename from auto_round/export/export_to_autoround/triton_utils/custom_autotune.py
rename to auto_round_extension/cuda/triton_utils/custom_autotune.py
diff --git a/auto_round/export/export_to_autoround/triton_utils/dequant.py b/auto_round_extension/cuda/triton_utils/dequant.py
similarity index 100%
rename from auto_round/export/export_to_autoround/triton_utils/dequant.py
rename to auto_round_extension/cuda/triton_utils/dequant.py
diff --git a/auto_round/export/export_to_autoround/triton_utils/kernels.py b/auto_round_extension/cuda/triton_utils/kernels.py
similarity index 100%
rename from auto_round/export/export_to_autoround/triton_utils/kernels.py
rename to auto_round_extension/cuda/triton_utils/kernels.py
diff --git a/auto_round/export/export_to_autoround/triton_utils/mixin.py b/auto_round_extension/cuda/triton_utils/mixin.py
similarity index 100%
rename from auto_round/export/export_to_autoround/triton_utils/mixin.py
rename to auto_round_extension/cuda/triton_utils/mixin.py
diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
index fe97a770..4cfb98a0 100644
--- a/examples/language-modeling/main.py
+++ b/examples/language-modeling/main.py
@@ -290,7 +290,7 @@ def get_library_version(library_name):
f"supported currently")
break
if args.quant_lm_head:
- weight_config[lm_head_layer_name] = {"data_type": "int"}
+ weight_config[lm_head_layer_name] = {"data_type": "int", "bits": 4, "group_size": 32}
transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
if transformers_version[0] == 4 and transformers_version[1] < 38:
error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
@@ -327,7 +327,7 @@ def get_library_version(library_name):
output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"
inplace = True if len(deployment_device) < 2 else False
- if 'gpu' in deployment_device:
+ if 'gpu' in deployment_device or "auto_round" in gpu_format or "auto-round" in gpu_format:
autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace)
if 'xpu' in deployment_device:
autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,