diff --git a/README.md b/README.md index 8e069df9..428e623b 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ image presents an overview of AutoRound.
## What's New -* [2024/06] AutoRound format supports mixed bits/group_size inference and fixed the asym kernel large drop issue. +* [2024/06] AutoRound format supports mixed bit-widths and group sizes for inference, resolving the significant performance drop issue with the asymmetric kernel * [2024/05] Check out our updated paper on [arxiv](https://arxiv.org/pdf/2309.05516v4) * [2024/05] AutoRound supports lm-head quantization, saving 0.7G for LLaMA3-8B at W4G128. * [2024/05] AutoRound performs well @@ -57,7 +57,7 @@ pip install auto-round ### Gaudi2/ CPU/ GPU We found a significant accuracy discrepancy with the qdq model using the AutoGPTQ GPU backend with asymmetric -quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to auoround format to fix this issue. +quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to AuoRound format to fix this issue. ```python from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py index e3efbb5e..84aaa5e7 100644 --- a/auto_round/auto_quantizer.py +++ b/auto_round/auto_quantizer.py @@ -37,7 +37,6 @@ from packaging import version from transformers.modeling_utils import PreTrainedModel from transformers.pytorch_utils import Conv1D -import transformers from transformers.quantizers import AutoQuantizationConfig, HfQuantizer from transformers.quantizers.auto import AUTO_QUANTIZER_MAPPING from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod @@ -102,7 +101,7 @@ def is_autoround_exllamav2_available(): if is_auto_round_available(): - from auto_round.export.export_to_autoround.post_init import autoround_post_init + from auto_round_extension.cuda.post_init import autoround_post_init # @@ -214,14 +213,7 @@ def __init__( group_size: int = 128, sym: bool = False, backend="autoround:exllamav2", - iters: int = 200, weight_config: dict = None, - enable_quanted_input=True, - enable_minmax_tuning=True, - lr=None, - minmax_lr=None, - n_samples=512, - seqlen=2048, **kwargs, ): self.bits = bits @@ -230,14 +222,7 @@ def __init__( self.group_size = group_size self.sym = sym self.backend = backend - self.inters = iters self.weight_config = weight_config - self.enable_quanted_input = enable_quanted_input - self.enable_minmax_tuning = enable_minmax_tuning - self.lr = lr - self.minmax_lr = minmax_lr - self.n_samples = n_samples - self.seqlen = seqlen if kwargs is not None: for key in kwargs.keys(): setattr(self, key, kwargs[key]) @@ -327,9 +312,9 @@ def convert_model(self, model: nn.Module): def _dynamic_import_inference_linear(self, bits, backend): if bits == 4 and self.exllama2_available and "exllama2" in backend: - from auto_round.export.export_to_autoround.qliner_exllamav2 import QuantLinear + from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear else: - from auto_round.export.export_to_autoround.qliner_triton import QuantLinear + from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend): @@ -431,4 +416,3 @@ def is_serializable(self): transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer -from transformers import AutoModelForCausalLM as AutoModelForCausalLM diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 13667619..f89a03aa 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -89,7 +89,7 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size): return QuantLinear ##export all use trition, inference use exllamav2 elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend: - from .qliner_triton import QuantLinear + from auto_round_extension.cuda.qliner_triton import QuantLinear return QuantLinear else: diff --git a/auto_round_extension/__init__.py b/auto_round_extension/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/auto_round_extension/cuda/__init__.py b/auto_round_extension/cuda/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/auto_round/export/export_to_autoround/post_init.py b/auto_round_extension/cuda/post_init.py similarity index 98% rename from auto_round/export/export_to_autoround/post_init.py rename to auto_round_extension/cuda/post_init.py index f536f5ad..3de7e5ab 100644 --- a/auto_round/export/export_to_autoround/post_init.py +++ b/auto_round_extension/cuda/post_init.py @@ -136,7 +136,7 @@ def autoround_post_init(model): fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0)) if model_uses_exllamav2: - from .qliner_exllamav2 import ExLlamaV2DeviceTensors + from auto_round_extension.cuda.qliner_exllamav2 import ExLlamaV2DeviceTensors device_tensors = {} for device, scratch_bytes in fixed_bytes.items(): diff --git a/auto_round/export/export_to_autoround/qliner_exllamav2.py b/auto_round_extension/cuda/qliner_exllamav2.py similarity index 100% rename from auto_round/export/export_to_autoround/qliner_exllamav2.py rename to auto_round_extension/cuda/qliner_exllamav2.py diff --git a/auto_round/export/export_to_autoround/qliner_triton.py b/auto_round_extension/cuda/qliner_triton.py similarity index 98% rename from auto_round/export/export_to_autoround/qliner_triton.py rename to auto_round_extension/cuda/qliner_triton.py index 7d02a27a..e307ace3 100644 --- a/auto_round/export/export_to_autoround/qliner_triton.py +++ b/auto_round_extension/cuda/qliner_triton.py @@ -41,13 +41,13 @@ import torch.nn as nn import transformers -from .triton_utils.mixin import TritonModuleMixin +from auto_round_extension.cuda.triton_utils.mixin import TritonModuleMixin logger = getLogger(__name__) try: - from .triton_utils.kernels import ( + from auto_round_extension.cuda.triton_utils import ( QuantLinearFunction, QuantLinearInferenceOnlyFunction, quant_matmul_248, diff --git a/auto_round/export/export_to_autoround/triton_utils/__init__.py b/auto_round_extension/cuda/triton_utils/__init__.py similarity index 100% rename from auto_round/export/export_to_autoround/triton_utils/__init__.py rename to auto_round_extension/cuda/triton_utils/__init__.py diff --git a/auto_round/export/export_to_autoround/triton_utils/custom_autotune.py b/auto_round_extension/cuda/triton_utils/custom_autotune.py similarity index 100% rename from auto_round/export/export_to_autoround/triton_utils/custom_autotune.py rename to auto_round_extension/cuda/triton_utils/custom_autotune.py diff --git a/auto_round/export/export_to_autoround/triton_utils/dequant.py b/auto_round_extension/cuda/triton_utils/dequant.py similarity index 100% rename from auto_round/export/export_to_autoround/triton_utils/dequant.py rename to auto_round_extension/cuda/triton_utils/dequant.py diff --git a/auto_round/export/export_to_autoround/triton_utils/kernels.py b/auto_round_extension/cuda/triton_utils/kernels.py similarity index 100% rename from auto_round/export/export_to_autoround/triton_utils/kernels.py rename to auto_round_extension/cuda/triton_utils/kernels.py diff --git a/auto_round/export/export_to_autoround/triton_utils/mixin.py b/auto_round_extension/cuda/triton_utils/mixin.py similarity index 100% rename from auto_round/export/export_to_autoround/triton_utils/mixin.py rename to auto_round_extension/cuda/triton_utils/mixin.py diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py index fe97a770..4cfb98a0 100644 --- a/examples/language-modeling/main.py +++ b/examples/language-modeling/main.py @@ -290,7 +290,7 @@ def get_library_version(library_name): f"supported currently") break if args.quant_lm_head: - weight_config[lm_head_layer_name] = {"data_type": "int"} + weight_config[lm_head_layer_name] = {"data_type": "int", "bits": 4, "group_size": 32} transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]] if transformers_version[0] == 4 and transformers_version[1] < 38: error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization." @@ -327,7 +327,7 @@ def get_library_version(library_name): output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq" inplace = True if len(deployment_device) < 2 else False - if 'gpu' in deployment_device: + if 'gpu' in deployment_device or "auto_round" in gpu_format or "auto-round" in gpu_format: autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace) if 'xpu' in deployment_device: autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,