Skip to content

Commit

Permalink
reorg the code of cuda kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
wenhuach21 committed Jun 3, 2024
1 parent ee65eda commit e1a7233
Show file tree
Hide file tree
Showing 14 changed files with 11 additions and 27 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ image presents an overview of AutoRound.
<div align="left">

## What's New
* [2024/06] AutoRound format supports mixed bits/group_size inference and fixed the asym kernel large drop issue.
* [2024/06] AutoRound format supports mixed bit-widths and group sizes for inference, resolving the significant performance drop issue with the asymmetric kernel
* [2024/05] Check out our updated paper on [arxiv](https://arxiv.org/pdf/2309.05516v4)
* [2024/05] AutoRound supports lm-head quantization, saving 0.7G for LLaMA3-8B at W4G128.
* [2024/05] AutoRound performs well
Expand Down Expand Up @@ -57,7 +57,7 @@ pip install auto-round
### Gaudi2/ CPU/ GPU

We found a significant accuracy discrepancy with the qdq model using the AutoGPTQ GPU backend with asymmetric
quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to auoround format to fix this issue.
quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to AuoRound format to fix this issue.

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
Expand Down
22 changes: 3 additions & 19 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from packaging import version
from transformers.modeling_utils import PreTrainedModel
from transformers.pytorch_utils import Conv1D
import transformers
from transformers.quantizers import AutoQuantizationConfig, HfQuantizer
from transformers.quantizers.auto import AUTO_QUANTIZER_MAPPING
from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod
Expand Down Expand Up @@ -102,7 +101,7 @@ def is_autoround_exllamav2_available():


if is_auto_round_available():
from auto_round.export.export_to_autoround.post_init import autoround_post_init
from auto_round_extension.cuda.post_init import autoround_post_init


#
Expand Down Expand Up @@ -214,14 +213,7 @@ def __init__(
group_size: int = 128,
sym: bool = False,
backend="autoround:exllamav2",
iters: int = 200,
weight_config: dict = None,
enable_quanted_input=True,
enable_minmax_tuning=True,
lr=None,
minmax_lr=None,
n_samples=512,
seqlen=2048,
**kwargs,
):
self.bits = bits
Expand All @@ -230,14 +222,7 @@ def __init__(
self.group_size = group_size
self.sym = sym
self.backend = backend
self.inters = iters
self.weight_config = weight_config
self.enable_quanted_input = enable_quanted_input
self.enable_minmax_tuning = enable_minmax_tuning
self.lr = lr
self.minmax_lr = minmax_lr
self.n_samples = n_samples
self.seqlen = seqlen
if kwargs is not None:
for key in kwargs.keys():
setattr(self, key, kwargs[key])
Expand Down Expand Up @@ -327,9 +312,9 @@ def convert_model(self, model: nn.Module):

def _dynamic_import_inference_linear(self, bits, backend):
if bits == 4 and self.exllama2_available and "exllama2" in backend:
from auto_round.export.export_to_autoround.qliner_exllamav2 import QuantLinear
from auto_round_extension.cuda.qliner_exllamav2 import QuantLinear
else:
from auto_round.export.export_to_autoround.qliner_triton import QuantLinear
from auto_round_extension.cuda.qliner_triton import QuantLinear
return QuantLinear

def _replace_by_quant_layers(self, module: nn.Module, layer_configs, backend):
Expand Down Expand Up @@ -431,4 +416,3 @@ def is_serializable(self):

transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
from transformers import AutoModelForCausalLM as AutoModelForCausalLM
2 changes: 1 addition & 1 deletion auto_round/export/export_to_autoround/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def dynamic_QuantLienar_for_packing(backend, bits, group_size):
return QuantLinear
##export all use trition, inference use exllamav2
elif "autoround" in backend or "auto-round" in backend or "auto_round" in backend:
from .qliner_triton import QuantLinear
from auto_round_extension.cuda.qliner_triton import QuantLinear
return QuantLinear

else:
Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def autoround_post_init(model):
fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))

if model_uses_exllamav2:
from .qliner_exllamav2 import ExLlamaV2DeviceTensors
from auto_round_extension.cuda.qliner_exllamav2 import ExLlamaV2DeviceTensors

device_tensors = {}
for device, scratch_bytes in fixed_bytes.items():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@
import torch.nn as nn
import transformers

from .triton_utils.mixin import TritonModuleMixin
from auto_round_extension.cuda.triton_utils.mixin import TritonModuleMixin


logger = getLogger(__name__)

try:
from .triton_utils.kernels import (
from auto_round_extension.cuda.triton_utils import (
QuantLinearFunction,
QuantLinearInferenceOnlyFunction,
quant_matmul_248,
Expand Down
4 changes: 2 additions & 2 deletions examples/language-modeling/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def get_library_version(library_name):
f"supported currently")
break
if args.quant_lm_head:
weight_config[lm_head_layer_name] = {"data_type": "int"}
weight_config[lm_head_layer_name] = {"data_type": "int", "bits": 4, "group_size": 32}
transformers_version = [int(item) for item in transformers.__version__.split('.')[:2]]
if transformers_version[0] == 4 and transformers_version[1] < 38:
error_message = "Please upgrade transformers>=4.38.0 to support lm-head quantization."
Expand Down Expand Up @@ -327,7 +327,7 @@ def get_library_version(library_name):
output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"

inplace = True if len(deployment_device) < 2 else False
if 'gpu' in deployment_device:
if 'gpu' in deployment_device or "auto_round" in gpu_format or "auto-round" in gpu_format:
autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace)
if 'xpu' in deployment_device:
autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
Expand Down

0 comments on commit e1a7233

Please sign in to comment.