Skip to content

Commit

Permalink
1 fix a bug in autoround format with the latest transformers 2 rename…
Browse files Browse the repository at this point in the history
… n_samples n_blocks to nsamples nblocks (intel#163)
  • Loading branch information
wenhuach21 authored Jun 27, 2024
1 parent e559b91 commit ce5640b
Show file tree
Hide file tree
Showing 8 changed files with 185 additions and 311 deletions.
28 changes: 11 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,9 @@ AutoRound
<div align="left">

AutoRound is an advanced weight-only quantization algorithm for low-bits LLM inference. It's tailored for a wide range
of models and consistently delivers noticeable improvements, often significantly outperforming SignRound with the cost
of more tuning time for quantization.

Our method adopts sign gradient descent to fine-tune rounding values and minmax values of weights in just 200 steps,
of models. Our method adopts sign gradient descent to fine-tune rounding values and minmax values of weights in just 200 steps,
which competes impressively against recent methods without introducing any additional inference overhead. The below
image presents an overview of AutoRound.
image presents an overview of AutoRound. Check out our updated paper on [arxiv](https://arxiv.org/pdf/2309.05516v4)

<div align="center">

Expand All @@ -26,7 +23,6 @@ image presents an overview of AutoRound.

## What's New
* [2024/06] AutoRound format supports mixed bit-widths and group sizes for inference, resolving the significant performance drop issue with the asymmetric kernel
* [2024/05] Check out our updated paper on [arxiv](https://arxiv.org/pdf/2309.05516v4)
* [2024/05] AutoRound supports lm-head quantization, saving 0.7G for LLaMA3-8B at W4G128.
* [2024/05] AutoRound performs well
in [low_bit_open_llm_leaderboard](https://huggingface.co/spaces/Intel/low_bit_open_llm_leaderboard)
Expand Down Expand Up @@ -102,7 +98,7 @@ autoround.save_quantized(output_dir) ##save_quantized(output_dir,format=="auto_r

- `minmax_lr (float)`: The learning rate for min-max tuning (default is None, it will be set to lr automatically).

- `n_samples (int)`: Number of samples for tuning (default is 512).
- `nsamples (int)`: Number of samples for tuning (default is 512).

- `seqlen (int)`: Data length of the sequence for tuning (default is 2048).

Expand All @@ -113,7 +109,7 @@ autoround.save_quantized(output_dir) ##save_quantized(output_dir,format=="auto_r

- `amp (bool)`: Whether to use automatic mixed precision (default is True).

- `n_blocks (int)`: Packing several blocks as one for tuning together (default is 1).
- `nblocks (int)`: Packing several blocks as one for tuning together (default is 1).

- `gradient_accumulate_steps (int)`: Number of gradient accumulation steps (default is 1).

Expand All @@ -137,13 +133,13 @@ Please run the quantization code first.
### CPU

```python
##Install the latest https://github.com/intel/intel-extension-for-transformers from source first.
##pip install intel-extension-for-transformers
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

quantized_model_path = "./tmp_autoround"
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
Expand All @@ -156,8 +152,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
##from auto_round.auto_quantizer import AutoHfQuantizer ## uncomment it for models with auto_round format

quantized_model_path = "./tmp_autoround"
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
Expand All @@ -174,10 +170,10 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
| google/gemma-2b | [HF-int4-model](https://huggingface.co/Intel/gemma-2b-int4-inc), [accuracy](./docs/gemma-2b-acc.md), [recipe](./examples/language-modeling/scripts/gemma-2b.sh), [example](./examples/language-modeling/)
| tiiuae/falcon-7b | [HF-int4-model-G64](https://huggingface.co/Intel/falcon-7b-int4-inc), [accuracy](./docs/falcon-7b-acc.md), [recipe](./examples/language-modeling/scripts/falcon-7b.sh), [example](./examples/language-modeling/) |
| mistralai/Mistral-7B-Instruct-v0.2 | [HF-int4-model](https://huggingface.co/Intel/Mistral-7B-Instruct-v0.2-int4-inc) (under review), [accuracy](./docs/Mistral-7B-Instruct-v0.2-acc.md), [recipe](./examples/language-modeling/scripts/Mistral-7B-Instruct-v0.2.sh), [example](./examples/language-modeling/) |
| google/gemma-7b | [HF-int4-model](https://huggingface.co/Intel/gemma-7b-int4-inc) (under review), [accuracy](./docs/gemma-7b-acc.md), [recipe](./examples/language-modeling/scripts/gemma-7b.sh), [example](./examples/language-modeling/) |
| mistralai/Mixtral-8x7B-Instruct-v0.1 | [HF-int4-model](https://huggingface.co/Intel/Mixtral-8x7B-Instruct-v0.1-int4-inc) (under review), [accuracy](./docs/Mixtral-8x7B-Instruct-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mixtral-8x7B-Instruct-v0.1.sh), [example](./examples/language-modeling/) |
| mistralai/Mixtral-8x7B-v0.1 | [HF-int4-model](https://huggingface.co/Intel/Mixtral-8x7B-v0.1-int4-inc) (under review), [accuracy](./docs/Mixtral-8x7B-v0.1-acc.md), [recipe](./examples/language-modeling/scripts/Mixtral-8x7B-v0.1.sh), [example](./examples/language-modeling/) |
| meta-llama/Meta-Llama-3-8B-Instruct | [accuracy](./docs/Meta-Llama-3-8B-Instruct-acc.md), [recipe](./examples/language-modeling/scripts/Meta-Llama-3-8B-Instruct.sh), [example](./examples/language-modeling/) |
| google/gemma-7b | [accuracy](./docs/gemma-7b-acc.md), [recipe](./examples/language-modeling/scripts/gemma-7b.sh), [example](./examples/language-modeling/) |
| meta-llama/Llama-2-7b-chat-hf | [accuracy](./docs/Llama-2-7b-chat-hf-acc.md), [recipe](./examples/language-modeling/scripts/Llama-2-7b-chat-hf.sh), [example](./examples/language-modeling/) |
| Qwen/Qwen1.5-7B-Chat | [accuracy](./docs/Qwen1.5-7B-Chat-acc.md), [sym recipe](./examples/language-modeling/scripts/Qwen1.5-7B-Chat-sym.sh), [asym recipe ](./examples/language-modeling/scripts/Qwen1.5-7B-Chat-asym.sh), [example](./examples/language-modeling/) |
| baichuan-inc/Baichuan2-7B-Chat | [accuracy](./docs/baichuan2-7b-chat-acc.md), [recipe](./examples/language-modeling/scripts/baichuan2-7b-chat.sh), [example](./examples/language-modeling/) |
Expand Down Expand Up @@ -205,9 +201,7 @@ average accuracies of 11 zero-shot tasks.

1 Consider increasing tuning steps to achieve better results, albeit with increased tuning time.

2 Setting 'enable_quanted_input' to False has been observed to occasionally yield improved results.

3 Setting 'minmax_lr' to 2.0/iters has been observed to occasionally yield improved results.
2 Setting 'minmax_lr' to 2.0/iters has been observed to occasionally yield improved results.

## Reference

Expand Down
19 changes: 7 additions & 12 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

from auto_round.utils import get_module, set_module, dynamic_import_inference_linear
import auto_round_extension.qbits.qlinear_qbits as qlinear_qbits

from enum import Enum
logger = getLogger(__name__)
import sys

Expand Down Expand Up @@ -194,6 +194,9 @@ def merge_quantization_configs(

return quantization_config

class AutoRoundQuantizationMethod(str, Enum):
AutoRound = "intel/auto-round"


@dataclass
class AutoRoundConfig(QuantizationConfigMixin):
Expand Down Expand Up @@ -222,6 +225,7 @@ def __init__(
weight_config: dict = None,
**kwargs,
):

self.bits = bits
self.tokenizer = tokenizer
self.dataset = dataset
Expand All @@ -232,7 +236,7 @@ def __init__(
if kwargs is not None:
for key in kwargs.keys():
setattr(self, key, kwargs[key])

self.quant_method = AutoRoundQuantizationMethod.AutoRound
self.post_init()

def get_loading_attributes(self):
Expand Down Expand Up @@ -378,11 +382,6 @@ def post_init_model(self, model):
The input model
"""

#
# if self.bits == 4: if get_device(model) == torch.device("cpu") or ( hasattr(model, "hf_device_map") and
# any(d in model.hf_device_map for d in ["cpu", "disk"]) ): raise ValueError( "Found modules on cpu/disk.
# Using Exllamav2 backend requires all the modules to be on GPU." "You can deactivate exllama backend by
# setting `disable_exllama=True` in the quantization config object" )

class StoreAttr(object):
pass
Expand All @@ -406,11 +405,7 @@ def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs
model = self.post_init_model(model)
else:
raise NotImplementedError
# if self.quantization_config.tokenizer is None:
# self.quantization_config.tokenizer = model.name_or_path
#
# self.optimum_quantizer.quantize_model(model, self.quantization_config.tokenizer)
# model.config.quantization_config = GPTQConfig.from_dict(self.optimum_quantizer.to_dict())


@property
def is_trainable(self, model: Optional["PreTrainedModel"] = None):
Expand Down
Loading

0 comments on commit ce5640b

Please sign in to comment.