Skip to content

Commit

Permalink
set auto_round format as default
Browse files Browse the repository at this point in the history
  • Loading branch information
wenhuach21 committed Jun 7, 2024
1 parent dbdc4a3 commit 49c0ac1
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 32 deletions.
25 changes: 5 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,8 @@ pip install auto-round
## Model quantization

### Gaudi2/ CPU/ GPU
By default, we export to the AutoRound format, which supports both CUDA and CPU backends and ensures asymmetry accuracy. To export in a format compatible with Transformers, save it in the auto_gptq format.

We found a significant accuracy discrepancy with the qdq model using the AutoGPTQ GPU backend with asymmetric
quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to AuoRound format to fix this issue.

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
Expand All @@ -75,7 +74,7 @@ bits, group_size, sym = 4, 128, False
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, device=None)
autoround.quantize()
output_dir = "./tmp_autoround"
autoround.save_quantized(output_dir) ##save_quantized(output_dir,format=="auto_round")
autoround.save_quantized(output_dir) ## tsave_quantized(output_dir,format=="auto_gptq")
```

<details>
Expand Down Expand Up @@ -134,26 +133,12 @@ autoround.save_quantized(output_dir) ##save_quantized(output_dir,format=="auto_r

Please run the quantization code first.

### CPU

```python
##Install the latest https://github.com/intel/intel-extension-for-transformers from source first.
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

quantized_model_path = "./tmp_autoround"
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, use_fast=True)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
```

### GPU
### CPU/GPU

```python
## install auto-round first, for auto_gptq format, please install auto-gptq
from transformers import AutoModelForCausalLM, AutoTokenizer
##from auto_round.auto_quantizer import AutoHfQuantizer ## uncomment it for models with auto_round format
from auto_round.auto_quantizer import AutoHfQuantizer ## comment it for models with auto_gptq format

quantized_model_path = "./tmp_autoround"
model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
Expand Down
2 changes: 1 addition & 1 deletion auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,7 +1019,7 @@ def quant_blocks(

torch.cuda.empty_cache()

def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kwargs):
def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
"""Save the quantized model to the specified output directory in the specified format.
Args:
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_autoround/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
save(model, output_dir)


def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True):
"""Save model state dict and configs.
Args:
Expand Down
13 changes: 4 additions & 9 deletions examples/language-modeling/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,6 @@ def get_library_version(library_name):
print(f"warning, disable_low_gpu_mem_usage is strongly recommended if the whole model could be loaded to "
f"gpu")
deployment_device = args.deployment_device.split(',')
gpu_format = "auto_gptq"
if 'gpu' in deployment_device:
if lm_head_layer_name in weight_config.keys() and weight_config[lm_head_layer_name]["data_type"] == "int":
gpu_format = "auto_round"

if "autoround" in deployment_device or "auto-round" in deployment_device or "auto_round" in deployment_device:
gpu_format = "auto_round"

autoround = round(model, tokenizer, args.bits, args.group_size, sym=args.sym, batch_size=args.train_bs,
dataset=args.dataset, seqlen=seqlen, n_blocks=args.n_blocks, iters=args.iters, lr=args.lr,
Expand All @@ -327,8 +320,10 @@ def get_library_version(library_name):
output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"

inplace = True if len(deployment_device) < 2 else False
if 'gpu' in deployment_device or "auto_round" in gpu_format or "auto-round" in gpu_format:
autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace)
if 'gpu' in deployment_device:
autoround.save_quantized(f'{export_dir}-gpu', format="auto_round", use_triton=True, inplace=inplace)
if "auto_gptq" in deployment_device:
autoround.save_quantized(f'{export_dir}-gptq', format="auto_gptq", use_triton=True, inplace=inplace)
if 'xpu' in deployment_device:
autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,
Expand Down
1 change: 0 additions & 1 deletion examples/language-modeling/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ einops
accelerate
datasets
protobuf
auto-gptq
openpyxl
wandb
py-cpuinfo
Expand Down

0 comments on commit 49c0ac1

Please sign in to comment.