set auto_round format as default

intel · Jun 7, 2024 · 49c0ac1 · 49c0ac1
1 parent dbdc4a3
commit 49c0ac1
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -57,9 +57,8 @@ pip install auto-round
 ## Model quantization
 
 ### Gaudi2/ CPU/ GPU
+By default, we export to the AutoRound format, which supports both CUDA and CPU backends and ensures asymmetry accuracy. To export in a format compatible with Transformers, save it in the auto_gptq format.
 
-We found a significant accuracy discrepancy with the qdq model using the AutoGPTQ GPU backend with asymmetric
-quantization in some scenarios, especially at lower bits,like 2. Please save quantized model to AuoRound format to fix this issue.
 
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -75,7 +74,7 @@ bits, group_size, sym = 4, 128, False
 autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, device=None)
 autoround.quantize()
 output_dir = "./tmp_autoround"
-autoround.save_quantized(output_dir) ##save_quantized(output_dir,format=="auto_round")
+autoround.save_quantized(output_dir) ## tsave_quantized(output_dir,format=="auto_gptq")
 ```
 
 <details>
@@ -134,26 +133,12 @@ autoround.save_quantized(output_dir) ##save_quantized(output_dir,format=="auto_r
 
 Please run the quantization code first.
 
-### CPU
-
-```python
-##Install the latest https://github.com/intel/intel-extension-for-transformers from source first.
-from intel_extension_for_transformers.transformers import AutoModelForCausalLM
-from transformers import AutoTokenizer
-
-quantized_model_path = "./tmp_autoround"
-model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained(quantized_model_path, use_fast=True)
-text = "There is a girl who likes adventure,"
-inputs = tokenizer(text, return_tensors="pt").to(model.device)
-print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-```
-
-### GPU
+### CPU/GPU
 
 ```python
+## install auto-round first, for auto_gptq format, please install auto-gptq
 from transformers import AutoModelForCausalLM, AutoTokenizer
-##from auto_round.auto_quantizer import AutoHfQuantizer ## uncomment it for models with auto_round format
+from auto_round.auto_quantizer import AutoHfQuantizer ## comment it for models with auto_gptq format
 
 quantized_model_path = "./tmp_autoround"
 model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto", trust_remote_code=True)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -1019,7 +1019,7 @@ def quant_blocks(
 
         torch.cuda.empty_cache()
 
-    def save_quantized(self, output_dir=None, format="auto_gptq", inplace=True, **kwargs):
+    def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
         """Save the quantized model to the specified output directory in the specified format.
 
         Args:

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -180,7 +180,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="autoround:exl
     save(model, output_dir)
 
 
-def save(model: nn.Module, save_dir: str, max_shard_size: str = "10GB", safe_serialization: bool = True):
+def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_serialization: bool = True):
     """Save model state dict and configs.
 
     Args:

diff --git a/examples/language-modeling/main.py b/examples/language-modeling/main.py
@@ -300,13 +300,6 @@ def get_library_version(library_name):
         print(f"warning, disable_low_gpu_mem_usage is strongly recommended if the whole model could be loaded to "
               f"gpu")
     deployment_device = args.deployment_device.split(',')
-    gpu_format = "auto_gptq"
-    if 'gpu' in deployment_device:
-        if lm_head_layer_name in weight_config.keys() and weight_config[lm_head_layer_name]["data_type"] == "int":
-            gpu_format = "auto_round"
-
-    if "autoround" in deployment_device or "auto-round" in deployment_device or "auto_round" in deployment_device:
-        gpu_format = "auto_round"
 
     autoround = round(model, tokenizer, args.bits, args.group_size, sym=args.sym, batch_size=args.train_bs,
                       dataset=args.dataset, seqlen=seqlen, n_blocks=args.n_blocks, iters=args.iters, lr=args.lr,
@@ -327,8 +320,10 @@ def get_library_version(library_name):
     output_dir = args.output_dir + "/" + model_name.split('/')[-1] + f"-autoround-w{args.bits}g{args.group_size}-qdq"
 
     inplace = True if len(deployment_device) < 2 else False
-    if 'gpu' in deployment_device or "auto_round" in gpu_format or "auto-round" in gpu_format:
-        autoround.save_quantized(f'{export_dir}-gpu', format=gpu_format, use_triton=True, inplace=inplace)
+    if 'gpu' in deployment_device:
+        autoround.save_quantized(f'{export_dir}-gpu', format="auto_round", use_triton=True, inplace=inplace)
+    if "auto_gptq" in deployment_device:
+        autoround.save_quantized(f'{export_dir}-gptq', format="auto_gptq", use_triton=True, inplace=inplace)
     if 'xpu' in deployment_device:
         autoround.save_quantized(f'{export_dir}-xpu', format="itrex_xpu", use_triton=True, inplace=inplace,
                                  compression_dtype=torch.int8, compression_dim=0, use_optimum_format=False,

diff --git a/examples/language-modeling/requirements.txt b/examples/language-modeling/requirements.txt
@@ -13,7 +13,6 @@ einops
 accelerate
 datasets
 protobuf
-auto-gptq
 openpyxl
 wandb
 py-cpuinfo