refine AuoRound format and support marlin repacking (#280)

--------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
intel · Oct 21, 2024 · 68138e8 · 68138e8
1 parent 7cfff96
commit 68138e8
Show file tree

Hide file tree

Showing 12 changed files with 1,222 additions and 273 deletions.
diff --git a/README.md b/README.md
@@ -193,7 +193,7 @@ asymmetric kernel has issues** that can cause considerable accuracy drops, parti
 models.
 Additionally, symmetric quantization tends to perform poorly at 2-bit precision.
 
-**AutoAWQ Format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
+**AutoAWQ Format**(>0.3.0): This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely adopted
 within the community, only 4-bits quantization is supported. It features
 specialized layer fusion tailored for Llama models.
 
@@ -230,13 +230,13 @@ in [Gaudi Guide](https://docs.habana.ai/en/latest/).
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from auto_round import AutoRoundConfig
 
-device = "auto"  ##cpu, hpu, cuda
+backend = "auto"  ##cpu, hpu, cuda, cuda:marlin('pip install -v gptqmodel --no-build-isolation')
 quantization_config = AutoRoundConfig(
-    backend=device
+    backend=backend
 )
 quantized_model_path = "./tmp_autoround"
 model = AutoModelForCausalLM.from_pretrained(quantized_model_path,
-                                             device_map=device, quantization_config=quantization_config)
+                                             device_map=backend.split(':')[0], quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -92,7 +92,7 @@ def setup_parser():
 
     parser.add_argument("--format", default=None, type=str,
                         help="The format in which to save the model. "
-                             "The options are 'auto_round', 'auto_round:gptq','auto_round:marlin',"
+                             "The options are 'auto_round', 'auto_round:gptq','auto_round:awq',"
                              " 'auto_gptq', 'auto_awq', 'itrex', 'itrex_xpu' and 'fake'."
                              "default to 'auto_round."
                         )
@@ -316,7 +316,9 @@ def tune(args):
     format_list = args.format.replace(' ', '').split(',')
     inplace = False if len(format_list) > 1 else True
     for format_ in format_list:
-        eval_folder = f'{export_dir}-{format_}'
+        save_format_ = format_.replace(":", "-")
+        save_format_ = save_format_.replace("_", "-")
+        eval_folder = f'{export_dir}-{save_format_}'
         autoround.save_quantized(eval_folder, format=format_, inplace=inplace)
 
     lm_eval_version = get_library_version("lm-eval")