add pre-quantization check of awq format

Signed-off-by: Zhang, Weiwei1 <[email protected]>
intel · Dec 30, 2024 · dcf1e06 · dcf1e06
2 parents c2b992e + 01b779c
commit dcf1e06
Show file tree

Hide file tree

Showing 12 changed files with 397 additions and 159 deletions.
diff --git a/README.md b/README.md
@@ -113,26 +113,6 @@ auto-round-fast \
 
 </details>
 
-#### Formats
-
-**AutoRound Format**: This format is well-suited for CPU, HPU devices, 2 bits, as well as mixed-precision
-inference. [2,4]
-bits are supported. It also benefits
-from the Marlin kernel, which can boost inference performance notably. However, it has not yet gained widespread
-community adoption.
-
-**AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the
-community, [2,3,4,8] bits are supported. It also benefits
-from the Marlin kernel, which can boost inference performance notably. However, **the
-asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small
-models.
-Additionally, symmetric quantization tends to perform poorly at 2-bit precision.
-
-**AutoAWQ Format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely
-adopted
-within the community, only 4-bits quantization is supported. It features
-specialized layer fusion tailored for Llama models.
-
 ### API Usage (Gaudi2/CPU/GPU)
 
 ```python

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -965,7 +965,7 @@ def quant_layer(self, layer_name, inputs, q_inputs=None, device=torch.device("cp
             unwrapper_layer(self.model, wrapper_linear, layer_name, best_params)
         mv_module_from_gpu(layer, self.low_cpu_mem_usage)
         dump_info = f"quantized {layer_name},  loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
-        logger.info(dump_info)
+        logger.debug(dump_info)
 
     def register_act_max_hook(self, model):
         def get_act_max_hook(module, input, output):
@@ -1045,7 +1045,7 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
                 f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
                 f"layers in the block"
             )
-            logger.info(dump_info)
+            logger.debug(dump_info)
             return output, output
 
         if self.lr_scheduler is None:
@@ -1136,7 +1136,7 @@ def quant_block(self, block, input_ids, input_others, q_input=None, device=torch
             f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
             f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
         )
-        logger.info(dump_info)
+        logger.debug(dump_info)
         if len(unquantized_layer_names) != 0:
             logger.info(f"{unquantized_layer_names} have not been quantized")
         with torch.no_grad():

diff --git a/auto_round/backend.py b/auto_round/backend.py
@@ -145,7 +145,7 @@ def check_auto_round_exllamav2_installed():
 BackendInfos['awq:gemm'] = BackendInfo(device=["cuda"], sym=[True, False],  ##actrally is gemm
                                        packing_format="awq",
                                        bits=[4], group_size=None,
-                                       priority=4, feature_checks=[feature_num_greater_checker_1024],
+                                       priority=4,
                                        alias=["auto_awq:gemm", "auto_round:awq:gemm", "auto_round:auto_awq:gemm", "awq",
                                               "auto_awq", "auto_round:awq", "auto_round:auto_awq"],
                                        requirements=["autoawq"]