From db3a935dead587ac90d739f74f8aea04aae97824 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 30 Oct 2024 11:39:26 +0100 Subject: [PATCH] [AWQ] Cast `fns.quantile()` result to float32 (#3044) ### Changes Cast `fns.quantile()` result to float32 inside AWQ algorithm. ### Reason for changes `fns.quantile()` for numpy backend returns `np.float64` value. In AWQ it is used as a clip lower bound, resulting in float64 result. Then via chain reaction it leads to weights and activations being converted to float64. As I understand, processing in float64 is not necessary. At the same time it leads to increased running time. Below are measurements for compression time with AWQ enabled before and after the changes. | Model | develop (sec.) | branch (sec.) | |-----------------|----------------|---------------| | tiny-llama-1.1b | 123 | 109 (-11%) | | phi3_mini-3.7b | 487 | 419 (-14%) | | llama3-8b | 1091 | 912 (-16%) | --- nncf/quantization/algorithms/weight_compression/awq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py index 1b43f5339c4..ee5237eb8e2 100644 --- a/nncf/quantization/algorithms/weight_compression/awq.py +++ b/nncf/quantization/algorithms/weight_compression/awq.py @@ -36,6 +36,7 @@ from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization from nncf.quantization.passes import transform_to_inference_graph +from nncf.tensor import TensorDataType from nncf.tensor import functions as fns TModel = TypeVar("TModel") @@ -241,7 +242,7 @@ def apply( offset = gi * group_size gscale = s[offset : offset + group_size] - a_min = fns.quantile(gscale, 0.1) + a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32) a_max = 1e2 gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)