From db3a935dead587ac90d739f74f8aea04aae97824 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 30 Oct 2024 11:39:26 +0100
Subject: [PATCH] [AWQ] Cast `fns.quantile()` result to float32 (#3044)

### Changes

Cast `fns.quantile()` result to float32 inside AWQ algorithm.

### Reason for changes

`fns.quantile()` for numpy backend returns `np.float64` value. In AWQ it
is used as a clip lower bound, resulting in float64 result. Then via
chain reaction it leads to weights and activations being converted to
float64.

As I understand, processing in float64 is not necessary. At the same
time it leads to increased running time. Below are measurements for
compression time with AWQ enabled before and after the changes.

| Model           | develop (sec.) | branch (sec.) |
|-----------------|----------------|---------------|
| tiny-llama-1.1b | 123            | 109 (-11%)    |
| phi3_mini-3.7b  | 487            | 419 (-14%)    |
| llama3-8b       | 1091           | 912 (-16%)    |
---
 nncf/quantization/algorithms/weight_compression/awq.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py
index 1b43f5339c4..ee5237eb8e2 100644
--- a/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/nncf/quantization/algorithms/weight_compression/awq.py
@@ -36,6 +36,7 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
 from nncf.quantization.passes import transform_to_inference_graph
+from nncf.tensor import TensorDataType
 from nncf.tensor import functions as fns
 
 TModel = TypeVar("TModel")
@@ -241,7 +242,7 @@ def apply(
                 offset = gi * group_size
                 gscale = s[offset : offset + group_size]
 
-                a_min = fns.quantile(gscale, 0.1)
+                a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
                 a_max = 1e2
                 gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)