Tests WIP

openvinotoolkit · Nov 1, 2024 · fe42f9a · fe42f9a
1 parent d20e593
commit fe42f9a
Show file tree

Hide file tree

Showing 8 changed files with 377 additions and 43 deletions.
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
@@ -54,6 +54,9 @@ def is_integer(self):
     def __hash__(self):
         return hash((self.mode.value, self.group_size))
 
+    def __str__(self):
+        return f"{self.mode.value}_{self.group_size}"
+
 
 @dataclass
 class WeightCompressionParameters:

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -46,6 +46,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorBackend
@@ -127,6 +128,7 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.
     def get_weight_dtype(
         self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph
     ) -> TensorDataType:
+        # TODO: use from nncf.tensor.functions.ov import DTYPE_MAP
         ov_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"]
         dtype_map = {
             "f16": TensorDataType.float16,
@@ -277,7 +279,6 @@ def _create_compression_subgraph(
 
         if should_add_convert_node:
             mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert")
-
         return mul, compressed_weight
 
     def transform_model(
@@ -344,6 +345,8 @@ def transform_model(
         # reset name_to_node_mapping
         self.name_to_node_mapping = None
 
+        OV_MODEL_CACHE.clear()
+
         return model
 
     @staticmethod

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -36,7 +36,7 @@
 class OVModelParameters:
     input_dtype: TensorDataType
     output_dtype: Optional[TensorDataType] = None
-    dynamic_shapes: bool = False
+    dynamic_shapes: bool = True  # TODO: set to False once 156511 is resolved
     recompile: bool = False
     release_memory: bool = True
     share_inputs: bool = True
@@ -124,7 +124,8 @@ def get_compress_decompress_weight_model(
 ) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
-        scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+        if scale_shape is not None:
+            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
         if zero_point_shape is not None:
             zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
 
@@ -223,19 +224,18 @@ def _build_compress_model(
         else:
             dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
 
+    compressed_w = opset.round(compressed_w)
     compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
     compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")
 
     ov_results = [compressed_w]
-    if len(ov_parameters) != 3:
-        # Two cases:
-        #   1. weight -> compressed_weight, scale, (zero_point)
-        #   2. weight, scale -> compressed_weight, (zero_point)
-        if len(ov_parameters) == 1:
-            ov_results.append(scale)
-
+    if len(ov_parameters) == 1:
+        ov_results.append(scale)
         if zero_point is not None:
-            ov_results.append(opset.convert(zero_point, compressed_w.get_element_type()))
+            zero_point_dtype = compressed_w.get_element_type() if ov_model_params.return_ov_tensors else ov.Type.i32
+            if zero_point.get_element_type() != zero_point_dtype:
+                zero_point = opset.convert(zero_point, zero_point_dtype)
+            ov_results.append(zero_point)
 
     if return_nodes:
         return ov_parameters, ov_results
@@ -264,18 +264,13 @@ def _build_compress_decompress_model(
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale, zero_point
             compressed_w, scale, zero_point = ov_results
-        elif len(ov_parameters) == 2:
-            # weight, scale -> compressed_weight, zero_point
-            compressed_w, zero_point = ov_results
-            scale = ov_parameters[1]
         else:
             # weight, scale, zero_point -> compressed_weight
             compressed_w = ov_results[0]
             scale, zero_point = ov_parameters[1:]
 
-        decompressed_w = scale * opset.convert(
-            opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32), ov.Type.f32
-        )
+        subtrac_zero_point = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
+        decompressed_w = scale * opset.convert(subtrac_zero_point, ov.Type.f32)
     else:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -110,7 +110,7 @@ def apply(
         graph: NNCFGraph,
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
-    ) -> Dict[str, Tensor]:
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
         """
         Estimates better scale for the int4 nodes in the model.
         Minimizes per-group difference between floating point MatMul and
@@ -122,10 +122,10 @@ def apply(
         :param graph: Model graph.
         :param statistic_points: Statistic points with collected statistics values.
         :param dataset: A representative dataset for the calibration process.
-        :return: Dict with pairs (weight name, estimated scale).
+        :return: Two dictionaries for estimated scales and zero points for each weight name.
         """
 
-        scales = dict()
+        scales, zero_points = dict(), dict()
 
         for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
             weight_name = wp.weight_name
@@ -145,7 +145,7 @@ def apply(
 
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
 
-            scales[weight_name], _ = self.calculate_quantization_params(
+            scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
                 self._backend_entity,
                 stats,
                 weight,
@@ -157,7 +157,7 @@ def apply(
                 self._weight_penalty,
             )
 
-        return scales
+        return scales, zero_points
 
     @staticmethod
     def calculate_quantization_params(
@@ -352,6 +352,8 @@ def calculate_quantization_params(
 
         if config.group_size == -1:
             result_scale = fns.squeeze(result_scale, axis=1)
+        if zp is not None and config.group_size == -1:
+            zp = fns.squeeze(zp, axis=1)
 
         return result_scale, zp
 

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -142,7 +142,9 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=
     return scale
 
 
-def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division=False) -> Tensor:
+def calculate_signed_scale(
+    weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division: Optional[bool] = True
+) -> Tensor:
     """
     Calculates the signed scale for symmetric quantization.
 
@@ -255,7 +257,10 @@ def calculate_normalized_weight_and_fp4_scale(
 
 
 def calculate_integer_quantization_params(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
+    weight: Tensor,
+    reduction_axes: ReductionAxes,
+    config: WeightCompressionConfig,
+    invert_division: Optional[bool] = True,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into
@@ -291,7 +296,7 @@ def calculate_quantized_weight(
     config: WeightCompressionConfig,
     scale: Tensor,
     zero_point: Optional[Tensor] = None,
-    invert_division=False,
+    invert_division: Optional[bool] = True,
 ) -> Tensor:
     """
     Quantizes the weight tensor using the provided scale and zero point.
@@ -327,7 +332,10 @@ def calculate_quantized_weight(
 
 
 def get_integer_quantization_error(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
+    weight: Tensor,
+    reduction_axes: ReductionAxes,
+    config: WeightCompressionConfig,
+    invert_division: Optional[bool] = True,
 ) -> float:
     """
     Calculates a quantity characterizing the difference between floating point weights and fake quantized
@@ -361,7 +369,7 @@ def compress_weight(
     config: WeightCompressionConfig,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
-    invert_division=False,
+    invert_division: Optional[bool] = True,
 ):
     """
     Compress weight using compression configuration.
@@ -435,7 +443,7 @@ def do_int_quantization(
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
-    invert_division: Optional[bool] = False,
+    invert_division: Optional[bool] = True,
     ov_model_params: Optional = None,
 ):
     """
@@ -453,6 +461,11 @@ def do_int_quantization(
     :return: A tuple containing the compressed weights, scale, and zero point tensors.
     """
     assert config.is_integer, "The function supports integer quantization only"
+    if config.is_int_asym and (precomputed_scale is None) != (precomputed_zero_point is None):
+        raise ValueError(
+            "If precomputed quantization parameters are provided, both scale and zero point are required "
+            "for asymmetric quantization."
+        )
 
     # import os
     accelerate_through_ov = (
@@ -528,11 +541,8 @@ def do_int_quantization(
         # Scale is always in fp32 so there is no need to store it in ov.Tensor
         if scale.backend == TensorBackend.ov:
             scale = scale.to_backend(TensorBackend.numpy)
-    elif precomputed_zero_point is None and config.is_int_asym:
-        # weight, scale -> compressed_weight, zero_point
-        compressed_weight, zero_point = model([weight, precomputed_scale])
-        scale = precomputed_scale
     else:
+        # weight, scale, (zero_point) -> compressed_weight
         inputs = (
             [weight, precomputed_scale]
             if precomputed_zero_point is None
@@ -550,7 +560,7 @@ def calculate_quantized_dequantized_weight(
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
     precomputed_zero_point: Optional[Tensor] = None,
-    invert_division: Optional[bool] = False,
+    invert_division: Optional[bool] = True,
     return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional = None,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
@@ -606,7 +616,7 @@ def calculate_quantized_dequantized_weight(
     if precomputed_zero_point is not None:
         inputs.append(precomputed_zero_point)
 
-    compressed_weight, scale, zero_point = None, None, None
+    compressed_weight, scale, zero_point = None, precomputed_scale, precomputed_zero_point
     results = model(inputs)
     if len(results) == 1:
         decompressed_weight = results[0]

diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py
@@ -344,7 +344,7 @@ def calculate_scale_zero_point(
     level_low: int,
     level_high: int,
     narrow_range: bool,
-    invert_division: Optional[bool] = False,
+    invert_division: Optional[bool] = True,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates scale and zero_point values for the quantizer.

diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
@@ -17,7 +17,8 @@
 from nncf.tensor.functions import numeric
 
 from ..definitions import TensorBackend
-from .numpy_numeric import DTYPE_MAP as NP_DTYPE_MAP
+from .numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
+from .numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP
 
 DTYPE_MAP = {
     TensorDataType.float16: ov.Type.f16,
@@ -40,7 +41,6 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
     from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
-    assert a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]
 
     model = get_astype_model(
         OVModelParameters(
@@ -65,11 +65,13 @@ def _(a: ov.Tensor) -> TensorBackend:
 
 @numeric.astype.register(ov.Tensor)
 def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
-    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
-    if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
+    if a.get_element_type() in [ov.Type.bf16, ov.Type.i4, ov.Type.u4] or dtype in [
+        TensorDataType.bfloat16,
+        TensorDataType.int4,
+        TensorDataType.uint4,
+    ]:
         return _ov_astype(a, dtype)
-
-    return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype]))
+    return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype]))
 
 
 @numeric.dtype.register(ov.Tensor)
@@ -87,8 +89,19 @@ def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor:
     return ov.Tensor(a.data.reshape(shape), shape, a.get_element_type())
 
 
+@numeric.to_backend.register(np.ndarray)
+def _(a: np.ndarray, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
+    if b == TensorBackend.numpy:
+        return a
+    if b != TensorBackend.ov:
+        raise ValueError("Not supported backend")
+    return ov.Tensor(a, a.shape, DTYPE_MAP[DTYPE_MAP_REV_NP[a.dtype]])
+
+
 @numeric.to_backend.register(ov.Tensor)
-def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray:
+def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
+    if b == TensorBackend.ov:
+        return a
     if b != TensorBackend.numpy:
         raise ValueError("Not supported backend")