openvinotoolkit · nikita-savelyevv · Jul 3, 2024 · Oct 22, 2024 · Oct 23, 2024 · Oct 24, 2024
@@ -145,6 +145,7 @@ def collect_api_entities() -> APIInfo:
     "nncf.tensor.functions.torch_linalg",
     "nncf.tensor.functions.torch_io",
     "nncf.tensor.functions.numpy_io",
+    "nncf.tensor.functions.ov",
 ]
 
 with mock(mock_modules):

@@ -12,6 +12,7 @@
 import logging
 import sys
 from contextlib import contextmanager
+from functools import lru_cache
 
 NNCF_LOGGER_NAME = "nncf"
 
@@ -86,3 +87,13 @@ def warn_bkc_version_mismatch(backend: str, bkc_version: str, current_version: s
         f"while current {backend} version is {current_version}. "
         f"If you encounter issues, consider switching to {backend}{bkc_version}"
     )
+
+
+@lru_cache(None)
+def log_once(level: int, message: str) -> None:
+    """
+    Logs a message only once.
+    :param level: Logging level, e.g. logging.WARNING.
+    :param message: The message to log.
+    """
+    nncf_logger.log(level, message)
@@ -9,6 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 from importlib import import_module
 from typing import Any, Callable, Dict, List
 
@@ -51,3 +52,62 @@ def wrapped_f(*args: Any, **kwargs: Any):  # type: ignore
         return wrapped_f
 
     return wrap
+
+
+class ResultsCacheContainer:
+    """
+    A container for results decorated with @cache_results decorator.
+    """
+
+    def __init__(self) -> None:
+        # Stores the results of the decorated function
+        self._cache: Dict[Any, Any] = {}
+        # Stores the number of times the cached result was accessed
+        self._access_count: Dict[Any, int] = {}
+
+    def clear(self) -> None:
+        self._cache.clear()
+        self._access_count.clear()
+
+    def is_empty(self) -> bool:
+        return len(self._cache) == 0
+
+    def __getitem__(self, item: Any) -> Any:
+        self._access_count[item] += 1
+        return self._cache[item]
+
+    def __setitem__(self, key: Any, value: Any) -> None:
+        self._access_count[key] = 0
+        self._cache[key] = value
+
+    def __contains__(self, item: Any) -> bool:
+        return item in self._cache
+
+
+def cache_results(cache: ResultsCacheContainer) -> Callable:  # type: ignore
+    """
+    Decorator to cache the results of a function.
+
+    Decorated function additionally accepts a `disable_caching` argument do disable caching if needed. If it is True,
+    the result will not be stored saved to a cache. Also, if there is a corresponding result in the cache, it will be
+    recomputed.
+    :param cache: A cache container where results will be stored.
+    """
+
+    def decorator(func: Callable) -> Callable:  # type: ignore
+        def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any:  # type: ignore
+            if disable_caching:
+                return func(*args, **kwargs)
+            sig = inspect.signature(func)
+            new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
+            new_kwargs.update(kwargs)
+            cache_key = (func.__name__, frozenset(new_kwargs.items()))
+            if cache_key in cache:
+                return cache[cache_key]
+            result = func(*args, **kwargs)
+            cache[cache_key] = result
+            return result
+
+        return wrapper
+
+    return decorator
@@ -0,0 +1,36 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+
+_openvino_available = importlib.util.find_spec("openvino") is not None
+_openvino_version = "N/A"
+if _openvino_available:
+    try:
+        from openvino.runtime import get_version
+
+        version = get_version()
+        # avoid invalid format
+        if "-" in version:
+            ov_major_version, dev_info = version.split("-", 1)
+            commit_id = dev_info.split("-")[0]
+            version = f"{ov_major_version}-{commit_id}"
+        _openvino_version = version
+    except ImportError:
+        _openvino_available = False
+
+
+def is_openvino_available():
+    """
+    Check if OpenVINO is available.
+    :return: True if openvino package is installed, False otherwise.
+    """
+    return _openvino_available
@@ -14,6 +14,7 @@
 import numpy as np
 import openvino.runtime as ov
 import openvino.runtime.opset13 as opset
+from openvino._pyopenvino.op import Constant
 
 import nncf
 from nncf.common.graph.graph import NNCFGraph
@@ -41,6 +42,8 @@
 from nncf.openvino.graph.metatypes.openvino_metatypes import OVMatMulMetatype
 from nncf.openvino.graph.metatypes.openvino_metatypes import OVOpMetatype
 from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
+from nncf.tensor import Tensor
+from nncf.tensor import TensorBackend
 
 InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node]
 
@@ -107,16 +110,17 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
     return cnt_if_op(model, 0)
 
 
-def get_const_value(const_node: ov.Node) -> np.ndarray:
+def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
     """
     Returns the constant tensor for the node.
     This method is applicable only for the floating-point constant data.
 
     :param const_node: OpenVINO node.
+    :param cast_bf16_to_fp32: Whether to cast bf16 node data to fp32 or not. If False and the node contains bf16 data,
+        the resulting bf16 value will be returned encoded inside a numpy.float16 array.
     :return: The constant value.
     """
-    if const_node.get_element_type() == ov.Type.bf16:
-        # Fixed FP32 data type as the result for BF16 constant
+    if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32:
         return const_node.get_data(dtype=np.float32)
     return const_node.data
 
@@ -631,3 +635,41 @@ def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: Tuple
         channel_axis = activations_layout.index(OVLayoutElem.C_IN)
 
     return channel_axis
+
+
+def convert_if_needed(node: ov.Node, target_dtype: ov.Type) -> ov.Node:
+    """
+    Converts the input node to the target data type if it is not already in the target data type.
+
+    :param node: The input node to convert.
+    :param target_dtype: The target data type to convert the input node to.
+    :return: The converted node.
+    """
+    if node.get_element_type() == target_dtype:
+        return node
+    return opset.convert(node, target_dtype)
+
+
+def non_convertable_divide(a: ov.Node, b: ov.Node) -> ov.Node:
+    """
+    Creates a "non-convertable" divide operation. It won't be converted to a*(1/b).
+    """
+    divide_node = a / b
+    divide_node.get_rt_info()["nonconvertable_divide_0"] = True
+    return divide_node
+
+
+def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant:
+    """
+    Create an OpenVINO Constant node from the given tensor.
+    :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created
+        directly from underlying OV tensor.
+    :param dtype: Data type of the constant.
+    :param name: Optional name of the constant.
+    :return: OpenVINO Constant node.
+    """
+    if x.backend == TensorBackend.ov:
+        assert x.data.get_element_type() == dtype
+        return opset.constant(x.data, name=name)
+    const = opset.constant(x.data, dtype=dtype, name=name)
+    return const
@@ -31,8 +31,7 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
 from nncf.quantization.passes import transform_to_inference_graph
@@ -262,10 +261,9 @@ def apply(
                         g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale)
                         g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale)
                     else:
-                        g_compressed_weighs, g_c_scale, g_c_zp = do_int_quantization(
-                            weights_to_fake_quantize, reduction_axis, awq_config
+                        g_decompressed_weighs = calculate_quantized_dequantized_weight(
+                            weights_to_fake_quantize, awq_config, reduction_axis
                         )
-                        g_decompressed_weighs = do_int_dequantization(g_compressed_weighs, g_c_scale, g_c_zp)
                     sacts = gacts / fns.unsqueeze(cur_scale, 1)
 
                     cur_out = fns.matmul(g_decompressed_weighs, sacts)

@@ -40,12 +40,23 @@ def num_bits(self):
         """
         return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4
 
+    @property
+    def is_int_asym(self):
+        return self.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM]
+
+    @property
     def is_integer(self):
         """
         :return: True if compression type in integer, else False.
         """
         return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
 
+    def __hash__(self):
+        return hash((self.mode.value, self.group_size))
+
+    def __str__(self):
+        return f"{self.mode.value}_{self.group_size}"
+
 
 @dataclass
 class WeightCompressionParameters:

@@ -267,7 +267,6 @@ def _quantize_weights(
                             activations = [inp[..., (i1 + i) : (i1 + i + group_size)] for inp in inputs]
                             wc_statistics = ScaleEstimation.activations_to_wc_statistics(activations)
                             scale, zero_point = ScaleEstimation.calculate_quantization_params(
-                                self._backend_entity,
                                 wc_statistics,
                                 weight_tensor[:, (i1 + i) : (i1 + i + group_size)],
                                 reduction_axes,

@@ -24,6 +24,7 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
@@ -105,7 +106,7 @@ def is_applicable(self, wc_params: WeightCompressionParameters):
         return wc_params.compression_config.num_bits == 4
 
     def calculate_adapters(
-        self, weight: Tensor, compressed_weight: Tensor, wc_params: WeightCompressionParameters
+        self, weight: Tensor, compressed_weight: CompressedWeight, wc_params: WeightCompressionParameters
     ) -> Tuple[Tensor, Tensor, List[float]]:
         """
         Calculates low rank matrices for a given original and compressed weights.
@@ -134,7 +135,7 @@ def calculate_adapters(
     @staticmethod
     def calculate_low_rank_matrices(
         weight: Tensor,
-        compressed_weight: Tensor,
+        compressed_weight: CompressedWeight,
         compression_config: WeightCompressionConfig,
         reduction_axes: Tuple[int, ...],
         lora_correction_params: AdvancedLoraCorrectionParameters,

@@ -354,7 +354,7 @@ def _calc_weight_sensitivity(
         if weight.dtype != TensorDataType.float32:
             weight = weight.astype(TensorDataType.float32)
 
-        compressed_weights, scale, zero_point = do_int_quantization(weight, reduction_axes, backup_config)
+        compressed_weights, scale, zero_point = do_int_quantization(weight, backup_config, reduction_axes)
         decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point)
         decompressed_weight = decompressed_weight.reshape(orig_shape)
         return fns.linalg.norm(decompressed_weight - weight, ord="fro").item()