Initial draft. Rebased.

openvinotoolkit · Jul 11, 2024 · 6b98ddd · 6b98ddd
1 parent 40233c0
commit 6b98ddd
Show file tree

Hide file tree

Showing 7 changed files with 232 additions and 102 deletions.
diff --git a/nncf/common/logging/logger.py b/nncf/common/logging/logger.py
@@ -12,6 +12,7 @@
 import logging
 import sys
 from contextlib import contextmanager
+from functools import lru_cache
 
 NNCF_LOGGER_NAME = "nncf"
 
@@ -86,3 +87,8 @@ def warn_bkc_version_mismatch(backend: str, bkc_version: str, current_version: s
         f"while current {backend} version is {current_version}. "
         f"If you encounter issues, consider switching to {backend}{bkc_version}"
     )
+
+
+@lru_cache(None)
+def log_once(level, message):
+    nncf_logger.log(level, message)
diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import openvino as ov
+from openvino.runtime import opset13 as opset
+
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+
+
+class OVCompressionPrimitiveCache:
+    def __init__(self):
+        self._compress_weight_model_cache = {}
+        self._compress_decompress_weight_model_cache = {}
+
+    def get_compress_weight_primitive(
+        self,
+        config: WeightCompressionConfig,
+        weight_shape: Tuple,
+        scale_shape: Tuple,
+        zero_point_shape: Optional[Tuple] = None,
+        invert_scale: Optional[bool] = False,
+    ):
+        key = (config.mode, config.num_bits, weight_shape, scale_shape, invert_scale)
+        if zero_point_shape is not None:
+            key += (zero_point_shape,)
+        if key not in self._compress_weight_model_cache:
+            self._compress_weight_model_cache[key] = self._build_compress_model(
+                config, weight_shape, scale_shape, zero_point_shape, invert_scale
+            )
+        return self._compress_weight_model_cache[key]
+
+    def get_compress_decompress_weight_primitive(
+        self,
+        config: WeightCompressionConfig,
+        weight_shape: Tuple,
+        scale_shape: Tuple,
+        zero_point_shape: Optional[Tuple] = None,
+    ):
+        key = (config.mode, config.num_bits, weight_shape, scale_shape)
+        if zero_point_shape is not None:
+            key += (zero_point_shape,)
+        if key not in self._compress_decompress_weight_model_cache:
+            self._compress_decompress_weight_model_cache[key] = self._build_compress_decompress_model(
+                config, weight_shape, scale_shape, zero_point_shape
+            )
+        return self._compress_decompress_weight_model_cache[key]
+
+    @staticmethod
+    def _build_compress_model(
+        config: WeightCompressionConfig,
+        weight_shape: Tuple,
+        scale_shape: Tuple,
+        zero_point_shape: Optional[Tuple] = None,
+        invert_scale: Optional[bool] = False,
+        return_nodes: bool = False,
+    ):
+        w = opset.parameter(weight_shape, name="w")
+        s = opset.parameter(scale_shape, name="s")
+        parameters = [w, s]
+        if invert_scale:
+            compressed_w = w * (1 / s)
+        else:
+            compressed_w = w / s
+        num_bits = config.num_bits
+        if zero_point_shape is not None:
+            level_low = 0
+            level_high = 2**num_bits - 1
+
+            zp = opset.parameter(zero_point_shape, name="zp")
+            parameters.append(zp)
+            compressed_w += zp
+        else:
+            level_low = -(2 ** (num_bits - 1))
+            level_high = 2 ** (num_bits - 1) - 1
+
+        result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")
+
+        if return_nodes:
+            return parameters, result
+
+        model = ov.Model([result], parameters)
+
+        compiled_model = ov.compile_model(model, device_name="CPU")
+
+        return lambda parameters: compiled_model(parameters)[0]
+
+    @staticmethod
+    def _build_compress_decompress_model(
+        config: WeightCompressionConfig,
+        weight_shape: Tuple,
+        scale_shape: Tuple,
+        zero_point_shape: Optional[Tuple] = None,
+    ):
+        parameters, clamp = OVCompressionPrimitiveCache._build_compress_model(
+            config, weight_shape, scale_shape, zero_point_shape, return_nodes=True
+        )
+
+        if len(parameters) == 3:
+            _, s, zp = parameters
+            result = (clamp - zp) * s
+        else:
+            s = parameters[1]
+            result = clamp * s
+
+        model = ov.Model([result], parameters)
+        compiled_model = ov.compile_model(model, device_name="CPU")
+
+        return lambda parameters: compiled_model(parameters)[0]
+
+
+OV_COMPRESSION_PRIMITIVE_CACHE = OVCompressionPrimitiveCache()
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -33,7 +33,6 @@
 from nncf.quantization.algorithms.weight_compression.awq_patterns import get_awq_patterns
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
@@ -227,55 +226,6 @@ def dump_parameters(
     ) -> None:
         dump_parameters(model, parameters, algo_name, path)
 
-    @staticmethod
-    def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
-        parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline(
-            config, w_shape, s_shape, z_p_shape, True
-        )
-
-        if len(parameters) == 3:
-            _, s, zp = parameters
-            result = (clamp - zp) * s
-        else:
-            s = parameters[1]
-            result = clamp * s
-
-        model = ov.Model([result], parameters)
-
-        compiled_model = ov.compile_model(model, device_name="CPU")
-
-        return lambda parameters: compiled_model(parameters)[0]
-
-    @staticmethod
-    def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False):
-        mode = config.mode
-        assert mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM]
-        num_bits = config.num_bits
-
-        asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
-        level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
-        level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
-
-        w = opset.parameter(w_shape, name="w")
-        s = opset.parameter(s_shape, name="s")
-        parameters = [w, s]
-        compressed_w = w / s
-        if z_p_shape is not None:
-            zp = opset.parameter(z_p_shape, name="zp")
-            parameters.append(zp)
-            compressed_w += zp
-
-        result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")
-
-        if return_nodes:
-            return parameters, result
-
-        model = ov.Model([result], parameters)
-
-        compiled_model = ov.compile_model(model, device_name="CPU")
-
-        return lambda parameters: compiled_model(parameters)[0]
-
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -20,6 +20,8 @@
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
@@ -117,7 +119,6 @@ def apply(
         :return: Dict with pairs (weight name, estimated scale).
         """
 
-        compress_decompress_cache = {}
         res = dict()
 
         for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
@@ -201,38 +202,14 @@ def apply(
             if self._weight_penalty > 0.0:
                 min_max_scale_diffs += self._weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
 
-            zp_shape = zp.shape if zp is not None else None
-            key = [(wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape]
-            if zp is not None:
-                key += zp_shape
-            key = tuple(key)
-            if key in compress_decompress_cache:
-                compress_decompress_model = compress_decompress_cache[key]["compress_decompress_model"]
-                compress_model = compress_decompress_cache[key]["compress_model"]
-            else:
-                compress_decompress_model = self._backend_entity.get_compress_decompress_pipeline(
-                    wp.compression_config, q_weights.shape, scale.shape, zp_shape
-                )
-                compress_model = self._backend_entity.get_compress_pipeline(
-                    wp.compression_config, q_weights.shape, scale.shape, zp_shape
-                )
-                compress_decompress_cache[key] = {
-                    "compress_decompress_model": compress_decompress_model,
-                    "compress_model": compress_model,
-                }
-
             zero_scale = 0.001
             zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
 
-            input_tensors = [original_weight.data, None]
-            if zp is not None:
-                input_tensors.append(zp.data)
             # iterative rectification of initial scale
             for i in range(self._initial_steps):
                 near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
-                input_tensors[1] = near_to_ideal_scale.data
 
-                out = compress_decompress_model(input_tensors)
+                out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp)
                 q_weights_ = fns.zeros_like(original_weight) + out
                 q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
 
@@ -255,10 +232,9 @@ def apply(
                 else:
                     near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
                 result_scale = near_to_ideal_scale
-                input_tensors[1] = near_to_ideal_scale.data
 
                 if i < self._initial_steps - 1:
-                    out = compress_model(input_tensors)
+                    out = calculate_quantized_weight(original_weight, config, near_to_ideal_scale, zp)
                     compressed_weights = fns.zeros_like(original_weight) + out
                     target, zero_mask = get_target_zero_mask(compressed_weights, zp)
                     zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
@@ -268,16 +244,14 @@ def apply(
                 factor = 1.0 - 0.05 * scale_steps
                 scaled_scale = factor * scale
 
-                input_tensors[1] = scaled_scale.data
-                out = compress_model(input_tensors)
+                out = calculate_quantized_weight(original_weight, config, scaled_scale, zp)
                 compressed_weights = fns.zeros_like(original_weight) + out
 
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
                 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
                 near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
 
-                input_tensors[1] = near_to_ideal_scale.data
-                out = compress_decompress_model(input_tensors)
+                out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp)
                 q_weights_ = fns.zeros_like(original_weight) + out
 
                 q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)