From 10d1ddb0cfc0d1c1c6779cd9d4dfd489ea833c9a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 3 Jul 2024 13:57:51 +0200
Subject: [PATCH 01/73] Initial draft. Rebased.

Dynamic shapes WIP

BF16 support

End-to-end compression WIP

Add logic to compare numpy to ov computations

Added release_memory

Added a script to run multiple experiments sequentially

INT4 experiments

BF16 fix

INT4 performance gains

WIP

weight_lowering dir created

weight_lowering_ dir changes

Renamed

Something works

File updates

Removed dispatching
---
 docs/api/source/conf.py                       |   1 +
 nncf/common/logging/logger.py                 |   6 +
 nncf/openvino/graph/node_utils.py             |   6 +-
 .../algorithms/weight_compression/config.py   |   3 +
 .../weight_compression/openvino_backend.py    |  94 ++---
 .../weight_compression/openvino_modeling.py   | 345 ++++++++++++++++
 .../weight_compression/scale_estimation.py    |  34 +-
 .../weight_compression/weight_lowering.py     | 183 +++++----
 nncf/quantization/fake_quantize.py            |  19 +-
 nncf/tensor/definitions.py                    |   1 +
 nncf/tensor/functions/__init__.py             |   3 +
 nncf/tensor/functions/ov.py                   |  41 ++
 nncf/utils.py                                 |  32 ++
 run_weight_compression.py                     | 373 ++++++++++++++++++
 .../quantization/test_weights_compression.py  |   6 +-
 weight_compression.py                         | 234 +++++++++++
 16 files changed, 1214 insertions(+), 167 deletions(-)
 create mode 100644 nncf/quantization/algorithms/weight_compression/openvino_modeling.py
 create mode 100644 nncf/tensor/functions/ov.py
 create mode 100644 nncf/utils.py
 create mode 100644 run_weight_compression.py
 create mode 100644 weight_compression.py

diff --git a/docs/api/source/conf.py b/docs/api/source/conf.py
index fe3afe0525c..ca5b7a11e0f 100644
--- a/docs/api/source/conf.py
+++ b/docs/api/source/conf.py
@@ -142,6 +142,7 @@ def collect_api_entities() -> APIInfo:
     "nncf.tensor.functions.numpy_linalg",
     "nncf.tensor.functions.torch_numeric",
     "nncf.tensor.functions.torch_linalg",
+    "nncf.tensor.functions.ov",
 ]
 
 with mock(mock_modules):
diff --git a/nncf/common/logging/logger.py b/nncf/common/logging/logger.py
index 5ba4b9a257c..e13fcaa8442 100644
--- a/nncf/common/logging/logger.py
+++ b/nncf/common/logging/logger.py
@@ -12,6 +12,7 @@
 import logging
 import sys
 from contextlib import contextmanager
+from functools import lru_cache
 
 NNCF_LOGGER_NAME = "nncf"
 
@@ -86,3 +87,8 @@ def warn_bkc_version_mismatch(backend: str, bkc_version: str, current_version: s
         f"while current {backend} version is {current_version}. "
         f"If you encounter issues, consider switching to {backend}{bkc_version}"
     )
+
+
+@lru_cache(None)
+def log_once(level, message):
+    nncf_logger.log(level, message)
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 7496187adb1..8fab3933945 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -8,7 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import numpy as np
@@ -115,7 +115,9 @@ def get_const_value(const_node: ov.Node) -> np.ndarray:
     :param const_node: OpenVINO node.
     :return: The constant value.
     """
-    if const_node.get_element_type() == ov.Type.bf16:
+    INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32")
+    NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    if const_node.get_element_type() == ov.Type.bf16 and (INPUT_DTYPE != "bf16" or NUMPY_COMPRESSION):
         # Fixed FP32 data type as the result for BF16 constant
         return const_node.get_data(dtype=np.float32)
     return const_node.data
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 56dbc24f2e2..ce512331349 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -46,6 +46,9 @@ def is_integer(self):
         """
         return self.mode not in [CompressWeightsMode.NF4, CompressWeightsMode.E2M1]
 
+    def __hash__(self):
+        return hash((self.mode.value, self.group_size))
+
 
 @dataclass
 class WeightCompressionParameters:
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 3d17d1a6af4..c00cb82a3f2 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -244,21 +244,38 @@ def _create_compression_subgraph(
         original_shape = weight.shape
         compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)
 
-        compressed_const = opset.constant(compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name)
+        compressed_weight_data = compressed_weight.tensor.data
+        if isinstance(compressed_weight_data, ov.Tensor):
+            compressed_const = opset.constant(compressed_weight_data, name=const_node_name)
+        else:
+            compressed_const = opset.constant(compressed_weight_data, dtype=compression_dtype, name=const_node_name)
+        if compressed_const.get_element_type() != compression_dtype:
+            compressed_const = opset.convert(compressed_const, compression_dtype)
         converted_const = opset.convert(compressed_const, ov.Type.f16)
-        if compressed_weight.zero_point is not None and compressed_weight.tensor.dtype == TensorDataType.uint8:
-            zero_point_const = opset.constant(
-                compressed_weight.zero_point.data,
-                dtype=compression_dtype,
-                name=f"{const_node_name}/zero_point",
-            )
-            converted_zero_point = opset.convert(zero_point_const, ov.Type.f16)
+        if compressed_weight.zero_point is not None:
+            zero_point_data = compressed_weight.zero_point.data
+            if isinstance(zero_point_data, ov.Tensor):
+                zero_point_const = opset.constant(
+                    compressed_weight.zero_point.data,
+                    name=f"{const_node_name}/zero_point",
+                )
+            else:
+                zero_point_const = opset.constant(
+                    compressed_weight.zero_point.data,
+                    dtype=compression_dtype,
+                    name=f"{const_node_name}/zero_point",
+                )
+            zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
             converted_const = opset.subtract(
-                converted_const, converted_zero_point, name=f"{const_node_name}/zero_point/subtract"
+                converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
             )
 
-        scale_const = opset.constant(compressed_weight.scale.data, dtype=scale_dtype, name=f"{const_node_name}/scale")
-        if scale_dtype != ov.Type.f16:
+        scale_data = compressed_weight.scale.data
+        if isinstance(scale_data, ov.Tensor):
+            scale_const = opset.constant(scale_data, name=f"{const_node_name}/scale")
+        else:
+            scale_const = opset.constant(scale_data, dtype=scale_dtype, name=f"{const_node_name}/scale")
+        if scale_const.get_element_type() != ov.Type.f16:
             scale_const = opset.convert(scale_const, ov.Type.f16)
 
         mul = opset.multiply(
@@ -302,6 +319,9 @@ def transform_model(
             layer_zero_points = (
                 None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name)
             )
+            import os
+
+            os.environ["CURRENT_NODE_NAME"] = wc_params.weight_name
             mul, compressed_weight = self._create_compression_subgraph(
                 weight=weight,
                 compression_config=wc_params.compression_config,
@@ -333,58 +353,6 @@ def dump_parameters(
     ) -> None:
         dump_parameters(model, parameters, algo_name, path)
 
-    @staticmethod
-    def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
-        parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline(
-            config, w_shape, s_shape, z_p_shape, True
-        )
-
-        if len(parameters) == 3:
-            _, s, zp = parameters
-            result = (clamp - zp) * s
-        else:
-            s = parameters[1]
-            result = clamp * s
-
-        model = ov.Model([result], parameters)
-
-        compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: Type.f32})
-
-        return lambda parameters: compiled_model(parameters)[0]
-
-    @staticmethod
-    def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None, return_nodes=False):
-        mode = config.mode
-        assert mode in [
-            CompressWeightsMode.INT4_SYM,
-            CompressWeightsMode.INT4_ASYM,
-        ], f"Only int4 supported, but given={mode}"
-        num_bits = config.num_bits
-
-        asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
-        level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
-        level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
-
-        w = opset.parameter(w_shape, name="w")
-        s = opset.parameter(s_shape, name="s")
-        parameters = [w, s]
-        compressed_w = w / s
-        if z_p_shape is not None:
-            zp = opset.parameter(z_p_shape, name="zp")
-            parameters.append(zp)
-            compressed_w += zp
-
-        result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")
-
-        if return_nodes:
-            return parameters, result
-
-        model = ov.Model([result], parameters)
-
-        compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: Type.f32})
-
-        return lambda parameters: compiled_model(parameters)[0]
-
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
new file mode 100644
index 00000000000..b4443970e30
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import numpy as np
+import openvino as ov
+from openvino.runtime import opset13 as opset
+
+import nncf
+from nncf import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+
+
+@dataclass
+class OVModelParameters:
+    dynamic: bool = False
+    recompile: bool = False
+    release_memory: bool = True
+    share_outputs: bool = True
+    input_dtype: str = "fp32"
+
+    def __hash__(self):
+        return hash((self.dynamic, self.recompile, self.release_memory, self.share_outputs, self.input_dtype))
+
+
+class CompiledModelCache:
+    def __init__(self):
+        self._cache = {}
+
+    def clear(self):
+        self._cache.clear()
+
+    def is_empty(self):
+        return len(self._cache) == 0
+
+
+COMPILED_MODEL_CACHE = CompiledModelCache()
+
+
+def clear_cache():
+    COMPILED_MODEL_CACHE.clear()
+
+
+def cache_results(func):
+    def wrapper(*args, **kwargs):
+        sig = inspect.signature(func)
+        new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
+        new_kwargs.update(kwargs)
+        cache_key = (func.__name__, frozenset(new_kwargs.items()))
+        recompile = new_kwargs.get("ov_model_params", OVModelParameters()).recompile
+        cache = COMPILED_MODEL_CACHE._cache
+        if not recompile and cache_key in cache:
+            return cache[cache_key]
+        result = func(*args, **kwargs)
+        cache[cache_key] = result
+        return result
+
+    return wrapper
+
+
+@cache_results
+def get_compress_weight_model(
+    config: WeightCompressionConfig,
+    weight_shape: Tuple,
+    scale_shape: Optional[Tuple] = None,
+    zero_point_shape: Optional[Tuple] = None,
+    reduction_axes: Optional[Tuple] = None,
+    ov_model_params: Optional[OVModelParameters] = None,
+):
+    if scale_shape is None and zero_point_shape is not None:
+        raise Exception("Zero point shape can only be provided if scale shape is provided.")
+    # if (scale_shape is None) != (reduction_axes is not None):
+    #     raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.")
+
+    if ov_model_params.dynamic:
+        weight_shape = (-1,) * len(weight_shape)
+        if scale_shape is not None:
+            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+        if zero_point_shape is not None:
+            zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+
+    return _build_compress_model(
+        config,
+        ov_model_params,
+        weight_shape,
+        scale_shape,
+        zero_point_shape,
+        reduction_axes,
+        return_nodes=False,
+    )
+
+
+@cache_results
+def get_compress_decompress_weight_model(
+    config: WeightCompressionConfig,
+    weight_shape: Tuple,
+    scale_shape: Optional[Tuple],
+    zero_point_shape: Optional[Tuple] = None,
+    ov_model_params: Optional[OVModelParameters] = None,
+):
+    if ov_model_params is None:
+        ov_model_params = OVModelParameters()
+    if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
+        ov_model_params.dynamic = False
+
+    if ov_model_params.dynamic:
+        weight_shape = (-1,) * len(weight_shape)
+        scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+        if zero_point_shape is not None:
+            zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+
+    return _build_compress_decompress_model(
+        config,
+        ov_model_params,
+        weight_shape,
+        scale_shape,
+        zero_point_shape,
+    )
+
+
+def _build_compress_decompress_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    weight_shape: Tuple,
+    scale_shape: Tuple,
+    zero_point_shape: Optional[Tuple] = None,
+):
+    ov_parameters, ov_results = _build_compress_model(
+        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
+    )
+    return _get_compress_decompress_model(
+        config,
+        ov_model_params,
+        ov_parameters,
+        ov_results,
+    )
+
+
+def _build_compress_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    weight_shape: Tuple,
+    scale_shape: Optional[Tuple] = None,
+    zero_point_shape: Optional[Tuple] = None,
+    reduction_axes: Optional[Tuple] = None,
+    return_nodes: bool = False,
+):
+    if ov_model_params.input_dtype == "fp32":
+        input_dtype = ov.Type.f32
+    elif ov_model_params.input_dtype == "fp16":
+        input_dtype = ov.Type.f16
+    elif ov_model_params.input_dtype == "bf16":
+        input_dtype = ov.Type.bf16
+    else:
+        raise Exception
+    weight = opset.parameter(weight_shape, name="w", dtype=input_dtype)
+    ov_parameters = [weight]
+
+    if scale_shape is not None:
+        # Compute only the compressed weight
+
+        scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32)
+        ov_parameters.append(scale)
+
+        zero_point = None
+        if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
+            zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
+            ov_parameters.append(zero_point)
+    else:
+        # Compute compressed weight, scale and, possibly, zero point
+
+        group_size = config.group_size
+        if group_size != -1:
+            if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1:
+                reduction_axes = reduction_axes[0]
+            if not isinstance(reduction_axes, int):
+                raise NotImplementedError(
+                    f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}."
+                )
+            channel_size = weight.shape[reduction_axes]
+            if channel_size % group_size != 0:
+                raise nncf.ValidationError(
+                    f"Channel size {channel_size} should be divisible by size of group {group_size}"
+                )
+
+            num_groups_per_channel = channel_size // group_size
+            shape = list(weight.shape)  # [a1, r, a2] - "r" refers to number of channels along reduction axis
+            shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size)
+            weight = opset.reshape(weight, shape, special_zero=False)
+            reduction_axes += 1
+
+        mode = config.mode
+        num_bits = config.num_bits
+        eps = np.finfo(np.float32).eps
+        if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+            min_values = opset.reduce_min(
+                weight, reduction_axes=reduction_axes, keep_dims=True
+            )  # [a1, r, a2] -> [a1, 1, a2]
+            max_values = opset.reduce_max(
+                weight, reduction_axes=reduction_axes, keep_dims=True
+            )  # [a1, r, a2] -> [a1, 1, a2]
+            min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)
+
+            level_low = 0
+            level_high = 2**num_bits - 1
+            levels = level_high - level_low + 1
+            scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32)
+            scale = opset.select(opset.abs(scale) < eps, eps, scale)
+
+            zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
+            zero_point = opset.clamp(zero_point, level_low, level_high)
+        else:
+            zero_point = None
+            level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32)
+
+            w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True))
+            w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True)
+            w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)
+
+            scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max)
+            scale /= level_high
+            scale = opset.select(opset.abs(scale) < eps, eps, scale)
+
+    return _get_compress_model(
+        config,
+        ov_model_params,
+        ov_parameters,
+        weight,
+        scale,
+        zero_point,
+        return_nodes,
+    )
+
+
+def _get_compress_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    ov_parameters: List[ov._pyopenvino.op.Parameter],
+    w: ov.runtime.Node,
+    s: ov.runtime.Node,
+    zp: Optional[ov.runtime.Node] = None,
+    return_nodes: Optional[bool] = False,
+):
+    if w.get_element_type() != ov.Type.f32:
+        w = opset.convert(w, ov.Type.f32)
+
+    compressed_w = w / s
+
+    num_bits = config.num_bits
+    if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
+        # dtype = ov.Type.u8
+        dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
+        level_low = 0
+        level_high = 2**num_bits - 1
+        compressed_w += opset.convert(zp, ov.Type.f32)
+    elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
+        # dtype = ov.Type.i8
+        dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
+        level_low = -(2 ** (num_bits - 1))
+        level_high = 2 ** (num_bits - 1) - 1
+    else:
+        raise Exception
+
+    compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
+    compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")
+
+    ov_results = [compressed_w]
+    if len(ov_parameters) == 1:
+        ov_results.append(s)
+        if zp is not None:
+            ov_results.append(opset.convert(zp, compressed_w.get_element_type()))
+
+    if return_nodes:
+        return ov_parameters, ov_results
+
+    model = ov.Model(ov_results, ov_parameters)
+    compiled_model = ov.compile_model(model, device_name="CPU")
+
+    def infer(inputs):
+        infer_request = compiled_model.create_infer_request()
+        infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs)
+        outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
+        if ov_model_params.release_memory:
+            compiled_model.release_memory()
+        return outputs
+
+    return infer
+
+
+def _get_compress_decompress_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    parameters: List[ov._pyopenvino.op.Parameter],
+    results: List[ov._pyopenvino.Node],
+):
+    if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
+        if len(results) == 1:
+            compressed_w = results[0]
+            s, zp = parameters[1], parameters[2]
+        else:
+            compressed_w, s, zp = results
+        decompressed_w = (compressed_w - zp) * s
+    else:
+        if len(results) == 1:
+            compressed_w = results[0]
+            s = parameters[1]
+        else:
+            compressed_w, s = results
+        decompressed_w = compressed_w * s
+
+    model = ov.Model([decompressed_w], parameters)
+    compiled_model = ov.compile_model(model, device_name="CPU")
+
+    def infer(inputs):
+        infer_request = compiled_model.create_infer_request()
+        infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs)
+        outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
+        if ov_model_params.release_memory:
+            compiled_model.release_memory()
+        return outputs
+
+    return infer
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 0596e94d432..1b4827038c9 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -27,6 +27,8 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
@@ -44,8 +46,6 @@ class ScaleEstimation:
     Scale estimation algorithm implementation.
     """
 
-    compress_decompress_cache = {}
-
     def __init__(
         self,
         model: TModel,
@@ -256,41 +256,20 @@ def calculate_quantization_params(
         if weight_penalty > 0.0:
             min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
 
-        zp_shape = zp.shape if zp is not None else None
-        key = (config.mode, config.num_bits) + q_weights.shape + scale.shape
-        if zp is not None:
-            key += zp_shape
-        if config.mode != CompressWeightsMode.NF4:
-            if key in ScaleEstimation.compress_decompress_cache:
-                compress_decompress_model = ScaleEstimation.compress_decompress_cache[key]["compress_decompress_model"]
-                compress_model = ScaleEstimation.compress_decompress_cache[key]["compress_model"]
-            else:
-                compress_decompress_model = backend_entity.get_compress_decompress_pipeline(
-                    config, q_weights.shape, scale.shape, zp_shape
-                )
-                compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape)
-                ScaleEstimation.compress_decompress_cache[key] = {
-                    "compress_decompress_model": compress_decompress_model,
-                    "compress_model": compress_model,
-                }
         scale_sign = scale / fns.abs(scale)
         zero_scale = 0.001
         zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
 
-        input_tensors = [original_weight.data, None]
-        if zp is not None:
-            input_tensors.append(zp.data)
         # iterative rectification of initial scale
         for i in range(initial_steps):
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
-            input_tensors[1] = near_to_ideal_scale.data
 
             if config.mode == CompressWeightsMode.NF4:
                 g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
-                out = compress_decompress_model(input_tensors)
+                out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp)
             q_weights_ = fns.zeros_like(original_weight) + out
             q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
 
@@ -313,13 +292,12 @@ def calculate_quantization_params(
             else:
                 near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
             result_scale = near_to_ideal_scale
-            input_tensors[1] = near_to_ideal_scale.data
 
             if i < initial_steps - 1:
                 if config.mode == CompressWeightsMode.NF4:
                     out = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 else:
-                    out = compress_model(input_tensors)
+                    out = calculate_quantized_weight(original_weight, config, near_to_ideal_scale, zp)
                 compressed_weights = fns.zeros_like(original_weight) + out
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
                 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
@@ -333,7 +311,7 @@ def calculate_quantization_params(
             if config.mode == CompressWeightsMode.NF4:
                 out = do_nf4_quantization(original_weight, scaled_scale)
             else:
-                out = compress_model(input_tensors)
+                out = calculate_quantized_weight(original_weight, config, scaled_scale, zp)
             compressed_weights = fns.zeros_like(original_weight) + out
 
             target, zero_mask = get_target_zero_mask(compressed_weights, zp)
@@ -346,7 +324,7 @@ def calculate_quantization_params(
                 g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
-                out = compress_decompress_model(input_tensors)
+                out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp)
             q_weights_ = fns.zeros_like(original_weight) + out
 
             q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 342725c0237..08aff97d5cd 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -17,10 +17,13 @@
 import nncf
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, \
+    get_compress_decompress_weight_model, get_compress_weight_model
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
 from nncf.tensor.definitions import TensorDataType
+from nncf.utils import is_openvino_available
 
 ReductionAxes = Tuple[int, ...]
 
@@ -139,7 +142,7 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=
     return scale
 
 
-def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4) -> Tensor:
+def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division=False) -> Tensor:
     """
     Calculates the signed scale for symmetric quantization.
 
@@ -154,7 +157,10 @@ def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bi
     w_max = fns.max(weight, axis=reduction_axes, keepdims=True)
 
     scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max)
-    scale /= level_high
+    if invert_division:
+        scale *= 1.0 / level_high
+    else:
+        scale /= level_high
 
     eps = fns.finfo(scale).eps
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
@@ -249,7 +255,7 @@ def calculate_normalized_weight_and_fp4_scale(
 
 
 def calculate_integer_quantization_params(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
+    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into
@@ -273,7 +279,7 @@ def calculate_integer_quantization_params(
         min_values = fns.min(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
         max_values = fns.max(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
         scale, zero_point = calculate_scale_zero_point(
-            min_values, max_values, level_low, level_high, narrow_range=False
+            min_values, max_values, level_low, level_high, narrow_range=False, invert_division=invert_division
         )
         return scale, zero_point
 
@@ -286,7 +292,7 @@ def calculate_quantized_weight(
     config: WeightCompressionConfig,
     scale: Tensor,
     zero_point: Optional[Tensor] = None,
-    invert_scale=False,
+    invert_division=False,
 ) -> Tensor:
     """
     Quantizes the weight tensor using the provided scale and zero point.
@@ -295,7 +301,7 @@ def calculate_quantized_weight(
     :param config: Weight compression configuration.
     :param scale: Scale tensor used for quantization.
     :param zero_point: Zero point tensor used for quantization.
-    :param invert_scale: applies inversion for scale and then multiply by weights instead of division.
+    :param invert_division: applies inversion for scale and then multiply by weights instead of division.
     :return: Quantized weight tensor of uint8 or int8 type.
     """
     if weight.dtype != TensorDataType.float32:
@@ -309,9 +315,8 @@ def calculate_quantized_weight(
     level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
-    if invert_scale:
-        scale = fns.power(scale, -1)
-        compressed_weights = weight * scale
+    if invert_division:
+        compressed_weights = weight * (1.0 / scale)
     else:
         compressed_weights = weight / scale
     if zero_point is not None:
@@ -322,63 +327,8 @@ def calculate_quantized_weight(
     return compressed_weights
 
 
-def do_int_quantization(
-    weight: Tensor,
-    reduction_axes: ReductionAxes,
-    config: WeightCompressionConfig,
-    precomputed_scale: Tensor = None,
-    precomputed_zero_point: Tensor = None,
-    invert_scale=False,
-) -> Tuple[Tensor, Tensor, Tensor]:
-    """
-    The method quantizes the given weights to integer data type uniformly in accordance with the compression config.
-    The config defines a quantization mode:
-        INT8_SYM mode refers to signed int8 symmetric weight compression without zero point -
-            quantization to [-128, 127] range.
-        INT8_ASYM mode refers to unsigned int8 asymmetric weight compression with a typical non-fixed zero-point -
-            quantization to [0, 255] range.
-        INT4_ASYM mode refers to unsigned int4 asymmetric weight compression with a typical non-fixed zero-point -
-            quantization to [0, 15] range.
-        INT4_SYM mode refers to signed int4 symmetric weight compression without zero point -
-            quantization to [-8, 7] range.
-        NF4 or E2M1 mode requires a dedicated procedure and it is not supported in this method.
-    One of the parameter of compression config is a group size. Quantization is per-channel, if group size equals to -1,
-    otherwise it's per-group, i.e. group size number of weights in the channel dimension share quantization parameters
-    (scales).
-
-    :param weight: Weight array to compress.
-    :param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
-    :param config: Information on how to compress (quantize) a specific weight.
-    :param precomputed_scale: Precomputed scale.
-    :param precomputed_zero_point: Precomputed zero point.
-    :param invert_scale: applies inversion for scale and then multiply by weights instead of division.
-        Need as reference implementation for OV.
-    :return: The compressed weights tensor of uint8 (asymmetric mode) or int8 (symmetric mode) type,
-        scale tensor of float32 type and zero point tensor of int32 type that was used for its quantization.
-    """
-    assert config.is_integer(), "The function supports integer quantization only"
-    group_size = config.group_size
-
-    if weight.dtype != TensorDataType.float32:
-        weight = weight.astype(TensorDataType.float32)
-
-    if group_size != -1:
-        # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
-        weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
-
-    if precomputed_zero_point is None or precomputed_zero_point is None:
-        scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config)
-    if precomputed_scale is not None:
-        scale = precomputed_scale
-    if precomputed_zero_point is not None:
-        zero_point = precomputed_zero_point
-
-    compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_scale)
-    return compressed_weights, scale, zero_point
-
-
 def get_integer_quantization_error(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig
+    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
 ) -> float:
     """
     Calculates a quantity characterizing the difference between floating point weights and fake quantized
@@ -394,7 +344,9 @@ def get_integer_quantization_error(
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
-    compressed_weights, scale, zero_point = do_int_quantization(weight, reduction_axes, config)
+    compressed_weights, scale, zero_point = do_int_quantization(
+        weight, reduction_axes, config, invert_division=invert_division
+    )
     decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point)
 
     decompressed_weight = decompressed_weight.reshape(orig_shape)
@@ -410,6 +362,7 @@ def compress_weight(
     config: WeightCompressionConfig,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
+    invert_division=False,
 ):
     """
     Compress weight using compression configuration.
@@ -427,7 +380,7 @@ def compress_weight(
         )
         return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_int_quantization(
-        weight, reduction_axes, config, precomputed_scale, precomputed_zero_point
+        weight, reduction_axes, config, precomputed_scale, precomputed_zero_point, invert_division=invert_division
     )
 
     return CompressedWeight(compressed_weight, scale, zero_point)
@@ -472,3 +425,99 @@ def do_int_dequantization(
         decompressed_weight = ungroup_weights(decompressed_weight, reduction_axis)
 
     return decompressed_weight
+
+
+def do_int_quantization(
+    weight: Tensor,
+    reduction_axes: Tuple[int, ...],
+    config: WeightCompressionConfig,
+    precomputed_scale: Tensor = None,
+    precomputed_zero_point: Tensor = None,
+    invert_division: Optional[bool] = False,
+    ov_model_params: Optional[OVModelParameters] = None,
+):
+    assert config.is_integer(), "The function supports integer quantization only"
+
+    accelerate_through_ov = is_openvino_available()
+
+    if not accelerate_through_ov:
+        group_size = config.group_size
+
+        if weight.dtype != TensorDataType.float32:
+            weight = weight.astype(TensorDataType.float32)
+
+        if group_size != -1:
+            # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
+            weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
+
+        scale, zero_point = None, None
+        if precomputed_zero_point is None or precomputed_zero_point is None:
+            scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config)
+        if precomputed_scale is not None:
+            scale = precomputed_scale
+        if precomputed_zero_point is not None:
+            zero_point = precomputed_zero_point
+
+        compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
+        return compressed_weights, scale, zero_point
+
+    weight_shape = weight.shape
+    scale_shape = None if precomputed_scale is None else precomputed_scale.shape
+    zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
+
+    if ov_model_params is None:
+        ov_model_params = OVModelParameters()
+    # TODO: Try reshaping weight before inputing it to the model
+    if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
+        ov_model_params.dynamic = False
+
+    model = get_compress_weight_model(
+        config,
+        weight_shape,
+        scale_shape,
+        zero_point_shape,
+        reduction_axes,
+        ov_model_params,
+    )
+
+    if precomputed_scale is None:
+        results = model(weight.data)
+        compressed_weight, scale, zero_point = [Tensor(it) for it in results]
+    else:
+        inputs = [weight.data, precomputed_scale.data]
+        if precomputed_zero_point is not None:
+            inputs += [precomputed_zero_point.data]
+        compressed_weight = Tensor(model(inputs)[0])
+        scale, zero_point = precomputed_scale, precomputed_zero_point
+
+    return compressed_weight, scale, zero_point
+
+
+def calculate_quantized_dequantized_weight(
+    weight: Tensor, config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None,
+    invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None,
+) -> Tensor:
+    accelerate_through_ov = is_openvino_available()
+
+    if not accelerate_through_ov:
+        compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
+        decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
+        return decompressed_weight
+
+    weight_shape = weight.shape
+    scale_shape = scale.shape
+    zero_point_shape = None if zero_point is None else zero_point.shape
+
+    if ov_model_params is None:
+        ov_model_params = OVModelParameters()
+    if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
+        ov_model_params.dynamic = False
+
+    model = get_compress_decompress_weight_model(config, weight_shape, scale_shape, zero_point_shape, ov_model_params)
+
+    inputs = [weight.data, scale.data]
+    if zero_point is not None:
+        inputs.append(zero_point.data)
+    results = model(inputs)
+    decompressed_weight = [Tensor(it) for it in results][0]
+    return decompressed_weight
diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py
index d5a3e96ae64..a225f53853a 100644
--- a/nncf/quantization/fake_quantize.py
+++ b/nncf/quantization/fake_quantize.py
@@ -11,7 +11,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Tuple
+from typing import Optional, Tuple
 
 import nncf
 from nncf.common.quantization.quantizers import calculate_asymmetric_level_ranges
@@ -339,7 +339,12 @@ def _calculate_scaled_parameters(
 
 
 def calculate_scale_zero_point(
-    input_low: Tensor, input_high: Tensor, level_low: int, level_high: int, narrow_range: bool
+    input_low: Tensor,
+    input_high: Tensor,
+    level_low: int,
+    level_high: int,
+    narrow_range: bool,
+    invert_division: Optional[bool] = False,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates scale and zero_point values for the quantizer.
@@ -355,11 +360,17 @@ def calculate_scale_zero_point(
     :return: Scale and Zero point values.
     """
     levels = level_high - level_low if narrow_range else level_high - level_low + 1
-    scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32)
+    if invert_division:
+        scale = ((input_high - input_low) * (1.0 / (levels - 1))).astype(TensorDataType.float32)
+    else:
+        scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32)
     eps = fns.finfo(scale).eps
     # NOTE: adding machine epsilon to avoid division by zero
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
     expected_level_low = level_low + 1 if narrow_range else level_low
-    zero_point = expected_level_low - fns.round(input_low / scale)
+    if invert_division:
+        zero_point = expected_level_low - fns.round(input_low * (1.0 / scale))
+    else:
+        zero_point = expected_level_low - fns.round(input_low / scale)
     zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high)
     return scale, zero_point
diff --git a/nncf/tensor/definitions.py b/nncf/tensor/definitions.py
index 5d2df4ac035..a4849e558e3 100644
--- a/nncf/tensor/definitions.py
+++ b/nncf/tensor/definitions.py
@@ -60,6 +60,7 @@ class TensorBackend(Enum):
 
     numpy = auto()
     torch = auto()
+    ov = auto()
 
 
 @dataclass
diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py
index 5a286a6fc13..9affab79c90 100644
--- a/nncf/tensor/functions/__init__.py
+++ b/nncf/tensor/functions/__init__.py
@@ -75,5 +75,8 @@ def _initialize_backends():
         import nncf.tensor.functions.torch_linalg
         import nncf.tensor.functions.torch_numeric  # noqa: F401
 
+    with contextlib.suppress(ImportError):
+        import nncf.tensor.functions.ov  # noqa: F401
+
 
 _initialize_backends()
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
new file mode 100644
index 00000000000..32bc615d30b
--- /dev/null
+++ b/nncf/tensor/functions/ov.py
@@ -0,0 +1,41 @@
+import numpy as np
+import openvino as ov
+
+from nncf.tensor import TensorDataType
+from nncf.tensor.functions import numeric
+
+from ..definitions import TensorBackend
+from .numpy_numeric import DTYPE_MAP as NP_DTYPE_MAP
+
+DTYPE_MAP = {
+    TensorDataType.float16: ov.Type.f16,
+    TensorDataType.bfloat16: ov.Type.bf16,
+    TensorDataType.float32: ov.Type.f32,
+    TensorDataType.float64: ov.Type.f64,
+    TensorDataType.int8: ov.Type.i8,
+    TensorDataType.int32: ov.Type.i32,
+    TensorDataType.int64: ov.Type.i64,
+    TensorDataType.uint8: ov.Type.u8,
+}
+
+DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()}
+
+
+@numeric.backend.register(ov.Tensor)
+def _(a: ov.Tensor) -> TensorBackend:
+    return TensorBackend.ov
+
+
+@numeric.astype.register(ov.Tensor)
+def _(a: ov.Tensor, dtype: TensorDataType) -> np.ndarray:
+    return a.data.astype(NP_DTYPE_MAP[dtype])
+
+
+@numeric.dtype.register(ov.Tensor)
+def _(a: ov.Tensor) -> TensorDataType:
+    return DTYPE_MAP_REV[a.get_element_type()]
+
+
+@numeric.size.register(ov.Tensor)
+def _(a: ov.Tensor) -> int:
+    return a.size
diff --git a/nncf/utils.py b/nncf/utils.py
new file mode 100644
index 00000000000..50a315e4048
--- /dev/null
+++ b/nncf/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+
+_openvino_available = importlib.util.find_spec("openvino") is not None
+_openvino_version = "N/A"
+if _openvino_available:
+    try:
+        from openvino.runtime import get_version
+
+        version = get_version()
+        # avoid invalid format
+        if "-" in version:
+            ov_major_version, dev_info = version.split("-", 1)
+            commit_id = dev_info.split("-")[0]
+            version = f"{ov_major_version}-{commit_id}"
+        _openvino_version = version
+    except ImportError:
+        _openvino_available = False
+
+
+def is_openvino_available():
+    return _openvino_available
diff --git a/run_weight_compression.py b/run_weight_compression.py
new file mode 100644
index 00000000000..0413034449d
--- /dev/null
+++ b/run_weight_compression.py
@@ -0,0 +1,373 @@
+import os
+import shutil
+import subprocess
+import threading
+import time
+from pathlib import Path
+
+
+def stream_handler(stream, target_file):
+    for line in iter(stream.readline, ''):
+        print(line, end='')
+        target_file.write(line)
+
+
+parent_model_dir = Path("/home/nsavel/workspace/openvino.genai/llm_bench/python/models")
+parent_log_dir = Path("compression_logs")
+
+experiment_params = [
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"),
+    #
+    #
+    #
+    #
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
+    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+    #
+    #
+    #
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"),
+    #
+    #
+    #
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
+]
+
+for model_dir, log_dir, params in experiment_params:
+    model_path = model_dir / "openvino_model.xml"
+    cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}"
+
+    log_dir.mkdir(parents=True, exist_ok=True)
+    with open(log_dir / "log.txt", "a") as log_file:
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+            universal_newlines=True,
+            preexec_fn=os.setsid,
+        )
+
+        stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file))
+        stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file))
+
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        process.wait()
+    time.sleep(10)
+
+evaluated_paths = set()
+for _, log_dir, _ in experiment_params:
+    for model_path in log_dir.rglob("**/*"):
+        model_path: Path
+        if model_path.suffix != ".xml":
+            continue
+        if model_path.absolute() in evaluated_paths:
+            continue
+        evaluated_paths.add(model_path.absolute())
+
+        model_dir = model_path.parent.absolute()
+        cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}"
+        process = subprocess.Popen(cmd, shell=True)
+        process.wait()
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index edc50652710..5d89c75e542 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -27,6 +27,7 @@
 from nncf.data.dataset import Dataset
 from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase
 from nncf.openvino.graph.node_utils import get_const_value
+from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE
 from nncf.parameters import BackupMode
 from nncf.quantization import compress_weights
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
@@ -35,7 +36,6 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
-from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
@@ -1038,8 +1038,8 @@ def test_np_ov_compression_decompression(mode):
     decompressed_weighs = decompressed_weighs.data
     zp_shape = zp.shape if zp is not None else None
 
-    compress = OVWeightCompressionAlgoBackend.get_compress_pipeline(config, w.shape, scale.shape, zp_shape)
-    compress_decompress = OVWeightCompressionAlgoBackend.get_compress_decompress_pipeline(
+    compress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(config, w.shape, scale.shape, zp_shape)
+    compress_decompress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_decompress_weight_primitive(
         config, w.shape, scale.shape, zp_shape
     )
 
diff --git a/weight_compression.py b/weight_compression.py
new file mode 100644
index 00000000000..bb6921e3558
--- /dev/null
+++ b/weight_compression.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gc
+import os
+import shutil
+import time
+from functools import partial
+from pathlib import Path
+
+import openvino as ov
+
+import nncf
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE
+from tools.memory_monitor import MemoryMonitor
+from tools.memory_monitor import MemoryType
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored")
+
+    parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved")
+
+    parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode")
+
+    parser.add_argument("--numpy", action="store_true", help="Enable numpy compression")
+
+    parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models")
+
+    parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression")
+
+    parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype")
+
+    parser.add_argument("--fp32-output", action="store_true", help="Output in fp32 instead of (u)int8")
+
+    parser.add_argument("--recompile", action="store_true", help="Recompile model every time")
+
+    parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs")
+
+    parser.add_argument("--save-model", action="store_true", help="Save compressed model")
+
+    parser.add_argument("--compare-with-numpy", action="store_true", help="Compare compressed weight with the one computed with NumPy")
+
+    parser.add_argument("--invert-numpy-division", action="store_true", help="Invert division when compressing with NumPy")
+
+    parser.add_argument("--release-memory", action="store_true", help="Release memory")
+
+    return parser.parse_args()
+
+
+def log(mm, fz, log_dir):
+    mm.save_memory_logs(
+        *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else ""
+    )
+
+
+def count_node_dtypes(model):
+    # Get the main dtype of weight constants
+    node_count_per_dtype = dict(f32=0, f16=0, bf16=0)
+    for node in model.get_ordered_ops():
+        friendly_name = node.get_friendly_name()
+        if node.get_type_name() != "Constant" or ".weight" not in friendly_name:
+            continue
+        const_dtype = node.get_element_type().get_type_name()
+        if const_dtype in node_count_per_dtype:
+            node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1
+    return node_count_per_dtype
+
+
+def main(args):
+    model_path = Path(args.model_path)
+    log_dir = Path(args.log_dir)
+
+    numpy_compression = args.numpy
+    dynamic_compression = args.dynamic
+    end_to_end_compression = args.end_to_end
+    input_dtype = args.input_dtype
+    fp32_output = args.fp32_output
+    recompile = args.recompile
+    share_outputs = args.share_outputs
+    save_model = args.save_model
+    compare_with_numpy = args.compare_with_numpy
+    invert_numpy_division = args.invert_numpy_division or compare_with_numpy
+    release_memory = args.release_memory
+
+    log_dir_suffix = f"{model_path.parent.name}_"
+    if numpy_compression:
+        log_dir_suffix = f"{log_dir_suffix}numpy"
+        if invert_numpy_division:
+            log_dir_suffix += "_inverted"
+    else:
+        log_dir_suffix = f"{log_dir_suffix}{'end-to-end_' if end_to_end_compression else ''}"
+        log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}"
+        log_dir_suffix = f"{log_dir_suffix}_{'output-fp32' if fp32_output else 'output-i8'}"
+        if input_dtype is not None:
+            log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}"
+        if recompile:
+            log_dir_suffix = f"{log_dir_suffix}_recompile"
+        if release_memory:
+            log_dir_suffix = f"{log_dir_suffix}_release-memory"
+        if share_outputs:
+            log_dir_suffix = f"{log_dir_suffix}_share-outputs"
+    print(f"Log dir suffix: {log_dir_suffix}")
+
+    memory_monitors = []
+    for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]:
+        memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0))
+        memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix))
+        memory_monitors.append(memory_monitor)
+
+    core = ov.Core()
+    # core.set_property({"ENABLE_MMAP": "NO"})
+    model = core.read_model(model_path)
+
+    node_count_per_dtype = count_node_dtypes(model)
+    assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type"
+    node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True)
+    model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]]
+
+    # Update input dtype based on model
+    input_dtype = input_dtype or model_dtype
+
+    os.environ["MODEL_PATH"] = str(model_path)
+    os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}"
+    os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}"
+    os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}"
+    os.environ["INPUT_DTYPE"] = input_dtype
+    os.environ["FP32_OUTPUT"] = f"{int(fp32_output)}"
+    os.environ["RECOMPILE"] = f"{int(recompile)}"
+    os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}"
+    os.environ["COMPARE_WITH_NUMPY"] = f"{int(compare_with_numpy)}"
+    os.environ["INVERT_NUMPY_DIVISION"] = f"{int(invert_numpy_division)}"
+    os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}"
+
+    start_time = time.perf_counter()
+    if args.compression_mode == "int8_asym":
+        compression_mode = nncf.CompressWeightsMode.INT8_ASYM
+    elif args.compression_mode == "int8_sym":
+        compression_mode = nncf.CompressWeightsMode.INT8_SYM
+    elif args.compression_mode == "int4_asym":
+        compression_mode = nncf.CompressWeightsMode.INT4_ASYM
+    elif args.compression_mode == "int4_sym":
+        compression_mode = nncf.CompressWeightsMode.INT4_SYM
+    else:
+        raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}")
+    compressed_model = nncf.compress_weights(model, mode=compression_mode)
+    compression_time = time.perf_counter() - start_time
+    print(f"Compression Time: {compression_time:.2f} sec.")
+
+    if save_model:
+        ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml")
+        for filepath in model_path.parent.glob("*.json"):
+            shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name))
+
+    del core
+    del model
+    del compressed_model
+    gc.collect()
+    time.sleep(0.5)
+
+    before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
+    if not COMPILED_MODEL_CACHE.is_empty():
+        COMPILED_MODEL_CACHE.clear()
+        gc.collect()
+        time.sleep(memory_monitors[0].interval * 10)
+        after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
+    else:
+        after_cache_deletion = before_cache_deletion
+    cache_size = before_cache_deletion - after_cache_deletion
+    print(f"Cache size: {cache_size:.2f} MiB")
+
+    time.sleep(memory_monitors[0].interval * 10)
+
+    leftover_memory = memory_monitors[2].get_data(True)[1][-1]
+    peak_memory = max(memory_monitors[2].get_data(True)[1])
+    print(f"Peak memory: {peak_memory:.2f} MiB")
+    print(f"Leftover memory: {leftover_memory:.2f} MiB")
+    print("Done")
+
+    csv_path = log_dir / "results.csv"
+    csv_exists = csv_path.exists()
+    csv_path.parent.mkdir(exist_ok=True, parents=True)
+    with open(csv_path, "a") as f:
+        if not csv_exists:
+            f.write(
+                "Model Path,"
+                "Model dtype,"
+                "Backend,"
+                "End to end,"
+                "Recompile,"
+                "Release memory,"
+                "Share outputs,"
+                "Input Shapes,"
+                "Input,"
+                "Output,"
+                "Compression Time,"
+                "Peak Memory,"
+                "Cache Size,"
+                "Leftover Memory"
+                "\n"
+            )
+        f.write(
+            f"{model_path},"
+            f"{model_dtype.upper()},"
+            f"{'NumPy' if numpy_compression else 'OV'},"
+            f"{'-' if numpy_compression else end_to_end_compression},"
+            f"{'-' if numpy_compression else recompile},"
+            f"{'-' if numpy_compression else release_memory},"
+            f"{'-' if numpy_compression else share_outputs},"
+            f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'},"
+            f"{'-' if numpy_compression else input_dtype.upper()},"
+            f"{'-' if numpy_compression else 'FP32' if fp32_output else 'INT8'},"
+            f"{compression_time:.2f},"
+            f"{peak_memory:.2f},"
+            f"{cache_size:.2f},"
+            f"{leftover_memory:.2f}"
+            f"\n"
+        )
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

From bd2629b181f7b5ab7b4642a58a3301edd3e9b57e Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 22 Oct 2024 16:26:23 +0200
Subject: [PATCH 02/73] Unstage helper scripts

---
 .../weight_compression/openvino_modeling.py   |  12 -
 .../weight_compression/scale_estimation.py    |   2 -
 .../weight_compression/weight_lowering.py     |  13 +-
 nncf/tensor/functions/ov.py                   |  11 +
 run_weight_compression.py                     | 373 ------------------
 weight_compression.py                         | 234 -----------
 6 files changed, 20 insertions(+), 625 deletions(-)
 delete mode 100644 run_weight_compression.py
 delete mode 100644 weight_compression.py

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index b4443970e30..afd31b8215c 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -9,18 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import inspect
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 1b4827038c9..e294c6e0f5d 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -307,7 +307,6 @@ def calculate_quantization_params(
             factor = 1.0 - 0.05 * scale_step
             scaled_scale = factor * scale
 
-            input_tensors[1] = scaled_scale.data
             if config.mode == CompressWeightsMode.NF4:
                 out = do_nf4_quantization(original_weight, scaled_scale)
             else:
@@ -319,7 +318,6 @@ def calculate_quantization_params(
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
             near_to_ideal_scale = near_to_ideal_scale * scale_sign
 
-            input_tensors[1] = near_to_ideal_scale.data
             if config.mode == CompressWeightsMode.NF4:
                 g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 08aff97d5cd..87fe07d569e 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -17,8 +17,9 @@
 import nncf
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, \
-    get_compress_decompress_weight_model, get_compress_weight_model
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
@@ -494,8 +495,12 @@ def do_int_quantization(
 
 
 def calculate_quantized_dequantized_weight(
-    weight: Tensor, config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None,
-    invert_division: Optional[bool] = False, ov_model_params: Optional[OVModelParameters] = None,
+    weight: Tensor,
+    config: WeightCompressionConfig,
+    scale: Tensor,
+    zero_point: Optional[Tensor] = None,
+    invert_division: Optional[bool] = False,
+    ov_model_params: Optional[OVModelParameters] = None,
 ) -> Tensor:
     accelerate_through_ov = is_openvino_available()
 
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index 32bc615d30b..cd094e7a0e0 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -1,3 +1,14 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import openvino as ov
 
diff --git a/run_weight_compression.py b/run_weight_compression.py
deleted file mode 100644
index 0413034449d..00000000000
--- a/run_weight_compression.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import os
-import shutil
-import subprocess
-import threading
-import time
-from pathlib import Path
-
-
-def stream_handler(stream, target_file):
-    for line in iter(stream.readline, ''):
-        print(line, end='')
-        target_file.write(line)
-
-
-parent_model_dir = Path("/home/nsavel/workspace/openvino.genai/llm_bench/python/models")
-parent_log_dir = Path("compression_logs")
-
-experiment_params = [
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/tiny-llama", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/phi3", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int8/llama3-8b", "--end-to-end --dynamic --release-memory --share-outputs"),
-    #
-    #
-    #
-    #
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym "),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym "),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym "),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --recompile --share-outputs"),
-    (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-    #
-    #
-    #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --end-to-end --release-memory"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--end-to-end --release-memory"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--end-to-end --release-memory"),
-    #
-    #
-    #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --end-to-end --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "recompile_vs_release-memory/int4/tiny-llama", "--save-model --compression-mode int4_asym --end-to-end"),
-]
-
-for model_dir, log_dir, params in experiment_params:
-    model_path = model_dir / "openvino_model.xml"
-    cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}"
-
-    log_dir.mkdir(parents=True, exist_ok=True)
-    with open(log_dir / "log.txt", "a") as log_file:
-        process = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True,
-            universal_newlines=True,
-            preexec_fn=os.setsid,
-        )
-
-        stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file))
-        stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file))
-
-        stdout_thread.start()
-        stderr_thread.start()
-
-        stdout_thread.join()
-        stderr_thread.join()
-
-        process.wait()
-    time.sleep(10)
-
-evaluated_paths = set()
-for _, log_dir, _ in experiment_params:
-    for model_path in log_dir.rglob("**/*"):
-        model_path: Path
-        if model_path.suffix != ".xml":
-            continue
-        if model_path.absolute() in evaluated_paths:
-            continue
-        evaluated_paths.add(model_path.absolute())
-
-        model_dir = model_path.parent.absolute()
-        cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}"
-        process = subprocess.Popen(cmd, shell=True)
-        process.wait()
diff --git a/weight_compression.py b/weight_compression.py
deleted file mode 100644
index bb6921e3558..00000000000
--- a/weight_compression.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import os
-import shutil
-import time
-from functools import partial
-from pathlib import Path
-
-import openvino as ov
-
-import nncf
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE
-from tools.memory_monitor import MemoryMonitor
-from tools.memory_monitor import MemoryType
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored")
-
-    parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved")
-
-    parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode")
-
-    parser.add_argument("--numpy", action="store_true", help="Enable numpy compression")
-
-    parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models")
-
-    parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression")
-
-    parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype")
-
-    parser.add_argument("--fp32-output", action="store_true", help="Output in fp32 instead of (u)int8")
-
-    parser.add_argument("--recompile", action="store_true", help="Recompile model every time")
-
-    parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs")
-
-    parser.add_argument("--save-model", action="store_true", help="Save compressed model")
-
-    parser.add_argument("--compare-with-numpy", action="store_true", help="Compare compressed weight with the one computed with NumPy")
-
-    parser.add_argument("--invert-numpy-division", action="store_true", help="Invert division when compressing with NumPy")
-
-    parser.add_argument("--release-memory", action="store_true", help="Release memory")
-
-    return parser.parse_args()
-
-
-def log(mm, fz, log_dir):
-    mm.save_memory_logs(
-        *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else ""
-    )
-
-
-def count_node_dtypes(model):
-    # Get the main dtype of weight constants
-    node_count_per_dtype = dict(f32=0, f16=0, bf16=0)
-    for node in model.get_ordered_ops():
-        friendly_name = node.get_friendly_name()
-        if node.get_type_name() != "Constant" or ".weight" not in friendly_name:
-            continue
-        const_dtype = node.get_element_type().get_type_name()
-        if const_dtype in node_count_per_dtype:
-            node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1
-    return node_count_per_dtype
-
-
-def main(args):
-    model_path = Path(args.model_path)
-    log_dir = Path(args.log_dir)
-
-    numpy_compression = args.numpy
-    dynamic_compression = args.dynamic
-    end_to_end_compression = args.end_to_end
-    input_dtype = args.input_dtype
-    fp32_output = args.fp32_output
-    recompile = args.recompile
-    share_outputs = args.share_outputs
-    save_model = args.save_model
-    compare_with_numpy = args.compare_with_numpy
-    invert_numpy_division = args.invert_numpy_division or compare_with_numpy
-    release_memory = args.release_memory
-
-    log_dir_suffix = f"{model_path.parent.name}_"
-    if numpy_compression:
-        log_dir_suffix = f"{log_dir_suffix}numpy"
-        if invert_numpy_division:
-            log_dir_suffix += "_inverted"
-    else:
-        log_dir_suffix = f"{log_dir_suffix}{'end-to-end_' if end_to_end_compression else ''}"
-        log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}"
-        log_dir_suffix = f"{log_dir_suffix}_{'output-fp32' if fp32_output else 'output-i8'}"
-        if input_dtype is not None:
-            log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}"
-        if recompile:
-            log_dir_suffix = f"{log_dir_suffix}_recompile"
-        if release_memory:
-            log_dir_suffix = f"{log_dir_suffix}_release-memory"
-        if share_outputs:
-            log_dir_suffix = f"{log_dir_suffix}_share-outputs"
-    print(f"Log dir suffix: {log_dir_suffix}")
-
-    memory_monitors = []
-    for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]:
-        memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0))
-        memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix))
-        memory_monitors.append(memory_monitor)
-
-    core = ov.Core()
-    # core.set_property({"ENABLE_MMAP": "NO"})
-    model = core.read_model(model_path)
-
-    node_count_per_dtype = count_node_dtypes(model)
-    assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type"
-    node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True)
-    model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]]
-
-    # Update input dtype based on model
-    input_dtype = input_dtype or model_dtype
-
-    os.environ["MODEL_PATH"] = str(model_path)
-    os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}"
-    os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}"
-    os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}"
-    os.environ["INPUT_DTYPE"] = input_dtype
-    os.environ["FP32_OUTPUT"] = f"{int(fp32_output)}"
-    os.environ["RECOMPILE"] = f"{int(recompile)}"
-    os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}"
-    os.environ["COMPARE_WITH_NUMPY"] = f"{int(compare_with_numpy)}"
-    os.environ["INVERT_NUMPY_DIVISION"] = f"{int(invert_numpy_division)}"
-    os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}"
-
-    start_time = time.perf_counter()
-    if args.compression_mode == "int8_asym":
-        compression_mode = nncf.CompressWeightsMode.INT8_ASYM
-    elif args.compression_mode == "int8_sym":
-        compression_mode = nncf.CompressWeightsMode.INT8_SYM
-    elif args.compression_mode == "int4_asym":
-        compression_mode = nncf.CompressWeightsMode.INT4_ASYM
-    elif args.compression_mode == "int4_sym":
-        compression_mode = nncf.CompressWeightsMode.INT4_SYM
-    else:
-        raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}")
-    compressed_model = nncf.compress_weights(model, mode=compression_mode)
-    compression_time = time.perf_counter() - start_time
-    print(f"Compression Time: {compression_time:.2f} sec.")
-
-    if save_model:
-        ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml")
-        for filepath in model_path.parent.glob("*.json"):
-            shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name))
-
-    del core
-    del model
-    del compressed_model
-    gc.collect()
-    time.sleep(0.5)
-
-    before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
-    if not COMPILED_MODEL_CACHE.is_empty():
-        COMPILED_MODEL_CACHE.clear()
-        gc.collect()
-        time.sleep(memory_monitors[0].interval * 10)
-        after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
-    else:
-        after_cache_deletion = before_cache_deletion
-    cache_size = before_cache_deletion - after_cache_deletion
-    print(f"Cache size: {cache_size:.2f} MiB")
-
-    time.sleep(memory_monitors[0].interval * 10)
-
-    leftover_memory = memory_monitors[2].get_data(True)[1][-1]
-    peak_memory = max(memory_monitors[2].get_data(True)[1])
-    print(f"Peak memory: {peak_memory:.2f} MiB")
-    print(f"Leftover memory: {leftover_memory:.2f} MiB")
-    print("Done")
-
-    csv_path = log_dir / "results.csv"
-    csv_exists = csv_path.exists()
-    csv_path.parent.mkdir(exist_ok=True, parents=True)
-    with open(csv_path, "a") as f:
-        if not csv_exists:
-            f.write(
-                "Model Path,"
-                "Model dtype,"
-                "Backend,"
-                "End to end,"
-                "Recompile,"
-                "Release memory,"
-                "Share outputs,"
-                "Input Shapes,"
-                "Input,"
-                "Output,"
-                "Compression Time,"
-                "Peak Memory,"
-                "Cache Size,"
-                "Leftover Memory"
-                "\n"
-            )
-        f.write(
-            f"{model_path},"
-            f"{model_dtype.upper()},"
-            f"{'NumPy' if numpy_compression else 'OV'},"
-            f"{'-' if numpy_compression else end_to_end_compression},"
-            f"{'-' if numpy_compression else recompile},"
-            f"{'-' if numpy_compression else release_memory},"
-            f"{'-' if numpy_compression else share_outputs},"
-            f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'},"
-            f"{'-' if numpy_compression else input_dtype.upper()},"
-            f"{'-' if numpy_compression else 'FP32' if fp32_output else 'INT8'},"
-            f"{compression_time:.2f},"
-            f"{peak_memory:.2f},"
-            f"{cache_size:.2f},"
-            f"{leftover_memory:.2f}"
-            f"\n"
-        )
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)

From 3e6925240ac5c6f04eedcb7203b6844e29979bc6 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 23 Oct 2024 17:14:06 +0200
Subject: [PATCH 03/73] WIP

---
 nncf/openvino/graph/node_utils.py             |   6 +-
 .../weight_compression/openvino_backend.py    |  44 +--
 .../weight_compression/openvino_modeling.py   | 107 +++---
 .../weight_compression/weight_lowering.py     |  51 ++-
 nncf/tensor/functions/numeric.py              |  21 ++
 run_weight_compression.py                     | 318 ++++++++++++++++++
 weight_compression.py                         | 209 ++++++++++++
 7 files changed, 662 insertions(+), 94 deletions(-)
 create mode 100644 run_weight_compression.py
 create mode 100644 weight_compression.py

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 8fab3933945..17213204268 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
     return cnt_if_op(model, 0)
 
 
-def get_const_value(const_node: ov.Node) -> np.ndarray:
+def get_const_value(const_node: ov.Node, bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
     """
     Returns the constant tensor for the node.
     This method is applicable only for the floating-point constant data.
@@ -115,9 +115,7 @@ def get_const_value(const_node: ov.Node) -> np.ndarray:
     :param const_node: OpenVINO node.
     :return: The constant value.
     """
-    INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32")
-    NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
-    if const_node.get_element_type() == ov.Type.bf16 and (INPUT_DTYPE != "bf16" or NUMPY_COMPRESSION):
+    if const_node.get_element_type() == ov.Type.bf16 and bf16_to_fp32:
         # Fixed FP32 data type as the result for BF16 constant
         return const_node.get_data(dtype=np.float32)
     return const_node.data
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index c00cb82a3f2..3caaaa1b4f9 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -14,6 +14,7 @@
 from openvino import Type
 from openvino.properties.hint import inference_precision
 from openvino.runtime import opset13 as opset
+from openvino.runtime.op import Constant
 
 import nncf
 from nncf.common.graph import NNCFGraph
@@ -49,7 +50,7 @@
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
-from nncf.tensor.definitions import TensorDataType
+from nncf.tensor.definitions import TensorDataType, TensorBackend
 
 
 class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
@@ -244,37 +245,19 @@ def _create_compression_subgraph(
         original_shape = weight.shape
         compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)
 
-        compressed_weight_data = compressed_weight.tensor.data
-        if isinstance(compressed_weight_data, ov.Tensor):
-            compressed_const = opset.constant(compressed_weight_data, name=const_node_name)
-        else:
-            compressed_const = opset.constant(compressed_weight_data, dtype=compression_dtype, name=const_node_name)
+        compressed_const = self._create_ov_const_from_tensor(compressed_weight.tensor, compression_dtype, name=const_node_name)
         if compressed_const.get_element_type() != compression_dtype:
             compressed_const = opset.convert(compressed_const, compression_dtype)
         converted_const = opset.convert(compressed_const, ov.Type.f16)
+
         if compressed_weight.zero_point is not None:
-            zero_point_data = compressed_weight.zero_point.data
-            if isinstance(zero_point_data, ov.Tensor):
-                zero_point_const = opset.constant(
-                    compressed_weight.zero_point.data,
-                    name=f"{const_node_name}/zero_point",
-                )
-            else:
-                zero_point_const = opset.constant(
-                    compressed_weight.zero_point.data,
-                    dtype=compression_dtype,
-                    name=f"{const_node_name}/zero_point",
-                )
+            zero_point_const = self._create_ov_const_from_tensor(compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point")
             zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
             converted_const = opset.subtract(
                 converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
             )
 
-        scale_data = compressed_weight.scale.data
-        if isinstance(scale_data, ov.Tensor):
-            scale_const = opset.constant(scale_data, name=f"{const_node_name}/scale")
-        else:
-            scale_const = opset.constant(scale_data, dtype=scale_dtype, name=f"{const_node_name}/scale")
+        scale_const = self._create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale")
         if scale_const.get_element_type() != ov.Type.f16:
             scale_const = opset.convert(scale_const, ov.Type.f16)
 
@@ -289,6 +272,8 @@ def _create_compression_subgraph(
 
         if should_add_convert_node:
             mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert")
+
+        # TODO: convert tensors inside compressed_weight to numpy backend if they are in ov backend
         return mul, compressed_weight
 
     def transform_model(
@@ -307,6 +292,10 @@ def transform_model(
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
             weight = Tensor(get_const_value(const_node))
+            # TODO: try to support bf16 by creating a Tensor with OV backend
+            # weight = Tensor(get_const_value(const_node, bf16_to_fp32=False))
+            # if const_dtype == ov.Type.bf16:
+            #     weight._is_bf16 = True
 
             should_add_convert_node = False
             if const_dtype != ov.Type.f16:
@@ -319,9 +308,6 @@ def transform_model(
             layer_zero_points = (
                 None if precomputed_zero_points is None else precomputed_zero_points.get(wc_params.weight_name)
             )
-            import os
-
-            os.environ["CURRENT_NODE_NAME"] = wc_params.weight_name
             mul, compressed_weight = self._create_compression_subgraph(
                 weight=weight,
                 compression_config=wc_params.compression_config,
@@ -353,6 +339,12 @@ def dump_parameters(
     ) -> None:
         dump_parameters(model, parameters, algo_name, path)
 
+    @staticmethod
+    def _create_ov_const_from_tensor(x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None) -> Constant:
+        if x.backend == TensorBackend.ov:
+            return opset.constant(x.data, name=name)
+        const = opset.constant(x.data, dtype=dtype, name=name)
+        return const
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index afd31b8215c..52f7c43a167 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -10,7 +10,9 @@
 # limitations under the License.
 
 import inspect
+import os
 from dataclasses import dataclass
+from functools import partial
 from typing import List, Optional, Tuple
 
 import numpy as np
@@ -20,18 +22,19 @@
 import nncf
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.tensor import TensorDataType, Tensor
 
 
 @dataclass
 class OVModelParameters:
+    input_dtype: TensorDataType
     dynamic: bool = False
     recompile: bool = False
     release_memory: bool = True
     share_outputs: bool = True
-    input_dtype: str = "fp32"
 
     def __hash__(self):
-        return hash((self.dynamic, self.recompile, self.release_memory, self.share_outputs, self.input_dtype))
+        return hash((self.input_dtype, self.dynamic, self.recompile, self.release_memory, self.share_outputs))
 
 
 class CompiledModelCache:
@@ -58,25 +61,44 @@ def wrapper(*args, **kwargs):
         new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
         new_kwargs.update(kwargs)
         cache_key = (func.__name__, frozenset(new_kwargs.items()))
-        recompile = new_kwargs.get("ov_model_params", OVModelParameters()).recompile
         cache = COMPILED_MODEL_CACHE._cache
-        if not recompile and cache_key in cache:
+        if cache_key in cache:
             return cache[cache_key]
         result = func(*args, **kwargs)
-        cache[cache_key] = result
+        recompile = new_kwargs["ov_model_params"].recompile
+        if not recompile:
+            cache[cache_key] = result
         return result
 
     return wrapper
 
 
-@cache_results
+def run_model(ov_model_params, compiled_model, inputs):
+    # Returns results as numpy tensors
+    outputs = compiled_model(inputs, share_outputs=ov_model_params.share_outputs)
+    outputs = [Tensor(outputs[i]) for i in range(len(outputs))]
+    if ov_model_params.release_memory:
+        compiled_model.release_memory()
+    return outputs
+
+
+def run_model_via_infer_request(ov_model_params, compiled_model, inputs):
+    # Returns results as ov tensors
+    infer_request = compiled_model.create_infer_request()
+    infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs)
+    outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))]
+    if ov_model_params.release_memory:
+        compiled_model.release_memory()
+    return outputs
+
+
 def get_compress_weight_model(
+    ov_model_params: OVModelParameters,
     config: WeightCompressionConfig,
     weight_shape: Tuple,
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
-    ov_model_params: Optional[OVModelParameters] = None,
 ):
     if scale_shape is None and zero_point_shape is not None:
         raise Exception("Zero point shape can only be provided if scale shape is provided.")
@@ -101,16 +123,13 @@ def get_compress_weight_model(
     )
 
 
-@cache_results
 def get_compress_decompress_weight_model(
+    ov_model_params: OVModelParameters,
     config: WeightCompressionConfig,
     weight_shape: Tuple,
     scale_shape: Optional[Tuple],
     zero_point_shape: Optional[Tuple] = None,
-    ov_model_params: Optional[OVModelParameters] = None,
 ):
-    if ov_model_params is None:
-        ov_model_params = OVModelParameters()
     if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
         ov_model_params.dynamic = False
 
@@ -129,24 +148,7 @@ def get_compress_decompress_weight_model(
     )
 
 
-def _build_compress_decompress_model(
-    config: WeightCompressionConfig,
-    ov_model_params: OVModelParameters,
-    weight_shape: Tuple,
-    scale_shape: Tuple,
-    zero_point_shape: Optional[Tuple] = None,
-):
-    ov_parameters, ov_results = _build_compress_model(
-        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
-    )
-    return _get_compress_decompress_model(
-        config,
-        ov_model_params,
-        ov_parameters,
-        ov_results,
-    )
-
-
+@cache_results
 def _build_compress_model(
     config: WeightCompressionConfig,
     ov_model_params: OVModelParameters,
@@ -156,11 +158,11 @@ def _build_compress_model(
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
 ):
-    if ov_model_params.input_dtype == "fp32":
+    if ov_model_params.input_dtype == TensorDataType.float32:
         input_dtype = ov.Type.f32
-    elif ov_model_params.input_dtype == "fp16":
+    elif ov_model_params.input_dtype == TensorDataType.float16:
         input_dtype = ov.Type.f16
-    elif ov_model_params.input_dtype == "bf16":
+    elif ov_model_params.input_dtype == TensorDataType.bfloat16:
         input_dtype = ov.Type.bf16
     else:
         raise Exception
@@ -243,6 +245,25 @@ def _build_compress_model(
     )
 
 
+@cache_results
+def _build_compress_decompress_model(
+    config: WeightCompressionConfig,
+    ov_model_params: OVModelParameters,
+    weight_shape: Tuple,
+    scale_shape: Tuple,
+    zero_point_shape: Optional[Tuple] = None,
+):
+    ov_parameters, ov_results = _build_compress_model(
+        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
+    )
+    return _get_compress_decompress_model(
+        config,
+        ov_model_params,
+        ov_parameters,
+        ov_results,
+    )
+
+
 def _get_compress_model(
     config: WeightCompressionConfig,
     ov_model_params: OVModelParameters,
@@ -287,15 +308,8 @@ def _get_compress_model(
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    def infer(inputs):
-        infer_request = compiled_model.create_infer_request()
-        infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs)
-        outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
-        if ov_model_params.release_memory:
-            compiled_model.release_memory()
-        return outputs
-
-    return infer
+    run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model
+    return partial(run_fn, ov_model_params, compiled_model)
 
 
 def _get_compress_decompress_model(
@@ -322,12 +336,5 @@ def _get_compress_decompress_model(
     model = ov.Model([decompressed_w], parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    def infer(inputs):
-        infer_request = compiled_model.create_infer_request()
-        infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs)
-        outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
-        if ov_model_params.release_memory:
-            compiled_model.release_memory()
-        return outputs
-
-    return infer
+    run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model
+    return partial(run_fn, ov_model_params, compiled_model)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 87fe07d569e..304d554b051 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -8,13 +8,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import logging
+import os
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
 import numpy as np
 
 import nncf
+from nncf.common.logging.logger import log_once
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
@@ -23,7 +25,7 @@
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
-from nncf.tensor.definitions import TensorDataType
+from nncf.tensor.definitions import TensorDataType, TensorBackend
 from nncf.utils import is_openvino_available
 
 ReductionAxes = Tuple[int, ...]
@@ -430,7 +432,7 @@ def do_int_dequantization(
 
 def do_int_quantization(
     weight: Tensor,
-    reduction_axes: Tuple[int, ...],
+    reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
@@ -439,9 +441,12 @@ def do_int_quantization(
 ):
     assert config.is_integer(), "The function supports integer quantization only"
 
-    accelerate_through_ov = is_openvino_available()
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    if not is_openvino_available() and weight.backend != TensorBackend.torch:
+        log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
     if not accelerate_through_ov:
+        # Reference implementation
         group_size = config.group_size
 
         if weight.dtype != TensorDataType.float32:
@@ -462,30 +467,40 @@ def do_int_quantization(
         compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
         return compressed_weights, scale, zero_point
 
+    import openvino as ov
+
     weight_shape = weight.shape
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
+    is_bf16 = getattr(weight, "_is_bf16", False)
+    input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype
     if ov_model_params is None:
-        ov_model_params = OVModelParameters()
+        # ov_model_params = OVModelParameters(input_dtype)
+        ov_model_params = OVModelParameters(
+            input_dtype,
+            dynamic=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
+            recompile=bool(int(os.environ.get("RECOMPILE", "0"))),
+            release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
+            share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))),
+        )
     # TODO: Try reshaping weight before inputing it to the model
     if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
         ov_model_params.dynamic = False
-
     model = get_compress_weight_model(
+        ov_model_params,
         config,
         weight_shape,
         scale_shape,
         zero_point_shape,
         reduction_axes,
-        ov_model_params,
     )
 
+    weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data
     if precomputed_scale is None:
-        results = model(weight.data)
-        compressed_weight, scale, zero_point = [Tensor(it) for it in results]
+        compressed_weight, scale, zero_point = model(weight_data)
     else:
-        inputs = [weight.data, precomputed_scale.data]
+        inputs = [weight_data, precomputed_scale.data]
         if precomputed_zero_point is not None:
             inputs += [precomputed_zero_point.data]
         compressed_weight = Tensor(model(inputs)[0])
@@ -502,25 +517,33 @@ def calculate_quantized_dequantized_weight(
     invert_division: Optional[bool] = False,
     ov_model_params: Optional[OVModelParameters] = None,
 ) -> Tensor:
-    accelerate_through_ov = is_openvino_available()
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    if not is_openvino_available() and weight.backend != TensorBackend.torch:
+        log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
     if not accelerate_through_ov:
+        # Reference implementation
         compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
         decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
         return decompressed_weight
 
+    import openvino as ov
+
     weight_shape = weight.shape
     scale_shape = scale.shape
     zero_point_shape = None if zero_point is None else zero_point.shape
 
+    is_bf16 = getattr(weight, "_is_bf16", False)
+    input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype
     if ov_model_params is None:
-        ov_model_params = OVModelParameters()
+        ov_model_params = OVModelParameters(input_dtype)
     if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
         ov_model_params.dynamic = False
 
-    model = get_compress_decompress_weight_model(config, weight_shape, scale_shape, zero_point_shape, ov_model_params)
+    model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape)
 
-    inputs = [weight.data, scale.data]
+    weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data
+    inputs = [weight_data, scale.data]
     if zero_point is not None:
         inputs.append(zero_point.data)
     results = model(inputs)
diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py
index 061d1ee6e66..715c963bf89 100644
--- a/nncf/tensor/functions/numeric.py
+++ b/nncf/tensor/functions/numeric.py
@@ -130,6 +130,27 @@ def astype(a: Tensor, data_type: TensorDataType) -> Tensor:
 
     :return: Copy of the tensor in specified type.
     """
+    # is_bf16 = getattr(a, "_is_bf16", False)
+    # if is_bf16:
+    #     def bf16_to_fp32_v2(x):
+    #         # Step 1: Interpret the float16 data as uint16 to access the raw bits
+    #         custom16_bits = x.view(np.uint16)  # Keep as uint16
+    #
+    #         # Step 2: Allocate uint32 to hold the result (in-place modification in original variable)
+    #         custom16_bits = custom16_bits.astype(np.uint32)  # Cast to uint32 for safe shifting
+    #
+    #         # Step 3: Extract and shift sign, exponent, and fraction directly into custom16_bits
+    #         custom16_bits = (((custom16_bits & 0x8000) << 16) |  # Extract and move sign bit to bit 31
+    #                          ((custom16_bits & 0x7F80) << 16) |  # Extract and move exponent to bits 30-23
+    #                          ((custom16_bits & 0x007F) << 16))   # Extract and move fraction to bits 22-0
+    #
+    #         # Step 4: Interpret the resulting 32-bit integers as float32
+    #         float32_array = custom16_bits.view(np.float32)
+    #
+    #         return float32_array
+    #
+    #     fp32_data = bf16_to_fp32_v2(a.data)
+    #     Tensor(astype(fp32_data, data_type))
     return Tensor(astype(a.data, data_type))
 
 
diff --git a/run_weight_compression.py b/run_weight_compression.py
new file mode 100644
index 00000000000..2d7211effc4
--- /dev/null
+++ b/run_weight_compression.py
@@ -0,0 +1,318 @@
+import os
+import shutil
+import subprocess
+import threading
+import time
+from pathlib import Path
+
+
+def stream_handler(stream, target_file):
+    for line in iter(stream.readline, ''):
+        print(line, end='')
+        target_file.write(line)
+
+
+parent_model_dir = Path("/home/nsavel/workspace/models/hf")
+parent_log_dir = Path("compression_logs")
+
+experiment_params = [
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", ""),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+
+
+
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"),
+    #
+    #
+    #
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym "),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+]
+
+for model_dir, log_dir, params in experiment_params:
+    model_path = model_dir / "openvino_model.xml"
+    cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}"
+
+    log_dir.mkdir(parents=True, exist_ok=True)
+    with open(log_dir / "log.txt", "a") as log_file:
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+            universal_newlines=True,
+            preexec_fn=os.setsid,
+        )
+
+        stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file))
+        stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file))
+
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        process.wait()
+    time.sleep(10)
+
+evaluated_paths = set()
+for _, log_dir, _ in experiment_params:
+    for model_path in log_dir.rglob("**/*"):
+        model_path: Path
+        if model_path.suffix != ".xml":
+            continue
+        if model_path.absolute() in evaluated_paths:
+            continue
+        evaluated_paths.add(model_path.absolute())
+
+        model_dir = model_path.parent.absolute()
+        cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}"
+        process = subprocess.Popen(cmd, shell=True)
+        process.wait()
diff --git a/weight_compression.py b/weight_compression.py
new file mode 100644
index 00000000000..54ce0690238
--- /dev/null
+++ b/weight_compression.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gc
+import os
+import shutil
+import time
+from functools import partial
+from pathlib import Path
+
+import openvino as ov
+
+import nncf
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE
+from tools.memory_monitor import MemoryMonitor
+from tools.memory_monitor import MemoryType
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored")
+
+    parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved")
+
+    parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode")
+
+    parser.add_argument("--numpy", action="store_true", help="Enable numpy compression")
+
+    parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models")
+
+    parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype")
+
+    parser.add_argument("--recompile", action="store_true", help="Recompile model every time")
+
+    parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs")
+
+    parser.add_argument("--save-model", action="store_true", help="Save compressed model")
+
+    parser.add_argument("--release-memory", action="store_true", help="Release memory")
+
+    return parser.parse_args()
+
+
+def log(mm, fz, log_dir):
+    mm.save_memory_logs(
+        *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else ""
+    )
+
+
+def count_node_dtypes(model):
+    # Get the main dtype of weight constants
+    node_count_per_dtype = dict(f32=0, f16=0, bf16=0)
+    for node in model.get_ordered_ops():
+        friendly_name = node.get_friendly_name()
+        if node.get_type_name() != "Constant" or ".weight" not in friendly_name:
+            continue
+        const_dtype = node.get_element_type().get_type_name()
+        if const_dtype in node_count_per_dtype:
+            node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1
+    return node_count_per_dtype
+
+
+def main(args):
+    model_path = Path(args.model_path)
+    log_dir = Path(args.log_dir)
+
+    numpy_compression = args.numpy
+    dynamic_compression = args.dynamic
+    input_dtype = args.input_dtype
+    recompile = args.recompile
+    share_outputs = args.share_outputs
+    save_model = args.save_model
+    release_memory = args.release_memory
+
+    log_dir_suffix = f"{model_path.parent.name}_"
+    if numpy_compression:
+        log_dir_suffix = f"{log_dir_suffix}numpy"
+    else:
+        log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}"
+        if input_dtype is not None:
+            log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}"
+        if recompile:
+            log_dir_suffix = f"{log_dir_suffix}_recompile"
+        if release_memory:
+            log_dir_suffix = f"{log_dir_suffix}_release-memory"
+        if share_outputs:
+            log_dir_suffix = f"{log_dir_suffix}_share-outputs"
+    print(f"Log dir suffix: {log_dir_suffix}")
+
+    memory_monitors = []
+    for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]:
+        memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0))
+        memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix))
+        memory_monitors.append(memory_monitor)
+
+    core = ov.Core()
+    # core.set_property({"ENABLE_MMAP": "NO"})
+    model = core.read_model(model_path)
+
+    node_count_per_dtype = count_node_dtypes(model)
+    assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type"
+    node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True)
+    model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]]
+
+    # Update input dtype based on model
+    input_dtype = input_dtype or model_dtype
+
+    os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}"
+    os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}"
+    os.environ["INPUT_DTYPE"] = input_dtype
+    os.environ["RECOMPILE"] = f"{int(recompile)}"
+    os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}"
+    os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}"
+
+    start_time = time.perf_counter()
+    if args.compression_mode == "int8_asym":
+        compression_mode = nncf.CompressWeightsMode.INT8_ASYM
+    elif args.compression_mode == "int8_sym":
+        compression_mode = nncf.CompressWeightsMode.INT8_SYM
+    elif args.compression_mode == "int4_asym":
+        compression_mode = nncf.CompressWeightsMode.INT4_ASYM
+    elif args.compression_mode == "int4_sym":
+        compression_mode = nncf.CompressWeightsMode.INT4_SYM
+    else:
+        raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}")
+    compressed_model = nncf.compress_weights(model, mode=compression_mode)
+    compression_time = time.perf_counter() - start_time
+    print(f"Compression Time: {compression_time:.2f} sec.")
+
+    if save_model:
+        ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml")
+        for filepath in model_path.parent.glob("*.json"):
+            shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name))
+
+    del core
+    del model
+    del compressed_model
+    gc.collect()
+    time.sleep(0.5)
+
+    before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
+    if not COMPILED_MODEL_CACHE.is_empty():
+        COMPILED_MODEL_CACHE.clear()
+        gc.collect()
+        time.sleep(memory_monitors[0].interval * 10)
+        after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
+    else:
+        after_cache_deletion = before_cache_deletion
+    cache_size = before_cache_deletion - after_cache_deletion
+    print(f"Cache size: {cache_size:.2f} MiB")
+
+    time.sleep(memory_monitors[0].interval * 10)
+
+    leftover_memory = memory_monitors[2].get_data(True)[1][-1]
+    peak_memory = max(memory_monitors[2].get_data(True)[1])
+    print(f"Peak memory: {peak_memory:.2f} MiB")
+    print(f"Leftover memory: {leftover_memory:.2f} MiB")
+    print("Done")
+
+    csv_path = log_dir / "results.csv"
+    csv_exists = csv_path.exists()
+    csv_path.parent.mkdir(exist_ok=True, parents=True)
+    with open(csv_path, "a") as f:
+        if not csv_exists:
+            f.write(
+                "Model Path,"
+                "Model dtype,"
+                "Backend,"
+                "Recompile,"
+                "Release memory,"
+                "Share outputs,"
+                "Input Shapes,"
+                "Input,"
+                "Compression Time,"
+                "Peak Memory,"
+                "Cache Size,"
+                "Leftover Memory"
+                "\n"
+            )
+        f.write(
+            f"{model_path},"
+            f"{model_dtype.upper()},"
+            f"{'NumPy' if numpy_compression else 'OV'},"
+            f"{'-' if numpy_compression else recompile},"
+            f"{'-' if numpy_compression else release_memory},"
+            f"{'-' if numpy_compression else share_outputs},"
+            f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'},"
+            f"{'-' if numpy_compression else input_dtype.upper()},"
+            f"{compression_time:.2f},"
+            f"{peak_memory:.2f},"
+            f"{cache_size:.2f},"
+            f"{leftover_memory:.2f}"
+            f"\n"
+        )
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)

From 166dd04c8670f05fab61822b3b4543d1744596e9 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 24 Oct 2024 13:41:47 +0200
Subject: [PATCH 04/73] Reshape weights beforehand

---
 .../weight_compression/openvino_modeling.py   |  23 -
 .../weight_compression/weight_lowering.py     |  13 +-
 run_weight_compression.py                     | 422 +++++++++---------
 weight_compression.py                         |   1 +
 4 files changed, 226 insertions(+), 233 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 52f7c43a167..064c842ef6f 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -130,9 +130,6 @@ def get_compress_decompress_weight_model(
     scale_shape: Optional[Tuple],
     zero_point_shape: Optional[Tuple] = None,
 ):
-    if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
-        ov_model_params.dynamic = False
-
     if ov_model_params.dynamic:
         weight_shape = (-1,) * len(weight_shape)
         scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
@@ -182,26 +179,6 @@ def _build_compress_model(
     else:
         # Compute compressed weight, scale and, possibly, zero point
 
-        group_size = config.group_size
-        if group_size != -1:
-            if isinstance(reduction_axes, tuple) and len(reduction_axes) == 1:
-                reduction_axes = reduction_axes[0]
-            if not isinstance(reduction_axes, int):
-                raise NotImplementedError(
-                    f"Group-wise quantization expects a single reduction axis, but given: {reduction_axes}."
-                )
-            channel_size = weight.shape[reduction_axes]
-            if channel_size % group_size != 0:
-                raise nncf.ValidationError(
-                    f"Channel size {channel_size} should be divisible by size of group {group_size}"
-                )
-
-            num_groups_per_channel = channel_size // group_size
-            shape = list(weight.shape)  # [a1, r, a2] - "r" refers to number of channels along reduction axis
-            shape[reduction_axes : reduction_axes + 1] = (num_groups_per_channel, group_size)
-            weight = opset.reshape(weight, shape, special_zero=False)
-            reduction_axes += 1
-
         mode = config.mode
         num_bits = config.num_bits
         eps = np.finfo(np.float32).eps
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 304d554b051..88b3a7358b7 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -445,17 +445,16 @@ def do_int_quantization(
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
+    if config.group_size != -1:
+        # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
+        weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
+
     if not accelerate_through_ov:
         # Reference implementation
-        group_size = config.group_size
 
         if weight.dtype != TensorDataType.float32:
             weight = weight.astype(TensorDataType.float32)
 
-        if group_size != -1:
-            # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
-            weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
-
         scale, zero_point = None, None
         if precomputed_zero_point is None or precomputed_zero_point is None:
             scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config)
@@ -484,9 +483,7 @@ def do_int_quantization(
             release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
             share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))),
         )
-    # TODO: Try reshaping weight before inputing it to the model
-    if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
-        ov_model_params.dynamic = False
+
     model = get_compress_weight_model(
         ov_model_params,
         config,
diff --git a/run_weight_compression.py b/run_weight_compression.py
index 2d7211effc4..d7eefec79ab 100644
--- a/run_weight_compression.py
+++ b/run_weight_compression.py
@@ -16,209 +16,227 @@ def stream_handler(stream, target_file):
 parent_log_dir = Path("compression_logs")
 
 experiment_params = [
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", ""),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", ""),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", ""),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
     #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
     #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-
-
-
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+
+
+
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
     #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
     #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym "),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym "),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
 
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
 
 
 
@@ -269,10 +287,10 @@ def stream_handler(stream, target_file):
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_23102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
 ]
 
 for model_dir, log_dir, params in experiment_params:
diff --git a/weight_compression.py b/weight_compression.py
index 54ce0690238..5bfc3bd24d7 100644
--- a/weight_compression.py
+++ b/weight_compression.py
@@ -133,6 +133,7 @@ def main(args):
         compression_mode = nncf.CompressWeightsMode.INT4_SYM
     else:
         raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}")
+    # TODO: Consider all_layers=True?
     compressed_model = nncf.compress_weights(model, mode=compression_mode)
     compression_time = time.perf_counter() - start_time
     print(f"Compression Time: {compression_time:.2f} sec.")

From edbe913558be883f67523dc57f49d65f47367315 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 13:38:28 +0200
Subject: [PATCH 05/73] BF16 support

---
 nncf/openvino/graph/node_utils.py             |  5 +-
 .../weight_compression/openvino_backend.py    | 34 ++++++++++----
 .../weight_compression/openvino_modeling.py   |  3 +-
 .../weight_compression/weight_lowering.py     | 47 +++++++++++--------
 nncf/tensor/functions/numeric.py              | 27 +++--------
 nncf/tensor/functions/ov.py                   | 38 ++++++++++++++-
 nncf/tensor/tensor.py                         |  3 ++
 7 files changed, 101 insertions(+), 56 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 17213204268..39056d65af5 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
     return cnt_if_op(model, 0)
 
 
-def get_const_value(const_node: ov.Node, bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
+def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
     """
     Returns the constant tensor for the node.
     This method is applicable only for the floating-point constant data.
@@ -115,8 +115,7 @@ def get_const_value(const_node: ov.Node, bf16_to_fp32: Optional[bool] = True) ->
     :param const_node: OpenVINO node.
     :return: The constant value.
     """
-    if const_node.get_element_type() == ov.Type.bf16 and bf16_to_fp32:
-        # Fixed FP32 data type as the result for BF16 constant
+    if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32:
         return const_node.get_data(dtype=np.float32)
     return const_node.data
 
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 3caaaa1b4f9..49924ead6f5 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -50,7 +50,8 @@
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
-from nncf.tensor.definitions import TensorDataType, TensorBackend
+from nncf.tensor.definitions import TensorBackend
+from nncf.tensor.definitions import TensorDataType
 
 
 class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
@@ -245,19 +246,25 @@ def _create_compression_subgraph(
         original_shape = weight.shape
         compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)
 
-        compressed_const = self._create_ov_const_from_tensor(compressed_weight.tensor, compression_dtype, name=const_node_name)
+        compressed_const = self._create_ov_const_from_tensor(
+            compressed_weight.tensor, compression_dtype, name=const_node_name
+        )
         if compressed_const.get_element_type() != compression_dtype:
             compressed_const = opset.convert(compressed_const, compression_dtype)
         converted_const = opset.convert(compressed_const, ov.Type.f16)
 
         if compressed_weight.zero_point is not None:
-            zero_point_const = self._create_ov_const_from_tensor(compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point")
+            zero_point_const = self._create_ov_const_from_tensor(
+                compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point"
+            )
             zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
             converted_const = opset.subtract(
                 converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
             )
 
-        scale_const = self._create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale")
+        scale_const = self._create_ov_const_from_tensor(
+            compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale"
+        )
         if scale_const.get_element_type() != ov.Type.f16:
             scale_const = opset.convert(scale_const, ov.Type.f16)
 
@@ -291,11 +298,10 @@ def transform_model(
             const_node = self.name_to_node_mapping[const_node_name]
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
-            weight = Tensor(get_const_value(const_node))
-            # TODO: try to support bf16 by creating a Tensor with OV backend
-            # weight = Tensor(get_const_value(const_node, bf16_to_fp32=False))
-            # if const_dtype == ov.Type.bf16:
-            #     weight._is_bf16 = True
+            weight = get_const_value(const_node, cast_bf16_to_fp32=False)
+            if const_dtype == ov.Type.bf16:
+                weight = ov.Tensor(weight, weight.shape, ov.Type.bf16)
+            weight = Tensor(weight)
 
             should_add_convert_node = False
             if const_dtype != ov.Type.f16:
@@ -325,6 +331,11 @@ def transform_model(
                 target_input.replace_source_output(mul_output)
 
             if lora_correction_algo is not None and lora_correction_algo.is_applicable(wc_params):
+                if weight.backend == TensorBackend.ov:
+                    if weight.dtype == TensorDataType.bfloat16:
+                        weight = weight.astype(TensorDataType.float32)
+                    weight = weight.to_backend(TensorBackend.numpy)
+                # TODO: cast int4 ov tensor too?
                 adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params)
                 self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters)
 
@@ -340,12 +351,15 @@ def dump_parameters(
         dump_parameters(model, parameters, algo_name, path)
 
     @staticmethod
-    def _create_ov_const_from_tensor(x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None) -> Constant:
+    def _create_ov_const_from_tensor(
+        x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None
+    ) -> Constant:
         if x.backend == TensorBackend.ov:
             return opset.constant(x.data, name=name)
         const = opset.constant(x.data, dtype=dtype, name=name)
         return const
 
+
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod
     def get_awq_patterns():
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 064c842ef6f..cd7fb0d3ede 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -22,7 +22,8 @@
 import nncf
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.tensor import TensorDataType, Tensor
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
 
 
 @dataclass
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 88b3a7358b7..765ada30041 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -25,7 +25,8 @@
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
-from nncf.tensor.definitions import TensorDataType, TensorBackend
+from nncf.tensor.definitions import TensorBackend
+from nncf.tensor.definitions import TensorDataType
 from nncf.utils import is_openvino_available
 
 ReductionAxes = Tuple[int, ...]
@@ -378,6 +379,11 @@ def compress_weight(
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
     if not config.is_integer():
+        if weight.backend == TensorBackend.ov:
+            if weight.dtype == TensorDataType.bfloat16:
+                weight = weight.astype(TensorDataType.float32)
+            weight = weight.to_backend(TensorBackend.numpy)
+
         compressed_weight, scale = calculate_normalized_weight_and_fp4_scale(
             weight, reduction_axes, config.group_size, precomputed_scale, config.mode
         )
@@ -441,7 +447,11 @@ def do_int_quantization(
 ):
     assert config.is_integer(), "The function supports integer quantization only"
 
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    accelerate_through_ov = (
+        is_openvino_available()
+        and weight.backend != TensorBackend.torch
+        and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    )
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
@@ -452,6 +462,11 @@ def do_int_quantization(
     if not accelerate_through_ov:
         # Reference implementation
 
+        if weight.backend == TensorBackend.ov:
+            if weight.dtype == TensorDataType.bfloat16:
+                weight = weight.astype(TensorDataType.float32)
+            weight = weight.to_backend(TensorBackend.numpy)
+
         if weight.dtype != TensorDataType.float32:
             weight = weight.astype(TensorDataType.float32)
 
@@ -466,18 +481,14 @@ def do_int_quantization(
         compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
         return compressed_weights, scale, zero_point
 
-    import openvino as ov
-
     weight_shape = weight.shape
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
-    is_bf16 = getattr(weight, "_is_bf16", False)
-    input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype
     if ov_model_params is None:
-        # ov_model_params = OVModelParameters(input_dtype)
+        # ov_model_params = OVModelParameters(weight.dtype)
         ov_model_params = OVModelParameters(
-            input_dtype,
+            weight.dtype,
             dynamic=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
             recompile=bool(int(os.environ.get("RECOMPILE", "0"))),
             release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
@@ -493,11 +504,10 @@ def do_int_quantization(
         reduction_axes,
     )
 
-    weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data
     if precomputed_scale is None:
-        compressed_weight, scale, zero_point = model(weight_data)
+        compressed_weight, scale, zero_point = model(weight.data)
     else:
-        inputs = [weight_data, precomputed_scale.data]
+        inputs = [weight.data, precomputed_scale.data]
         if precomputed_zero_point is not None:
             inputs += [precomputed_zero_point.data]
         compressed_weight = Tensor(model(inputs)[0])
@@ -514,7 +524,11 @@ def calculate_quantized_dequantized_weight(
     invert_division: Optional[bool] = False,
     ov_model_params: Optional[OVModelParameters] = None,
 ) -> Tensor:
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    accelerate_through_ov = (
+        is_openvino_available()
+        and weight.backend != TensorBackend.torch
+        and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    )
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
@@ -524,23 +538,18 @@ def calculate_quantized_dequantized_weight(
         decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
         return decompressed_weight
 
-    import openvino as ov
-
     weight_shape = weight.shape
     scale_shape = scale.shape
     zero_point_shape = None if zero_point is None else zero_point.shape
 
-    is_bf16 = getattr(weight, "_is_bf16", False)
-    input_dtype = TensorDataType.bfloat16 if is_bf16 else weight.dtype
     if ov_model_params is None:
-        ov_model_params = OVModelParameters(input_dtype)
+        ov_model_params = OVModelParameters(weight.dtype)
     if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
         ov_model_params.dynamic = False
 
     model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape)
 
-    weight_data = ov.Tensor(weight.data, weight.data.shape, ov.Type.bf16) if is_bf16 else weight.data
-    inputs = [weight_data, scale.data]
+    inputs = [weight.data, scale.data]
     if zero_point is not None:
         inputs.append(zero_point.data)
     results = model(inputs)
diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py
index 715c963bf89..cdec5788bf6 100644
--- a/nncf/tensor/functions/numeric.py
+++ b/nncf/tensor/functions/numeric.py
@@ -130,27 +130,6 @@ def astype(a: Tensor, data_type: TensorDataType) -> Tensor:
 
     :return: Copy of the tensor in specified type.
     """
-    # is_bf16 = getattr(a, "_is_bf16", False)
-    # if is_bf16:
-    #     def bf16_to_fp32_v2(x):
-    #         # Step 1: Interpret the float16 data as uint16 to access the raw bits
-    #         custom16_bits = x.view(np.uint16)  # Keep as uint16
-    #
-    #         # Step 2: Allocate uint32 to hold the result (in-place modification in original variable)
-    #         custom16_bits = custom16_bits.astype(np.uint32)  # Cast to uint32 for safe shifting
-    #
-    #         # Step 3: Extract and shift sign, exponent, and fraction directly into custom16_bits
-    #         custom16_bits = (((custom16_bits & 0x8000) << 16) |  # Extract and move sign bit to bit 31
-    #                          ((custom16_bits & 0x7F80) << 16) |  # Extract and move exponent to bits 30-23
-    #                          ((custom16_bits & 0x007F) << 16))   # Extract and move fraction to bits 22-0
-    #
-    #         # Step 4: Interpret the resulting 32-bit integers as float32
-    #         float32_array = custom16_bits.view(np.float32)
-    #
-    #         return float32_array
-    #
-    #     fp32_data = bf16_to_fp32_v2(a.data)
-    #     Tensor(astype(fp32_data, data_type))
     return Tensor(astype(a.data, data_type))
 
 
@@ -926,3 +905,9 @@ def ceil(a: Tensor) -> Tensor:
     :return: An array of the same type as a, containing the ceiling values.
     """
     return Tensor(ceil(a.data))
+
+
+@functools.singledispatch
+@tensor_guard
+def to_backend(a: Tensor, b: TensorBackend) -> Tensor:
+    return Tensor(to_backend(a.data, b))
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index cd094e7a0e0..fbc28418fb9 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -8,6 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Tuple, Union
 
 import numpy as np
 import openvino as ov
@@ -32,14 +33,35 @@
 DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()}
 
 
+def _bf16_to_fp32(a: ov.Tensor) -> ov.Tensor:
+    assert a.get_element_type() == ov.Type.bf16 and a.data.dtype == np.float16
+
+    a = a.data.view(np.uint16)
+
+    res = a.astype(np.uint32)
+    res = (
+        ((res & 0x8000) << 16)  # Move sign bit to bit 31
+        | ((res & 0x7F80) << 16)  # Move exponent to bits 30-23
+        | ((res & 0x007F) << 16)
+    )  # Move fraction to bits 22-0
+    res = res.view(np.float32)
+
+    res = ov.Tensor(res)
+    return res
+
+
 @numeric.backend.register(ov.Tensor)
 def _(a: ov.Tensor) -> TensorBackend:
     return TensorBackend.ov
 
 
 @numeric.astype.register(ov.Tensor)
-def _(a: ov.Tensor, dtype: TensorDataType) -> np.ndarray:
-    return a.data.astype(NP_DTYPE_MAP[dtype])
+def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
+    if dtype == TensorDataType.bfloat16:
+        raise ValueError("Not supported conversion")
+    if a.get_element_type() == ov.Type.bf16:
+        a = _bf16_to_fp32(a)
+    return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype]))
 
 
 @numeric.dtype.register(ov.Tensor)
@@ -50,3 +72,15 @@ def _(a: ov.Tensor) -> TensorDataType:
 @numeric.size.register(ov.Tensor)
 def _(a: ov.Tensor) -> int:
     return a.size
+
+
+@numeric.reshape.register(ov.Tensor)
+def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor:
+    return ov.Tensor(a.data.reshape(shape), shape, a.get_element_type())
+
+
+@numeric.to_backend.register(ov.Tensor)
+def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray:
+    if b != TensorBackend.numpy:
+        raise ValueError("Not supported backend")
+    return a.data
diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py
index 52966be1ad1..1f776e19ad6 100644
--- a/nncf/tensor/tensor.py
+++ b/nncf/tensor/tensor.py
@@ -194,6 +194,9 @@ def item(self) -> float:
     def clone(self) -> float:
         return _call_function("clone", self)
 
+    def to_backend(self, backend: TensorBackend) -> Tensor:
+        return _call_function("to_backend", self, backend)
+
 
 def _call_function(func_name: str, *args):
     """

From b636c667a73b3560eb964812bcff9142c7145277 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 13:43:39 +0200
Subject: [PATCH 06/73] Tweak lora type hint

---
 .../algorithms/weight_compression/lora_correction.py        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/lora_correction.py b/nncf/quantization/algorithms/weight_compression/lora_correction.py
index 0c9bb3409ba..212eb5e79fb 100644
--- a/nncf/quantization/algorithms/weight_compression/lora_correction.py
+++ b/nncf/quantization/algorithms/weight_compression/lora_correction.py
@@ -24,7 +24,7 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization, CompressedWeight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
 from nncf.tensor import Tensor
@@ -105,7 +105,7 @@ def is_applicable(self, wc_params: WeightCompressionParameters):
         return wc_params.compression_config.num_bits == 4
 
     def calculate_adapters(
-        self, weight: Tensor, compressed_weight: Tensor, wc_params: WeightCompressionParameters
+        self, weight: Tensor, compressed_weight: CompressedWeight, wc_params: WeightCompressionParameters
     ) -> Tuple[Tensor, Tensor, List[float]]:
         """
         Calculates low rank matrices for a given original and compressed weights.
@@ -134,7 +134,7 @@ def calculate_adapters(
     @staticmethod
     def calculate_low_rank_matrices(
         weight: Tensor,
-        compressed_weight: Tensor,
+        compressed_weight: CompressedWeight,
         compression_config: WeightCompressionConfig,
         reduction_axes: Tuple[int, ...],
         lora_correction_params: AdvancedLoraCorrectionParameters,

From f0129efecd8e1bf7e43a330e1ab610e4acddd50e Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 14:03:19 +0200
Subject: [PATCH 07/73] Tweaks

---
 .../algorithms/weight_compression/openvino_modeling.py        | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index cd7fb0d3ede..16d16b7314c 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -258,13 +258,11 @@ def _get_compress_model(
 
     num_bits = config.num_bits
     if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
-        # dtype = ov.Type.u8
         dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
         level_low = 0
         level_high = 2**num_bits - 1
-        compressed_w += opset.convert(zp, ov.Type.f32)
+        compressed_w += zp if zp.get_element_type() == ov.Type.f32 else opset.convert(zp, ov.Type.f32)
     elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
-        # dtype = ov.Type.i8
         dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
         level_low = -(2 ** (num_bits - 1))
         level_high = 2 ** (num_bits - 1) - 1

From e887e70b5da19ceaa7bc29f27e7d4a06baccb9cd Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 16:08:34 +0200
Subject: [PATCH 08/73] Added share_inputs

---
 .../weight_compression/openvino_backend.py    |  9 +++++---
 .../weight_compression/openvino_modeling.py   | 23 ++++++++++---------
 .../weight_compression/weight_lowering.py     |  5 +++-
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 49924ead6f5..840170f4f8b 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -330,6 +330,10 @@ def transform_model(
             for target_input in const_node.output(0).get_target_inputs():
                 target_input.replace_source_output(mul_output)
 
+            # if compressed_weight.tensor.backend == TensorBackend.ov:
+            #     if compressed_weight.tensor.dtype == TensorDataType.uint4:
+            #         compressed_weight.tensor = compressed_weight.tensor.astype(TensorDataType.uint8)
+            #     compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy)
             if lora_correction_algo is not None and lora_correction_algo.is_applicable(wc_params):
                 if weight.backend == TensorBackend.ov:
                     if weight.dtype == TensorDataType.bfloat16:
@@ -351,10 +355,9 @@ def dump_parameters(
         dump_parameters(model, parameters, algo_name, path)
 
     @staticmethod
-    def _create_ov_const_from_tensor(
-        x: Tensor, dtype: Optional[ov.Type] = None, name: Optional[str] = None
-    ) -> Constant:
+    def _create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant:
         if x.backend == TensorBackend.ov:
+            assert x.data.get_element_type() == dtype
             return opset.constant(x.data, name=name)
         const = opset.constant(x.data, dtype=dtype, name=name)
         return const
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 16d16b7314c..5bc302c50b8 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -10,7 +10,6 @@
 # limitations under the License.
 
 import inspect
-import os
 from dataclasses import dataclass
 from functools import partial
 from typing import List, Optional, Tuple
@@ -19,7 +18,6 @@
 import openvino as ov
 from openvino.runtime import opset13 as opset
 
-import nncf
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.tensor import Tensor
@@ -29,13 +27,14 @@
 @dataclass
 class OVModelParameters:
     input_dtype: TensorDataType
-    dynamic: bool = False
+    dynamic_shapes: bool = False
     recompile: bool = False
     release_memory: bool = True
+    share_inputs: bool = True
     share_outputs: bool = True
 
     def __hash__(self):
-        return hash((self.input_dtype, self.dynamic, self.recompile, self.release_memory, self.share_outputs))
+        return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs))
 
 
 class CompiledModelCache:
@@ -57,7 +56,7 @@ def clear_cache():
 
 
 def cache_results(func):
-    def wrapper(*args, **kwargs):
+    def wrapper(*args, disable_caching=False, **kwargs):
         sig = inspect.signature(func)
         new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
         new_kwargs.update(kwargs)
@@ -66,8 +65,7 @@ def wrapper(*args, **kwargs):
         if cache_key in cache:
             return cache[cache_key]
         result = func(*args, **kwargs)
-        recompile = new_kwargs["ov_model_params"].recompile
-        if not recompile:
+        if not disable_caching:
             cache[cache_key] = result
         return result
 
@@ -76,7 +74,7 @@ def wrapper(*args, **kwargs):
 
 def run_model(ov_model_params, compiled_model, inputs):
     # Returns results as numpy tensors
-    outputs = compiled_model(inputs, share_outputs=ov_model_params.share_outputs)
+    outputs = compiled_model(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
     outputs = [Tensor(outputs[i]) for i in range(len(outputs))]
     if ov_model_params.release_memory:
         compiled_model.release_memory()
@@ -86,7 +84,8 @@ def run_model(ov_model_params, compiled_model, inputs):
 def run_model_via_infer_request(ov_model_params, compiled_model, inputs):
     # Returns results as ov tensors
     infer_request = compiled_model.create_infer_request()
-    infer_request.infer(inputs, share_outputs=ov_model_params.share_outputs)
+    # TODO: try share_inputs=True
+    infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
     outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))]
     if ov_model_params.release_memory:
         compiled_model.release_memory()
@@ -106,7 +105,7 @@ def get_compress_weight_model(
     # if (scale_shape is None) != (reduction_axes is not None):
     #     raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.")
 
-    if ov_model_params.dynamic:
+    if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
         if scale_shape is not None:
             scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
@@ -121,6 +120,7 @@ def get_compress_weight_model(
         zero_point_shape,
         reduction_axes,
         return_nodes=False,
+        disable_caching=ov_model_params.recompile,
     )
 
 
@@ -131,7 +131,7 @@ def get_compress_decompress_weight_model(
     scale_shape: Optional[Tuple],
     zero_point_shape: Optional[Tuple] = None,
 ):
-    if ov_model_params.dynamic:
+    if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
         scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
         if zero_point_shape is not None:
@@ -143,6 +143,7 @@ def get_compress_decompress_weight_model(
         weight_shape,
         scale_shape,
         zero_point_shape,
+        disable_caching=ov_model_params.recompile,
     )
 
 
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 765ada30041..dfc369b0c9a 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -489,7 +489,7 @@ def do_int_quantization(
         # ov_model_params = OVModelParameters(weight.dtype)
         ov_model_params = OVModelParameters(
             weight.dtype,
-            dynamic=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
+            dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
             recompile=bool(int(os.environ.get("RECOMPILE", "0"))),
             release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
             share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))),
@@ -506,6 +506,9 @@ def do_int_quantization(
 
     if precomputed_scale is None:
         compressed_weight, scale, zero_point = model(weight.data)
+        # Scale is always in fp32 so there is no need to store it in ov.Tensor
+        if scale.backend == TensorBackend.ov:
+            scale = scale.to_backend(TensorBackend.numpy)
     else:
         inputs = [weight.data, precomputed_scale.data]
         if precomputed_zero_point is not None:

From 9141a8a6ec7ee595fbadef488a11f1b64cbd483d Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 16:50:26 +0200
Subject: [PATCH 09/73] Modeling tweaks

---
 .../weight_compression/openvino_modeling.py   | 119 +++++++-----------
 .../weight_compression/weight_lowering.py     |  15 ++-
 2 files changed, 53 insertions(+), 81 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 5bc302c50b8..f419995959d 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -12,7 +12,7 @@
 import inspect
 from dataclasses import dataclass
 from functools import partial
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple, Callable, List
 
 import numpy as np
 import openvino as ov
@@ -23,6 +23,9 @@
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 
+TensorList = List[Tensor]
+ModelCallable = Callable[[TensorList], TensorList]
+
 
 @dataclass
 class OVModelParameters:
@@ -32,10 +35,10 @@ class OVModelParameters:
     release_memory: bool = True
     share_inputs: bool = True
     share_outputs: bool = True
+    return_ov_tensors: bool = False
 
     def __hash__(self):
-        return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs))
-
+        return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs, self.return_ov_tensors))
 
 class CompiledModelCache:
     def __init__(self):
@@ -72,8 +75,9 @@ def wrapper(*args, disable_caching=False, **kwargs):
     return wrapper
 
 
-def run_model(ov_model_params, compiled_model, inputs):
+def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
     # Returns results as numpy tensors
+    inputs = [inp.data for inp in inputs]
     outputs = compiled_model(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
     outputs = [Tensor(outputs[i]) for i in range(len(outputs))]
     if ov_model_params.release_memory:
@@ -81,10 +85,10 @@ def run_model(ov_model_params, compiled_model, inputs):
     return outputs
 
 
-def run_model_via_infer_request(ov_model_params, compiled_model, inputs):
+def run_model_via_infer_request(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
     # Returns results as ov tensors
+    inputs = [inp.data for inp in inputs]
     infer_request = compiled_model.create_infer_request()
-    # TODO: try share_inputs=True
     infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
     outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))]
     if ov_model_params.release_memory:
@@ -99,7 +103,7 @@ def get_compress_weight_model(
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
-):
+) -> ModelCallable:
     if scale_shape is None and zero_point_shape is not None:
         raise Exception("Zero point shape can only be provided if scale shape is provided.")
     # if (scale_shape is None) != (reduction_axes is not None):
@@ -112,6 +116,8 @@ def get_compress_weight_model(
         if zero_point_shape is not None:
             zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
 
+    ov_model_params.return_ov_tensors = config.num_bits == 4
+
     return _build_compress_model(
         config,
         ov_model_params,
@@ -130,7 +136,7 @@ def get_compress_decompress_weight_model(
     weight_shape: Tuple,
     scale_shape: Optional[Tuple],
     zero_point_shape: Optional[Tuple] = None,
-):
+) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
         scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
@@ -156,7 +162,7 @@ def _build_compress_model(
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
-):
+) -> ModelCallable:
     if ov_model_params.input_dtype == TensorDataType.float32:
         input_dtype = ov.Type.f32
     elif ov_model_params.input_dtype == TensorDataType.float16:
@@ -178,6 +184,7 @@ def _build_compress_model(
         if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
             zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
             ov_parameters.append(zero_point)
+            zero_point = opset.convert(zero_point, ov.Type.f32)
     else:
         # Compute compressed weight, scale and, possibly, zero point
 
@@ -213,56 +220,16 @@ def _build_compress_model(
             scale /= level_high
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
 
-    return _get_compress_model(
-        config,
-        ov_model_params,
-        ov_parameters,
-        weight,
-        scale,
-        zero_point,
-        return_nodes,
-    )
-
-
-@cache_results
-def _build_compress_decompress_model(
-    config: WeightCompressionConfig,
-    ov_model_params: OVModelParameters,
-    weight_shape: Tuple,
-    scale_shape: Tuple,
-    zero_point_shape: Optional[Tuple] = None,
-):
-    ov_parameters, ov_results = _build_compress_model(
-        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
-    )
-    return _get_compress_decompress_model(
-        config,
-        ov_model_params,
-        ov_parameters,
-        ov_results,
-    )
-
-
-def _get_compress_model(
-    config: WeightCompressionConfig,
-    ov_model_params: OVModelParameters,
-    ov_parameters: List[ov._pyopenvino.op.Parameter],
-    w: ov.runtime.Node,
-    s: ov.runtime.Node,
-    zp: Optional[ov.runtime.Node] = None,
-    return_nodes: Optional[bool] = False,
-):
-    if w.get_element_type() != ov.Type.f32:
-        w = opset.convert(w, ov.Type.f32)
-
-    compressed_w = w / s
+    if weight.get_element_type() != ov.Type.f32:
+        weight = opset.convert(weight, ov.Type.f32)
+    compressed_w = weight / scale
 
     num_bits = config.num_bits
     if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
         dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
         level_low = 0
-        level_high = 2**num_bits - 1
-        compressed_w += zp if zp.get_element_type() == ov.Type.f32 else opset.convert(zp, ov.Type.f32)
+        level_high = 2 ** num_bits - 1
+        compressed_w += zero_point
     elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
         dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
         level_low = -(2 ** (num_bits - 1))
@@ -275,9 +242,9 @@ def _get_compress_model(
 
     ov_results = [compressed_w]
     if len(ov_parameters) == 1:
-        ov_results.append(s)
-        if zp is not None:
-            ov_results.append(opset.convert(zp, compressed_w.get_element_type()))
+        ov_results.append(scale)
+        if zero_point is not None:
+            ov_results.append(opset.convert(zero_point, compressed_w.get_element_type()))
 
     if return_nodes:
         return ov_parameters, ov_results
@@ -285,33 +252,39 @@ def _get_compress_model(
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model
+    run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
     return partial(run_fn, ov_model_params, compiled_model)
 
 
-def _get_compress_decompress_model(
+@cache_results
+def _build_compress_decompress_model(
     config: WeightCompressionConfig,
     ov_model_params: OVModelParameters,
-    parameters: List[ov._pyopenvino.op.Parameter],
-    results: List[ov._pyopenvino.Node],
-):
-    if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
-        if len(results) == 1:
-            compressed_w = results[0]
-            s, zp = parameters[1], parameters[2]
+    weight_shape: Tuple,
+    scale_shape: Tuple,
+    zero_point_shape: Optional[Tuple] = None,
+) -> ModelCallable:
+    ov_parameters, ov_results = _build_compress_model(
+        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
+    )
+
+    if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+        if len(ov_results) == 1:
+            compressed_w = ov_results[0]
+            s, zp = ov_parameters[1], ov_parameters[2]
         else:
-            compressed_w, s, zp = results
+            compressed_w, s, zp = ov_results
         decompressed_w = (compressed_w - zp) * s
     else:
-        if len(results) == 1:
-            compressed_w = results[0]
-            s = parameters[1]
+        if len(ov_results) == 1:
+            compressed_w = ov_results[0]
+            s = ov_parameters[1]
         else:
-            compressed_w, s = results
+            compressed_w, s = ov_results
         decompressed_w = compressed_w * s
 
-    model = ov.Model([decompressed_w], parameters)
+    model = ov.Model([decompressed_w], ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    run_fn = run_model_via_infer_request if config.num_bits == 4 else run_model
+    run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
     return partial(run_fn, ov_model_params, compiled_model)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index dfc369b0c9a..3cc476f7c97 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -505,15 +505,15 @@ def do_int_quantization(
     )
 
     if precomputed_scale is None:
-        compressed_weight, scale, zero_point = model(weight.data)
+        compressed_weight, scale, zero_point = model([weight])
         # Scale is always in fp32 so there is no need to store it in ov.Tensor
         if scale.backend == TensorBackend.ov:
             scale = scale.to_backend(TensorBackend.numpy)
     else:
-        inputs = [weight.data, precomputed_scale.data]
+        inputs = [weight, precomputed_scale]
         if precomputed_zero_point is not None:
-            inputs += [precomputed_zero_point.data]
-        compressed_weight = Tensor(model(inputs)[0])
+            inputs += [precomputed_zero_point]
+        compressed_weight = model(inputs)[0]
         scale, zero_point = precomputed_scale, precomputed_zero_point
 
     return compressed_weight, scale, zero_point
@@ -552,9 +552,8 @@ def calculate_quantized_dequantized_weight(
 
     model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape)
 
-    inputs = [weight.data, scale.data]
+    inputs = [weight, scale]
     if zero_point is not None:
-        inputs.append(zero_point.data)
-    results = model(inputs)
-    decompressed_weight = [Tensor(it) for it in results][0]
+        inputs.append(zero_point)
+    decompressed_weight = model(inputs)[0]
     return decompressed_weight

From a43c5142d897c0842afd2b74ac26fcdff4b40212 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 16:59:00 +0200
Subject: [PATCH 10/73] Move results_cache into separate file

---
 .../weight_compression/lora_correction.py     |  3 +-
 .../weight_compression/openvino_modeling.py   | 66 +++++++------------
 nncf/results_caching.py                       | 39 +++++++++++
 weight_compression.py                         |  6 +-
 4 files changed, 69 insertions(+), 45 deletions(-)
 create mode 100644 nncf/results_caching.py

diff --git a/nncf/quantization/algorithms/weight_compression/lora_correction.py b/nncf/quantization/algorithms/weight_compression/lora_correction.py
index 212eb5e79fb..18167b9704e 100644
--- a/nncf/quantization/algorithms/weight_compression/lora_correction.py
+++ b/nncf/quantization/algorithms/weight_compression/lora_correction.py
@@ -24,7 +24,8 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization, CompressedWeight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import CompressedWeight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
 from nncf.tensor import Tensor
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index f419995959d..1aa5ccf65f5 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -9,10 +9,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
 from dataclasses import dataclass
 from functools import partial
-from typing import Optional, Tuple, Callable, List
+from typing import Callable, List, Optional, Tuple
 
 import numpy as np
 import openvino as ov
@@ -20,6 +19,8 @@
 
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.results_caching import ResultsCacheContainer
+from nncf.results_caching import cache_results
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 
@@ -27,6 +28,9 @@
 ModelCallable = Callable[[TensorList], TensorList]
 
 
+OV_MODEL_CACHE = ResultsCacheContainer()
+
+
 @dataclass
 class OVModelParameters:
     input_dtype: TensorDataType
@@ -38,54 +42,34 @@ class OVModelParameters:
     return_ov_tensors: bool = False
 
     def __hash__(self):
-        return hash((self.input_dtype, self.dynamic_shapes, self.recompile, self.release_memory, self.share_inputs, self.share_outputs, self.return_ov_tensors))
-
-class CompiledModelCache:
-    def __init__(self):
-        self._cache = {}
-
-    def clear(self):
-        self._cache.clear()
-
-    def is_empty(self):
-        return len(self._cache) == 0
-
-
-COMPILED_MODEL_CACHE = CompiledModelCache()
-
-
-def clear_cache():
-    COMPILED_MODEL_CACHE.clear()
-
-
-def cache_results(func):
-    def wrapper(*args, disable_caching=False, **kwargs):
-        sig = inspect.signature(func)
-        new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
-        new_kwargs.update(kwargs)
-        cache_key = (func.__name__, frozenset(new_kwargs.items()))
-        cache = COMPILED_MODEL_CACHE._cache
-        if cache_key in cache:
-            return cache[cache_key]
-        result = func(*args, **kwargs)
-        if not disable_caching:
-            cache[cache_key] = result
-        return result
-
-    return wrapper
+        return hash(
+            (
+                self.input_dtype,
+                self.dynamic_shapes,
+                self.recompile,
+                self.release_memory,
+                self.share_inputs,
+                self.share_outputs,
+                self.return_ov_tensors,
+            )
+        )
 
 
 def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
     # Returns results as numpy tensors
     inputs = [inp.data for inp in inputs]
-    outputs = compiled_model(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
+    outputs = compiled_model(
+        inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
+    )
     outputs = [Tensor(outputs[i]) for i in range(len(outputs))]
     if ov_model_params.release_memory:
         compiled_model.release_memory()
     return outputs
 
 
-def run_model_via_infer_request(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
+def run_model_via_infer_request(
+    ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList
+) -> TensorList:
     # Returns results as ov tensors
     inputs = [inp.data for inp in inputs]
     infer_request = compiled_model.create_infer_request()
@@ -153,7 +137,7 @@ def get_compress_decompress_weight_model(
     )
 
 
-@cache_results
+@cache_results(OV_MODEL_CACHE)
 def _build_compress_model(
     config: WeightCompressionConfig,
     ov_model_params: OVModelParameters,
@@ -256,7 +240,7 @@ def _build_compress_model(
     return partial(run_fn, ov_model_params, compiled_model)
 
 
-@cache_results
+@cache_results(OV_MODEL_CACHE)
 def _build_compress_decompress_model(
     config: WeightCompressionConfig,
     ov_model_params: OVModelParameters,
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
new file mode 100644
index 00000000000..447ed3966dd
--- /dev/null
+++ b/nncf/results_caching.py
@@ -0,0 +1,39 @@
+import inspect
+
+
+class ResultsCacheContainer:
+    def __init__(self):
+        self._cache = {}
+
+    def clear(self):
+        self._cache.clear()
+
+    def is_empty(self):
+        return len(self._cache) == 0
+
+    def __getitem__(self, item):
+        return self._cache[item]
+
+    def __setitem__(self, key, value):
+        self._cache[key] = value
+
+    def __contains__(self, item):
+        return item in self._cache
+
+
+def cache_results(cache: ResultsCacheContainer):
+    def decorator(func):
+        def wrapper(*args, disable_caching=False, **kwargs):
+            sig = inspect.signature(func)
+            new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
+            new_kwargs.update(kwargs)
+            cache_key = (func.__name__, frozenset(new_kwargs.items()))
+            if cache_key in cache:
+                return cache[cache_key]
+            result = func(*args, **kwargs)
+            if not disable_caching:
+                cache[cache_key] = result
+            return result
+
+        return wrapper
+    return decorator
diff --git a/weight_compression.py b/weight_compression.py
index 5bfc3bd24d7..245016e8035 100644
--- a/weight_compression.py
+++ b/weight_compression.py
@@ -20,7 +20,7 @@
 import openvino as ov
 
 import nncf
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import COMPILED_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
 from tools.memory_monitor import MemoryMonitor
 from tools.memory_monitor import MemoryType
 
@@ -150,8 +150,8 @@ def main(args):
     time.sleep(0.5)
 
     before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
-    if not COMPILED_MODEL_CACHE.is_empty():
-        COMPILED_MODEL_CACHE.clear()
+    if not OV_MODEL_CACHE.is_empty():
+        OV_MODEL_CACHE.clear()
         gc.collect()
         time.sleep(memory_monitors[0].interval * 10)
         after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]

From 1216f65573c2cd9f71b83ca8cf2612e84e430b8b Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 25 Oct 2024 17:43:43 +0200
Subject: [PATCH 11/73] Implement astype for ov backend for bf16, u4, i4

---
 .../weight_compression/openvino_backend.py    | 12 ++---
 .../weight_compression/openvino_modeling.py   | 39 +++++++++-----
 .../weight_compression/weight_lowering.py     |  4 --
 nncf/results_caching.py                       |  1 +
 nncf/tensor/definitions.py                    |  2 +
 nncf/tensor/functions/ov.py                   | 53 +++++++++++++------
 6 files changed, 69 insertions(+), 42 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 840170f4f8b..5c328e372b0 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -329,17 +329,13 @@ def transform_model(
             mul_output = mul.output(0)
             for target_input in const_node.output(0).get_target_inputs():
                 target_input.replace_source_output(mul_output)
-
-            # if compressed_weight.tensor.backend == TensorBackend.ov:
-            #     if compressed_weight.tensor.dtype == TensorDataType.uint4:
-            #         compressed_weight.tensor = compressed_weight.tensor.astype(TensorDataType.uint8)
-            #     compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy)
             if lora_correction_algo is not None and lora_correction_algo.is_applicable(wc_params):
                 if weight.backend == TensorBackend.ov:
-                    if weight.dtype == TensorDataType.bfloat16:
-                        weight = weight.astype(TensorDataType.float32)
                     weight = weight.to_backend(TensorBackend.numpy)
-                # TODO: cast int4 ov tensor too?
+                if compressed_weight.tensor.backend == TensorBackend.ov:
+                    compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy)
+                if compressed_weight.zero_point.backend == TensorBackend.ov:
+                    compressed_weight.zero_point = compressed_weight.zero_point.to_backend(TensorBackend.numpy)
                 adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params)
                 self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters)
 
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 1aa5ccf65f5..1008e872ba7 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -23,6 +23,7 @@
 from nncf.results_caching import cache_results
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
+from nncf.tensor.functions.ov import DTYPE_MAP as OV_DTYPE_MAP
 
 TensorList = List[Tensor]
 ModelCallable = Callable[[TensorList], TensorList]
@@ -57,7 +58,8 @@ def __hash__(self):
 
 def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
     # Returns results as numpy tensors
-    inputs = [inp.data for inp in inputs]
+    if any(isinstance(it, Tensor) for it in inputs):
+        inputs = [inp.data for inp in inputs]
     outputs = compiled_model(
         inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
     )
@@ -71,7 +73,8 @@ def run_model_via_infer_request(
     ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList
 ) -> TensorList:
     # Returns results as ov tensors
-    inputs = [inp.data for inp in inputs]
+    if any(isinstance(it, Tensor) for it in inputs):
+        inputs = [inp.data for inp in inputs]
     infer_request = compiled_model.create_infer_request()
     infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
     outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))]
@@ -100,7 +103,8 @@ def get_compress_weight_model(
         if zero_point_shape is not None:
             zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
 
-    ov_model_params.return_ov_tensors = config.num_bits == 4
+    if config.num_bits == 4:
+        ov_model_params.return_ov_tensors = True
 
     return _build_compress_model(
         config,
@@ -147,15 +151,7 @@ def _build_compress_model(
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
 ) -> ModelCallable:
-    if ov_model_params.input_dtype == TensorDataType.float32:
-        input_dtype = ov.Type.f32
-    elif ov_model_params.input_dtype == TensorDataType.float16:
-        input_dtype = ov.Type.f16
-    elif ov_model_params.input_dtype == TensorDataType.bfloat16:
-        input_dtype = ov.Type.bf16
-    else:
-        raise Exception
-    weight = opset.parameter(weight_shape, name="w", dtype=input_dtype)
+    weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
     ov_parameters = [weight]
 
     if scale_shape is not None:
@@ -212,7 +208,7 @@ def _build_compress_model(
     if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
         dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
         level_low = 0
-        level_high = 2 ** num_bits - 1
+        level_high = 2**num_bits - 1
         compressed_w += zero_point
     elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
         dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
@@ -272,3 +268,20 @@ def _build_compress_decompress_model(
 
     run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
     return partial(run_fn, ov_model_params, compiled_model)
+
+
+def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
+    if ov_model_params.dynamic_shapes:
+        arg_shape = (-1,) * len(arg_shape)
+    return _build_astype_model(ov_model_params, arg_shape, dtype)
+
+
+@cache_results(OV_MODEL_CACHE)
+def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
+    arg = opset.parameter(arg_shape, dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
+    res = opset.convert(arg, OV_DTYPE_MAP[dtype])
+    model = ov.Model([res], [arg])
+    compiled_model = ov.compile_model(model, device_name="CPU")
+
+    run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
+    return partial(run_fn, ov_model_params, compiled_model)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 3cc476f7c97..3af76eed391 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -380,8 +380,6 @@ def compress_weight(
     """
     if not config.is_integer():
         if weight.backend == TensorBackend.ov:
-            if weight.dtype == TensorDataType.bfloat16:
-                weight = weight.astype(TensorDataType.float32)
             weight = weight.to_backend(TensorBackend.numpy)
 
         compressed_weight, scale = calculate_normalized_weight_and_fp4_scale(
@@ -463,8 +461,6 @@ def do_int_quantization(
         # Reference implementation
 
         if weight.backend == TensorBackend.ov:
-            if weight.dtype == TensorDataType.bfloat16:
-                weight = weight.astype(TensorDataType.float32)
             weight = weight.to_backend(TensorBackend.numpy)
 
         if weight.dtype != TensorDataType.float32:
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
index 447ed3966dd..4a991a36be7 100644
--- a/nncf/results_caching.py
+++ b/nncf/results_caching.py
@@ -36,4 +36,5 @@ def wrapper(*args, disable_caching=False, **kwargs):
             return result
 
         return wrapper
+
     return decorator
diff --git a/nncf/tensor/definitions.py b/nncf/tensor/definitions.py
index a4849e558e3..67b3bf7ed5e 100644
--- a/nncf/tensor/definitions.py
+++ b/nncf/tensor/definitions.py
@@ -36,6 +36,8 @@ class TensorDataType(Enum):
     int32 = auto()
     int64 = auto()
     uint8 = auto()
+    uint4 = auto()
+    int4 = auto()
 
     def is_float(self):
         """
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index fbc28418fb9..f8cd0431f83 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -28,26 +28,34 @@
     TensorDataType.int32: ov.Type.i32,
     TensorDataType.int64: ov.Type.i64,
     TensorDataType.uint8: ov.Type.u8,
+    TensorDataType.uint4: ov.Type.u4,
+    TensorDataType.int4: ov.Type.i4,
 }
 
 DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()}
 
 
-def _bf16_to_fp32(a: ov.Tensor) -> ov.Tensor:
-    assert a.get_element_type() == ov.Type.bf16 and a.data.dtype == np.float16
+def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 
-    a = a.data.view(np.uint16)
+    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
+    assert a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]
 
-    res = a.astype(np.uint32)
-    res = (
-        ((res & 0x8000) << 16)  # Move sign bit to bit 31
-        | ((res & 0x7F80) << 16)  # Move exponent to bits 30-23
-        | ((res & 0x007F) << 16)
-    )  # Move fraction to bits 22-0
-    res = res.view(np.float32)
-
-    res = ov.Tensor(res)
-    return res
+    model = get_astype_model(
+        OVModelParameters(
+            input_dtype=a_dtype,
+            dynamic_shapes=True,
+            recompile=False,
+            release_memory=True,
+            share_inputs=True,
+            share_outputs=True,
+            return_ov_tensors=True,
+        ),
+        a.shape,
+        dtype,
+    )
+    return model([a])[0].data
 
 
 @numeric.backend.register(ov.Tensor)
@@ -57,10 +65,10 @@ def _(a: ov.Tensor) -> TensorBackend:
 
 @numeric.astype.register(ov.Tensor)
 def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
-    if dtype == TensorDataType.bfloat16:
-        raise ValueError("Not supported conversion")
-    if a.get_element_type() == ov.Type.bf16:
-        a = _bf16_to_fp32(a)
+    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
+    if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
+        return _ov_astype(a, dtype)
+
     return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype]))
 
 
@@ -83,4 +91,15 @@ def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor:
 def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray:
     if b != TensorBackend.numpy:
         raise ValueError("Not supported backend")
+
+    # Cannot convert bfloat16, uint4, int4 to numpy directly
+    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
+    if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
+        dtype = TensorDataType.float32
+        if a_dtype == TensorDataType.uint4:
+            dtype = TensorDataType.uint8
+        elif a_dtype == TensorDataType.int4:
+            dtype = TensorDataType.int8
+        a = _ov_astype(a, dtype)
+
     return a.data

From 8611b75dc5340e273cfa63a3ce696925aec2e877 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Sat, 26 Oct 2024 15:38:22 +0200
Subject: [PATCH 12/73] Experiments

---
 run_weight_compression.py | 497 ++++++++++++++++++++------------------
 weight_compression.py     |   2 +-
 2 files changed, 268 insertions(+), 231 deletions(-)

diff --git a/run_weight_compression.py b/run_weight_compression.py
index d7eefec79ab..74d752ef4de 100644
--- a/run_weight_compression.py
+++ b/run_weight_compression.py
@@ -16,227 +16,264 @@ def stream_handler(stream, target_file):
 parent_log_dir = Path("compression_logs")
 
 experiment_params = [
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", ""),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024_acc/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", ""),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/tiny-llama", "--dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/phi3", "--dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-
-
-
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic "),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym "),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym "),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"),
+
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"),
+
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"),
+    # #
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+    # #
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
+
+
+
+
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+    
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
+
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
 
 
 
@@ -264,21 +301,21 @@ def stream_handler(stream, target_file):
     #
     #
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
     #
     # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym "),
+    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym"),
     # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
     # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
     # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
     # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
     #
     # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym "),
+    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym"),
     # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
     # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
     # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
@@ -286,11 +323,11 @@ def stream_handler(stream, target_file):
 
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
     # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102024/int4/tiny-llama", "--save-model --compression-mode int4_asym "),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
+    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
 ]
 
 for model_dir, log_dir, params in experiment_params:
@@ -322,7 +359,7 @@ def stream_handler(stream, target_file):
 
 evaluated_paths = set()
 for _, log_dir, _ in experiment_params:
-    for model_path in log_dir.rglob("**/*"):
+    for model_path in sorted(log_dir.rglob("**/*")):
         model_path: Path
         if model_path.suffix != ".xml":
             continue
diff --git a/weight_compression.py b/weight_compression.py
index 245016e8035..bae1948145c 100644
--- a/weight_compression.py
+++ b/weight_compression.py
@@ -191,7 +191,7 @@ def main(args):
         f.write(
             f"{model_path},"
             f"{model_dtype.upper()},"
-            f"{'NumPy' if numpy_compression else 'OV'},"
+            f"{'-' if numpy_compression else 'OV'},"
             f"{'-' if numpy_compression else recompile},"
             f"{'-' if numpy_compression else release_memory},"
             f"{'-' if numpy_compression else share_outputs},"

From 071866834cb6f605fcca3a0557698071c88f8e83 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Sat, 26 Oct 2024 15:40:31 +0200
Subject: [PATCH 13/73] Support case of (weight, scale) -> (c_weight, zp)

---
 nncf/openvino/graph/node_utils.py             |   4 +-
 .../weight_compression/openvino_backend.py    |   6 +-
 .../weight_compression/openvino_modeling.py   | 167 ++++++++++--------
 .../weight_compression/weight_lowering.py     |  34 +++-
 nncf/results_caching.py                       |  11 ++
 5 files changed, 136 insertions(+), 86 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 39056d65af5..33d67140d16 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -8,7 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+
 from typing import Any, Callable, Dict, List, Optional, Tuple, Type
 
 import numpy as np
@@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
     return cnt_if_op(model, 0)
 
 
-def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
+def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = False) -> np.ndarray:
     """
     Returns the constant tensor for the node.
     This method is applicable only for the floating-point constant data.
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 5c328e372b0..8fbd0e2935a 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -299,8 +299,10 @@ def transform_model(
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
             weight = get_const_value(const_node, cast_bf16_to_fp32=False)
-            if const_dtype == ov.Type.bf16:
-                weight = ov.Tensor(weight, weight.shape, ov.Type.bf16)
+            # Creation of ov.Tensor is required for two reasons:
+            #   1. To be able to process BF16 weight properly
+            #   2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
+            weight = ov.Tensor(weight, weight.shape, const_dtype)
             weight = Tensor(weight)
 
             should_add_convert_node = False
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 1008e872ba7..2f223d71d06 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -11,7 +11,7 @@
 
 from dataclasses import dataclass
 from functools import partial
-from typing import Callable, List, Optional, Tuple
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino as ov
@@ -35,6 +35,7 @@
 @dataclass
 class OVModelParameters:
     input_dtype: TensorDataType
+    output_dtype: Optional[TensorDataType] = None
     dynamic_shapes: bool = False
     recompile: bool = False
     release_memory: bool = True
@@ -56,30 +57,28 @@ def __hash__(self):
         )
 
 
-def run_model(ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList) -> TensorList:
-    # Returns results as numpy tensors
+def run_model(
+    ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList
+) -> TensorList:
     if any(isinstance(it, Tensor) for it in inputs):
         inputs = [inp.data for inp in inputs]
-    outputs = compiled_model(
-        inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
-    )
-    outputs = [Tensor(outputs[i]) for i in range(len(outputs))]
-    if ov_model_params.release_memory:
-        compiled_model.release_memory()
-    return outputs
 
+    if return_ov_tensors:
+        infer_request = compiled_model.create_infer_request()
+        infer_request.infer(
+            inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
+        )
+        outputs = [infer_request.get_output_tensor(i) for i in range(len(infer_request.results))]
+    else:
+        outputs = compiled_model(
+            inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
+        )
+        outputs = [outputs[i] for i in range(len(outputs))]
+    outputs = [Tensor(it) for it in outputs]
 
-def run_model_via_infer_request(
-    ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList
-) -> TensorList:
-    # Returns results as ov tensors
-    if any(isinstance(it, Tensor) for it in inputs):
-        inputs = [inp.data for inp in inputs]
-    infer_request = compiled_model.create_infer_request()
-    infer_request.infer(inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs)
-    outputs = [Tensor(infer_request.get_output_tensor(i)) for i in range(len(infer_request.results))]
     if ov_model_params.release_memory:
         compiled_model.release_memory()
+
     return outputs
 
 
@@ -93,8 +92,6 @@ def get_compress_weight_model(
 ) -> ModelCallable:
     if scale_shape is None and zero_point_shape is not None:
         raise Exception("Zero point shape can only be provided if scale shape is provided.")
-    # if (scale_shape is None) != (reduction_axes is not None):
-    #     raise Exception("Either one of scale_shape or reduction_axes must be provided at the same time.")
 
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
@@ -103,9 +100,6 @@ def get_compress_weight_model(
         if zero_point_shape is not None:
             zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
 
-    if config.num_bits == 4:
-        ov_model_params.return_ov_tensors = True
-
     return _build_compress_model(
         config,
         ov_model_params,
@@ -150,28 +144,29 @@ def _build_compress_model(
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
-) -> ModelCallable:
+) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
     weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
     ov_parameters = [weight]
 
-    if scale_shape is not None:
-        # Compute only the compressed weight
+    mode = config.mode
+    asym_mode = mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
+    num_bits = config.num_bits
+    eps = np.finfo(np.float32).eps
+    if asym_mode:
+        level_low = 0
+        level_high = 2**num_bits - 1
+    else:
+        level_low = -(2 ** (num_bits - 1))
+        level_high = 2 ** (num_bits - 1) - 1
 
+    min_values = None
+    if scale_shape is not None:
+        # Scale is given as an input
         scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32)
         ov_parameters.append(scale)
-
-        zero_point = None
-        if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
-            zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
-            ov_parameters.append(zero_point)
-            zero_point = opset.convert(zero_point, ov.Type.f32)
     else:
-        # Compute compressed weight, scale and, possibly, zero point
-
-        mode = config.mode
-        num_bits = config.num_bits
-        eps = np.finfo(np.float32).eps
-        if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+        # Compute scale
+        if asym_mode:
             min_values = opset.reduce_min(
                 weight, reduction_axes=reduction_axes, keep_dims=True
             )  # [a1, r, a2] -> [a1, 1, a2]
@@ -180,49 +175,64 @@ def _build_compress_model(
             )  # [a1, r, a2] -> [a1, 1, a2]
             min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)
 
-            level_low = 0
-            level_high = 2**num_bits - 1
             levels = level_high - level_low + 1
             scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32)
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
-
-            zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
-            zero_point = opset.clamp(zero_point, level_low, level_high)
         else:
-            zero_point = None
-            level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32)
-
             w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True))
             w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True)
             w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)
 
             scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max)
-            scale /= level_high
+            scale /= opset.constant(level_high, ov.Type.f32)
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
 
+    zero_point = None
+    if zero_point_shape is not None:
+        # Zero point is given as an input
+        zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
+        ov_parameters.append(zero_point)
+        zero_point = opset.convert(zero_point, ov.Type.f32)
+    elif mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+        # Compute zero point
+        if min_values is None:
+            min_values = opset.reduce_min(
+                weight, reduction_axes=reduction_axes, keep_dims=True
+            )  # [a1, r, a2] -> [a1, 1, a2]
+            min_values = opset.convert(min_values, ov.Type.f32)
+
+        level_low = 0
+        level_high = 2**num_bits - 1
+        zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
+        zero_point = opset.clamp(zero_point, level_low, level_high)
+
     if weight.get_element_type() != ov.Type.f32:
         weight = opset.convert(weight, ov.Type.f32)
     compressed_w = weight / scale
 
-    num_bits = config.num_bits
-    if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]:
-        dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
-        level_low = 0
-        level_high = 2**num_bits - 1
+    if asym_mode:
+        if ov_model_params.output_dtype is not None:
+            dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
+        else:
+            dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
         compressed_w += zero_point
-    elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]:
-        dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.u4
-        level_low = -(2 ** (num_bits - 1))
-        level_high = 2 ** (num_bits - 1) - 1
     else:
-        raise Exception
+        if ov_model_params.output_dtype is not None:
+            dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
+        else:
+            dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
 
     compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
     compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")
 
     ov_results = [compressed_w]
-    if len(ov_parameters) == 1:
-        ov_results.append(scale)
+    if len(ov_parameters) != 3:
+        # Two cases:
+        #   1. weight -> compressed_weight, scale, (zero_point)
+        #   2. weight, scale -> compressed_weight, (zero_point)
+        if len(ov_parameters) == 1:
+            ov_results.append(scale)
+
         if zero_point is not None:
             ov_results.append(opset.convert(zero_point, compressed_w.get_element_type()))
 
@@ -232,8 +242,7 @@ def _build_compress_model(
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
-    return partial(run_fn, ov_model_params, compiled_model)
+    return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
 
 
 @cache_results(OV_MODEL_CACHE)
@@ -249,25 +258,32 @@ def _build_compress_decompress_model(
     )
 
     if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
-        if len(ov_results) == 1:
-            compressed_w = ov_results[0]
-            s, zp = ov_parameters[1], ov_parameters[2]
+        if len(ov_parameters) == 1:
+            # weight -> compressed_weight, scale, zero_point
+            compressed_w, scale, zero_point = ov_results
+        elif len(ov_parameters) == 2:
+            # weight, scale -> compressed_weight, zero_point
+            compressed_w, zero_point = ov_results
+            scale = ov_parameters[1]
         else:
-            compressed_w, s, zp = ov_results
-        decompressed_w = (compressed_w - zp) * s
-    else:
-        if len(ov_results) == 1:
+            # weight, scale, zero_point -> compressed_weight
             compressed_w = ov_results[0]
-            s = ov_parameters[1]
+            scale, zero_point = ov_parameters[1:]
+        decompressed_w = opset.convert(opset.convert(compressed_w, ov.Type.i32) - zero_point, ov.Type.f32) * scale
+    else:
+        if len(ov_parameters) == 1:
+            # weight -> compressed_weight, scale
+            compressed_w, scale = ov_results
         else:
-            compressed_w, s = ov_results
-        decompressed_w = compressed_w * s
+            # weight, scale -> compressed_weight
+            compressed_w = ov_results[0]
+            scale = ov_parameters[1]
+        decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale
 
     model = ov.Model([decompressed_w], ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
-    return partial(run_fn, ov_model_params, compiled_model)
+    return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
 
 
 def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
@@ -283,5 +299,4 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dt
     model = ov.Model([res], [arg])
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    run_fn = run_model_via_infer_request if ov_model_params.return_ov_tensors else run_model
-    return partial(run_fn, ov_model_params, compiled_model)
+    return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 3af76eed391..8e0c4cd403d 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -481,14 +481,23 @@ def do_int_quantization(
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
+    asym_mode = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
     if ov_model_params is None:
-        # ov_model_params = OVModelParameters(weight.dtype)
+        output_dtype = None
+        return_ov_tensors = False
+        if config.num_bits == 4:
+            if weight.backend == TensorBackend.ov:
+                return_ov_tensors = weight.backend == TensorBackend.ov
+            else:
+                output_dtype = TensorDataType.uint8 if asym_mode else TensorDataType.int8
         ov_model_params = OVModelParameters(
-            weight.dtype,
+            input_dtype=weight.dtype,
+            output_dtype=output_dtype,
             dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
             recompile=bool(int(os.environ.get("RECOMPILE", "0"))),
             release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
             share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))),
+            return_ov_tensors=return_ov_tensors,
         )
 
     model = get_compress_weight_model(
@@ -501,14 +510,27 @@ def do_int_quantization(
     )
 
     if precomputed_scale is None:
-        compressed_weight, scale, zero_point = model([weight])
+        # weight -> compressed_weight, scale, (zero_point)
+        results = model([weight])
+        if asym_mode:
+            compressed_weight, scale, zero_point = results
+        else:
+            compressed_weight, scale = results
+            zero_point = None
+
         # Scale is always in fp32 so there is no need to store it in ov.Tensor
         if scale.backend == TensorBackend.ov:
             scale = scale.to_backend(TensorBackend.numpy)
+    elif precomputed_zero_point is None and asym_mode:
+        # weight, scale -> compressed_weight, zero_point
+        compressed_weight, zero_point = model([weight, precomputed_scale])
+        scale = precomputed_scale
     else:
-        inputs = [weight, precomputed_scale]
-        if precomputed_zero_point is not None:
-            inputs += [precomputed_zero_point]
+        inputs = (
+            [weight, precomputed_scale]
+            if precomputed_zero_point is None
+            else [weight, precomputed_scale, precomputed_zero_point]
+        )
         compressed_weight = model(inputs)[0]
         scale, zero_point = precomputed_scale, precomputed_zero_point
 
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
index 4a991a36be7..d1d16ea775b 100644
--- a/nncf/results_caching.py
+++ b/nncf/results_caching.py
@@ -1,3 +1,14 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 
 

From 283a821805e97d16ede321105a4ce97fb77ef7f9 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 28 Oct 2024 09:51:34 +0100
Subject: [PATCH 14/73] SE improvements

---
 .../algorithms/weight_compression/awq.py      |  3 +-
 .../algorithms/weight_compression/config.py   |  5 ++
 .../weight_compression/mixed_precision.py     |  2 +-
 .../weight_compression/openvino_modeling.py   | 12 ++---
 .../weight_compression/scale_estimation.py    | 10 ++--
 .../weight_compression/weight_lowering.py     | 48 +++++++++++++------
 .../quantization/test_weights_compression.py  |  4 +-
 7 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py
index 1b43f5339c4..4d78f8f8f4f 100644
--- a/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/nncf/quantization/algorithms/weight_compression/awq.py
@@ -261,8 +261,9 @@ def apply(
                         g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale)
                         g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale)
                     else:
+                        # TODO: Improve by replacing with quantize_dequantize
                         g_compressed_weighs, g_c_scale, g_c_zp = do_int_quantization(
-                            weights_to_fake_quantize, reduction_axis, awq_config
+                            weights_to_fake_quantize, awq_config, reduction_axis
                         )
                         g_decompressed_weighs = do_int_dequantization(g_compressed_weighs, g_c_scale, g_c_zp)
                     sacts = gacts / fns.unsqueeze(cur_scale, 1)
diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index ce512331349..03590fc5ff3 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -40,6 +40,11 @@ def num_bits(self):
         """
         return 8 if self.mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM] else 4
 
+    @property
+    def is_int_asym(self):
+        return self.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM]
+
+    @property
     def is_integer(self):
         """
         :return: True if compression type in integer, else False.
diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
index 53d44c97748..247f8daf6cf 100644
--- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py
+++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
@@ -329,7 +329,7 @@ def _calc_weight_sensitivity(
         if weight.dtype != TensorDataType.float32:
             weight = weight.astype(TensorDataType.float32)
 
-        compressed_weights, scale, zero_point = do_int_quantization(weight, reduction_axes, backup_config)
+        compressed_weights, scale, zero_point = do_int_quantization(weight, backup_config, reduction_axes)
         decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point)
         decompressed_weight = decompressed_weight.reshape(orig_shape)
         return fns.linalg.norm(decompressed_weight - weight, ord="fro").item()
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 2f223d71d06..06e1f2ddd70 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -148,11 +148,9 @@ def _build_compress_model(
     weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
     ov_parameters = [weight]
 
-    mode = config.mode
-    asym_mode = mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
     num_bits = config.num_bits
     eps = np.finfo(np.float32).eps
-    if asym_mode:
+    if config.is_int_asym:
         level_low = 0
         level_high = 2**num_bits - 1
     else:
@@ -166,7 +164,7 @@ def _build_compress_model(
         ov_parameters.append(scale)
     else:
         # Compute scale
-        if asym_mode:
+        if config.is_int_asym:
             min_values = opset.reduce_min(
                 weight, reduction_axes=reduction_axes, keep_dims=True
             )  # [a1, r, a2] -> [a1, 1, a2]
@@ -193,7 +191,7 @@ def _build_compress_model(
         zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
         ov_parameters.append(zero_point)
         zero_point = opset.convert(zero_point, ov.Type.f32)
-    elif mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+    elif config.is_int_asym:
         # Compute zero point
         if min_values is None:
             min_values = opset.reduce_min(
@@ -210,7 +208,7 @@ def _build_compress_model(
         weight = opset.convert(weight, ov.Type.f32)
     compressed_w = weight / scale
 
-    if asym_mode:
+    if config.is_int_asym:
         if ov_model_params.output_dtype is not None:
             dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
         else:
@@ -257,7 +255,7 @@ def _build_compress_decompress_model(
         config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
     )
 
-    if config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+    if config.is_int_asym:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale, zero_point
             compressed_w, scale, zero_point = ov_results
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index e294c6e0f5d..68e161b5a8e 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -28,7 +28,6 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
@@ -221,7 +220,8 @@ def calculate_quantization_params(
             q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis)
             zp = None
         else:
-            compressed_weights, scale, zp = do_int_quantization(original_weight, reduction_axis, cur_config)
+            # TODO: Improve by replacing with quantize_dequantize with additional outputs
+            compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis)
             if zp is not None:
                 zp = zp.astype(scale.dtype)
             q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis)
@@ -297,7 +297,8 @@ def calculate_quantization_params(
                 if config.mode == CompressWeightsMode.NF4:
                     out = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 else:
-                    out = calculate_quantized_weight(original_weight, config, near_to_ideal_scale, zp)
+                    out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale,
+                                                    precomputed_zero_point=zp)
                 compressed_weights = fns.zeros_like(original_weight) + out
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
                 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
@@ -310,7 +311,8 @@ def calculate_quantization_params(
             if config.mode == CompressWeightsMode.NF4:
                 out = do_nf4_quantization(original_weight, scaled_scale)
             else:
-                out = calculate_quantized_weight(original_weight, config, scaled_scale, zp)
+                out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=scaled_scale,
+                                                precomputed_zero_point=zp)
             compressed_weights = fns.zeros_like(original_weight) + out
 
             target, zero_mask = get_target_zero_mask(compressed_weights, zp)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 8e0c4cd403d..d9db96e7e77 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -270,14 +270,13 @@ def calculate_integer_quantization_params(
     :param config: Weight compression configuration.
     :return: Scale and zero point tensors.
     """
-    mode = config.mode
-    assert config.is_integer(), "The function supports integer quantization only"
+    assert config.is_integer, "The function supports integer quantization only"
     num_bits = config.num_bits
 
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
-    if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
+    if config.is_int_asym:
         level_low = 0
         level_high = 2**num_bits - 1
         min_values = fns.min(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
@@ -314,7 +313,7 @@ def calculate_quantized_weight(
         scale = scale.astype(TensorDataType.float32)
 
     num_bits = config.num_bits
-    asym_quant = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
+    asym_quant = config.is_int_asym
     dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8
     level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
@@ -349,7 +348,7 @@ def get_integer_quantization_error(
         weight = weight.astype(TensorDataType.float32)
 
     compressed_weights, scale, zero_point = do_int_quantization(
-        weight, reduction_axes, config, invert_division=invert_division
+        weight, config, reduction_axes, invert_division=invert_division
     )
     decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point)
 
@@ -378,7 +377,7 @@ def compress_weight(
     :param precomputed_zero_point: Precomputed zero point.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
-    if not config.is_integer():
+    if not config.is_integer:
         if weight.backend == TensorBackend.ov:
             weight = weight.to_backend(TensorBackend.numpy)
 
@@ -387,7 +386,7 @@ def compress_weight(
         )
         return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_int_quantization(
-        weight, reduction_axes, config, precomputed_scale, precomputed_zero_point, invert_division=invert_division
+        weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division=invert_division
     )
 
     return CompressedWeight(compressed_weight, scale, zero_point)
@@ -436,14 +435,28 @@ def do_int_dequantization(
 
 def do_int_quantization(
     weight: Tensor,
-    reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
+    reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
     invert_division: Optional[bool] = False,
     ov_model_params: Optional[OVModelParameters] = None,
 ):
-    assert config.is_integer(), "The function supports integer quantization only"
+    """
+    Performs integer quantization on the given weight tensor.
+
+    :param weight: The weight tensor to quantize.
+    :param config: The weight compression configuration.
+    :param reduction_axes: Axes along which to reduce (collect) statistics (e.g., min, max). Not required if
+        precomputed scale (and zero point) are provided.
+    :param precomputed_scale: Optional precomputed scale tensor.
+    :param precomputed_zero_point: Optional precomputed zero point tensor.
+    :param invert_division: Whether to apply inversion for scale and then multiply by weights instead of division.
+        Defaults to False.
+    :param ov_model_params: OpenVINO model parameters for acceleration.
+    :return: A tuple containing the compressed weights, scale, and zero point tensors.
+    """
+    assert config.is_integer, "The function supports integer quantization only"
 
     accelerate_through_ov = (
         is_openvino_available()
@@ -453,7 +466,8 @@ def do_int_quantization(
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
-    if config.group_size != -1:
+    # When reduction axes are not provided, assuming that the weights are already reshaped
+    if config.group_size != -1 and reduction_axes is not None:
         # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
@@ -467,7 +481,7 @@ def do_int_quantization(
             weight = weight.astype(TensorDataType.float32)
 
         scale, zero_point = None, None
-        if precomputed_zero_point is None or precomputed_zero_point is None:
+        if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None):
             scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config)
         if precomputed_scale is not None:
             scale = precomputed_scale
@@ -481,7 +495,6 @@ def do_int_quantization(
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
-    asym_mode = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
     if ov_model_params is None:
         output_dtype = None
         return_ov_tensors = False
@@ -489,7 +502,12 @@ def do_int_quantization(
             if weight.backend == TensorBackend.ov:
                 return_ov_tensors = weight.backend == TensorBackend.ov
             else:
-                output_dtype = TensorDataType.uint8 if asym_mode else TensorDataType.int8
+                output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
+        # ov_model_params = OVModelParameters(
+        #     input_dtype=weight.dtype,
+        #     output_dtype=output_dtype,
+        #     return_ov_tensors=return_ov_tensors,
+        # )
         ov_model_params = OVModelParameters(
             input_dtype=weight.dtype,
             output_dtype=output_dtype,
@@ -512,7 +530,7 @@ def do_int_quantization(
     if precomputed_scale is None:
         # weight -> compressed_weight, scale, (zero_point)
         results = model([weight])
-        if asym_mode:
+        if config.is_int_asym:
             compressed_weight, scale, zero_point = results
         else:
             compressed_weight, scale = results
@@ -521,7 +539,7 @@ def do_int_quantization(
         # Scale is always in fp32 so there is no need to store it in ov.Tensor
         if scale.backend == TensorBackend.ov:
             scale = scale.to_backend(TensorBackend.numpy)
-    elif precomputed_zero_point is None and asym_mode:
+    elif precomputed_zero_point is None and config.is_int_asym:
         # weight, scale -> compressed_weight, zero_point
         compressed_weight, zero_point = model([weight, precomputed_scale])
         scale = precomputed_scale
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 5d89c75e542..ccf539aeb86 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1031,7 +1031,7 @@ def test_np_ov_compression_decompression(mode):
 
     config = WeightCompressionConfig(mode)
 
-    compressed_weighs, scale, zp = do_int_quantization(w, -1, config, invert_scale=True)
+    compressed_weighs, scale, zp = do_int_quantization(w, config, -1, invert_division=True)
     decompressed_weighs = do_int_dequantization(compressed_weighs, scale, zp)
 
     compressed_weighs = compressed_weighs.data
@@ -1067,7 +1067,7 @@ def test_compressed_weighs_range(mode, data):
     w = Tensor(data)
 
     config = WeightCompressionConfig(mode=mode)
-    compressed_weighs, _, _ = do_int_quantization(w, -1, config)
+    compressed_weighs, _, _ = do_int_quantization(w, config, -1)
 
     assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data))
 

From 69648449a7cc0d4f642b14ab8e12f8a69912e08d Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 28 Oct 2024 13:34:23 +0100
Subject: [PATCH 15/73] Accelerate AWQ

---
 .../algorithms/weight_compression/awq.py      | 10 +++---
 .../weight_compression/openvino_modeling.py   | 14 +++++---
 .../weight_compression/scale_estimation.py    | 10 +++---
 .../weight_compression/weight_lowering.py     | 32 ++++++++++++-------
 4 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py
index 4d78f8f8f4f..ea59258b940 100644
--- a/nncf/quantization/algorithms/weight_compression/awq.py
+++ b/nncf/quantization/algorithms/weight_compression/awq.py
@@ -31,11 +31,11 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_nf4_scale
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
 from nncf.quantization.passes import transform_to_inference_graph
+from nncf.tensor import TensorDataType
 from nncf.tensor import functions as fns
 
 TModel = TypeVar("TModel")
@@ -241,7 +241,7 @@ def apply(
                 offset = gi * group_size
                 gscale = s[offset : offset + group_size]
 
-                a_min = fns.quantile(gscale, 0.1)
+                a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
                 a_max = 1e2
                 gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)
 
@@ -261,11 +261,9 @@ def apply(
                         g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale)
                         g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale)
                     else:
-                        # TODO: Improve by replacing with quantize_dequantize
-                        g_compressed_weighs, g_c_scale, g_c_zp = do_int_quantization(
+                        g_decompressed_weighs = calculate_quantized_dequantized_weight(
                             weights_to_fake_quantize, awq_config, reduction_axis
                         )
-                        g_decompressed_weighs = do_int_dequantization(g_compressed_weighs, g_c_scale, g_c_zp)
                     sacts = gacts / fns.unsqueeze(cur_scale, 1)
 
                     cur_out = fns.matmul(g_decompressed_weighs, sacts)
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 06e1f2ddd70..c26c095a70c 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -116,8 +116,9 @@ def get_compress_decompress_weight_model(
     ov_model_params: OVModelParameters,
     config: WeightCompressionConfig,
     weight_shape: Tuple,
-    scale_shape: Optional[Tuple],
+    scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
+    reduction_axes: Optional[Tuple] = None,
 ) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
@@ -131,6 +132,7 @@ def get_compress_decompress_weight_model(
         weight_shape,
         scale_shape,
         zero_point_shape,
+        reduction_axes,
         disable_caching=ov_model_params.recompile,
     )
 
@@ -248,11 +250,12 @@ def _build_compress_decompress_model(
     config: WeightCompressionConfig,
     ov_model_params: OVModelParameters,
     weight_shape: Tuple,
-    scale_shape: Tuple,
+    scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
+    reduction_axes: Optional[Tuple] = None,
 ) -> ModelCallable:
     ov_parameters, ov_results = _build_compress_model(
-        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes=None, return_nodes=True
+        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
     )
 
     if config.is_int_asym:
@@ -267,7 +270,10 @@ def _build_compress_decompress_model(
             # weight, scale, zero_point -> compressed_weight
             compressed_w = ov_results[0]
             scale, zero_point = ov_parameters[1:]
-        decompressed_w = opset.convert(opset.convert(compressed_w, ov.Type.i32) - zero_point, ov.Type.f32) * scale
+
+        decompressed_w = scale * opset.convert(
+            opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32), ov.Type.f32
+        )
     else:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 68e161b5a8e..bb814724df3 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -297,8 +297,9 @@ def calculate_quantization_params(
                 if config.mode == CompressWeightsMode.NF4:
                     out = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 else:
-                    out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=near_to_ideal_scale,
-                                                    precomputed_zero_point=zp)
+                    out, _, _ = do_int_quantization(
+                        original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp
+                    )
                 compressed_weights = fns.zeros_like(original_weight) + out
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
                 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
@@ -311,8 +312,9 @@ def calculate_quantization_params(
             if config.mode == CompressWeightsMode.NF4:
                 out = do_nf4_quantization(original_weight, scaled_scale)
             else:
-                out, _, _ = do_int_quantization(original_weight, config, precomputed_scale=scaled_scale,
-                                                precomputed_zero_point=zp)
+                out, _, _ = do_int_quantization(
+                    original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp
+                )
             compressed_weights = fns.zeros_like(original_weight) + out
 
             target, zero_mask = get_target_zero_mask(compressed_weights, zp)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index d9db96e7e77..2f70e7e47a9 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -558,8 +558,9 @@ def do_int_quantization(
 def calculate_quantized_dequantized_weight(
     weight: Tensor,
     config: WeightCompressionConfig,
-    scale: Tensor,
-    zero_point: Optional[Tensor] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
+    precomputed_scale: Optional[Tensor] = None,
+    precomputed_zero_point: Optional[Tensor] = None,
     invert_division: Optional[bool] = False,
     ov_model_params: Optional[OVModelParameters] = None,
 ) -> Tensor:
@@ -573,23 +574,32 @@ def calculate_quantized_dequantized_weight(
 
     if not accelerate_through_ov:
         # Reference implementation
-        compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
+        if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None):
+            compressed_weight, scale, zero_point = do_int_quantization(
+                weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division
+            )
+        else:
+            scale = precomputed_scale if precomputed_scale is not None else None
+            zero_point = precomputed_zero_point if precomputed_zero_point is not None else None
+            compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
         decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
         return decompressed_weight
 
     weight_shape = weight.shape
-    scale_shape = scale.shape
-    zero_point_shape = None if zero_point is None else zero_point.shape
+    scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
+    zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None
 
     if ov_model_params is None:
         ov_model_params = OVModelParameters(weight.dtype)
-    if config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]:
-        ov_model_params.dynamic = False
 
-    model = get_compress_decompress_weight_model(ov_model_params, config, weight_shape, scale_shape, zero_point_shape)
+    model = get_compress_decompress_weight_model(
+        ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes
+    )
 
-    inputs = [weight, scale]
-    if zero_point is not None:
-        inputs.append(zero_point)
+    inputs = [weight]
+    if precomputed_scale is not None:
+        inputs.append(precomputed_scale)
+    if precomputed_zero_point is not None:
+        inputs.append(precomputed_zero_point)
     decompressed_weight = model(inputs)[0]
     return decompressed_weight

From 80e2c928171e0fa5340c1df5f5cba02ef14eb4a6 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 29 Oct 2024 10:47:20 +0100
Subject: [PATCH 16/73] SE changes

---
 .../weight_compression/openvino_modeling.py   |  7 +-
 .../weight_compression/scale_estimation.py    | 16 +++--
 .../weight_compression/weight_lowering.py     | 67 +++++++++++--------
 3 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index c26c095a70c..2840d32e8b2 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -47,6 +47,7 @@ def __hash__(self):
         return hash(
             (
                 self.input_dtype,
+                self.output_dtype,
                 self.dynamic_shapes,
                 self.recompile,
                 self.release_memory,
@@ -119,6 +120,7 @@ def get_compress_decompress_weight_model(
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
+    return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
@@ -133,6 +135,7 @@ def get_compress_decompress_weight_model(
         scale_shape,
         zero_point_shape,
         reduction_axes,
+        return_compressed_weight,
         disable_caching=ov_model_params.recompile,
     )
 
@@ -253,6 +256,7 @@ def _build_compress_decompress_model(
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
+    return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
     ov_parameters, ov_results = _build_compress_model(
         config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
@@ -284,7 +288,8 @@ def _build_compress_decompress_model(
             scale = ov_parameters[1]
         decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale
 
-    model = ov.Model([decompressed_w], ov_parameters)
+    ov_results = [decompressed_w] + ov_results if return_compressed_weight else [decompressed_w]
+    model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
     return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index bb814724df3..b35188d05ae 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -28,7 +28,6 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
@@ -220,11 +219,11 @@ def calculate_quantization_params(
             q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis)
             zp = None
         else:
-            # TODO: Improve by replacing with quantize_dequantize with additional outputs
-            compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis)
+            q_weights, compressed_weights, scale, zp = calculate_quantized_dequantized_weight(
+                original_weight, cur_config, reduction_axis, return_compressed_weight=True
+            )
             if zp is not None:
                 zp = zp.astype(scale.dtype)
-            q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis)
 
         s = fns.unsqueeze(s, 0)
         s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size)
@@ -243,7 +242,6 @@ def calculate_quantization_params(
         importance = importance / (denum + eps)
 
         X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
-        q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
         best_diffs = None
         result_scale = None
 
@@ -269,7 +267,9 @@ def calculate_quantization_params(
                 g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
-                out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp)
+                out = calculate_quantized_dequantized_weight(
+                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp
+                )
             q_weights_ = fns.zeros_like(original_weight) + out
             q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
 
@@ -326,7 +326,9 @@ def calculate_quantization_params(
                 g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
-                out = calculate_quantized_dequantized_weight(original_weight, config, near_to_ideal_scale, zp)
+                out = calculate_quantized_dequantized_weight(
+                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp
+                )
             q_weights_ = fns.zeros_like(original_weight) + out
 
             q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 2f70e7e47a9..265b624c872 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -11,7 +11,7 @@
 import logging
 import os
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 
@@ -496,27 +496,16 @@ def do_int_quantization(
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
     if ov_model_params is None:
-        output_dtype = None
-        return_ov_tensors = False
-        if config.num_bits == 4:
-            if weight.backend == TensorBackend.ov:
-                return_ov_tensors = weight.backend == TensorBackend.ov
-            else:
-                output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
-        # ov_model_params = OVModelParameters(
-        #     input_dtype=weight.dtype,
-        #     output_dtype=output_dtype,
-        #     return_ov_tensors=return_ov_tensors,
-        # )
-        ov_model_params = OVModelParameters(
-            input_dtype=weight.dtype,
-            output_dtype=output_dtype,
-            dynamic_shapes=bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))),
-            recompile=bool(int(os.environ.get("RECOMPILE", "0"))),
-            release_memory=bool(int(os.environ.get("RELEASE_MEMORY", "0"))),
-            share_outputs=bool(int(os.environ.get("SHARE_OUTPUTS", "0"))),
-            return_ov_tensors=return_ov_tensors,
-        )
+        ov_model_params = OVModelParameters(weight.dtype)
+    if config.num_bits == 4:
+        if weight.backend == TensorBackend.ov:
+            ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov
+        else:
+            ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
+    ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
+    ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
+    ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
+    ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
 
     model = get_compress_weight_model(
         ov_model_params,
@@ -562,8 +551,9 @@ def calculate_quantized_dequantized_weight(
     precomputed_scale: Optional[Tensor] = None,
     precomputed_zero_point: Optional[Tensor] = None,
     invert_division: Optional[bool] = False,
+    return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional[OVModelParameters] = None,
-) -> Tensor:
+) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
     accelerate_through_ov = (
         is_openvino_available()
         and weight.backend != TensorBackend.torch
@@ -583,7 +573,15 @@ def calculate_quantized_dequantized_weight(
             zero_point = precomputed_zero_point if precomputed_zero_point is not None else None
             compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
         decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
-        return decompressed_weight
+        if return_compressed_weight:
+            return decompressed_weight, compressed_weight, scale, zero_point
+        else:
+            return decompressed_weight
+
+    # When reduction axes are not provided, assuming that the weights are already reshaped
+    if config.group_size != -1 and reduction_axes is not None:
+        # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
+        weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
     weight_shape = weight.shape
     scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
@@ -591,9 +589,11 @@ def calculate_quantized_dequantized_weight(
 
     if ov_model_params is None:
         ov_model_params = OVModelParameters(weight.dtype)
+    if return_compressed_weight and config.num_bits == 4:
+        ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
 
     model = get_compress_decompress_weight_model(
-        ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes
+        ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight
     )
 
     inputs = [weight]
@@ -601,5 +601,18 @@ def calculate_quantized_dequantized_weight(
         inputs.append(precomputed_scale)
     if precomputed_zero_point is not None:
         inputs.append(precomputed_zero_point)
-    decompressed_weight = model(inputs)[0]
-    return decompressed_weight
+
+    compressed_weight, scale, zero_point = None, None, None
+    results = model(inputs)
+    if len(results) == 1:
+        decompressed_weight = results[0]
+    elif len(results) == 2:
+        decompressed_weight, compressed_weight = results
+    elif len(results) == 3:
+        decompressed_weight, compressed_weight, scale = results
+    else:
+        decompressed_weight, compressed_weight, scale, zero_point = results
+    if return_compressed_weight:
+        return decompressed_weight, compressed_weight, scale, zero_point
+    else:
+        return decompressed_weight

From fc828664d0718eea5ae23ab3b85d5ad2cbddd9f8 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 29 Oct 2024 15:52:57 +0100
Subject: [PATCH 17/73] Add access counts to caching decorator

---
 compare_inference_time.py | 125 ++++++++++++++++++++++++++++++++++++++
 nncf/results_caching.py   |   5 ++
 2 files changed, 130 insertions(+)
 create mode 100644 compare_inference_time.py

diff --git a/compare_inference_time.py b/compare_inference_time.py
new file mode 100644
index 00000000000..f11884dbd0f
--- /dev/null
+++ b/compare_inference_time.py
@@ -0,0 +1,125 @@
+import gc
+import time
+
+import numpy as np
+from unittest.mock import patch
+from tqdm import tqdm
+
+from nncf import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization, calculate_quantized_dequantized_weight
+from nncf.tensor import Tensor
+import nncf.utils
+
+
+def get_random_weights(size, amount, n_unique_shapes, dtype, is_sorted=True):
+    n_channels = set()
+    while len(n_channels) < n_unique_shapes:
+        n_channels.add(int(np.random.normal(np.sqrt(size), n_unique_shapes)))
+    n_channels = list(n_channels)
+
+    unique_weights = []
+    for d in n_channels:
+        shape = (size // d, d)
+        unique_weights.append(Tensor(np.random.random(shape).astype(dtype)))
+
+    result = []
+    for _ in range(amount):
+        result.append(np.random.choice(unique_weights))
+
+    if is_sorted:
+        result = sorted(result, key=lambda x: x.shape[0] * x.shape[1], reverse=True)
+    return result
+
+
+def measure_compression_time(weights, config, is_ov, verbose=True):
+    orig_value = nncf.utils._openvino_available
+    nncf.utils._openvino_available = is_ov
+
+    start_time = time.perf_counter()
+    for w in tqdm(weights, disable=not verbose):
+        do_int_quantization(
+        # calculate_quantized_dequantized_weight(
+            w,
+            config,
+            reduction_axes=(1,),
+            ov_model_params=OVModelParameters(
+                input_dtype=w.dtype,
+                output_dtype=None,
+                dynamic_shapes=bool(0),
+                recompile=bool(0),
+                release_memory=bool(1),
+                share_inputs=bool(1),
+                share_outputs=bool(1),
+                return_ov_tensors=bool(0),
+            ),
+            # return_compressed_weight=bool(1)
+        )
+    end_time = time.perf_counter()
+    total_time = end_time - start_time
+    avg_time = total_time / len(weights)
+    if verbose:
+        print("OV" if is_ov else "NP", f"avg. time: {avg_time:.1e} sec.")
+
+    nncf.utils._openvino_available = orig_value
+    OV_MODEL_CACHE.clear()
+    gc.collect()
+    return avg_time
+
+
+def bin_search(l, r, config, n, dtype):
+    while r / l > 1.05:
+        m = np.sqrt(l * r)
+        weights = get_random_weights(
+            size=int(m),
+            amount=n,
+            # n_unique_shapes=int(np.sqrt(n)),
+            n_unique_shapes=1,
+            dtype=dtype
+        )
+        t_np = measure_compression_time(
+            weights,
+            config,
+            is_ov=False,
+            verbose=False,
+        )
+        t_ov = measure_compression_time(
+            weights,
+            config,
+            is_ov=True,
+            verbose=False,
+        )
+        print(f"S: {m:.1e}. NP time: {t_np:.1e} sec. OV time: {t_ov:.1e} sec.")
+        if t_np < t_ov:
+            l = m
+        else:
+            r = m
+
+
+N = int(1e5)
+S = int(5e5)    # 5e5 for compression/decompression,
+K = int(np.sqrt(N))
+DTYPE = np.float32
+
+bin_search(
+    l=int(1e2),
+    r=int(1e5),
+    config=WeightCompressionConfig(
+        CompressWeightsMode.INT4_ASYM,
+        group_size=-1
+    ),
+    n=N,
+    dtype=DTYPE,
+)
+
+# weights = get_random_weights(size=S, amount=N, n_unique_shapes=K, dtype=np.float32)
+# for is_ov in [False, True]:
+#     measure_compression_time(
+#         weights,
+#         WeightCompressionConfig(
+#             CompressWeightsMode.INT4_ASYM,
+#             group_size=-1
+#         ),
+#         is_ov=is_ov,
+#     )
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
index d1d16ea775b..5d8b7fa99c9 100644
--- a/nncf/results_caching.py
+++ b/nncf/results_caching.py
@@ -10,22 +10,27 @@
 # limitations under the License.
 
 import inspect
+from collections import defaultdict
 
 
 class ResultsCacheContainer:
     def __init__(self):
         self._cache = {}
+        self._access_count = {}
 
     def clear(self):
         self._cache.clear()
+        self._access_count.clear()
 
     def is_empty(self):
         return len(self._cache) == 0
 
     def __getitem__(self, item):
+        self._access_count[item] += 1
         return self._cache[item]
 
     def __setitem__(self, key, value):
+        self._access_count[key] = 0
         self._cache[key] = value
 
     def __contains__(self, item):

From f3891cda50a3bf75cd57dcfaeae4eb13e4231bf1 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 29 Oct 2024 16:18:32 +0100
Subject: [PATCH 18/73] Comment out env vars

---
 .../algorithms/weight_compression/weight_lowering.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 265b624c872..3eaa024f4c2 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -461,7 +461,7 @@ def do_int_quantization(
     accelerate_through_ov = (
         is_openvino_available()
         and weight.backend != TensorBackend.torch
-        and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+        # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
     )
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
@@ -502,10 +502,10 @@ def do_int_quantization(
             ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov
         else:
             ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
-    ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
-    ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
-    ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
-    ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
+    # ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
+    # ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
+    # ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
+    # ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
 
     model = get_compress_weight_model(
         ov_model_params,
@@ -557,7 +557,7 @@ def calculate_quantized_dequantized_weight(
     accelerate_through_ov = (
         is_openvino_available()
         and weight.backend != TensorBackend.torch
-        and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+        # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
     )
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")

From 353aac14524eb64aba7dd20661fe3333c248fefe Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 29 Oct 2024 16:43:30 +0100
Subject: [PATCH 19/73] Fix existing tests

---
 .github/workflows/precommit.yml               |  2 ++
 compare_inference_time.py                     | 21 +++++++------
 nncf/openvino/graph/node_utils.py             |  2 +-
 .../weight_compression/openvino_backend.py    |  7 +++--
 .../weight_compression/openvino_modeling.py   |  6 ++--
 .../weight_compression/scale_estimation.py    |  1 +
 .../weight_compression/weight_lowering.py     | 18 ++++++-----
 nncf/results_caching.py                       |  1 -
 .../template_test_nncf_tensor.py              | 20 +++++++++++--
 tests/openvino/native/models.py               |  4 +--
 .../quantization/test_weights_compression.py  | 30 -------------------
 11 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
index cbed985e3de..1772e2619ad 100644
--- a/.github/workflows/precommit.yml
+++ b/.github/workflows/precommit.yml
@@ -64,6 +64,8 @@ jobs:
           cache: pip
       - name: Install NNCF and test requirements
         run: make install-openvino-test
+      - name: Install OpenVINO Nightly
+        run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - name: Print installed modules
         run: pip list
       - name: Run OV precommit test scope
diff --git a/compare_inference_time.py b/compare_inference_time.py
index f11884dbd0f..452e3e57afd 100644
--- a/compare_inference_time.py
+++ b/compare_inference_time.py
@@ -1,16 +1,18 @@
 import gc
 import time
+from unittest.mock import patch
 
 import numpy as np
-from unittest.mock import patch
 from tqdm import tqdm
 
+import nncf.utils
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization, calculate_quantized_dequantized_weight
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.tensor import Tensor
-import nncf.utils
 
 
 def get_random_weights(size, amount, n_unique_shapes, dtype, is_sorted=True):
@@ -40,7 +42,7 @@ def measure_compression_time(weights, config, is_ov, verbose=True):
     start_time = time.perf_counter()
     for w in tqdm(weights, disable=not verbose):
         do_int_quantization(
-        # calculate_quantized_dequantized_weight(
+            # calculate_quantized_dequantized_weight(
             w,
             config,
             reduction_axes=(1,),
@@ -76,7 +78,7 @@ def bin_search(l, r, config, n, dtype):
             amount=n,
             # n_unique_shapes=int(np.sqrt(n)),
             n_unique_shapes=1,
-            dtype=dtype
+            dtype=dtype,
         )
         t_np = measure_compression_time(
             weights,
@@ -98,17 +100,14 @@ def bin_search(l, r, config, n, dtype):
 
 
 N = int(1e5)
-S = int(5e5)    # 5e5 for compression/decompression,
+S = int(5e5)  # 5e5 for compression/decompression,
 K = int(np.sqrt(N))
 DTYPE = np.float32
 
 bin_search(
     l=int(1e2),
     r=int(1e5),
-    config=WeightCompressionConfig(
-        CompressWeightsMode.INT4_ASYM,
-        group_size=-1
-    ),
+    config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, group_size=-1),
     n=N,
     dtype=DTYPE,
 )
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 33d67140d16..05e759f1b16 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -107,7 +107,7 @@ def cnt_if_op(model: ov.Model, cnt: int) -> int:
     return cnt_if_op(model, 0)
 
 
-def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = False) -> np.ndarray:
+def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = True) -> np.ndarray:
     """
     Returns the constant tensor for the node.
     This method is applicable only for the floating-point constant data.
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 8fbd0e2935a..3e14be11561 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -11,8 +11,6 @@
 from typing import Dict, Iterable, List, Optional, Tuple
 
 import openvino as ov
-from openvino import Type
-from openvino.properties.hint import inference_precision
 from openvino.runtime import opset13 as opset
 from openvino.runtime.op import Constant
 
@@ -336,7 +334,10 @@ def transform_model(
                     weight = weight.to_backend(TensorBackend.numpy)
                 if compressed_weight.tensor.backend == TensorBackend.ov:
                     compressed_weight.tensor = compressed_weight.tensor.to_backend(TensorBackend.numpy)
-                if compressed_weight.zero_point.backend == TensorBackend.ov:
+                if (
+                    compressed_weight.zero_point is not None
+                    and compressed_weight.zero_point.backend == TensorBackend.ov
+                ):
                     compressed_weight.zero_point = compressed_weight.zero_point.to_backend(TensorBackend.numpy)
                 adapters = lora_correction_algo.calculate_adapters(weight, compressed_weight, wc_params)
                 self.insert_adapters(wc_params, *adapters, int8_lora=lora_correction_algo.use_int8_adapters)
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 2840d32e8b2..a1c99241b4c 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -186,8 +186,8 @@ def _build_compress_model(
             w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True)
             w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)
 
-            scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.constant(0, ov.Type.f32) - w_max)
-            scale /= opset.constant(level_high, ov.Type.f32)
+            scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max))
+            scale /= opset.constant(-level_low, ov.Type.f32)
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
 
     zero_point = None
@@ -204,8 +204,6 @@ def _build_compress_model(
             )  # [a1, r, a2] -> [a1, 1, a2]
             min_values = opset.convert(min_values, ov.Type.f32)
 
-        level_low = 0
-        level_high = 2**num_bits - 1
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
         zero_point = opset.clamp(zero_point, level_low, level_high)
 
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index b35188d05ae..9e5fbb3d678 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -217,6 +217,7 @@ def calculate_quantization_params(
             )
             compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True)
             q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis)
+            q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
             zp = None
         else:
             q_weights, compressed_weights, scale, zp = calculate_quantized_dequantized_weight(
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 3eaa024f4c2..14df740d2c9 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -9,7 +9,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
-import os
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -19,9 +18,6 @@
 from nncf.common.logging.logger import log_once
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
 from nncf.tensor import Tensor
 from nncf.tensor import functions as fns
@@ -440,7 +436,7 @@ def do_int_quantization(
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
     invert_division: Optional[bool] = False,
-    ov_model_params: Optional[OVModelParameters] = None,
+    ov_model_params: Optional["OVModelParameters"] = None,
 ):
     """
     Performs integer quantization on the given weight tensor.
@@ -458,6 +454,7 @@ def do_int_quantization(
     """
     assert config.is_integer, "The function supports integer quantization only"
 
+    # import os
     accelerate_through_ov = (
         is_openvino_available()
         and weight.backend != TensorBackend.torch
@@ -471,7 +468,7 @@ def do_int_quantization(
         # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
-    if not accelerate_through_ov:
+    if not accelerate_through_ov or True:
         # Reference implementation
 
         if weight.backend == TensorBackend.ov:
@@ -491,6 +488,9 @@ def do_int_quantization(
         compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
         return compressed_weights, scale, zero_point
 
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
+
     weight_shape = weight.shape
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
@@ -552,8 +552,9 @@ def calculate_quantized_dequantized_weight(
     precomputed_zero_point: Optional[Tensor] = None,
     invert_division: Optional[bool] = False,
     return_compressed_weight: Optional[bool] = False,
-    ov_model_params: Optional[OVModelParameters] = None,
+    ov_model_params: Optional["OVModelParameters"] = None,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
+    # import os
     accelerate_through_ov = (
         is_openvino_available()
         and weight.backend != TensorBackend.torch
@@ -578,6 +579,9 @@ def calculate_quantized_dequantized_weight(
         else:
             return decompressed_weight
 
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
+
     # When reduction axes are not provided, assuming that the weights are already reshaped
     if config.group_size != -1 and reduction_axes is not None:
         # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
index 5d8b7fa99c9..9b314863108 100644
--- a/nncf/results_caching.py
+++ b/nncf/results_caching.py
@@ -10,7 +10,6 @@
 # limitations under the License.
 
 import inspect
-from collections import defaultdict
 
 
 class ResultsCacheContainer:
diff --git a/tests/cross_fw/test_templates/template_test_nncf_tensor.py b/tests/cross_fw/test_templates/template_test_nncf_tensor.py
index 13f2d6bc976..97b59342d58 100644
--- a/tests/cross_fw/test_templates/template_test_nncf_tensor.py
+++ b/tests/cross_fw/test_templates/template_test_nncf_tensor.py
@@ -1504,7 +1504,15 @@ def test_expand_dims_error(self, x, axis, match):
     def test_fn_zeros(self):
         shape = (2, 2)
         for dtype in TensorDataType:
-            if dtype == TensorDataType.bfloat16 and self.backend() == TensorBackend.numpy:
+            if (
+                self.backend() == TensorBackend.numpy
+                and dtype == TensorDataType.bfloat16
+                or dtype
+                in [
+                    TensorDataType.int4,
+                    TensorDataType.uint4,
+                ]
+            ):
                 continue
             tensor_a = fns.zeros(shape, backend=self.backend(), dtype=dtype, device=self.device())
             assert isinstance(tensor_a, Tensor)
@@ -1525,7 +1533,15 @@ def test_fn_zeros(self):
     )
     def test_fn_eye(self, n, m, ref):
         for dtype in TensorDataType:
-            if dtype == TensorDataType.bfloat16 and self.backend() == TensorBackend.numpy:
+            if (
+                self.backend() == TensorBackend.numpy
+                and dtype == TensorDataType.bfloat16
+                or dtype
+                in [
+                    TensorDataType.int4,
+                    TensorDataType.uint4,
+                ]
+            ):
                 continue
             tensor_a = fns.eye(n, m, backend=self.backend(), dtype=dtype, device=self.device())
             assert isinstance(tensor_a, Tensor)
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index 5f779bd96e9..48d0807e07f 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -290,11 +290,11 @@ def __init__(self, const_dtype: ov.Type = ov.Type.f32, input_dtype: ov.Type = ov
     def _create_ov_model(self):
         input_shape = [1, 3, 4, 2]
         input_1 = opset.parameter(input_shape, name="Input", dtype=self.input_dtype)
-        data = opset.constant(value=self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const")
+        data = opset.constant(self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const")
         if self.const_dtype != self.input_dtype:
             data = opset.convert(data, self.input_dtype.to_string())
         matmul = opset.matmul(input_1, data, transpose_a=True, transpose_b=False, name="MatMul")
-        bias = opset.constant(value=self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias")
+        bias = opset.constant(self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias")
         if self.const_dtype != self.input_dtype:
             bias = opset.convert(bias, self.input_dtype.to_string())
         add = opset.add(matmul, bias, name="Add")
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index ccf539aeb86..a9623a2ccf4 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -27,7 +27,6 @@
 from nncf.data.dataset import Dataset
 from nncf.experimental.common.tensor_statistics.collectors import AggregatorBase
 from nncf.openvino.graph.node_utils import get_const_value
-from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE
 from nncf.parameters import BackupMode
 from nncf.quantization import compress_weights
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
@@ -36,7 +35,6 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
@@ -1023,34 +1021,6 @@ def test_mixed_precision_e2m1(mode, all_layers, ratio, ref_ids):
     assert ref_e8m0_nodes == names_e8m0
 
 
-@pytest.mark.parametrize("mode", (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM))
-def test_np_ov_compression_decompression(mode):
-    sz = 60
-    w = np.arange(-sz, sz).reshape(2, sz).astype(np.float32) / 9.0
-    w = Tensor(w)
-
-    config = WeightCompressionConfig(mode)
-
-    compressed_weighs, scale, zp = do_int_quantization(w, config, -1, invert_division=True)
-    decompressed_weighs = do_int_dequantization(compressed_weighs, scale, zp)
-
-    compressed_weighs = compressed_weighs.data
-    decompressed_weighs = decompressed_weighs.data
-    zp_shape = zp.shape if zp is not None else None
-
-    compress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive(config, w.shape, scale.shape, zp_shape)
-    compress_decompress = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_decompress_weight_primitive(
-        config, w.shape, scale.shape, zp_shape
-    )
-
-    params = [w.data, scale.data, zp.data] if zp is not None else [w.data, scale.data]
-    compressed_weighs_ov = compress(params)
-    decompressed_weighs_ov = compress_decompress(params)
-
-    assert np.allclose(compressed_weighs, compressed_weighs_ov)
-    assert np.allclose(decompressed_weighs, decompressed_weighs_ov)
-
-
 @pytest.mark.parametrize(
     ("mode", "data"),
     (

From d20e593cbd10e3904b05e0d14778b9ff3b7dd9c1 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 30 Oct 2024 09:42:19 +0100
Subject: [PATCH 20/73] Unstage helper scripts

---
 .github/workflows/precommit.yml               |   2 +-
 compare_inference_time.py                     | 124 ------
 .../weight_compression/openvino_backend.py    |   1 -
 .../weight_compression/weight_lowering.py     |   6 +-
 run_weight_compression.py                     | 373 ------------------
 weight_compression.py                         | 210 ----------
 6 files changed, 4 insertions(+), 712 deletions(-)
 delete mode 100644 compare_inference_time.py
 delete mode 100644 run_weight_compression.py
 delete mode 100644 weight_compression.py

diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
index 1772e2619ad..218d9c32fd1 100644
--- a/.github/workflows/precommit.yml
+++ b/.github/workflows/precommit.yml
@@ -65,7 +65,7 @@ jobs:
       - name: Install NNCF and test requirements
         run: make install-openvino-test
       - name: Install OpenVINO Nightly
-        run: pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+        run: pip install -U --pre openvino==2024.5.0.dev20241015 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - name: Print installed modules
         run: pip list
       - name: Run OV precommit test scope
diff --git a/compare_inference_time.py b/compare_inference_time.py
deleted file mode 100644
index 452e3e57afd..00000000000
--- a/compare_inference_time.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import gc
-import time
-from unittest.mock import patch
-
-import numpy as np
-from tqdm import tqdm
-
-import nncf.utils
-from nncf import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
-from nncf.tensor import Tensor
-
-
-def get_random_weights(size, amount, n_unique_shapes, dtype, is_sorted=True):
-    n_channels = set()
-    while len(n_channels) < n_unique_shapes:
-        n_channels.add(int(np.random.normal(np.sqrt(size), n_unique_shapes)))
-    n_channels = list(n_channels)
-
-    unique_weights = []
-    for d in n_channels:
-        shape = (size // d, d)
-        unique_weights.append(Tensor(np.random.random(shape).astype(dtype)))
-
-    result = []
-    for _ in range(amount):
-        result.append(np.random.choice(unique_weights))
-
-    if is_sorted:
-        result = sorted(result, key=lambda x: x.shape[0] * x.shape[1], reverse=True)
-    return result
-
-
-def measure_compression_time(weights, config, is_ov, verbose=True):
-    orig_value = nncf.utils._openvino_available
-    nncf.utils._openvino_available = is_ov
-
-    start_time = time.perf_counter()
-    for w in tqdm(weights, disable=not verbose):
-        do_int_quantization(
-            # calculate_quantized_dequantized_weight(
-            w,
-            config,
-            reduction_axes=(1,),
-            ov_model_params=OVModelParameters(
-                input_dtype=w.dtype,
-                output_dtype=None,
-                dynamic_shapes=bool(0),
-                recompile=bool(0),
-                release_memory=bool(1),
-                share_inputs=bool(1),
-                share_outputs=bool(1),
-                return_ov_tensors=bool(0),
-            ),
-            # return_compressed_weight=bool(1)
-        )
-    end_time = time.perf_counter()
-    total_time = end_time - start_time
-    avg_time = total_time / len(weights)
-    if verbose:
-        print("OV" if is_ov else "NP", f"avg. time: {avg_time:.1e} sec.")
-
-    nncf.utils._openvino_available = orig_value
-    OV_MODEL_CACHE.clear()
-    gc.collect()
-    return avg_time
-
-
-def bin_search(l, r, config, n, dtype):
-    while r / l > 1.05:
-        m = np.sqrt(l * r)
-        weights = get_random_weights(
-            size=int(m),
-            amount=n,
-            # n_unique_shapes=int(np.sqrt(n)),
-            n_unique_shapes=1,
-            dtype=dtype,
-        )
-        t_np = measure_compression_time(
-            weights,
-            config,
-            is_ov=False,
-            verbose=False,
-        )
-        t_ov = measure_compression_time(
-            weights,
-            config,
-            is_ov=True,
-            verbose=False,
-        )
-        print(f"S: {m:.1e}. NP time: {t_np:.1e} sec. OV time: {t_ov:.1e} sec.")
-        if t_np < t_ov:
-            l = m
-        else:
-            r = m
-
-
-N = int(1e5)
-S = int(5e5)  # 5e5 for compression/decompression,
-K = int(np.sqrt(N))
-DTYPE = np.float32
-
-bin_search(
-    l=int(1e2),
-    r=int(1e5),
-    config=WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, group_size=-1),
-    n=N,
-    dtype=DTYPE,
-)
-
-# weights = get_random_weights(size=S, amount=N, n_unique_shapes=K, dtype=np.float32)
-# for is_ov in [False, True]:
-#     measure_compression_time(
-#         weights,
-#         WeightCompressionConfig(
-#             CompressWeightsMode.INT4_ASYM,
-#             group_size=-1
-#         ),
-#         is_ov=is_ov,
-#     )
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 3e14be11561..24364c592d9 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -278,7 +278,6 @@ def _create_compression_subgraph(
         if should_add_convert_node:
             mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert")
 
-        # TODO: convert tensors inside compressed_weight to numpy backend if they are in ov backend
         return mul, compressed_weight
 
     def transform_model(
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 14df740d2c9..f65049e6dff 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -436,7 +436,7 @@ def do_int_quantization(
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
     invert_division: Optional[bool] = False,
-    ov_model_params: Optional["OVModelParameters"] = None,
+    ov_model_params: Optional = None,
 ):
     """
     Performs integer quantization on the given weight tensor.
@@ -468,7 +468,7 @@ def do_int_quantization(
         # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
-    if not accelerate_through_ov or True:
+    if not accelerate_through_ov:
         # Reference implementation
 
         if weight.backend == TensorBackend.ov:
@@ -552,7 +552,7 @@ def calculate_quantized_dequantized_weight(
     precomputed_zero_point: Optional[Tensor] = None,
     invert_division: Optional[bool] = False,
     return_compressed_weight: Optional[bool] = False,
-    ov_model_params: Optional["OVModelParameters"] = None,
+    ov_model_params: Optional = None,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
     # import os
     accelerate_through_ov = (
diff --git a/run_weight_compression.py b/run_weight_compression.py
deleted file mode 100644
index 74d752ef4de..00000000000
--- a/run_weight_compression.py
+++ /dev/null
@@ -1,373 +0,0 @@
-import os
-import shutil
-import subprocess
-import threading
-import time
-from pathlib import Path
-
-
-def stream_handler(stream, target_file):
-    for line in iter(stream.readline, ''):
-        print(line, end='')
-        target_file.write(line)
-
-
-parent_model_dir = Path("/home/nsavel/workspace/models/hf")
-parent_log_dir = Path("compression_logs")
-
-experiment_params = [
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"),
-
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"),
-
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/tiny-llama", "--save-model --dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"),
-    # #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", ""),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/phi3", "--dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-    # #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", ""),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int8/llama3-8b", "--dynamic --release-memory --share-outputs"),
-
-
-
-
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-    
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/phi3", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP32", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --recompile --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/llama3-8b", "--compression-mode int4_asym --dynamic --release-memory --share-outputs"),
-
-
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/tiny-llama", "--save-model --release-memory"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int8/phi3", "--release-memory"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int8/llama3-8b", "--release-memory"),
-    #
-    #
-    #
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/tiny-llama", "--save-model --compression-mode int4_asym --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP32", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/FP16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "phi3-mini-4k-instruct/pytorch/dldt/BF16", parent_log_dir / "optimal_configurations/int4/phi3", "--compression-mode int4_asym --release-memory --share-outputs"),
-    #
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp32", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-fp16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "Meta-Llama-3-8B/pytorch/dldt/optimum-bf16", parent_log_dir / "optimal_configurations/int4/llama3-8b", "--compression-mode int4_asym --release-memory --share-outputs"),
-
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP32", parent_log_dir / "int4_acc/tiny-llama", "--save-model --compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/FP16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym --numpy"),
-    # (parent_model_dir / "tiny-llama/pytorch/dldt/BF16", parent_log_dir / "reimpl_24102025_acc/int4/tiny-llama", "--save-model --compression-mode int4_asym"),
-]
-
-for model_dir, log_dir, params in experiment_params:
-    model_path = model_dir / "openvino_model.xml"
-    cmd = f"/home/nsavel/venvs/nncf/bin/python weight_compression.py --model-path {model_path} --log-dir {log_dir} {params}"
-
-    log_dir.mkdir(parents=True, exist_ok=True)
-    with open(log_dir / "log.txt", "a") as log_file:
-        process = subprocess.Popen(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True,
-            universal_newlines=True,
-            preexec_fn=os.setsid,
-        )
-
-        stdout_thread = threading.Thread(target=stream_handler, args=(process.stdout, log_file))
-        stderr_thread = threading.Thread(target=stream_handler, args=(process.stderr, log_file))
-
-        stdout_thread.start()
-        stderr_thread.start()
-
-        stdout_thread.join()
-        stderr_thread.join()
-
-        process.wait()
-    time.sleep(10)
-
-evaluated_paths = set()
-for _, log_dir, _ in experiment_params:
-    for model_path in sorted(log_dir.rglob("**/*")):
-        model_path: Path
-        if model_path.suffix != ".xml":
-            continue
-        if model_path.absolute() in evaluated_paths:
-            continue
-        evaluated_paths.add(model_path.absolute())
-
-        model_dir = model_path.parent.absolute()
-        cmd = f"/home/nsavel/venvs/lm-evaluation-harness/bin/lm_eval --model openvino --model_args pretrained={model_dir},trust_remote_code=True --tasks wikitext --output_path {model_dir}"
-        process = subprocess.Popen(cmd, shell=True)
-        process.wait()
diff --git a/weight_compression.py b/weight_compression.py
deleted file mode 100644
index bae1948145c..00000000000
--- a/weight_compression.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import gc
-import os
-import shutil
-import time
-from functools import partial
-from pathlib import Path
-
-import openvino as ov
-
-import nncf
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
-from tools.memory_monitor import MemoryMonitor
-from tools.memory_monitor import MemoryType
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--model-path", type=str, required=True, help="Path where the model is stored")
-
-    parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved")
-
-    parser.add_argument("--compression-mode", default="int8_asym", type=str, choices=["int8_asym", "int8_sym", "int4_asym", "int4_sym",], help="Weight compression mode")
-
-    parser.add_argument("--numpy", action="store_true", help="Enable numpy compression")
-
-    parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models")
-
-    parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default=None, help="OV model input dtype")
-
-    parser.add_argument("--recompile", action="store_true", help="Recompile model every time")
-
-    parser.add_argument("--share-outputs", action="store_true", help="Share OV model outputs")
-
-    parser.add_argument("--save-model", action="store_true", help="Save compressed model")
-
-    parser.add_argument("--release-memory", action="store_true", help="Release memory")
-
-    return parser.parse_args()
-
-
-def log(mm, fz, log_dir):
-    mm.save_memory_logs(
-        *mm.get_data(memory_from_zero=fz), save_dir=Path(log_dir), filename_suffix="_from-zero" if fz else ""
-    )
-
-
-def count_node_dtypes(model):
-    # Get the main dtype of weight constants
-    node_count_per_dtype = dict(f32=0, f16=0, bf16=0)
-    for node in model.get_ordered_ops():
-        friendly_name = node.get_friendly_name()
-        if node.get_type_name() != "Constant" or ".weight" not in friendly_name:
-            continue
-        const_dtype = node.get_element_type().get_type_name()
-        if const_dtype in node_count_per_dtype:
-            node_count_per_dtype[const_dtype] = node_count_per_dtype[const_dtype] + 1
-    return node_count_per_dtype
-
-
-def main(args):
-    model_path = Path(args.model_path)
-    log_dir = Path(args.log_dir)
-
-    numpy_compression = args.numpy
-    dynamic_compression = args.dynamic
-    input_dtype = args.input_dtype
-    recompile = args.recompile
-    share_outputs = args.share_outputs
-    save_model = args.save_model
-    release_memory = args.release_memory
-
-    log_dir_suffix = f"{model_path.parent.name}_"
-    if numpy_compression:
-        log_dir_suffix = f"{log_dir_suffix}numpy"
-    else:
-        log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}"
-        if input_dtype is not None:
-            log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}"
-        if recompile:
-            log_dir_suffix = f"{log_dir_suffix}_recompile"
-        if release_memory:
-            log_dir_suffix = f"{log_dir_suffix}_release-memory"
-        if share_outputs:
-            log_dir_suffix = f"{log_dir_suffix}_share-outputs"
-    print(f"Log dir suffix: {log_dir_suffix}")
-
-    memory_monitors = []
-    for memory_type, mem_from_zero in [(MemoryType.RSS, False), (MemoryType.SYSTEM, False), (MemoryType.SYSTEM, True)]:
-        memory_monitor = MemoryMonitor(interval=1e-2, memory_type=memory_type, include_child_processes=bool(0))
-        memory_monitor.start(at_exit_fn=partial(log, memory_monitor, mem_from_zero, log_dir / log_dir_suffix))
-        memory_monitors.append(memory_monitor)
-
-    core = ov.Core()
-    # core.set_property({"ENABLE_MMAP": "NO"})
-    model = core.read_model(model_path)
-
-    node_count_per_dtype = count_node_dtypes(model)
-    assert max(node_count_per_dtype.values()) == sum(node_count_per_dtype.values()), "Not all consts have the same type"
-    node_count_per_dtype = sorted([(v, k) for k, v in node_count_per_dtype.items()], reverse=True)
-    model_dtype = dict(f32="fp32", f16="fp16", bf16="bf16")[node_count_per_dtype[0][1]]
-
-    # Update input dtype based on model
-    input_dtype = input_dtype or model_dtype
-
-    os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}"
-    os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}"
-    os.environ["INPUT_DTYPE"] = input_dtype
-    os.environ["RECOMPILE"] = f"{int(recompile)}"
-    os.environ["SHARE_OUTPUTS"] = f"{int(share_outputs)}"
-    os.environ["RELEASE_MEMORY"] = f"{int(release_memory)}"
-
-    start_time = time.perf_counter()
-    if args.compression_mode == "int8_asym":
-        compression_mode = nncf.CompressWeightsMode.INT8_ASYM
-    elif args.compression_mode == "int8_sym":
-        compression_mode = nncf.CompressWeightsMode.INT8_SYM
-    elif args.compression_mode == "int4_asym":
-        compression_mode = nncf.CompressWeightsMode.INT4_ASYM
-    elif args.compression_mode == "int4_sym":
-        compression_mode = nncf.CompressWeightsMode.INT4_SYM
-    else:
-        raise ValueError(f"Unknown weight compression mode argument: {args.compression_mode}")
-    # TODO: Consider all_layers=True?
-    compressed_model = nncf.compress_weights(model, mode=compression_mode)
-    compression_time = time.perf_counter() - start_time
-    print(f"Compression Time: {compression_time:.2f} sec.")
-
-    if save_model:
-        ov.save_model(compressed_model, log_dir / log_dir_suffix / "openvino_model.xml")
-        for filepath in model_path.parent.glob("*.json"):
-            shutil.copy(str(filepath), str(log_dir / log_dir_suffix / filepath.name))
-
-    del core
-    del model
-    del compressed_model
-    gc.collect()
-    time.sleep(0.5)
-
-    before_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
-    if not OV_MODEL_CACHE.is_empty():
-        OV_MODEL_CACHE.clear()
-        gc.collect()
-        time.sleep(memory_monitors[0].interval * 10)
-        after_cache_deletion = memory_monitors[2].get_data(True)[1][-1]
-    else:
-        after_cache_deletion = before_cache_deletion
-    cache_size = before_cache_deletion - after_cache_deletion
-    print(f"Cache size: {cache_size:.2f} MiB")
-
-    time.sleep(memory_monitors[0].interval * 10)
-
-    leftover_memory = memory_monitors[2].get_data(True)[1][-1]
-    peak_memory = max(memory_monitors[2].get_data(True)[1])
-    print(f"Peak memory: {peak_memory:.2f} MiB")
-    print(f"Leftover memory: {leftover_memory:.2f} MiB")
-    print("Done")
-
-    csv_path = log_dir / "results.csv"
-    csv_exists = csv_path.exists()
-    csv_path.parent.mkdir(exist_ok=True, parents=True)
-    with open(csv_path, "a") as f:
-        if not csv_exists:
-            f.write(
-                "Model Path,"
-                "Model dtype,"
-                "Backend,"
-                "Recompile,"
-                "Release memory,"
-                "Share outputs,"
-                "Input Shapes,"
-                "Input,"
-                "Compression Time,"
-                "Peak Memory,"
-                "Cache Size,"
-                "Leftover Memory"
-                "\n"
-            )
-        f.write(
-            f"{model_path},"
-            f"{model_dtype.upper()},"
-            f"{'-' if numpy_compression else 'OV'},"
-            f"{'-' if numpy_compression else recompile},"
-            f"{'-' if numpy_compression else release_memory},"
-            f"{'-' if numpy_compression else share_outputs},"
-            f"{'-' if numpy_compression else 'Dynamic' if dynamic_compression else 'Static'},"
-            f"{'-' if numpy_compression else input_dtype.upper()},"
-            f"{compression_time:.2f},"
-            f"{peak_memory:.2f},"
-            f"{cache_size:.2f},"
-            f"{leftover_memory:.2f}"
-            f"\n"
-        )
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    main(args)

From dc30d8d94981b205437917242897b5e92e59bd8a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 31 Oct 2024 18:54:23 +0100
Subject: [PATCH 21/73] Tests WIP

---
 .../algorithms/weight_compression/config.py   |   3 +
 .../weight_compression/openvino_backend.py    |   5 +-
 .../weight_compression/openvino_modeling.py   |  29 +-
 .../weight_compression/scale_estimation.py    |  12 +-
 .../weight_compression/weight_lowering.py     |  34 +-
 nncf/quantization/fake_quantize.py            |   2 +-
 nncf/tensor/functions/ov.py                   |  27 +-
 .../quantization/test_openvino_modeling.py    | 307 ++++++++++++++++++
 8 files changed, 376 insertions(+), 43 deletions(-)
 create mode 100644 tests/openvino/native/quantization/test_openvino_modeling.py

diff --git a/nncf/quantization/algorithms/weight_compression/config.py b/nncf/quantization/algorithms/weight_compression/config.py
index 03590fc5ff3..85179df2fe4 100644
--- a/nncf/quantization/algorithms/weight_compression/config.py
+++ b/nncf/quantization/algorithms/weight_compression/config.py
@@ -54,6 +54,9 @@ def is_integer(self):
     def __hash__(self):
         return hash((self.mode.value, self.group_size))
 
+    def __str__(self):
+        return f"{self.mode.value}_{self.group_size}"
+
 
 @dataclass
 class WeightCompressionParameters:
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 24364c592d9..3a262bd5d12 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -46,6 +46,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorBackend
@@ -127,6 +128,7 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.
     def get_weight_dtype(
         self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph
     ) -> TensorDataType:
+        # TODO: use from nncf.tensor.functions.ov import DTYPE_MAP
         ov_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"]
         dtype_map = {
             "f16": TensorDataType.float16,
@@ -277,7 +279,6 @@ def _create_compression_subgraph(
 
         if should_add_convert_node:
             mul = opset.convert(mul, const_dtype, name=f"{const_node_name}/fq_weights_{weight_port_id}/convert")
-
         return mul, compressed_weight
 
     def transform_model(
@@ -344,6 +345,8 @@ def transform_model(
         # reset name_to_node_mapping
         self.name_to_node_mapping = None
 
+        OV_MODEL_CACHE.clear()
+
         return model
 
     @staticmethod
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index a1c99241b4c..9f2fed9e03e 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -36,7 +36,7 @@
 class OVModelParameters:
     input_dtype: TensorDataType
     output_dtype: Optional[TensorDataType] = None
-    dynamic_shapes: bool = False
+    dynamic_shapes: bool = True  # TODO: set to False once 156511 is resolved
     recompile: bool = False
     release_memory: bool = True
     share_inputs: bool = True
@@ -124,7 +124,8 @@ def get_compress_decompress_weight_model(
 ) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
-        scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+        if scale_shape is not None:
+            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
         if zero_point_shape is not None:
             zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
 
@@ -223,19 +224,18 @@ def _build_compress_model(
         else:
             dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
 
+    compressed_w = opset.round(compressed_w)
     compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
     compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")
 
     ov_results = [compressed_w]
-    if len(ov_parameters) != 3:
-        # Two cases:
-        #   1. weight -> compressed_weight, scale, (zero_point)
-        #   2. weight, scale -> compressed_weight, (zero_point)
-        if len(ov_parameters) == 1:
-            ov_results.append(scale)
-
+    if len(ov_parameters) == 1:
+        ov_results.append(scale)
         if zero_point is not None:
-            ov_results.append(opset.convert(zero_point, compressed_w.get_element_type()))
+            zero_point_dtype = compressed_w.get_element_type() if ov_model_params.return_ov_tensors else ov.Type.i32
+            if zero_point.get_element_type() != zero_point_dtype:
+                zero_point = opset.convert(zero_point, zero_point_dtype)
+            ov_results.append(zero_point)
 
     if return_nodes:
         return ov_parameters, ov_results
@@ -264,18 +264,13 @@ def _build_compress_decompress_model(
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale, zero_point
             compressed_w, scale, zero_point = ov_results
-        elif len(ov_parameters) == 2:
-            # weight, scale -> compressed_weight, zero_point
-            compressed_w, zero_point = ov_results
-            scale = ov_parameters[1]
         else:
             # weight, scale, zero_point -> compressed_weight
             compressed_w = ov_results[0]
             scale, zero_point = ov_parameters[1:]
 
-        decompressed_w = scale * opset.convert(
-            opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32), ov.Type.f32
-        )
+        subtrac_zero_point = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
+        decompressed_w = scale * opset.convert(subtrac_zero_point, ov.Type.f32)
     else:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 9e5fbb3d678..2e4c695b7f5 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -110,7 +110,7 @@ def apply(
         graph: NNCFGraph,
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
-    ) -> Dict[str, Tensor]:
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
         """
         Estimates better scale for the int4 nodes in the model.
         Minimizes per-group difference between floating point MatMul and
@@ -122,10 +122,10 @@ def apply(
         :param graph: Model graph.
         :param statistic_points: Statistic points with collected statistics values.
         :param dataset: A representative dataset for the calibration process.
-        :return: Dict with pairs (weight name, estimated scale).
+        :return: Two dictionaries for estimated scales and zero points for each weight name.
         """
 
-        scales = dict()
+        scales, zero_points = dict(), dict()
 
         for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
             weight_name = wp.weight_name
@@ -145,7 +145,7 @@ def apply(
 
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
 
-            scales[weight_name], _ = self.calculate_quantization_params(
+            scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
                 self._backend_entity,
                 stats,
                 weight,
@@ -157,7 +157,7 @@ def apply(
                 self._weight_penalty,
             )
 
-        return scales
+        return scales, zero_points
 
     @staticmethod
     def calculate_quantization_params(
@@ -352,6 +352,8 @@ def calculate_quantization_params(
 
         if config.group_size == -1:
             result_scale = fns.squeeze(result_scale, axis=1)
+        if zp is not None and config.group_size == -1:
+            zp = fns.squeeze(zp, axis=1)
 
         return result_scale, zp
 
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index f65049e6dff..1aad39d5c5d 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -142,7 +142,9 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=
     return scale
 
 
-def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division=False) -> Tensor:
+def calculate_signed_scale(
+    weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division: Optional[bool] = True
+) -> Tensor:
     """
     Calculates the signed scale for symmetric quantization.
 
@@ -255,7 +257,10 @@ def calculate_normalized_weight_and_fp4_scale(
 
 
 def calculate_integer_quantization_params(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
+    weight: Tensor,
+    reduction_axes: ReductionAxes,
+    config: WeightCompressionConfig,
+    invert_division: Optional[bool] = True,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into
@@ -291,7 +296,7 @@ def calculate_quantized_weight(
     config: WeightCompressionConfig,
     scale: Tensor,
     zero_point: Optional[Tensor] = None,
-    invert_division=False,
+    invert_division: Optional[bool] = True,
 ) -> Tensor:
     """
     Quantizes the weight tensor using the provided scale and zero point.
@@ -327,7 +332,10 @@ def calculate_quantized_weight(
 
 
 def get_integer_quantization_error(
-    weight: Tensor, reduction_axes: ReductionAxes, config: WeightCompressionConfig, invert_division=False
+    weight: Tensor,
+    reduction_axes: ReductionAxes,
+    config: WeightCompressionConfig,
+    invert_division: Optional[bool] = True,
 ) -> float:
     """
     Calculates a quantity characterizing the difference between floating point weights and fake quantized
@@ -361,7 +369,7 @@ def compress_weight(
     config: WeightCompressionConfig,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
-    invert_division=False,
+    invert_division: Optional[bool] = True,
 ):
     """
     Compress weight using compression configuration.
@@ -435,7 +443,7 @@ def do_int_quantization(
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
-    invert_division: Optional[bool] = False,
+    invert_division: Optional[bool] = True,
     ov_model_params: Optional = None,
 ):
     """
@@ -453,6 +461,11 @@ def do_int_quantization(
     :return: A tuple containing the compressed weights, scale, and zero point tensors.
     """
     assert config.is_integer, "The function supports integer quantization only"
+    if config.is_int_asym and (precomputed_scale is None) != (precomputed_zero_point is None):
+        raise ValueError(
+            "If precomputed quantization parameters are provided, both scale and zero point are required "
+            "for asymmetric quantization."
+        )
 
     # import os
     accelerate_through_ov = (
@@ -528,11 +541,8 @@ def do_int_quantization(
         # Scale is always in fp32 so there is no need to store it in ov.Tensor
         if scale.backend == TensorBackend.ov:
             scale = scale.to_backend(TensorBackend.numpy)
-    elif precomputed_zero_point is None and config.is_int_asym:
-        # weight, scale -> compressed_weight, zero_point
-        compressed_weight, zero_point = model([weight, precomputed_scale])
-        scale = precomputed_scale
     else:
+        # weight, scale, (zero_point) -> compressed_weight
         inputs = (
             [weight, precomputed_scale]
             if precomputed_zero_point is None
@@ -550,7 +560,7 @@ def calculate_quantized_dequantized_weight(
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
     precomputed_zero_point: Optional[Tensor] = None,
-    invert_division: Optional[bool] = False,
+    invert_division: Optional[bool] = True,
     return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional = None,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
@@ -606,7 +616,7 @@ def calculate_quantized_dequantized_weight(
     if precomputed_zero_point is not None:
         inputs.append(precomputed_zero_point)
 
-    compressed_weight, scale, zero_point = None, None, None
+    compressed_weight, scale, zero_point = None, precomputed_scale, precomputed_zero_point
     results = model(inputs)
     if len(results) == 1:
         decompressed_weight = results[0]
diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py
index a225f53853a..3e7cee04bc1 100644
--- a/nncf/quantization/fake_quantize.py
+++ b/nncf/quantization/fake_quantize.py
@@ -344,7 +344,7 @@ def calculate_scale_zero_point(
     level_low: int,
     level_high: int,
     narrow_range: bool,
-    invert_division: Optional[bool] = False,
+    invert_division: Optional[bool] = True,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates scale and zero_point values for the quantizer.
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index f8cd0431f83..b5083a4b284 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -17,7 +17,8 @@
 from nncf.tensor.functions import numeric
 
 from ..definitions import TensorBackend
-from .numpy_numeric import DTYPE_MAP as NP_DTYPE_MAP
+from .numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
+from .numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP
 
 DTYPE_MAP = {
     TensorDataType.float16: ov.Type.f16,
@@ -40,7 +41,6 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
     from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
-    assert a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]
 
     model = get_astype_model(
         OVModelParameters(
@@ -65,11 +65,13 @@ def _(a: ov.Tensor) -> TensorBackend:
 
 @numeric.astype.register(ov.Tensor)
 def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
-    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
-    if a_dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
+    if a.get_element_type() in [ov.Type.bf16, ov.Type.i4, ov.Type.u4] or dtype in [
+        TensorDataType.bfloat16,
+        TensorDataType.int4,
+        TensorDataType.uint4,
+    ]:
         return _ov_astype(a, dtype)
-
-    return ov.Tensor(a.data.astype(NP_DTYPE_MAP[dtype]))
+    return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype]))
 
 
 @numeric.dtype.register(ov.Tensor)
@@ -87,8 +89,19 @@ def _(a: ov.Tensor, shape: Union[int, Tuple[int, ...]]) -> ov.Tensor:
     return ov.Tensor(a.data.reshape(shape), shape, a.get_element_type())
 
 
+@numeric.to_backend.register(np.ndarray)
+def _(a: np.ndarray, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
+    if b == TensorBackend.numpy:
+        return a
+    if b != TensorBackend.ov:
+        raise ValueError("Not supported backend")
+    return ov.Tensor(a, a.shape, DTYPE_MAP[DTYPE_MAP_REV_NP[a.dtype]])
+
+
 @numeric.to_backend.register(ov.Tensor)
-def _(a: ov.Tensor, b: TensorBackend) -> np.ndarray:
+def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
+    if b == TensorBackend.ov:
+        return a
     if b != TensorBackend.numpy:
         raise ValueError("Not supported backend")
 
diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
new file mode 100644
index 00000000000..872173c0990
--- /dev/null
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from collections import defaultdict
+from contextlib import contextmanager
+from enum import Enum
+from unittest.mock import patch
+
+import numpy as np
+import openvino as ov
+import pytest
+
+from nncf import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
+from nncf.results_caching import ResultsCacheContainer
+from nncf.results_caching import cache_results
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
+from nncf.tensor.definitions import TensorBackend
+from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
+from nncf.tensor.functions.numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP
+from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV
+
+
+class ComputationBackend(Enum):
+    NumPy = "numpy"
+    OV = "ov"
+
+
+class QuantizationTask(Enum):
+    Q = "quantize"
+    Q_DQ = "quantize_dequantize"
+    Q_DQ_RQ = "quantize_dequantize_return_quantized"
+
+
+COMPRESSION_CONFIGS = [
+    WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+    WeightCompressionConfig(CompressWeightsMode.INT8_SYM),
+    WeightCompressionConfig(CompressWeightsMode.INT4_ASYM),
+    WeightCompressionConfig(CompressWeightsMode.INT4_SYM),
+    WeightCompressionConfig(CompressWeightsMode.INT4_ASYM, group_size=2),
+    WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2),
+]
+
+
+DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
+
+WEIGHT_SHAPE = (1000, 4)
+
+TENSOR_BACKENDS = [TensorBackend.numpy, TensorBackend.ov]
+
+reduction_axes = (1,)
+
+
+RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer()
+
+
+@cache_results(RANDOM_TENSOR_CACHE_CONTAINER)
+def get_random_float_tensor(shape, dtype, backend, seed=0):
+    np.random.seed(seed)
+    data = np.random.normal(size=shape)
+    data = data.astype(np.float16 if dtype == TensorDataType.float16 else np.float32)
+
+    if backend == TensorBackend.ov or dtype == TensorDataType.bfloat16:
+        data = Tensor(ov.Tensor(data, shape, DTYPE_MAP_OV[DTYPE_MAP_REV_NP[data.dtype]]))
+        if dtype == TensorDataType.bfloat16:
+            data = data.astype(TensorDataType.bfloat16)
+    if backend == TensorBackend.numpy:
+        data = data.to_backend(TensorBackend.numpy) if dtype == TensorDataType.bfloat16 else Tensor(data)
+    return Tensor(data)
+
+
+@cache_results(RANDOM_TENSOR_CACHE_CONTAINER)
+def get_random_integer_tensor(shape, low, high, dtype, backend, seed=0):
+    np.random.seed(seed)
+    data = np.random.randint(low, high, size=shape).astype(DTYPE_MAP_NP[dtype])
+    if backend == TensorBackend.ov:
+        data = ov.Tensor(data, shape, DTYPE_MAP_OV[dtype])
+    return Tensor(data)
+
+
+@contextmanager
+def openvino_available(available: bool):
+    import nncf.utils
+
+    original_value = nncf.utils._openvino_available
+    nncf.utils._openvino_available = available
+    yield
+    nncf.utils._openvino_available = original_value
+
+
+@pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS])
+# @pytest.mark.parametrize("config", [WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)])
+@pytest.mark.parametrize(
+    ("quantization_task", "tensor_backend"),
+    [
+        (QuantizationTask.Q, TensorBackend.numpy),
+        (QuantizationTask.Q, "auto"),
+        (QuantizationTask.Q, TensorBackend.ov),
+        (QuantizationTask.Q_DQ, TensorBackend.numpy),
+        (QuantizationTask.Q_DQ, "auto"),
+        (QuantizationTask.Q_DQ_RQ, TensorBackend.numpy),
+        (QuantizationTask.Q_DQ_RQ, "auto"),
+    ],
+)
+@pytest.mark.parametrize("dtype", DATA_TYPES)
+@pytest.mark.parametrize("precompute", [False, True])
+def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute):
+    d1, d2 = WEIGHT_SHAPE
+    group_size = config.group_size
+    zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1)
+    level_low, level_high = 0, 2**config.num_bits - 1
+
+    results = defaultdict(dict)
+    # Iterate over two implementations
+    for cb in [ComputationBackend.NumPy, ComputationBackend.OV]:
+        # A context manager to enable/disable ov implementation
+        with openvino_available(cb == ComputationBackend.OV):
+            # OV tensor backend for weight is only supported for quantization task
+            if quantization_task == QuantizationTask.Q and (
+                tensor_backend == TensorBackend.ov or cb == ComputationBackend.OV and tensor_backend == "auto"
+            ):
+                weight_tensor_backend = TensorBackend.ov
+            else:
+                weight_tensor_backend = TensorBackend.numpy
+
+            # Generate input tensors
+            weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, weight_tensor_backend)
+            precomputed_scale, precomputed_zero_point = None, None
+            if precompute:
+                # For precomputed mode, the weight is assumed to be already reshaped
+                if group_size != -1:
+                    weight, _ = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
+
+                precomputed_scale = get_random_float_tensor(scale_shape, TensorDataType.float32, TensorBackend.numpy)
+                if config.is_int_asym:
+                    precomputed_zero_point = get_random_integer_tensor(
+                        zero_point_shape, level_low, level_high, TensorDataType.int32, TensorBackend.numpy
+                    )
+
+            if quantization_task == QuantizationTask.Q:
+                fn_to_call = do_int_quantization
+                fn_to_patch = get_compress_weight_model
+            else:
+                fn_to_call = calculate_quantized_dequantized_weight
+                fn_to_patch = get_compress_decompress_weight_model
+            patch_path = f"{inspect.getmodule(fn_to_patch).__name__}.{fn_to_patch.__name__}"
+            with patch(patch_path, side_effect=fn_to_patch) as mock:
+                # For precomputed mode, all inputs are assumed to be already reshaped
+                r_axes = None if precompute else reduction_axes
+                kwargs = {"return_compressed_weight": True} if quantization_task == QuantizationTask.Q_DQ_RQ else {}
+                outputs = fn_to_call(weight, config, r_axes, precomputed_scale, precomputed_zero_point, **kwargs)
+
+                decompressed_weight, compressed_weight, scale, zero_point = (None,) * 4
+                if quantization_task == QuantizationTask.Q:
+                    compressed_weight, scale, zero_point = outputs
+                elif quantization_task == QuantizationTask.Q_DQ:
+                    decompressed_weight = outputs[0]
+                else:
+                    decompressed_weight, compressed_weight, scale, zero_point = outputs
+
+                if cb == ComputationBackend.NumPy:
+                    mock.assert_not_called()
+                else:
+                    mock.assert_called_once()
+
+        if quantization_task != QuantizationTask.Q_DQ:
+            # Scale should always be float32 and numpy backend
+            assert scale.dtype == TensorDataType.float32
+            assert scale.backend == TensorBackend.numpy
+            if precompute:
+                # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones
+                np.testing.assert_allclose(precomputed_scale.data, scale.data)
+                if config.is_int_asym:
+                    np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data)
+
+        if (
+            quantization_task == QuantizationTask.Q
+            and cb == ComputationBackend.OV
+            and weight_tensor_backend == TensorBackend.ov
+            and config.num_bits == 4
+        ):
+            # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed
+            # zero point must be in ov backend and have (u)int4 dtype in order to be able to insert them into OV model
+            # without re-packing
+            assert compressed_weight.backend == TensorBackend.ov
+            assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4)
+            if config.is_int_asym and not precompute:
+                assert zero_point.backend == TensorBackend.ov
+                assert zero_point.dtype == TensorDataType.uint4
+        else:
+            if quantization_task != QuantizationTask.Q_DQ:
+                # Otherwise compressed weight and zero point must be returned in numpy backend, compressed weight must
+                # be of (u)int8 data type, zero point -- in int32
+                assert compressed_weight.backend == TensorBackend.numpy
+                assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8)
+                if config.is_int_asym and not precompute:
+                    assert zero_point.backend == TensorBackend.numpy
+                    assert zero_point.dtype == TensorDataType.int32
+            if quantization_task != QuantizationTask.Q:
+                assert decompressed_weight.backend == TensorBackend.numpy
+                assert decompressed_weight.dtype == TensorDataType.float32
+
+        # Save results for comparison between implementations
+        if quantization_task != QuantizationTask.Q:
+            results[cb]["decompressed_weight"] = decompressed_weight
+        if quantization_task != QuantizationTask.Q_DQ:
+            results[cb]["compressed_weight"] = compressed_weight.to_backend(TensorBackend.numpy)
+            results[cb]["scale"] = scale
+            if config.is_int_asym:
+                results[cb]["zero_point"] = zero_point.to_backend(TensorBackend.numpy)
+
+    keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy]))
+    # Check that the computed tensors are equal between implementations
+    for key in keys:
+        numpy_result = results[ComputationBackend.NumPy][key].data
+        ov_result = results[ComputationBackend.OV][key].data
+        np.testing.assert_allclose(numpy_result, ov_result, err_msg=f"Results do not align for {key}.")
+
+
+# @pytest.mark.parametrize("mode", COMPRESSION_MODES)
+# @pytest.mark.parametrize("group_size", [2])
+# def test_grouped_quantization(mode, group_size):
+#     if mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM]:
+#         pytest.skip("Group size is not applicable for INT8 modes")
+#
+#     # Generate random weight tensor
+#     weight_shape = (128, 4)
+#     weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy)
+#
+#     # Create WeightCompressionConfig
+#     config = WeightCompressionConfig(mode, group_size=group_size)
+#
+#     # Patch is_openvino_available to control the implementation
+#     with patch("nncf.utils.is_openvino_available", return_value=False):
+#         # Reference implementation
+#         decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes)
+#
+#     with patch("nncf.utils.is_openvino_available", return_value=True):
+#         # OpenVINO implementation
+#         ov_model_params = OVModelParameters(weight.dtype)
+#         decompressed_weight_ov = calculate_quantized_dequantized_weight(
+#             weight, config, reduction_axes, ov_model_params=ov_model_params
+#         )
+#
+#     # Compare decompressed weights
+#     np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4)
+#
+#
+# def test_weight_dtypes():
+#     # Test different weight data types
+#     weight_shape = (128, 4)
+#     for dtype in DATA_TYPES:
+#         weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy)
+#         config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM)
+#
+#         # Reference implementation
+#         with patch("nncf.utils.is_openvino_available", return_value=False):
+#             decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes)
+#
+#         # OpenVINO implementation
+#         with patch("nncf.utils.is_openvino_available", return_value=True):
+#             ov_model_params = OVModelParameters(weight.dtype)
+#             decompressed_weight_ov = calculate_quantized_dequantized_weight(
+#                 weight, config, reduction_axes, ov_model_params=ov_model_params
+#             )
+#
+#         # Compare decompressed weights
+#         np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4)
+#
+#
+# def test_tensor_backends():
+#     # Test different tensor backends for do_int_quantization
+#     weight_shape = (128, 4)
+#     weight_numpy = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy)
+#     weight_ov = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.ov)
+#     config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM)
+#
+#     # Reference implementation with numpy backend
+#     with patch("nncf.utils.is_openvino_available", return_value=False):
+#         compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, reduction_axes)
+#
+#     # OpenVINO implementation with OV backend
+#     with patch("nncf.utils.is_openvino_available", return_value=True):
+#         ov_model_params = OVModelParameters(weight_ov.dtype)
+#         compressed_weight_ov, scale_ov = do_int_quantization(
+#             weight_ov, config, reduction_axes, ov_model_params=ov_model_params
+#         )
+#
+#     # Compare compressed weights
+#     np.testing.assert_allclose(compressed_weight_ref.data, compressed_weight_ov.data, atol=1e-5, rtol=1e-4)
+#
+#     # Compare scales
+#     np.testing.assert_allclose(scale_ref.data, scale_ov.data, atol=1e-5, rtol=1e-4)

From c5606cec5ea03f8bcfbb4ff8a395cdcc205bfc8a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 1 Nov 2024 17:32:28 +0100
Subject: [PATCH 22/73] Invert Tensor division

---
 nncf/tensor/tensor.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py
index 1f776e19ad6..5db49985d4e 100644
--- a/nncf/tensor/tensor.py
+++ b/nncf/tensor/tensor.py
@@ -116,6 +116,12 @@ def __ipow__(self, other: Union[Tensor, float]) -> Tensor:
         self._data **= unwrap_tensor_data(other)
         return self
 
+    # def __truediv__(self, other: Union[Tensor, float]) -> Tensor:
+    #     return self * _call_function("_binary_op_nowarn", 1.0, other, operator.truediv)
+    #
+    # def __rtruediv__(self, other: Union[Tensor, float]) -> Tensor:
+    #     return other * _call_function("_binary_reverse_op_nowarn", self, 1.0, operator.truediv)
+
     def __truediv__(self, other: Union[Tensor, float]) -> Tensor:
         return _call_function("_binary_op_nowarn", self, other, operator.truediv)
 

From e6a9d56e2f6e0f9ef9d3d0764114f307747c42ca Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 4 Nov 2024 20:21:28 +0100
Subject: [PATCH 23/73] Add fns.divide

---
 .../weight_compression/weight_lowering.py     | 39 +++++--------------
 nncf/quantization/fake_quantize.py            | 13 ++-----
 nncf/tensor/functions/__init__.py             |  2 +
 nncf/tensor/functions/numeric.py              | 35 +++++++++++++++++
 4 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 1aad39d5c5d..20c4b3e539a 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -142,9 +142,7 @@ def calculate_e2m1_scale(weight: Tensor, reduction_axes: ReductionAxes, max_val=
     return scale
 
 
-def calculate_signed_scale(
-    weight: Tensor, reduction_axes: ReductionAxes, num_bits=4, invert_division: Optional[bool] = True
-) -> Tensor:
+def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bits=4) -> Tensor:
     """
     Calculates the signed scale for symmetric quantization.
 
@@ -159,10 +157,7 @@ def calculate_signed_scale(
     w_max = fns.max(weight, axis=reduction_axes, keepdims=True)
 
     scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max)
-    if invert_division:
-        scale *= 1.0 / level_high
-    else:
-        scale /= level_high
+    fns.inplace_divide(scale, level_high)
 
     eps = fns.finfo(scale).eps
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
@@ -183,7 +178,7 @@ def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor:
     if scale.dtype != TensorDataType.float32:
         scale = scale.astype(TensorDataType.float32)
 
-    return weight / scale
+    return fns.divide(weight, scale)
 
 
 def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor:
@@ -260,7 +255,6 @@ def calculate_integer_quantization_params(
     weight: Tensor,
     reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
-    invert_division: Optional[bool] = True,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates the scale and zero point for uniform quantization (INT4, INT8), when the range of values is divided into
@@ -283,7 +277,7 @@ def calculate_integer_quantization_params(
         min_values = fns.min(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
         max_values = fns.max(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
         scale, zero_point = calculate_scale_zero_point(
-            min_values, max_values, level_low, level_high, narrow_range=False, invert_division=invert_division
+            min_values, max_values, level_low, level_high, narrow_range=False
         )
         return scale, zero_point
 
@@ -296,7 +290,6 @@ def calculate_quantized_weight(
     config: WeightCompressionConfig,
     scale: Tensor,
     zero_point: Optional[Tensor] = None,
-    invert_division: Optional[bool] = True,
 ) -> Tensor:
     """
     Quantizes the weight tensor using the provided scale and zero point.
@@ -305,7 +298,6 @@ def calculate_quantized_weight(
     :param config: Weight compression configuration.
     :param scale: Scale tensor used for quantization.
     :param zero_point: Zero point tensor used for quantization.
-    :param invert_division: applies inversion for scale and then multiply by weights instead of division.
     :return: Quantized weight tensor of uint8 or int8 type.
     """
     if weight.dtype != TensorDataType.float32:
@@ -319,10 +311,7 @@ def calculate_quantized_weight(
     level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
-    if invert_division:
-        compressed_weights = weight * (1.0 / scale)
-    else:
-        compressed_weights = weight / scale
+    compressed_weights = fns.divide(weight, scale)
     if zero_point is not None:
         compressed_weights += zero_point.astype(weight.dtype)
     compressed_weights = fns.round(compressed_weights)
@@ -335,7 +324,6 @@ def get_integer_quantization_error(
     weight: Tensor,
     reduction_axes: ReductionAxes,
     config: WeightCompressionConfig,
-    invert_division: Optional[bool] = True,
 ) -> float:
     """
     Calculates a quantity characterizing the difference between floating point weights and fake quantized
@@ -351,9 +339,7 @@ def get_integer_quantization_error(
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
-    compressed_weights, scale, zero_point = do_int_quantization(
-        weight, config, reduction_axes, invert_division=invert_division
-    )
+    compressed_weights, scale, zero_point = do_int_quantization(weight, config, reduction_axes)
     decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point)
 
     decompressed_weight = decompressed_weight.reshape(orig_shape)
@@ -369,7 +355,6 @@ def compress_weight(
     config: WeightCompressionConfig,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
-    invert_division: Optional[bool] = True,
 ):
     """
     Compress weight using compression configuration.
@@ -390,7 +375,7 @@ def compress_weight(
         )
         return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_int_quantization(
-        weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division=invert_division
+        weight, config, reduction_axes, precomputed_scale, precomputed_zero_point
     )
 
     return CompressedWeight(compressed_weight, scale, zero_point)
@@ -443,7 +428,6 @@ def do_int_quantization(
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
-    invert_division: Optional[bool] = True,
     ov_model_params: Optional = None,
 ):
     """
@@ -455,8 +439,6 @@ def do_int_quantization(
         precomputed scale (and zero point) are provided.
     :param precomputed_scale: Optional precomputed scale tensor.
     :param precomputed_zero_point: Optional precomputed zero point tensor.
-    :param invert_division: Whether to apply inversion for scale and then multiply by weights instead of division.
-        Defaults to False.
     :param ov_model_params: OpenVINO model parameters for acceleration.
     :return: A tuple containing the compressed weights, scale, and zero point tensors.
     """
@@ -498,7 +480,7 @@ def do_int_quantization(
         if precomputed_zero_point is not None:
             zero_point = precomputed_zero_point
 
-        compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
+        compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point)
         return compressed_weights, scale, zero_point
 
     from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
@@ -560,7 +542,6 @@ def calculate_quantized_dequantized_weight(
     reduction_axes: Optional[ReductionAxes] = None,
     precomputed_scale: Optional[Tensor] = None,
     precomputed_zero_point: Optional[Tensor] = None,
-    invert_division: Optional[bool] = True,
     return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional = None,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
@@ -577,12 +558,12 @@ def calculate_quantized_dequantized_weight(
         # Reference implementation
         if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None):
             compressed_weight, scale, zero_point = do_int_quantization(
-                weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, invert_division
+                weight, config, reduction_axes, precomputed_scale, precomputed_zero_point
             )
         else:
             scale = precomputed_scale if precomputed_scale is not None else None
             zero_point = precomputed_zero_point if precomputed_zero_point is not None else None
-            compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point, invert_division)
+            compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point)
         decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
         if return_compressed_weight:
             return decompressed_weight, compressed_weight, scale, zero_point
diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py
index 3e7cee04bc1..385cef9ca2e 100644
--- a/nncf/quantization/fake_quantize.py
+++ b/nncf/quantization/fake_quantize.py
@@ -11,7 +11,7 @@
 
 import warnings
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Tuple
 
 import nncf
 from nncf.common.quantization.quantizers import calculate_asymmetric_level_ranges
@@ -344,7 +344,6 @@ def calculate_scale_zero_point(
     level_low: int,
     level_high: int,
     narrow_range: bool,
-    invert_division: Optional[bool] = True,
 ) -> Tuple[Tensor, Tensor]:
     """
     Calculates scale and zero_point values for the quantizer.
@@ -360,17 +359,11 @@ def calculate_scale_zero_point(
     :return: Scale and Zero point values.
     """
     levels = level_high - level_low if narrow_range else level_high - level_low + 1
-    if invert_division:
-        scale = ((input_high - input_low) * (1.0 / (levels - 1))).astype(TensorDataType.float32)
-    else:
-        scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32)
+    scale = fns.divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32)
     eps = fns.finfo(scale).eps
     # NOTE: adding machine epsilon to avoid division by zero
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
     expected_level_low = level_low + 1 if narrow_range else level_low
-    if invert_division:
-        zero_point = expected_level_low - fns.round(input_low * (1.0 / scale))
-    else:
-        zero_point = expected_level_low - fns.round(input_low / scale)
+    zero_point = expected_level_low - fns.round(fns.divide(input_low, scale))
     zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high)
     return scale, zero_point
diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py
index 9affab79c90..52bc666dfa3 100644
--- a/nncf/tensor/functions/__init__.py
+++ b/nncf/tensor/functions/__init__.py
@@ -24,12 +24,14 @@
 from nncf.tensor.functions.numeric import count_nonzero as count_nonzero
 from nncf.tensor.functions.numeric import device as device
 from nncf.tensor.functions.numeric import diag as diag
+from nncf.tensor.functions.numeric import divide as divide
 from nncf.tensor.functions.numeric import dtype as dtype
 from nncf.tensor.functions.numeric import expand_dims as expand_dims
 from nncf.tensor.functions.numeric import eye as eye
 from nncf.tensor.functions.numeric import finfo as finfo
 from nncf.tensor.functions.numeric import flatten as flatten
 from nncf.tensor.functions.numeric import from_numpy as from_numpy
+from nncf.tensor.functions.numeric import inplace_divide as inplace_divide
 from nncf.tensor.functions.numeric import isclose as isclose
 from nncf.tensor.functions.numeric import isempty as isempty
 from nncf.tensor.functions.numeric import item as item
diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py
index cdec5788bf6..c6276a5e22f 100644
--- a/nncf/tensor/functions/numeric.py
+++ b/nncf/tensor/functions/numeric.py
@@ -911,3 +911,38 @@ def ceil(a: Tensor) -> Tensor:
 @tensor_guard
 def to_backend(a: Tensor, b: TensorBackend) -> Tensor:
     return Tensor(to_backend(a.data, b))
+
+
+@functools.singledispatch
+@tensor_guard
+def divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor:
+    """
+    Divide two tensors or a tensor and a float.
+    This function divides `a` by `b`. If `invert` is True, it performs the division as `a * (1.0 / b)`.
+    Otherwise, it performs the division as `a / b`.
+    :param a: The first input tensor or float.
+    :param b: The second input tensor or float.
+    :param invert: If True, the division is performed as `a * (1.0 / b)`. If False, it is performed as `a / b`.
+                   Defaults to True.
+    :return: A new tensor resulting from the division.
+    """
+    return Tensor(a * (1.0 / b) if invert else a / b)
+
+
+@functools.singledispatch
+@tensor_guard
+def inplace_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None:
+    """
+    In-place division of two tensors or a tensor and a float.
+    This function divides `a` by `b` in place. If `invert` is True, it performs the division as `a *= (1.0 / b)`.
+    Otherwise, it performs the division as `a /= b`.
+    :param a: The first input tensor or float.
+    :param b: The second input tensor or float.
+    :param invert: If True, the division is performed as `a *= (1.0 / b)`. If False, the division it is as `a /= b`.
+                   Defaults to True.
+    :return: None. The operation is performed in place.
+    """
+    if invert:
+        a *= 1.0 / b
+    else:
+        a /= b

From ab90a089738ed06f4687fd05d58db0dbe1ff1798 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 6 Nov 2024 18:48:46 +0100
Subject: [PATCH 24/73] Adopt misalignment test to check the degree of
 misalignment

---
 .../weight_compression/openvino_modeling.py   |   6 +-
 nncf/tensor/functions/ov.py                   |   2 +-
 .../quantization/test_openvino_modeling.py    | 124 ++++++++++++++----
 3 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 9f2fed9e03e..d98fbde9324 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -36,7 +36,7 @@
 class OVModelParameters:
     input_dtype: TensorDataType
     output_dtype: Optional[TensorDataType] = None
-    dynamic_shapes: bool = True  # TODO: set to False once 156511 is resolved
+    dynamic_shapes: bool = False
     recompile: bool = False
     release_memory: bool = True
     share_inputs: bool = True
@@ -269,8 +269,8 @@ def _build_compress_decompress_model(
             compressed_w = ov_results[0]
             scale, zero_point = ov_parameters[1:]
 
-        subtrac_zero_point = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
-        decompressed_w = scale * opset.convert(subtrac_zero_point, ov.Type.f32)
+        compressed_w_ = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
+        decompressed_w = scale * opset.convert(compressed_w_, ov.Type.f32)
     else:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index b5083a4b284..483aac9bf6b 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -52,7 +52,7 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
             share_outputs=True,
             return_ov_tensors=True,
         ),
-        a.shape,
+        tuple(a.shape),
         dtype,
     )
     return model([a])[0].data
diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index 872173c0990..b9720def2ac 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -20,6 +20,7 @@
 
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
@@ -29,6 +30,7 @@
 from nncf.results_caching import cache_results
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
+from nncf.tensor import functions as fns
 from nncf.tensor.definitions import TensorBackend
 from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
 from nncf.tensor.functions.numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP
@@ -58,11 +60,21 @@ class QuantizationTask(Enum):
 
 DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
 
-WEIGHT_SHAPE = (1000, 4)
+WEIGHT_SHAPE = (10000, 4)
+
+MAX_MISALIGNMENT_FREQUENCY = {
+    TensorDataType.float32: 1e-2,  # tends to < 5e-6
+    TensorDataType.float16: 1e-2,  # tends to < 5e-5
+    TensorDataType.bfloat16: 1e-2,  # tends to < 5e-4
+}
+
+MAX_MISALIGNMENT_MAGNITUDE = 1
 
 TENSOR_BACKENDS = [TensorBackend.numpy, TensorBackend.ov]
 
-reduction_axes = (1,)
+EPS = np.finfo(np.float32).eps
+
+REDUCTION_AXES = (1,)
 
 
 RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer()
@@ -103,12 +115,12 @@ def openvino_available(available: bool):
 
 
 @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS])
-# @pytest.mark.parametrize("config", [WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)])
 @pytest.mark.parametrize(
     ("quantization_task", "tensor_backend"),
     [
         (QuantizationTask.Q, TensorBackend.numpy),
         (QuantizationTask.Q, "auto"),
+        # Only for quantization task NumPy backend should support OV tensors as inputs
         (QuantizationTask.Q, TensorBackend.ov),
         (QuantizationTask.Q_DQ, TensorBackend.numpy),
         (QuantizationTask.Q_DQ, "auto"),
@@ -117,8 +129,9 @@ def openvino_available(available: bool):
     ],
 )
 @pytest.mark.parametrize("dtype", DATA_TYPES)
-@pytest.mark.parametrize("precompute", [False, True])
-def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute):
+@pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"])
+@pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"])
+def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes):
     d1, d2 = WEIGHT_SHAPE
     group_size = config.group_size
     zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1)
@@ -140,10 +153,10 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
             # Generate input tensors
             weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, weight_tensor_backend)
             precomputed_scale, precomputed_zero_point = None, None
-            if precompute:
-                # For precomputed mode, the weight is assumed to be already reshaped
+            if precompute_s_zp:
+                # When scale (and z.p) are precomputed, all inputs are assumed to be reshaped beforehand
                 if group_size != -1:
-                    weight, _ = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
+                    weight, _ = reshape_weight_for_grouped_quantization(weight, REDUCTION_AXES, group_size)
 
                 precomputed_scale = get_random_float_tensor(scale_shape, TensorDataType.float32, TensorBackend.numpy)
                 if config.is_int_asym:
@@ -159,16 +172,26 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
                 fn_to_patch = get_compress_decompress_weight_model
             patch_path = f"{inspect.getmodule(fn_to_patch).__name__}.{fn_to_patch.__name__}"
             with patch(patch_path, side_effect=fn_to_patch) as mock:
-                # For precomputed mode, all inputs are assumed to be already reshaped
-                r_axes = None if precompute else reduction_axes
-                kwargs = {"return_compressed_weight": True} if quantization_task == QuantizationTask.Q_DQ_RQ else {}
-                outputs = fn_to_call(weight, config, r_axes, precomputed_scale, precomputed_zero_point, **kwargs)
+                # When scale (and z.p) are precomputed, all inputs are assumed to be already reshaped and reduction
+                # axes are not needed
+                reduction_axes = None if precompute_s_zp else REDUCTION_AXES
+
+                kwargs = {}
+                if cb == ComputationBackend.OV:
+                    ov_model_params = OVModelParameters(weight.dtype, dynamic_shapes=not static_shapes)
+                    kwargs["ov_model_params"] = ov_model_params
+                if quantization_task == QuantizationTask.Q_DQ_RQ:
+                    kwargs["return_compressed_weight"] = True
+
+                outputs = fn_to_call(
+                    weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, **kwargs
+                )
 
                 decompressed_weight, compressed_weight, scale, zero_point = (None,) * 4
                 if quantization_task == QuantizationTask.Q:
                     compressed_weight, scale, zero_point = outputs
                 elif quantization_task == QuantizationTask.Q_DQ:
-                    decompressed_weight = outputs[0]
+                    decompressed_weight = outputs
                 else:
                     decompressed_weight, compressed_weight, scale, zero_point = outputs
 
@@ -181,7 +204,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
             # Scale should always be float32 and numpy backend
             assert scale.dtype == TensorDataType.float32
             assert scale.backend == TensorBackend.numpy
-            if precompute:
+            if precompute_s_zp:
                 # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones
                 np.testing.assert_allclose(precomputed_scale.data, scale.data)
                 if config.is_int_asym:
@@ -198,7 +221,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
             # without re-packing
             assert compressed_weight.backend == TensorBackend.ov
             assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4)
-            if config.is_int_asym and not precompute:
+            if config.is_int_asym and not precompute_s_zp:
                 assert zero_point.backend == TensorBackend.ov
                 assert zero_point.dtype == TensorDataType.uint4
         else:
@@ -207,7 +230,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
                 # be of (u)int8 data type, zero point -- in int32
                 assert compressed_weight.backend == TensorBackend.numpy
                 assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8)
-                if config.is_int_asym and not precompute:
+                if config.is_int_asym and not precompute_s_zp:
                     assert zero_point.backend == TensorBackend.numpy
                     assert zero_point.dtype == TensorDataType.int32
             if quantization_task != QuantizationTask.Q:
@@ -224,11 +247,60 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
                 results[cb]["zero_point"] = zero_point.to_backend(TensorBackend.numpy)
 
     keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy]))
-    # Check that the computed tensors are equal between implementations
     for key in keys:
-        numpy_result = results[ComputationBackend.NumPy][key].data
-        ov_result = results[ComputationBackend.OV][key].data
-        np.testing.assert_allclose(numpy_result, ov_result, err_msg=f"Results do not align for {key}.")
+        numpy_result = results[ComputationBackend.NumPy][key]
+        ov_result = results[ComputationBackend.OV][key]
+
+        atol = 0
+        scale = None
+        # For static-shaped OV models doing asymmetric compression there maybe misalignments between OV and NumPy
+        # For more details see 156511
+        if static_shapes and config.is_int_asym:
+            if key == "compressed_weight":
+                atol = MAX_MISALIGNMENT_MAGNITUDE
+            elif key == "decompressed_weight":
+                if "scale" in results[ComputationBackend.NumPy]:
+                    scale = results[ComputationBackend.NumPy]["scale"]
+                else:
+                    if precompute_s_zp:
+                        scale = precomputed_scale
+                    else:
+                        weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, TensorBackend.numpy)
+                        with openvino_available(False):
+                            _, _, scale, _ = calculate_quantized_dequantized_weight(
+                                weight, config, REDUCTION_AXES, return_compressed_weight=True
+                            )
+                # For decompressed weight the misalignment magnitude depends on the scale
+                atol = MAX_MISALIGNMENT_MAGNITUDE * fns.abs(scale).max().item() + EPS
+            max_misalignment_frequency = MAX_MISALIGNMENT_FREQUENCY[dtype]
+        else:
+            max_misalignment_frequency = None
+
+        # Check that the computed tensors are equal between implementations
+        np.testing.assert_allclose(
+            numpy_result.data, ov_result.data, atol=atol, err_msg=f"Results do not align for {key}."
+        )
+
+        if max_misalignment_frequency is not None:
+            if key == "compressed_weight":
+                diff = fns.abs(numpy_result.astype(TensorDataType.int32) - ov_result.astype(TensorDataType.int32))
+            else:
+                diff = fns.abs(numpy_result - ov_result)
+
+            if diff.max() > 0:
+                # Check that the proportion of misaligned values is small
+                n_not_equal = fns.sum(diff > 0)
+                assert n_not_equal / numpy_result.size < max_misalignment_frequency
+
+                # Check that the magnitude of misalignment is as small as expected
+                if key == "decompressed_weight":
+                    # Reshape scale to match the shape of decompressed weight
+                    scale = np.repeat(scale.data, diff.shape[-1], axis=-1)
+                    np.testing.assert_array_less(
+                        diff.data,
+                        MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS,
+                        err_msg=f"Too large misalignment for {key}.",
+                    )
 
 
 # @pytest.mark.parametrize("mode", COMPRESSION_MODES)
@@ -247,13 +319,13 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
 #     # Patch is_openvino_available to control the implementation
 #     with patch("nncf.utils.is_openvino_available", return_value=False):
 #         # Reference implementation
-#         decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes)
+#         decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES)
 #
 #     with patch("nncf.utils.is_openvino_available", return_value=True):
 #         # OpenVINO implementation
 #         ov_model_params = OVModelParameters(weight.dtype)
 #         decompressed_weight_ov = calculate_quantized_dequantized_weight(
-#             weight, config, reduction_axes, ov_model_params=ov_model_params
+#             weight, config, REDUCTION_AXES, ov_model_params=ov_model_params
 #         )
 #
 #     # Compare decompressed weights
@@ -269,13 +341,13 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
 #
 #         # Reference implementation
 #         with patch("nncf.utils.is_openvino_available", return_value=False):
-#             decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, reduction_axes)
+#             decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES)
 #
 #         # OpenVINO implementation
 #         with patch("nncf.utils.is_openvino_available", return_value=True):
 #             ov_model_params = OVModelParameters(weight.dtype)
 #             decompressed_weight_ov = calculate_quantized_dequantized_weight(
-#                 weight, config, reduction_axes, ov_model_params=ov_model_params
+#                 weight, config, REDUCTION_AXES, ov_model_params=ov_model_params
 #             )
 #
 #         # Compare decompressed weights
@@ -291,13 +363,13 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
 #
 #     # Reference implementation with numpy backend
 #     with patch("nncf.utils.is_openvino_available", return_value=False):
-#         compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, reduction_axes)
+#         compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, REDUCTION_AXES)
 #
 #     # OpenVINO implementation with OV backend
 #     with patch("nncf.utils.is_openvino_available", return_value=True):
 #         ov_model_params = OVModelParameters(weight_ov.dtype)
 #         compressed_weight_ov, scale_ov = do_int_quantization(
-#             weight_ov, config, reduction_axes, ov_model_params=ov_model_params
+#             weight_ov, config, REDUCTION_AXES, ov_model_params=ov_model_params
 #         )
 #
 #     # Compare compressed weights

From 6289c5cb6bd172a3b47b487be9a7be1840e619c6 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 7 Nov 2024 14:56:07 +0100
Subject: [PATCH 25/73] Merge-related fixes

---
 .../native/quantization/test_openvino_modeling.py         | 2 +-
 .../native/quantization/test_weights_compression.py       | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index b9720def2ac..6d82885dd0f 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -120,7 +120,7 @@ def openvino_available(available: bool):
     [
         (QuantizationTask.Q, TensorBackend.numpy),
         (QuantizationTask.Q, "auto"),
-        # Only for quantization task NumPy backend should support OV tensors as inputs
+        # NumPy backend should support OV tensors as inputs only for quantization task
         (QuantizationTask.Q, TensorBackend.ov),
         (QuantizationTask.Q_DQ, TensorBackend.numpy),
         (QuantizationTask.Q_DQ, "auto"),
diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
index 239a94eaf63..f187b34961f 100644
--- a/tests/openvino/native/quantization/test_weights_compression.py
+++ b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1062,8 +1062,6 @@ def test_compressed_weighs_range(mode, data):
     ],
 )
 def test_int_quantization_with_precomputed_parameters(config, precompute_scale, precompute_zero_point, raises):
-    is_asym = config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM]
-
     precomputed_scale, precomputed_zero_point = None, None
     weight = Tensor(((np.arange(11) - 5) / 10).astype(np.float32)[:, None])
     if precompute_scale:
@@ -1073,18 +1071,18 @@ def test_int_quantization_with_precomputed_parameters(config, precompute_scale,
 
     if raises:
         with pytest.raises(ValueError) as exc_info:
-            _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point)
+            _, scale, zero_point = do_int_quantization(weight, config, -1, precomputed_scale, precomputed_zero_point)
             assert exc_info.value == (
                 "If precomputed quantization parameters are provided, both scale and zero point "
                 "are required for asymmetric quantization."
             )
         return
     else:
-        _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point)
+        _, scale, zero_point = do_int_quantization(weight, config, -1, precomputed_scale, precomputed_zero_point)
 
     if precompute_scale:
         assert np.allclose(scale.data, precomputed_scale.data)
-    if is_asym:
+    if config.is_int_asym:
         if precompute_zero_point:
             assert np.allclose(zero_point.data, precomputed_zero_point.data)
     else:

From f60fd177e56ccce6f67fae1db2b1708bdce1ca52 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 7 Nov 2024 16:33:18 +0100
Subject: [PATCH 26/73] Tweaks

---
 .../weight_compression/openvino_modeling.py   | 16 ++--
 nncf/tensor/functions/ov.py                   |  3 +-
 .../quantization/test_openvino_modeling.py    | 89 ++-----------------
 3 files changed, 16 insertions(+), 92 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index d98fbde9324..a2092604b1a 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -23,7 +23,7 @@
 from nncf.results_caching import cache_results
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
-from nncf.tensor.functions.ov import DTYPE_MAP as OV_DTYPE_MAP
+from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV
 
 TensorList = List[Tensor]
 ModelCallable = Callable[[TensorList], TensorList]
@@ -61,9 +61,7 @@ def __hash__(self):
 def run_model(
     ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList
 ) -> TensorList:
-    if any(isinstance(it, Tensor) for it in inputs):
-        inputs = [inp.data for inp in inputs]
-
+    inputs = [inp.data for inp in inputs]
     if return_ov_tensors:
         infer_request = compiled_model.create_infer_request()
         infer_request.infer(
@@ -151,7 +149,7 @@ def _build_compress_model(
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
 ) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
-    weight = opset.parameter(weight_shape, name="w", dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
+    weight = opset.parameter(weight_shape, name="w", dtype=DTYPE_MAP_OV[ov_model_params.input_dtype])
     ov_parameters = [weight]
 
     num_bits = config.num_bits
@@ -214,13 +212,13 @@ def _build_compress_model(
 
     if config.is_int_asym:
         if ov_model_params.output_dtype is not None:
-            dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
+            dtype = DTYPE_MAP_OV[ov_model_params.output_dtype]
         else:
             dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
         compressed_w += zero_point
     else:
         if ov_model_params.output_dtype is not None:
-            dtype = OV_DTYPE_MAP[ov_model_params.output_dtype]
+            dtype = DTYPE_MAP_OV[ov_model_params.output_dtype]
         else:
             dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
 
@@ -296,8 +294,8 @@ def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype
 
 @cache_results(OV_MODEL_CACHE)
 def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
-    arg = opset.parameter(arg_shape, dtype=OV_DTYPE_MAP[ov_model_params.input_dtype])
-    res = opset.convert(arg, OV_DTYPE_MAP[dtype])
+    arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[ov_model_params.input_dtype])
+    res = opset.convert(arg, DTYPE_MAP_OV[dtype])
     model = ov.Model([res], [arg])
     compiled_model = ov.compile_model(model, device_name="CPU")
 
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index 483aac9bf6b..55b0b854499 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -13,6 +13,7 @@
 import numpy as np
 import openvino as ov
 
+from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.functions import numeric
 
@@ -55,7 +56,7 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
         tuple(a.shape),
         dtype,
     )
-    return model([a])[0].data
+    return model([Tensor(a)])[0].data
 
 
 @numeric.backend.register(ov.Tensor)
diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index 6d82885dd0f..e1e45ef0391 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -60,8 +60,6 @@ class QuantizationTask(Enum):
 
 DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
 
-WEIGHT_SHAPE = (10000, 4)
-
 MAX_MISALIGNMENT_FREQUENCY = {
     TensorDataType.float32: 1e-2,  # tends to < 5e-6
     TensorDataType.float16: 1e-2,  # tends to < 5e-5
@@ -114,6 +112,7 @@ def openvino_available(available: bool):
     nncf.utils._openvino_available = original_value
 
 
+@pytest.mark.parametrize("weight_shape", [(10000, 4)], ids=[""])
 @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS])
 @pytest.mark.parametrize(
     ("quantization_task", "tensor_backend"),
@@ -131,8 +130,10 @@ def openvino_available(available: bool):
 @pytest.mark.parametrize("dtype", DATA_TYPES)
 @pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"])
 @pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"])
-def test_quantization_alignment(config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes):
-    d1, d2 = WEIGHT_SHAPE
+def test_quantization_alignment(
+    weight_shape, config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes
+):
+    d1, d2 = weight_shape
     group_size = config.group_size
     zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1)
     level_low, level_high = 0, 2**config.num_bits - 1
@@ -151,7 +152,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
                 weight_tensor_backend = TensorBackend.numpy
 
             # Generate input tensors
-            weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, weight_tensor_backend)
+            weight = get_random_float_tensor(weight_shape, dtype, weight_tensor_backend)
             precomputed_scale, precomputed_zero_point = None, None
             if precompute_s_zp:
                 # When scale (and z.p) are precomputed, all inputs are assumed to be reshaped beforehand
@@ -265,7 +266,7 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
                     if precompute_s_zp:
                         scale = precomputed_scale
                     else:
-                        weight = get_random_float_tensor(WEIGHT_SHAPE, dtype, TensorBackend.numpy)
+                        weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy)
                         with openvino_available(False):
                             _, _, scale, _ = calculate_quantized_dequantized_weight(
                                 weight, config, REDUCTION_AXES, return_compressed_weight=True
@@ -301,79 +302,3 @@ def test_quantization_alignment(config, quantization_task, tensor_backend, dtype
                         MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS,
                         err_msg=f"Too large misalignment for {key}.",
                     )
-
-
-# @pytest.mark.parametrize("mode", COMPRESSION_MODES)
-# @pytest.mark.parametrize("group_size", [2])
-# def test_grouped_quantization(mode, group_size):
-#     if mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM]:
-#         pytest.skip("Group size is not applicable for INT8 modes")
-#
-#     # Generate random weight tensor
-#     weight_shape = (128, 4)
-#     weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy)
-#
-#     # Create WeightCompressionConfig
-#     config = WeightCompressionConfig(mode, group_size=group_size)
-#
-#     # Patch is_openvino_available to control the implementation
-#     with patch("nncf.utils.is_openvino_available", return_value=False):
-#         # Reference implementation
-#         decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES)
-#
-#     with patch("nncf.utils.is_openvino_available", return_value=True):
-#         # OpenVINO implementation
-#         ov_model_params = OVModelParameters(weight.dtype)
-#         decompressed_weight_ov = calculate_quantized_dequantized_weight(
-#             weight, config, REDUCTION_AXES, ov_model_params=ov_model_params
-#         )
-#
-#     # Compare decompressed weights
-#     np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4)
-#
-#
-# def test_weight_dtypes():
-#     # Test different weight data types
-#     weight_shape = (128, 4)
-#     for dtype in DATA_TYPES:
-#         weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy)
-#         config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM)
-#
-#         # Reference implementation
-#         with patch("nncf.utils.is_openvino_available", return_value=False):
-#             decompressed_weight_ref = calculate_quantized_dequantized_weight(weight, config, REDUCTION_AXES)
-#
-#         # OpenVINO implementation
-#         with patch("nncf.utils.is_openvino_available", return_value=True):
-#             ov_model_params = OVModelParameters(weight.dtype)
-#             decompressed_weight_ov = calculate_quantized_dequantized_weight(
-#                 weight, config, REDUCTION_AXES, ov_model_params=ov_model_params
-#             )
-#
-#         # Compare decompressed weights
-#         np.testing.assert_allclose(decompressed_weight_ref.data, decompressed_weight_ov.data, atol=1e-5, rtol=1e-4)
-#
-#
-# def test_tensor_backends():
-#     # Test different tensor backends for do_int_quantization
-#     weight_shape = (128, 4)
-#     weight_numpy = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy)
-#     weight_ov = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.ov)
-#     config = WeightCompressionConfig(CompressWeightsMode.INT8_SYM)
-#
-#     # Reference implementation with numpy backend
-#     with patch("nncf.utils.is_openvino_available", return_value=False):
-#         compressed_weight_ref, scale_ref = do_int_quantization(weight_numpy, config, REDUCTION_AXES)
-#
-#     # OpenVINO implementation with OV backend
-#     with patch("nncf.utils.is_openvino_available", return_value=True):
-#         ov_model_params = OVModelParameters(weight_ov.dtype)
-#         compressed_weight_ov, scale_ov = do_int_quantization(
-#             weight_ov, config, REDUCTION_AXES, ov_model_params=ov_model_params
-#         )
-#
-#     # Compare compressed weights
-#     np.testing.assert_allclose(compressed_weight_ref.data, compressed_weight_ov.data, atol=1e-5, rtol=1e-4)
-#
-#     # Compare scales
-#     np.testing.assert_allclose(scale_ref.data, scale_ov.data, atol=1e-5, rtol=1e-4)

From 57a0931fb0e60675e2e9970d145b96b6e86a72e0 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 11 Nov 2024 16:32:39 +0100
Subject: [PATCH 27/73] Strict input/output data types

---
 .../weight_compression/openvino_modeling.py   | 159 +++++++++++++-----
 .../weight_compression/weight_lowering.py     |  43 +++--
 nncf/tensor/functions/ov.py                   |   4 +-
 .../quantization/test_openvino_modeling.py    |   4 +-
 4 files changed, 149 insertions(+), 61 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index a2092604b1a..cec16ce8bb7 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -11,13 +11,12 @@
 
 from dataclasses import dataclass
 from functools import partial
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino as ov
 from openvino.runtime import opset13 as opset
 
-from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.results_caching import ResultsCacheContainer
 from nncf.results_caching import cache_results
@@ -34,8 +33,8 @@
 
 @dataclass
 class OVModelParameters:
-    input_dtype: TensorDataType
-    output_dtype: Optional[TensorDataType] = None
+    input_dtypes: Optional[Dict[str, TensorDataType]] = None
+    output_dtypes: Optional[Dict[str, TensorDataType]] = None
     dynamic_shapes: bool = False
     recompile: bool = False
     release_memory: bool = True
@@ -46,8 +45,8 @@ class OVModelParameters:
     def __hash__(self):
         return hash(
             (
-                self.input_dtype,
-                self.output_dtype,
+                None if self.output_dtypes is None else frozenset(self.input_dtypes.items()),
+                None if self.output_dtypes is None else frozenset(self.output_dtypes.items()),
                 self.dynamic_shapes,
                 self.recompile,
                 self.release_memory,
@@ -61,6 +60,15 @@ def __hash__(self):
 def run_model(
     ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList
 ) -> TensorList:
+    # Check that input dtypes match the expected dtypes
+    for i, inp in enumerate(compiled_model.inputs):
+        input_name = inp.any_name
+        actual_dtype = inputs[i].dtype
+        expected_dtype = ov_model_params.input_dtypes[input_name]
+        if actual_dtype != expected_dtype:
+            raise ValueError(f"Expected input '{input_name}' to be {expected_dtype}. But found: {actual_dtype}.")
+
+    # Infer the model
     inputs = [inp.data for inp in inputs]
     if return_ov_tensors:
         infer_request = compiled_model.create_infer_request()
@@ -149,12 +157,60 @@ def _build_compress_model(
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
 ) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
-    weight = opset.parameter(weight_shape, name="w", dtype=DTYPE_MAP_OV[ov_model_params.input_dtype])
+    input_dtypes = ov_model_params.input_dtypes
+    if input_dtypes is None:
+        raise ValueError("Input dtypes must be provided.")
+    output_dtypes = ov_model_params.output_dtypes
+    if output_dtypes is None:
+        raise ValueError("Output dtypes must be provided.")
+
+    weight_dtype = input_dtypes.get("weight")
+    input_scale_dtype = input_dtypes.get("scale", None)
+    input_zero_point_dtype = input_dtypes.get("zero_point", None)
+    compressed_weight_dtype = output_dtypes.get("compressed_weight")
+    output_scale_dtype = output_dtypes.get("scale", None)
+    output_zero_point_dtype = output_dtypes.get("zero_point", None)
+
+    # Validate input dtypes
+    valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
+    if weight_dtype not in valid_weight_dtypes:
+        raise ValueError(
+            f"Weight must be one of the following data types: {valid_weight_dtypes}. But found: {weight_dtype}."
+        )
+    if scale_shape is not None and input_scale_dtype != TensorDataType.float32:
+        raise ValueError(f"Input scale must be of float32 data type. But found: {input_scale_dtype}.")
+    if zero_point_shape is not None and input_zero_point_dtype != TensorDataType.int32:
+        raise ValueError(f"Input zero point must be of int32 data type. But found: {input_zero_point_dtype}.")
+
+    # Validate output dtypes
+    valid_compressed_weight_dtypes = [
+        TensorDataType.int32,
+        TensorDataType.int8,
+        TensorDataType.uint8,
+        TensorDataType.int4,
+        TensorDataType.uint4,
+    ]
+    if compressed_weight_dtype not in valid_compressed_weight_dtypes:
+        raise ValueError(
+            f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
+            f"But found: {compressed_weight_dtype}."
+        )
+    if scale_shape is None and output_scale_dtype != TensorDataType.float32:
+        raise ValueError(f"Output scale must be of float32 data type. But found: {output_scale_dtype}.")
+    is_int_asym = config.is_int_asym
+    if is_int_asym and zero_point_shape is None and output_zero_point_dtype not in valid_compressed_weight_dtypes:
+        raise ValueError(
+            f"Output zero point must be of one of the following data types: {valid_compressed_weight_dtypes}. "
+            f"But found: {output_zero_point_dtype}."
+        )
+
+    # Build OV model
+    weight = opset.parameter(weight_shape, name="weight", dtype=DTYPE_MAP_OV[weight_dtype])
     ov_parameters = [weight]
 
     num_bits = config.num_bits
     eps = np.finfo(np.float32).eps
-    if config.is_int_asym:
+    if is_int_asym:
         level_low = 0
         level_high = 2**num_bits - 1
     else:
@@ -164,11 +220,11 @@ def _build_compress_model(
     min_values = None
     if scale_shape is not None:
         # Scale is given as an input
-        scale = opset.parameter(scale_shape, name="s", dtype=ov.Type.f32)
+        scale = opset.parameter(scale_shape, name="scale", dtype=ov.Type.f32)
         ov_parameters.append(scale)
     else:
         # Compute scale
-        if config.is_int_asym:
+        if is_int_asym:
             min_values = opset.reduce_min(
                 weight, reduction_axes=reduction_axes, keep_dims=True
             )  # [a1, r, a2] -> [a1, 1, a2]
@@ -192,47 +248,36 @@ def _build_compress_model(
     zero_point = None
     if zero_point_shape is not None:
         # Zero point is given as an input
-        zero_point = opset.parameter(zero_point_shape, name="zp", dtype=ov.Type.i32)
+        zero_point = opset.parameter(zero_point_shape, name="zero_point", dtype=ov.Type.i32)
         ov_parameters.append(zero_point)
+        # Cast to float32 for an addition later
         zero_point = opset.convert(zero_point, ov.Type.f32)
-    elif config.is_int_asym:
+    elif is_int_asym:
         # Compute zero point
         if min_values is None:
             min_values = opset.reduce_min(
                 weight, reduction_axes=reduction_axes, keep_dims=True
             )  # [a1, r, a2] -> [a1, 1, a2]
             min_values = opset.convert(min_values, ov.Type.f32)
-
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
         zero_point = opset.clamp(zero_point, level_low, level_high)
 
     if weight.get_element_type() != ov.Type.f32:
         weight = opset.convert(weight, ov.Type.f32)
-    compressed_w = weight / scale
+    compressed_weight = weight / scale
 
-    if config.is_int_asym:
-        if ov_model_params.output_dtype is not None:
-            dtype = DTYPE_MAP_OV[ov_model_params.output_dtype]
-        else:
-            dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4
-        compressed_w += zero_point
-    else:
-        if ov_model_params.output_dtype is not None:
-            dtype = DTYPE_MAP_OV[ov_model_params.output_dtype]
-        else:
-            dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4
+    if is_int_asym:
+        compressed_weight += zero_point
 
-    compressed_w = opset.round(compressed_w)
-    compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high)
-    compressed_w = opset.convert(compressed_w, dtype, name="compressed_weights")
+    compressed_weight = opset.round(compressed_weight)
+    compressed_weight = opset.clamp(opset.round(compressed_weight), level_low, level_high)
+    compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
 
-    ov_results = [compressed_w]
+    ov_results = [compressed_weight]
     if len(ov_parameters) == 1:
         ov_results.append(scale)
         if zero_point is not None:
-            zero_point_dtype = compressed_w.get_element_type() if ov_model_params.return_ov_tensors else ov.Type.i32
-            if zero_point.get_element_type() != zero_point_dtype:
-                zero_point = opset.convert(zero_point, zero_point_dtype)
+            zero_point = opset.convert(zero_point, DTYPE_MAP_OV[output_zero_point_dtype])
             ov_results.append(zero_point)
 
     if return_nodes:
@@ -254,6 +299,17 @@ def _build_compress_decompress_model(
     reduction_axes: Optional[Tuple] = None,
     return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
+    input_dtypes = ov_model_params.input_dtypes
+    if input_dtypes is None:
+        raise ValueError("Input dtypes must be provided.")
+    output_dtypes = ov_model_params.output_dtypes
+    if output_dtypes is None:
+        raise ValueError("Output dtypes must be provided.")
+
+    decompressed_weight_dtype = output_dtypes.get("decompressed_weight")
+    if decompressed_weight_dtype != TensorDataType.float32:
+        raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.")
+
     ov_parameters, ov_results = _build_compress_model(
         config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
     )
@@ -261,41 +317,54 @@ def _build_compress_decompress_model(
     if config.is_int_asym:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale, zero_point
-            compressed_w, scale, zero_point = ov_results
+            compressed_weight, scale, zero_point = ov_results
         else:
             # weight, scale, zero_point -> compressed_weight
-            compressed_w = ov_results[0]
+            compressed_weight = ov_results[0]
             scale, zero_point = ov_parameters[1:]
 
-        compressed_w_ = opset.convert(compressed_w, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
-        decompressed_w = scale * opset.convert(compressed_w_, ov.Type.f32)
+        compressed_weight = opset.convert(compressed_weight, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
     else:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale
-            compressed_w, scale = ov_results
+            compressed_weight, scale = ov_results
         else:
             # weight, scale -> compressed_weight
-            compressed_w = ov_results[0]
+            compressed_weight = ov_results[0]
             scale = ov_parameters[1]
-        decompressed_w = opset.convert(compressed_w, ov.Type.f32) * scale
 
-    ov_results = [decompressed_w] + ov_results if return_compressed_weight else [decompressed_w]
+    if compressed_weight.get_element_type() != ov.Type.f32:
+        compressed_weight = opset.convert(compressed_weight, ov.Type.f32)
+    decompressed_weight = opset.multiply(scale, compressed_weight)
+
+    ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight]
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
     return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
 
 
-def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
+def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
         arg_shape = (-1,) * len(arg_shape)
-    return _build_astype_model(ov_model_params, arg_shape, dtype)
+    return _build_astype_model(ov_model_params, arg_shape)
 
 
 @cache_results(OV_MODEL_CACHE)
-def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple, dtype: TensorDataType) -> ModelCallable:
-    arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[ov_model_params.input_dtype])
-    res = opset.convert(arg, DTYPE_MAP_OV[dtype])
+def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> ModelCallable:
+    input_dtypes = ov_model_params.input_dtypes
+    if input_dtypes is None:
+        raise ValueError("Input dtypes must be provided.")
+    output_dtypes = ov_model_params.output_dtypes
+    if output_dtypes is None:
+        raise ValueError("Output dtypes must be provided.")
+    if "input" not in input_dtypes:
+        raise ValueError("Input dtype is required.")
+    if "output" not in output_dtypes:
+        raise ValueError("Output dtype is required.")
+
+    arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[input_dtypes["input"]], name="input")
+    res = opset.convert(arg, DTYPE_MAP_OV[output_dtypes["output"]])
     model = ov.Model([res], [arg])
     compiled_model = ov.compile_model(model, device_name="CPU")
 
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 20c4b3e539a..4e1cabc3790 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -8,6 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import logging
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
@@ -490,13 +491,25 @@ def do_int_quantization(
     scale_shape = None if precomputed_scale is None else precomputed_scale.shape
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
-    if ov_model_params is None:
-        ov_model_params = OVModelParameters(weight.dtype)
-    if config.num_bits == 4:
-        if weight.backend == TensorBackend.ov:
-            ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov
-        else:
-            ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
+    ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params)
+    ov_model_params.input_dtypes = ov_model_params.input_dtypes or {
+        "weight": weight.dtype,
+        "scale": TensorDataType.float32,
+        "zero_point": TensorDataType.int32,
+    }
+    ov_model_params.output_dtypes = ov_model_params.output_dtypes or {
+        "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8,
+        "scale": TensorDataType.float32,
+        "zero_point": TensorDataType.int32,
+    }
+    if config.num_bits == 4 and weight.backend == TensorBackend.ov:
+        # Return ov tensors in target precision to seamlessly insert them into openvino model later
+        ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov
+        compressed_weight_dtype = TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4
+        ov_model_params.output_dtypes.update(
+            {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
+        )
+
     # ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
     # ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
     # ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
@@ -582,10 +595,18 @@ def calculate_quantized_dequantized_weight(
     scale_shape = precomputed_scale.shape if precomputed_scale is not None else None
     zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None
 
-    if ov_model_params is None:
-        ov_model_params = OVModelParameters(weight.dtype)
-    if return_compressed_weight and config.num_bits == 4:
-        ov_model_params.output_dtype = TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8
+    ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params)
+    ov_model_params.input_dtypes = ov_model_params.input_dtypes or {
+        "weight": weight.dtype,
+        "scale": TensorDataType.float32,
+        "zero_point": TensorDataType.int32,
+    }
+    ov_model_params.output_dtypes = ov_model_params.output_dtypes or {
+        "decompressed_weight": TensorDataType.float32,
+        "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8,
+        "scale": TensorDataType.float32,
+        "zero_point": TensorDataType.int32,
+    }
 
     model = get_compress_decompress_weight_model(
         ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index 55b0b854499..a868d310190 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -45,7 +45,8 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
 
     model = get_astype_model(
         OVModelParameters(
-            input_dtype=a_dtype,
+            input_dtypes={"input": a_dtype},
+            output_dtypes={"output": dtype},
             dynamic_shapes=True,
             recompile=False,
             release_memory=True,
@@ -54,7 +55,6 @@ def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
             return_ov_tensors=True,
         ),
         tuple(a.shape),
-        dtype,
     )
     return model([Tensor(a)])[0].data
 
diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index e1e45ef0391..12de6121ac6 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -57,7 +57,6 @@ class QuantizationTask(Enum):
     WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2),
 ]
 
-
 DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
 
 MAX_MISALIGNMENT_FREQUENCY = {
@@ -74,7 +73,6 @@ class QuantizationTask(Enum):
 
 REDUCTION_AXES = (1,)
 
-
 RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer()
 
 
@@ -179,7 +177,7 @@ def test_quantization_alignment(
 
                 kwargs = {}
                 if cb == ComputationBackend.OV:
-                    ov_model_params = OVModelParameters(weight.dtype, dynamic_shapes=not static_shapes)
+                    ov_model_params = OVModelParameters(dynamic_shapes=not static_shapes)
                     kwargs["ov_model_params"] = ov_model_params
                 if quantization_task == QuantizationTask.Q_DQ_RQ:
                     kwargs["return_compressed_weight"] = True

From 1010fcf56bc62140440383d03df2a68853d041f5 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 11 Nov 2024 17:58:55 +0100
Subject: [PATCH 28/73] Add dynamic shapes test

---
 .../quantization/test_openvino_modeling.py    | 89 ++++++++++++++++++-
 1 file changed, 88 insertions(+), 1 deletion(-)

diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index 12de6121ac6..9321087939e 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -20,7 +20,8 @@
 
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
@@ -300,3 +301,89 @@ def test_quantization_alignment(
                         MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS,
                         err_msg=f"Too large misalignment for {key}.",
                     )
+
+
+@pytest.mark.parametrize("get_ov_model_fn,input_shapes,ref_cache_size", [
+    (
+        lambda dynamic_shapes, input_shapes: get_compress_weight_model(
+            OVModelParameters(
+                input_dtypes={
+                    "weight": TensorDataType.float32,
+                    "scale": TensorDataType.float32,
+                    "zero_point": TensorDataType.int32
+                },
+                output_dtypes={
+                    "compressed_weight": TensorDataType.uint8
+                },
+                dynamic_shapes=dynamic_shapes,
+            ),
+            WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            *input_shapes,
+            reduction_axes=REDUCTION_AXES,
+        ),
+        [
+            [(10, 4), (10, 1), (10, 1)],
+            [(20, 6), (20, 1), (20, 1)],
+            [(20, 8), (20, 1), (20, 1)],
+            [(10, 4, 4), (10, 4, 1), (10, 4, 1),],
+            [(10, 8, 4), (10, 8, 1), (10, 8, 1),],
+        ],
+        {False: 5, True: 2}
+    ),
+    (
+        lambda dynamic_shapes, input_shapes: get_compress_decompress_weight_model(
+            OVModelParameters(
+                input_dtypes={
+                    "weight": TensorDataType.float32,
+                    "scale": TensorDataType.float32,
+                    "zero_point": TensorDataType.int32
+                },
+                output_dtypes={
+                    "compressed_weight": TensorDataType.int32,
+                    "decompressed_weight": TensorDataType.float32,
+                },
+                dynamic_shapes=dynamic_shapes,
+            ),
+            WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            *input_shapes,
+            reduction_axes=REDUCTION_AXES,
+        ),
+        [
+            [(10, 4), (10, 1), (10, 1)],
+            [(20, 6), (20, 1), (20, 1)],
+            [(20, 8), (20, 1), (20, 1)],
+            [(10, 4, 4), (10, 4, 1), (10, 4, 1),],
+            [(10, 8, 4), (10, 8, 1), (10, 8, 1),],
+        ],
+        {False: 10, True: 4}
+    ),
+    (
+        lambda dynamic_shapes, input_shape: get_astype_model(
+            OVModelParameters(
+                input_dtypes={
+                    "input": TensorDataType.float32,
+                },
+                output_dtypes={
+                    "output": TensorDataType.int32,
+                },
+                dynamic_shapes=dynamic_shapes,
+            ),
+            input_shape,
+        ),
+        [
+            (10, 4),
+            (20, 6),
+            (20, 8),
+            (10, 4, 4),
+            (10, 8, 4),
+        ],
+        {False: 5, True: 2}
+    ),
+])
+@pytest.mark.parametrize("dynamic_shapes", [False, True])
+def test_dynamic_shapes(get_ov_model_fn, input_shapes, ref_cache_size, dynamic_shapes):
+    # Check that model cache contains fewer elements with dynamic shapes included
+    OV_MODEL_CACHE.clear()
+    for shape in input_shapes:
+        get_ov_model_fn(dynamic_shapes, shape)
+    assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes]

From 6e54fba431ee2989e218d0458eb50953c2e5e47b Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 13 Nov 2024 14:44:52 +0100
Subject: [PATCH 29/73] ov modeling tests

---
 .../weight_compression/openvino_modeling.py   |  31 +-
 .../quantization/test_openvino_modeling.py    | 267 +++++++++++++-----
 2 files changed, 212 insertions(+), 86 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index cec16ce8bb7..1cab401ee01 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -9,6 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 from dataclasses import dataclass
 from functools import partial
 from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -96,6 +97,7 @@ def get_compress_weight_model(
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
+    return_nodes: Optional[bool] = False,
 ) -> ModelCallable:
     if scale_shape is None and zero_point_shape is not None:
         raise Exception("Zero point shape can only be provided if scale shape is provided.")
@@ -114,7 +116,7 @@ def get_compress_weight_model(
         scale_shape,
         zero_point_shape,
         reduction_axes,
-        return_nodes=False,
+        return_nodes=return_nodes,
         disable_caching=ov_model_params.recompile,
     )
 
@@ -165,11 +167,11 @@ def _build_compress_model(
         raise ValueError("Output dtypes must be provided.")
 
     weight_dtype = input_dtypes.get("weight")
-    input_scale_dtype = input_dtypes.get("scale", None)
-    input_zero_point_dtype = input_dtypes.get("zero_point", None)
+    input_scale_dtype = input_dtypes.get("scale", TensorDataType.float32)
+    input_zero_point_dtype = input_dtypes.get("zero_point", TensorDataType.int32)
     compressed_weight_dtype = output_dtypes.get("compressed_weight")
-    output_scale_dtype = output_dtypes.get("scale", None)
-    output_zero_point_dtype = output_dtypes.get("zero_point", None)
+    output_scale_dtype = output_dtypes.get("scale", TensorDataType.float32)
+    output_zero_point_dtype = output_dtypes.get("zero_point", TensorDataType.int32)
 
     # Validate input dtypes
     valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
@@ -190,7 +192,7 @@ def _build_compress_model(
         TensorDataType.int4,
         TensorDataType.uint4,
     ]
-    if compressed_weight_dtype not in valid_compressed_weight_dtypes:
+    if compressed_weight_dtype not in valid_compressed_weight_dtypes + [TensorDataType.float32]:
         raise ValueError(
             f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
             f"But found: {compressed_weight_dtype}."
@@ -271,7 +273,8 @@ def _build_compress_model(
 
     compressed_weight = opset.round(compressed_weight)
     compressed_weight = opset.clamp(opset.round(compressed_weight), level_low, level_high)
-    compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
+    if compressed_weight_dtype != TensorDataType.float32:
+        compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
 
     ov_results = [compressed_weight]
     if len(ov_parameters) == 1:
@@ -310,8 +313,12 @@ def _build_compress_decompress_model(
     if decompressed_weight_dtype != TensorDataType.float32:
         raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.")
 
-    ov_parameters, ov_results = _build_compress_model(
-        config, ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
+    if "compressed_weight" not in output_dtypes:
+        ov_model_params = copy.deepcopy(ov_model_params)
+        ov_model_params.output_dtypes["compressed_weight"] = TensorDataType.float32
+
+    ov_parameters, ov_results = get_compress_weight_model(
+        ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
     )
 
     if config.is_int_asym:
@@ -344,10 +351,10 @@ def _build_compress_decompress_model(
     return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
 
 
-def get_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) -> ModelCallable:
+def get_astype_model(ov_model_params: OVModelParameters, input_shape: Tuple) -> ModelCallable:
     if ov_model_params.dynamic_shapes:
-        arg_shape = (-1,) * len(arg_shape)
-    return _build_astype_model(ov_model_params, arg_shape)
+        input_shape = (-1,) * len(input_shape)
+    return _build_astype_model(ov_model_params, input_shape, disable_caching=ov_model_params.recompile)
 
 
 @cache_results(OV_MODEL_CACHE)
diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index 9321087939e..d7d562cff6c 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -8,6 +8,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import inspect
 from collections import defaultdict
 from contextlib import contextmanager
@@ -20,7 +21,8 @@
 
 from nncf import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters, OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
@@ -303,87 +305,204 @@ def test_quantization_alignment(
                     )
 
 
-@pytest.mark.parametrize("get_ov_model_fn,input_shapes,ref_cache_size", [
-    (
-        lambda dynamic_shapes, input_shapes: get_compress_weight_model(
-            OVModelParameters(
-                input_dtypes={
-                    "weight": TensorDataType.float32,
-                    "scale": TensorDataType.float32,
-                    "zero_point": TensorDataType.int32
-                },
-                output_dtypes={
-                    "compressed_weight": TensorDataType.uint8
-                },
-                dynamic_shapes=dynamic_shapes,
-            ),
-            WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
-            *input_shapes,
+class ModelGetter:
+    def __init__(self, get_model_fn, ov_model_params_kwargs, get_model_kwargs):
+        self._get_model_fn = get_model_fn
+        self._ov_model_params_kwargs = ov_model_params_kwargs
+        self._get_model_kwargs = get_model_kwargs
+
+    def get(self, ov_model_params_kwargs=None, get_model_kwargs=None):
+        ov_model_params_kwargs = ov_model_params_kwargs or {}
+        get_model_kwargs = get_model_kwargs or {}
+        return self._get_model_fn(
+            OVModelParameters(**{**self._ov_model_params_kwargs, **ov_model_params_kwargs}),
+            **{**self._get_model_kwargs, **get_model_kwargs},
+        )
+
+
+MODEL_GETTERS = [
+    ModelGetter(
+        get_model_fn=get_compress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "weight": TensorDataType.float32,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+            output_dtypes={"compressed_weight": TensorDataType.uint8},
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
+            scale_shape=(10, 1),
+            zero_point_shape=(10, 1),
+        ),
+    ),
+    ModelGetter(
+        get_model_fn=get_compress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={"weight": TensorDataType.float32},
+            output_dtypes={
+                "compressed_weight": TensorDataType.uint8,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
             reduction_axes=REDUCTION_AXES,
         ),
-        [
-            [(10, 4), (10, 1), (10, 1)],
-            [(20, 6), (20, 1), (20, 1)],
-            [(20, 8), (20, 1), (20, 1)],
-            [(10, 4, 4), (10, 4, 1), (10, 4, 1),],
-            [(10, 8, 4), (10, 8, 1), (10, 8, 1),],
-        ],
-        {False: 5, True: 2}
     ),
-    (
-        lambda dynamic_shapes, input_shapes: get_compress_decompress_weight_model(
-            OVModelParameters(
-                input_dtypes={
-                    "weight": TensorDataType.float32,
-                    "scale": TensorDataType.float32,
-                    "zero_point": TensorDataType.int32
-                },
-                output_dtypes={
-                    "compressed_weight": TensorDataType.int32,
-                    "decompressed_weight": TensorDataType.float32,
-                },
-                dynamic_shapes=dynamic_shapes,
-            ),
-            WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
-            *input_shapes,
+    ModelGetter(
+        get_model_fn=get_compress_decompress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "weight": TensorDataType.float32,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+            output_dtypes={
+                "decompressed_weight": TensorDataType.float32,
+            },
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
+            scale_shape=(10, 1),
+            zero_point_shape=(10, 1),
+        ),
+    ),
+    ModelGetter(
+        get_model_fn=get_compress_decompress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "weight": TensorDataType.float32,
+            },
+            output_dtypes={
+                "decompressed_weight": TensorDataType.float32,
+                "compressed_weight": TensorDataType.int32,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
             reduction_axes=REDUCTION_AXES,
+            return_compressed_weight=True,
         ),
-        [
-            [(10, 4), (10, 1), (10, 1)],
-            [(20, 6), (20, 1), (20, 1)],
-            [(20, 8), (20, 1), (20, 1)],
-            [(10, 4, 4), (10, 4, 1), (10, 4, 1),],
-            [(10, 8, 4), (10, 8, 1), (10, 8, 1),],
-        ],
-        {False: 10, True: 4}
     ),
-    (
-        lambda dynamic_shapes, input_shape: get_astype_model(
-            OVModelParameters(
-                input_dtypes={
-                    "input": TensorDataType.float32,
-                },
-                output_dtypes={
-                    "output": TensorDataType.int32,
-                },
-                dynamic_shapes=dynamic_shapes,
-            ),
-            input_shape,
+    ModelGetter(
+        get_model_fn=get_astype_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "input": TensorDataType.float32,
+            },
+            output_dtypes={
+                "output": TensorDataType.bfloat16,
+            },
+        ),
+        get_model_kwargs=dict(
+            input_shape=(10, 4),
         ),
-        [
-            (10, 4),
-            (20, 6),
-            (20, 8),
-            (10, 4, 4),
-            (10, 8, 4),
-        ],
-        {False: 5, True: 2}
     ),
-])
+]
+
+
+@pytest.mark.parametrize(
+    "model_getter,input_shapes,ref_cache_size",
+    [
+        (
+            MODEL_GETTERS[0],
+            [
+                dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)),
+                dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)),
+                dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)),
+            ],
+            {False: 5, True: 2},
+        ),
+        (
+            MODEL_GETTERS[1],
+            [
+                dict(weight_shape=(10, 4)),
+                dict(weight_shape=(20, 6)),
+                dict(weight_shape=(20, 8)),
+                dict(weight_shape=(10, 4, 4)),
+                dict(weight_shape=(10, 8, 4)),
+            ],
+            {False: 5, True: 2},
+        ),
+        (
+            MODEL_GETTERS[2],
+            [
+                dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)),
+                dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)),
+                dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)),
+            ],
+            {False: 10, True: 4},
+        ),
+        (
+            MODEL_GETTERS[3],
+            [
+                dict(weight_shape=(10, 4)),
+                dict(weight_shape=(20, 6)),
+                dict(weight_shape=(20, 8)),
+                dict(weight_shape=(10, 4, 4)),
+                dict(weight_shape=(10, 8, 4)),
+            ],
+            {False: 10, True: 4},
+        ),
+        (
+            MODEL_GETTERS[4],
+            [
+                dict(input_shape=(10, 1)),
+                dict(input_shape=(10, 2)),
+                dict(input_shape=(20, 3)),
+                dict(input_shape=(10, 4, 4)),
+                dict(input_shape=(10, 8, 4)),
+            ],
+            {False: 5, True: 2},
+        ),
+    ],
+)
 @pytest.mark.parametrize("dynamic_shapes", [False, True])
-def test_dynamic_shapes(get_ov_model_fn, input_shapes, ref_cache_size, dynamic_shapes):
-    # Check that model cache contains fewer elements with dynamic shapes included
+def test_dynamic_shapes(model_getter, input_shapes, ref_cache_size, dynamic_shapes):
+    # Check that model cache contains fewer elements with dynamic shapes enabled
     OV_MODEL_CACHE.clear()
-    for shape in input_shapes:
-        get_ov_model_fn(dynamic_shapes, shape)
+    for shape_kwargs in input_shapes:
+        model_getter.get(ov_model_params_kwargs=dict(dynamic_shapes=dynamic_shapes), get_model_kwargs=shape_kwargs)
     assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes]
+
+
+@pytest.mark.parametrize("model_getter", MODEL_GETTERS)
+@pytest.mark.parametrize("recompile", [True, False])
+def test_recompile(model_getter, recompile):
+    OV_MODEL_CACHE.clear()
+    model_getter.get(ov_model_params_kwargs=dict(recompile=recompile))
+    ref_size = 0 if recompile else (2 if model_getter._get_model_fn == get_compress_decompress_weight_model else 1)
+    assert len(OV_MODEL_CACHE._cache) == ref_size
+
+
+@pytest.mark.parametrize("model_getter", MODEL_GETTERS)
+@pytest.mark.parametrize("return_ov_tensors", [True, False])
+def test_return_ov_tensors(model_getter, return_ov_tensors):
+    OV_MODEL_CACHE.clear()
+    inputs = []
+    for input_name, input_dtype in model_getter._ov_model_params_kwargs["input_dtypes"].items():
+        input_shape = model_getter._get_model_kwargs.get(f"{input_name}_shape")
+        if input_dtype in [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]:
+            inp = get_random_float_tensor(input_shape, input_dtype, TensorBackend.numpy)
+        else:
+            inp = get_random_integer_tensor(input_shape, 0, 16, input_dtype, TensorBackend.numpy)
+        inputs.append(inp)
+
+    model_run_fn = model_getter.get(ov_model_params_kwargs=dict(return_ov_tensors=return_ov_tensors))
+    outputs = model_run_fn(inputs)
+
+    all_outputs_are_ov_tensors = all([out.backend == TensorBackend.ov for out in outputs])
+    assert all_outputs_are_ov_tensors == return_ov_tensors

From 8ac0fe2ef0fa195ebf4e2ab4930b0a41f4da5e86 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 13 Nov 2024 14:47:19 +0100
Subject: [PATCH 30/73] Move cache_results decorator

---
 nncf/common/utils/decorators.py               | 44 +++++++++++++++
 .../weight_compression/openvino_modeling.py   |  3 +-
 nncf/results_caching.py                       | 55 -------------------
 .../quantization/test_openvino_modeling.py    |  3 +-
 4 files changed, 46 insertions(+), 59 deletions(-)
 delete mode 100644 nncf/results_caching.py

diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py
index d47c78c473a..0542c91e578 100644
--- a/nncf/common/utils/decorators.py
+++ b/nncf/common/utils/decorators.py
@@ -10,6 +10,7 @@
 # limitations under the License.
 
 from importlib import import_module
+import inspect
 from typing import Any, Callable, Dict, List
 
 from nncf.common.logging import nncf_logger
@@ -51,3 +52,46 @@ def wrapped_f(*args: Any, **kwargs: Any):  # type: ignore
         return wrapped_f
 
     return wrap
+
+
+class ResultsCacheContainer:
+    def __init__(self):
+        self._cache = {}
+        self._access_count = {}
+
+    def clear(self):
+        self._cache.clear()
+        self._access_count.clear()
+
+    def is_empty(self):
+        return len(self._cache) == 0
+
+    def __getitem__(self, item):
+        self._access_count[item] += 1
+        return self._cache[item]
+
+    def __setitem__(self, key, value):
+        self._access_count[key] = 0
+        self._cache[key] = value
+
+    def __contains__(self, item):
+        return item in self._cache
+
+
+def cache_results(cache: ResultsCacheContainer):
+    def decorator(func):
+        def wrapper(*args, disable_caching=False, **kwargs):
+            sig = inspect.signature(func)
+            new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
+            new_kwargs.update(kwargs)
+            cache_key = (func.__name__, frozenset(new_kwargs.items()))
+            if cache_key in cache:
+                return cache[cache_key]
+            result = func(*args, **kwargs)
+            if not disable_caching:
+                cache[cache_key] = result
+            return result
+
+        return wrapper
+
+    return decorator
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 1cab401ee01..2acd9733c82 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -18,9 +18,8 @@
 import openvino as ov
 from openvino.runtime import opset13 as opset
 
+from nncf.common.utils.decorators import ResultsCacheContainer, cache_results
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.results_caching import ResultsCacheContainer
-from nncf.results_caching import cache_results
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
deleted file mode 100644
index 9b314863108..00000000000
--- a/nncf/results_caching.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-
-
-class ResultsCacheContainer:
-    def __init__(self):
-        self._cache = {}
-        self._access_count = {}
-
-    def clear(self):
-        self._cache.clear()
-        self._access_count.clear()
-
-    def is_empty(self):
-        return len(self._cache) == 0
-
-    def __getitem__(self, item):
-        self._access_count[item] += 1
-        return self._cache[item]
-
-    def __setitem__(self, key, value):
-        self._access_count[key] = 0
-        self._cache[key] = value
-
-    def __contains__(self, item):
-        return item in self._cache
-
-
-def cache_results(cache: ResultsCacheContainer):
-    def decorator(func):
-        def wrapper(*args, disable_caching=False, **kwargs):
-            sig = inspect.signature(func)
-            new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
-            new_kwargs.update(kwargs)
-            cache_key = (func.__name__, frozenset(new_kwargs.items()))
-            if cache_key in cache:
-                return cache[cache_key]
-            result = func(*args, **kwargs)
-            if not disable_caching:
-                cache[cache_key] = result
-            return result
-
-        return wrapper
-
-    return decorator
diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_openvino_modeling.py
index d7d562cff6c..3fd270132a6 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_openvino_modeling.py
@@ -20,6 +20,7 @@
 import pytest
 
 from nncf import CompressWeightsMode
+from nncf.common.utils.decorators import cache_results, ResultsCacheContainer
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
@@ -29,8 +30,6 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
-from nncf.results_caching import ResultsCacheContainer
-from nncf.results_caching import cache_results
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor import functions as fns

From ded66f3447676a3b98af3b4dffdc8bf6e6b6f9ac Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 13 Nov 2024 14:53:07 +0100
Subject: [PATCH 31/73] Tests reorgantization

---
 ...ing.py => test_ov_modeling_compression.py} | 205 ----------------
 .../openvino/native/test_openvino_modeling.py | 224 ++++++++++++++++++
 2 files changed, 224 insertions(+), 205 deletions(-)
 rename tests/openvino/native/quantization/{test_openvino_modeling.py => test_ov_modeling_compression.py} (64%)
 create mode 100644 tests/openvino/native/test_openvino_modeling.py

diff --git a/tests/openvino/native/quantization/test_openvino_modeling.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
similarity index 64%
rename from tests/openvino/native/quantization/test_openvino_modeling.py
rename to tests/openvino/native/quantization/test_ov_modeling_compression.py
index 3fd270132a6..682ff604901 100644
--- a/tests/openvino/native/quantization/test_openvino_modeling.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -22,9 +22,7 @@
 from nncf import CompressWeightsMode
 from nncf.common.utils.decorators import cache_results, ResultsCacheContainer
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
@@ -302,206 +300,3 @@ def test_quantization_alignment(
                         MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS,
                         err_msg=f"Too large misalignment for {key}.",
                     )
-
-
-class ModelGetter:
-    def __init__(self, get_model_fn, ov_model_params_kwargs, get_model_kwargs):
-        self._get_model_fn = get_model_fn
-        self._ov_model_params_kwargs = ov_model_params_kwargs
-        self._get_model_kwargs = get_model_kwargs
-
-    def get(self, ov_model_params_kwargs=None, get_model_kwargs=None):
-        ov_model_params_kwargs = ov_model_params_kwargs or {}
-        get_model_kwargs = get_model_kwargs or {}
-        return self._get_model_fn(
-            OVModelParameters(**{**self._ov_model_params_kwargs, **ov_model_params_kwargs}),
-            **{**self._get_model_kwargs, **get_model_kwargs},
-        )
-
-
-MODEL_GETTERS = [
-    ModelGetter(
-        get_model_fn=get_compress_weight_model,
-        ov_model_params_kwargs=dict(
-            input_dtypes={
-                "weight": TensorDataType.float32,
-                "scale": TensorDataType.float32,
-                "zero_point": TensorDataType.int32,
-            },
-            output_dtypes={"compressed_weight": TensorDataType.uint8},
-        ),
-        get_model_kwargs=dict(
-            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
-            weight_shape=(10, 4),
-            scale_shape=(10, 1),
-            zero_point_shape=(10, 1),
-        ),
-    ),
-    ModelGetter(
-        get_model_fn=get_compress_weight_model,
-        ov_model_params_kwargs=dict(
-            input_dtypes={"weight": TensorDataType.float32},
-            output_dtypes={
-                "compressed_weight": TensorDataType.uint8,
-                "scale": TensorDataType.float32,
-                "zero_point": TensorDataType.int32,
-            },
-        ),
-        get_model_kwargs=dict(
-            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
-            weight_shape=(10, 4),
-            reduction_axes=REDUCTION_AXES,
-        ),
-    ),
-    ModelGetter(
-        get_model_fn=get_compress_decompress_weight_model,
-        ov_model_params_kwargs=dict(
-            input_dtypes={
-                "weight": TensorDataType.float32,
-                "scale": TensorDataType.float32,
-                "zero_point": TensorDataType.int32,
-            },
-            output_dtypes={
-                "decompressed_weight": TensorDataType.float32,
-            },
-        ),
-        get_model_kwargs=dict(
-            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
-            weight_shape=(10, 4),
-            scale_shape=(10, 1),
-            zero_point_shape=(10, 1),
-        ),
-    ),
-    ModelGetter(
-        get_model_fn=get_compress_decompress_weight_model,
-        ov_model_params_kwargs=dict(
-            input_dtypes={
-                "weight": TensorDataType.float32,
-            },
-            output_dtypes={
-                "decompressed_weight": TensorDataType.float32,
-                "compressed_weight": TensorDataType.int32,
-                "scale": TensorDataType.float32,
-                "zero_point": TensorDataType.int32,
-            },
-        ),
-        get_model_kwargs=dict(
-            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
-            weight_shape=(10, 4),
-            reduction_axes=REDUCTION_AXES,
-            return_compressed_weight=True,
-        ),
-    ),
-    ModelGetter(
-        get_model_fn=get_astype_model,
-        ov_model_params_kwargs=dict(
-            input_dtypes={
-                "input": TensorDataType.float32,
-            },
-            output_dtypes={
-                "output": TensorDataType.bfloat16,
-            },
-        ),
-        get_model_kwargs=dict(
-            input_shape=(10, 4),
-        ),
-    ),
-]
-
-
-@pytest.mark.parametrize(
-    "model_getter,input_shapes,ref_cache_size",
-    [
-        (
-            MODEL_GETTERS[0],
-            [
-                dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)),
-                dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)),
-                dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)),
-                dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)),
-                dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)),
-            ],
-            {False: 5, True: 2},
-        ),
-        (
-            MODEL_GETTERS[1],
-            [
-                dict(weight_shape=(10, 4)),
-                dict(weight_shape=(20, 6)),
-                dict(weight_shape=(20, 8)),
-                dict(weight_shape=(10, 4, 4)),
-                dict(weight_shape=(10, 8, 4)),
-            ],
-            {False: 5, True: 2},
-        ),
-        (
-            MODEL_GETTERS[2],
-            [
-                dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)),
-                dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)),
-                dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)),
-                dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)),
-                dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)),
-            ],
-            {False: 10, True: 4},
-        ),
-        (
-            MODEL_GETTERS[3],
-            [
-                dict(weight_shape=(10, 4)),
-                dict(weight_shape=(20, 6)),
-                dict(weight_shape=(20, 8)),
-                dict(weight_shape=(10, 4, 4)),
-                dict(weight_shape=(10, 8, 4)),
-            ],
-            {False: 10, True: 4},
-        ),
-        (
-            MODEL_GETTERS[4],
-            [
-                dict(input_shape=(10, 1)),
-                dict(input_shape=(10, 2)),
-                dict(input_shape=(20, 3)),
-                dict(input_shape=(10, 4, 4)),
-                dict(input_shape=(10, 8, 4)),
-            ],
-            {False: 5, True: 2},
-        ),
-    ],
-)
-@pytest.mark.parametrize("dynamic_shapes", [False, True])
-def test_dynamic_shapes(model_getter, input_shapes, ref_cache_size, dynamic_shapes):
-    # Check that model cache contains fewer elements with dynamic shapes enabled
-    OV_MODEL_CACHE.clear()
-    for shape_kwargs in input_shapes:
-        model_getter.get(ov_model_params_kwargs=dict(dynamic_shapes=dynamic_shapes), get_model_kwargs=shape_kwargs)
-    assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes]
-
-
-@pytest.mark.parametrize("model_getter", MODEL_GETTERS)
-@pytest.mark.parametrize("recompile", [True, False])
-def test_recompile(model_getter, recompile):
-    OV_MODEL_CACHE.clear()
-    model_getter.get(ov_model_params_kwargs=dict(recompile=recompile))
-    ref_size = 0 if recompile else (2 if model_getter._get_model_fn == get_compress_decompress_weight_model else 1)
-    assert len(OV_MODEL_CACHE._cache) == ref_size
-
-
-@pytest.mark.parametrize("model_getter", MODEL_GETTERS)
-@pytest.mark.parametrize("return_ov_tensors", [True, False])
-def test_return_ov_tensors(model_getter, return_ov_tensors):
-    OV_MODEL_CACHE.clear()
-    inputs = []
-    for input_name, input_dtype in model_getter._ov_model_params_kwargs["input_dtypes"].items():
-        input_shape = model_getter._get_model_kwargs.get(f"{input_name}_shape")
-        if input_dtype in [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]:
-            inp = get_random_float_tensor(input_shape, input_dtype, TensorBackend.numpy)
-        else:
-            inp = get_random_integer_tensor(input_shape, 0, 16, input_dtype, TensorBackend.numpy)
-        inputs.append(inp)
-
-    model_run_fn = model_getter.get(ov_model_params_kwargs=dict(return_ov_tensors=return_ov_tensors))
-    outputs = model_run_fn(inputs)
-
-    all_outputs_are_ov_tensors = all([out.backend == TensorBackend.ov for out in outputs])
-    assert all_outputs_are_ov_tensors == return_ov_tensors
diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py
new file mode 100644
index 00000000000..71f9cd316fb
--- /dev/null
+++ b/tests/openvino/native/test_openvino_modeling.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pytest
+
+from nncf import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
+from nncf.tensor import TensorDataType, Tensor
+from nncf.tensor.definitions import TensorBackend
+
+from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
+
+
+class ModelGetter:
+    def __init__(self, get_model_fn, ov_model_params_kwargs, get_model_kwargs):
+        self._get_model_fn = get_model_fn
+        self._ov_model_params_kwargs = ov_model_params_kwargs
+        self._get_model_kwargs = get_model_kwargs
+
+    def get(self, ov_model_params_kwargs=None, get_model_kwargs=None):
+        ov_model_params_kwargs = ov_model_params_kwargs or {}
+        get_model_kwargs = get_model_kwargs or {}
+        return self._get_model_fn(
+            OVModelParameters(**{**self._ov_model_params_kwargs, **ov_model_params_kwargs}),
+            **{**self._get_model_kwargs, **get_model_kwargs},
+        )
+
+
+MODEL_GETTERS = [
+    ModelGetter(
+        get_model_fn=get_compress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "weight": TensorDataType.float32,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+            output_dtypes={"compressed_weight": TensorDataType.uint8},
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
+            scale_shape=(10, 1),
+            zero_point_shape=(10, 1),
+        ),
+    ),
+    ModelGetter(
+        get_model_fn=get_compress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={"weight": TensorDataType.float32},
+            output_dtypes={
+                "compressed_weight": TensorDataType.uint8,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
+            reduction_axes=(1,),
+        ),
+    ),
+    ModelGetter(
+        get_model_fn=get_compress_decompress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "weight": TensorDataType.float32,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+            output_dtypes={
+                "decompressed_weight": TensorDataType.float32,
+            },
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
+            scale_shape=(10, 1),
+            zero_point_shape=(10, 1),
+        ),
+    ),
+    ModelGetter(
+        get_model_fn=get_compress_decompress_weight_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "weight": TensorDataType.float32,
+            },
+            output_dtypes={
+                "decompressed_weight": TensorDataType.float32,
+                "compressed_weight": TensorDataType.int32,
+                "scale": TensorDataType.float32,
+                "zero_point": TensorDataType.int32,
+            },
+        ),
+        get_model_kwargs=dict(
+            config=WeightCompressionConfig(CompressWeightsMode.INT8_ASYM),
+            weight_shape=(10, 4),
+            reduction_axes=(1,),
+            return_compressed_weight=True,
+        ),
+    ),
+    ModelGetter(
+        get_model_fn=get_astype_model,
+        ov_model_params_kwargs=dict(
+            input_dtypes={
+                "input": TensorDataType.float32,
+            },
+            output_dtypes={
+                "output": TensorDataType.bfloat16,
+            },
+        ),
+        get_model_kwargs=dict(
+            input_shape=(10, 4),
+        ),
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "model_getter,input_shapes,ref_cache_size",
+    [
+        (
+            MODEL_GETTERS[0],
+            [
+                dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)),
+                dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)),
+                dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)),
+            ],
+            {False: 5, True: 2},
+        ),
+        (
+            MODEL_GETTERS[1],
+            [
+                dict(weight_shape=(10, 4)),
+                dict(weight_shape=(20, 6)),
+                dict(weight_shape=(20, 8)),
+                dict(weight_shape=(10, 4, 4)),
+                dict(weight_shape=(10, 8, 4)),
+            ],
+            {False: 5, True: 2},
+        ),
+        (
+            MODEL_GETTERS[2],
+            [
+                dict(weight_shape=(10, 4), scale_shape=(10, 1), zero_point_shape=(10, 1)),
+                dict(weight_shape=(20, 6), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(20, 8), scale_shape=(20, 1), zero_point_shape=(20, 1)),
+                dict(weight_shape=(10, 4, 4), scale_shape=(10, 4, 1), zero_point_shape=(10, 4, 1)),
+                dict(weight_shape=(10, 8, 4), scale_shape=(10, 8, 1), zero_point_shape=(10, 8, 1)),
+            ],
+            {False: 10, True: 4},
+        ),
+        (
+            MODEL_GETTERS[3],
+            [
+                dict(weight_shape=(10, 4)),
+                dict(weight_shape=(20, 6)),
+                dict(weight_shape=(20, 8)),
+                dict(weight_shape=(10, 4, 4)),
+                dict(weight_shape=(10, 8, 4)),
+            ],
+            {False: 10, True: 4},
+        ),
+        (
+            MODEL_GETTERS[4],
+            [
+                dict(input_shape=(10, 1)),
+                dict(input_shape=(10, 2)),
+                dict(input_shape=(20, 3)),
+                dict(input_shape=(10, 4, 4)),
+                dict(input_shape=(10, 8, 4)),
+            ],
+            {False: 5, True: 2},
+        ),
+    ],
+)
+@pytest.mark.parametrize("dynamic_shapes", [False, True])
+def test_dynamic_shapes(model_getter, input_shapes, ref_cache_size, dynamic_shapes):
+    # Check that model cache contains fewer elements with dynamic shapes enabled
+    OV_MODEL_CACHE.clear()
+    for shape_kwargs in input_shapes:
+        model_getter.get(ov_model_params_kwargs=dict(dynamic_shapes=dynamic_shapes), get_model_kwargs=shape_kwargs)
+    assert len(OV_MODEL_CACHE._cache) == ref_cache_size[dynamic_shapes]
+
+
+@pytest.mark.parametrize("model_getter", MODEL_GETTERS)
+@pytest.mark.parametrize("recompile", [True, False])
+def test_recompile(model_getter, recompile):
+    # Check that with recompilation ov models are not cached
+    OV_MODEL_CACHE.clear()
+    model_getter.get(ov_model_params_kwargs=dict(recompile=recompile))
+    ref_size = 0 if recompile else (2 if model_getter._get_model_fn == get_compress_decompress_weight_model else 1)
+    assert len(OV_MODEL_CACHE._cache) == ref_size
+
+
+@pytest.mark.parametrize("model_getter", MODEL_GETTERS)
+@pytest.mark.parametrize("return_ov_tensors", [True, False])
+def test_return_ov_tensors(model_getter, return_ov_tensors):
+    # Check that ov tensors are returned
+    OV_MODEL_CACHE.clear()
+    inputs = []
+    for input_name, input_dtype in model_getter._ov_model_params_kwargs["input_dtypes"].items():
+        input_shape = model_getter._get_model_kwargs.get(f"{input_name}_shape")
+        inputs.append(Tensor(np.zeros(input_shape, dtype=DTYPE_MAP_NP[input_dtype])))
+
+    model_run_fn = model_getter.get(ov_model_params_kwargs=dict(return_ov_tensors=return_ov_tensors))
+    outputs = model_run_fn(inputs)
+
+    assert all([out.backend == (TensorBackend.ov if return_ov_tensors else TensorBackend.numpy) for out in outputs])

From 69ae5fa871453caec9c31f05f6bb27a5500cb16e Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 13 Nov 2024 16:04:44 +0100
Subject: [PATCH 32/73] cache_results decorator test

---
 nncf/common/utils/decorators.py               |   7 +-
 .../weight_compression/openvino_modeling.py   |   3 +-
 .../utils/test_cache_results_decorator.py     | 133 ++++++++++++++++++
 .../test_ov_modeling_compression.py           |   3 +-
 .../openvino/native/test_openvino_modeling.py |   4 +-
 5 files changed, 143 insertions(+), 7 deletions(-)
 create mode 100644 tests/common/utils/test_cache_results_decorator.py

diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py
index 0542c91e578..b3fd2a0e3ad 100644
--- a/nncf/common/utils/decorators.py
+++ b/nncf/common/utils/decorators.py
@@ -9,8 +9,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from importlib import import_module
 import inspect
+from importlib import import_module
 from typing import Any, Callable, Dict, List
 
 from nncf.common.logging import nncf_logger
@@ -81,6 +81,8 @@ def __contains__(self, item):
 def cache_results(cache: ResultsCacheContainer):
     def decorator(func):
         def wrapper(*args, disable_caching=False, **kwargs):
+            if disable_caching:
+                return func(*args, **kwargs)
             sig = inspect.signature(func)
             new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
             new_kwargs.update(kwargs)
@@ -88,8 +90,7 @@ def wrapper(*args, disable_caching=False, **kwargs):
             if cache_key in cache:
                 return cache[cache_key]
             result = func(*args, **kwargs)
-            if not disable_caching:
-                cache[cache_key] = result
+            cache[cache_key] = result
             return result
 
         return wrapper
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 2acd9733c82..eb61c6ea5bd 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -18,7 +18,8 @@
 import openvino as ov
 from openvino.runtime import opset13 as opset
 
-from nncf.common.utils.decorators import ResultsCacheContainer, cache_results
+from nncf.common.utils.decorators import ResultsCacheContainer
+from nncf.common.utils.decorators import cache_results
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
diff --git a/tests/common/utils/test_cache_results_decorator.py b/tests/common/utils/test_cache_results_decorator.py
new file mode 100644
index 00000000000..599e41a421d
--- /dev/null
+++ b/tests/common/utils/test_cache_results_decorator.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+
+from nncf.common.utils.decorators import ResultsCacheContainer
+from nncf.common.utils.decorators import cache_results
+
+TEST_CACHE_CONTAINER = ResultsCacheContainer()
+
+
+@cache_results(TEST_CACHE_CONTAINER)
+def cached_addition(a, b):
+    return a + b
+
+
+@pytest.mark.parametrize(
+    "inputs,disable_caching,output,clear_cache,cache_size,ref_cache,ref_access_count",
+    [
+        (
+            (1, 2),
+            False,
+            3,
+            False,
+            1,
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0},
+        ),
+        (
+            (1, 2),
+            False,
+            3,
+            False,
+            1,
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1},
+        ),
+        (
+            (2, 3),
+            True,
+            5,
+            False,
+            1,
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1},
+        ),
+        (
+            (3, 4),
+            False,
+            7,
+            False,
+            2,
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+            },
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 1,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0,
+            },
+        ),
+        (
+            (1, 2),
+            False,
+            3,
+            False,
+            2,
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+            },
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0,
+            },
+        ),
+        (
+            (3, 4),
+            False,
+            7,
+            False,
+            2,
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+            },
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1,
+            },
+        ),
+        (
+            (3, 4),
+            True,
+            7,
+            False,
+            2,
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+            },
+            {
+                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
+                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1,
+            },
+        ),
+        ((3, 4), True, 7, True, 0, {}, {}),
+        (
+            (1, 2),
+            False,
+            3,
+            False,
+            1,
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0},
+        ),
+    ],
+)
+def test_caching_results(inputs, disable_caching, output, clear_cache, cache_size, ref_cache, ref_access_count):
+    if clear_cache:
+        TEST_CACHE_CONTAINER.clear()
+    kwargs = {"disable_caching": True} if disable_caching else {}
+    assert cached_addition(*inputs, **kwargs) == output
+    assert len(TEST_CACHE_CONTAINER._cache) == cache_size
+    assert TEST_CACHE_CONTAINER._cache == ref_cache
+    assert TEST_CACHE_CONTAINER._access_count == ref_access_count
diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
index 682ff604901..d8c6bfa7ffa 100644
--- a/tests/openvino/native/quantization/test_ov_modeling_compression.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -20,7 +20,8 @@
 import pytest
 
 from nncf import CompressWeightsMode
-from nncf.common.utils.decorators import cache_results, ResultsCacheContainer
+from nncf.common.utils.decorators import ResultsCacheContainer
+from nncf.common.utils.decorators import cache_results
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py
index 71f9cd316fb..14ec9f740ab 100644
--- a/tests/openvino/native/test_openvino_modeling.py
+++ b/tests/openvino/native/test_openvino_modeling.py
@@ -18,9 +18,9 @@
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
-from nncf.tensor import TensorDataType, Tensor
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
 from nncf.tensor.definitions import TensorBackend
-
 from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
 
 

From d0f49aeca9804c421a19f59bbab7b6ccdae395b1 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 13 Nov 2024 16:22:04 +0100
Subject: [PATCH 33/73] get_const_value test

---
 tests/openvino/native/test_node_utils.py | 63 ++++++++++++++++++------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/tests/openvino/native/test_node_utils.py b/tests/openvino/native/test_node_utils.py
index fd962d6938b..241b9e6f156 100644
--- a/tests/openvino/native/test_node_utils.py
+++ b/tests/openvino/native/test_node_utils.py
@@ -29,30 +29,63 @@
 
 
 @pytest.mark.parametrize(
-    "precisions",
+    "precisions,cast_bf16_to_fp32",
     [
         # base FP32 precision
-        {
-            "type_for_const": ov.Type.f32,
-            "ref_type": np.float32,
-        },
+        (
+            {
+                "type_for_const": ov.Type.f32,
+                "ref_type": np.float32,
+            },
+            True,
+        ),
         # base FP16 precision
-        {
-            "type_for_const": ov.Type.f16,
-            "ref_type": np.float16,
-        },
+        (
+            {
+                "type_for_const": ov.Type.f16,
+                "ref_type": np.float16,
+            },
+            True,
+        ),
         # base BF16 precision should be casted to FP32
-        {
-            "type_for_const": ov.Type.bf16,
-            "ref_type": np.float32,
-        },
+        (
+            {
+                "type_for_const": ov.Type.bf16,
+                "ref_type": np.float32,
+            },
+            True,
+        ),
+        # base FP32 precision, cast_bf16_to_fp32=False has no effect
+        (
+            {
+                "type_for_const": ov.Type.f32,
+                "ref_type": np.float32,
+            },
+            False,
+        ),
+        # base FP16 precision, cast_bf16_to_fp32=False has no effect
+        (
+            {
+                "type_for_const": ov.Type.f16,
+                "ref_type": np.float16,
+            },
+            False,
+        ),
+        # with cast_bf16_to_fp32=False BF16 constant is retrieved as FP16
+        (
+            {
+                "type_for_const": ov.Type.bf16,
+                "ref_type": np.float16,
+            },
+            False,
+        ),
     ],
 )
-def test_get_const_value(precisions):
+def test_get_const_value(precisions, cast_bf16_to_fp32):
     const_data = np.ones((1, 2, 3), dtype=np.float32)
     weight_const = opset.constant(const_data, dtype=precisions["type_for_const"])
 
-    const_value = get_const_value(weight_const)
+    const_value = get_const_value(weight_const, cast_bf16_to_fp32=cast_bf16_to_fp32)
     assert const_value.dtype == precisions["ref_type"]
 
 

From a282976b3952f8c340069a6448fd2dd807425303 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 13 Nov 2024 16:54:57 +0100
Subject: [PATCH 34/73] OVModelParameters minor refactor

---
 nncf/openvino/graph/node_utils.py             |  13 ++
 .../weight_compression/openvino_modeling.py   | 144 +++++++++++-------
 .../weight_compression/weight_lowering.py     |  31 ++--
 nncf/results_caching.py                       |  55 +++++++
 4 files changed, 168 insertions(+), 75 deletions(-)
 create mode 100644 nncf/results_caching.py

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 05e759f1b16..24677d52968 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -630,3 +630,16 @@ def get_activation_channel_axis(node: NNCFNode, port_id: int, input_shape: Tuple
         channel_axis = activations_layout.index(OVLayoutElem.C_IN)
 
     return channel_axis
+
+
+def convert_if_needed(node: ov.Node, target_dtype: ov.Type) -> ov.Node:
+    """
+    Converts the input node to the target data type if it is not already in the target data type.
+
+    :param node: The input node to convert.
+    :param target_dtype: The target data type to convert the input node to.
+    :return: The converted node.
+    """
+    if node.get_element_type() == target_dtype:
+        return node
+    return opset.convert(node, target_dtype)
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index eb61c6ea5bd..d11679a9081 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -20,6 +20,7 @@
 
 from nncf.common.utils.decorators import ResultsCacheContainer
 from nncf.common.utils.decorators import cache_results
+from nncf.openvino.graph.node_utils import convert_if_needed
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
@@ -32,22 +33,57 @@
 OV_MODEL_CACHE = ResultsCacheContainer()
 
 
-@dataclass
+@dataclass(init=False)
 class OVModelParameters:
-    input_dtypes: Optional[Dict[str, TensorDataType]] = None
-    output_dtypes: Optional[Dict[str, TensorDataType]] = None
-    dynamic_shapes: bool = False
-    recompile: bool = False
-    release_memory: bool = True
-    share_inputs: bool = True
-    share_outputs: bool = True
-    return_ov_tensors: bool = False
+    def __init__(
+        self,
+        input_dtypes: Optional[Dict[str, TensorDataType]] = None,
+        output_dtypes: Optional[Dict[str, TensorDataType]] = None,
+        dynamic_shapes: bool = False,
+        recompile: bool = False,
+        release_memory: bool = True,
+        share_inputs: bool = True,
+        share_outputs: bool = True,
+        return_ov_tensors: bool = False,
+    ):
+        self.input_dtypes = input_dtypes or {}
+        self.output_dtypes = output_dtypes or {}
+        self.dynamic_shapes = dynamic_shapes
+        self.recompile = recompile
+        self.release_memory = release_memory
+        self.share_inputs = share_inputs
+        self.share_outputs = share_outputs
+        self.return_ov_tensors = return_ov_tensors
+
+    def __copy__(self):
+        return OVModelParameters(
+            input_dtypes=self.input_dtypes.copy(),
+            output_dtypes=self.output_dtypes.copy(),
+            dynamic_shapes=self.dynamic_shapes,
+            recompile=self.recompile,
+            release_memory=self.release_memory,
+            share_inputs=self.share_inputs,
+            share_outputs=self.share_outputs,
+            return_ov_tensors=self.return_ov_tensors,
+        )
+
+    def __deepcopy__(self, memo):
+        return OVModelParameters(
+            input_dtypes=copy.deepcopy(self.input_dtypes, memo),
+            output_dtypes=copy.deepcopy(self.output_dtypes, memo),
+            dynamic_shapes=self.dynamic_shapes,
+            recompile=self.recompile,
+            release_memory=self.release_memory,
+            share_inputs=self.share_inputs,
+            share_outputs=self.share_outputs,
+            return_ov_tensors=self.return_ov_tensors,
+        )
 
     def __hash__(self):
         return hash(
             (
-                None if self.output_dtypes is None else frozenset(self.input_dtypes.items()),
-                None if self.output_dtypes is None else frozenset(self.output_dtypes.items()),
+                frozenset(self.input_dtypes.items()),
+                frozenset(self.output_dtypes.items()),
                 self.dynamic_shapes,
                 self.recompile,
                 self.release_memory,
@@ -158,20 +194,27 @@ def _build_compress_model(
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
-) -> Union[ModelCallable, Tuple[List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
-    input_dtypes = ov_model_params.input_dtypes
-    if input_dtypes is None:
-        raise ValueError("Input dtypes must be provided.")
-    output_dtypes = ov_model_params.output_dtypes
-    if output_dtypes is None:
-        raise ValueError("Output dtypes must be provided.")
-
-    weight_dtype = input_dtypes.get("weight")
-    input_scale_dtype = input_dtypes.get("scale", TensorDataType.float32)
-    input_zero_point_dtype = input_dtypes.get("zero_point", TensorDataType.int32)
-    compressed_weight_dtype = output_dtypes.get("compressed_weight")
-    output_scale_dtype = output_dtypes.get("scale", TensorDataType.float32)
-    output_zero_point_dtype = output_dtypes.get("zero_point", TensorDataType.int32)
+) -> Union[ModelCallable, Tuple[OVModelParameters, List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
+    is_int_asym = config.is_int_asym
+    default_input_dtypes = {
+        "scale": TensorDataType.float32,
+        "zero_point": TensorDataType.int32,
+    }
+    default_output_dtypes = {
+        "compressed_weight": TensorDataType.uint8 if is_int_asym else TensorDataType.int8,
+        "scale": TensorDataType.float32,
+        "zero_point": TensorDataType.int32,
+    }
+    ov_model_params = copy.deepcopy(ov_model_params)
+    ov_model_params.input_dtypes = {**default_input_dtypes, **ov_model_params.input_dtypes}
+    ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes}
+
+    weight_dtype = ov_model_params.input_dtypes["weight"]
+    input_scale_dtype = ov_model_params.input_dtypes["scale"]
+    input_zero_point_dtype = ov_model_params.input_dtypes["zero_point"]
+    compressed_weight_dtype = ov_model_params.output_dtypes["compressed_weight"]
+    output_scale_dtype = ov_model_params.output_dtypes["scale"]
+    output_zero_point_dtype = ov_model_params.output_dtypes["zero_point"]
 
     # Validate input dtypes
     valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
@@ -181,25 +224,25 @@ def _build_compress_model(
         )
     if scale_shape is not None and input_scale_dtype != TensorDataType.float32:
         raise ValueError(f"Input scale must be of float32 data type. But found: {input_scale_dtype}.")
-    if zero_point_shape is not None and input_zero_point_dtype != TensorDataType.int32:
-        raise ValueError(f"Input zero point must be of int32 data type. But found: {input_zero_point_dtype}.")
+    if zero_point_shape is not None and input_zero_point_dtype not in [TensorDataType.int32, TensorDataType.float32]:
+        raise ValueError(f"Input zero point must be of int32/float32 data type. But found: {input_zero_point_dtype}.")
 
     # Validate output dtypes
     valid_compressed_weight_dtypes = [
+        TensorDataType.float32,
         TensorDataType.int32,
         TensorDataType.int8,
         TensorDataType.uint8,
         TensorDataType.int4,
         TensorDataType.uint4,
     ]
-    if compressed_weight_dtype not in valid_compressed_weight_dtypes + [TensorDataType.float32]:
+    if compressed_weight_dtype not in valid_compressed_weight_dtypes:
         raise ValueError(
             f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. "
             f"But found: {compressed_weight_dtype}."
         )
     if scale_shape is None and output_scale_dtype != TensorDataType.float32:
         raise ValueError(f"Output scale must be of float32 data type. But found: {output_scale_dtype}.")
-    is_int_asym = config.is_int_asym
     if is_int_asym and zero_point_shape is None and output_zero_point_dtype not in valid_compressed_weight_dtypes:
         raise ValueError(
             f"Output zero point must be of one of the following data types: {valid_compressed_weight_dtypes}. "
@@ -222,7 +265,7 @@ def _build_compress_model(
     min_values = None
     if scale_shape is not None:
         # Scale is given as an input
-        scale = opset.parameter(scale_shape, name="scale", dtype=ov.Type.f32)
+        scale = opset.parameter(scale_shape, name="scale", dtype=DTYPE_MAP_OV[input_scale_dtype])
         ov_parameters.append(scale)
     else:
         # Compute scale
@@ -250,10 +293,10 @@ def _build_compress_model(
     zero_point = None
     if zero_point_shape is not None:
         # Zero point is given as an input
-        zero_point = opset.parameter(zero_point_shape, name="zero_point", dtype=ov.Type.i32)
+        zero_point = opset.parameter(zero_point_shape, name="zero_point", dtype=DTYPE_MAP_OV[input_zero_point_dtype])
         ov_parameters.append(zero_point)
         # Cast to float32 for an addition later
-        zero_point = opset.convert(zero_point, ov.Type.f32)
+        zero_point = convert_if_needed(zero_point, ov.Type.f32)
     elif is_int_asym:
         # Compute zero point
         if min_values is None:
@@ -264,8 +307,7 @@ def _build_compress_model(
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
         zero_point = opset.clamp(zero_point, level_low, level_high)
 
-    if weight.get_element_type() != ov.Type.f32:
-        weight = opset.convert(weight, ov.Type.f32)
+    weight = convert_if_needed(weight, ov.Type.f32)
     compressed_weight = weight / scale
 
     if is_int_asym:
@@ -273,18 +315,17 @@ def _build_compress_model(
 
     compressed_weight = opset.round(compressed_weight)
     compressed_weight = opset.clamp(opset.round(compressed_weight), level_low, level_high)
-    if compressed_weight_dtype != TensorDataType.float32:
-        compressed_weight = opset.convert(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
+    compressed_weight = convert_if_needed(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype])
 
     ov_results = [compressed_weight]
     if len(ov_parameters) == 1:
         ov_results.append(scale)
         if zero_point is not None:
-            zero_point = opset.convert(zero_point, DTYPE_MAP_OV[output_zero_point_dtype])
+            zero_point = convert_if_needed(zero_point, DTYPE_MAP_OV[output_zero_point_dtype])
             ov_results.append(zero_point)
 
     if return_nodes:
-        return ov_parameters, ov_results
+        return ov_model_params, ov_parameters, ov_results
 
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
@@ -302,22 +343,17 @@ def _build_compress_decompress_model(
     reduction_axes: Optional[Tuple] = None,
     return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
-    input_dtypes = ov_model_params.input_dtypes
-    if input_dtypes is None:
-        raise ValueError("Input dtypes must be provided.")
-    output_dtypes = ov_model_params.output_dtypes
-    if output_dtypes is None:
-        raise ValueError("Output dtypes must be provided.")
+    default_output_dtypes = {"decompressed_weight": TensorDataType.float32}
+    if not return_compressed_weight:
+        default_output_dtypes["compressed_weight"] = TensorDataType.float32
+    ov_model_params = copy.deepcopy(ov_model_params)
+    ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes}
 
-    decompressed_weight_dtype = output_dtypes.get("decompressed_weight")
+    decompressed_weight_dtype = ov_model_params.output_dtypes["decompressed_weight"]
     if decompressed_weight_dtype != TensorDataType.float32:
         raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.")
 
-    if "compressed_weight" not in output_dtypes:
-        ov_model_params = copy.deepcopy(ov_model_params)
-        ov_model_params.output_dtypes["compressed_weight"] = TensorDataType.float32
-
-    ov_parameters, ov_results = get_compress_weight_model(
+    ov_model_params, ov_parameters, ov_results = get_compress_weight_model(
         ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
     )
 
@@ -330,7 +366,9 @@ def _build_compress_decompress_model(
             compressed_weight = ov_results[0]
             scale, zero_point = ov_parameters[1:]
 
-        compressed_weight = opset.convert(compressed_weight, ov.Type.i32) - opset.convert(zero_point, ov.Type.i32)
+        compressed_weight = convert_if_needed(compressed_weight, ov.Type.i32) - convert_if_needed(
+            zero_point, ov.Type.i32
+        )
     else:
         if len(ov_parameters) == 1:
             # weight -> compressed_weight, scale
@@ -340,9 +378,7 @@ def _build_compress_decompress_model(
             compressed_weight = ov_results[0]
             scale = ov_parameters[1]
 
-    if compressed_weight.get_element_type() != ov.Type.f32:
-        compressed_weight = opset.convert(compressed_weight, ov.Type.f32)
-    decompressed_weight = opset.multiply(scale, compressed_weight)
+    decompressed_weight = opset.multiply(scale, convert_if_needed(compressed_weight, ov.Type.f32))
 
     ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight]
     model = ov.Model(ov_results, ov_parameters)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 4e1cabc3790..6aa3bdf9867 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -492,16 +492,11 @@ def do_int_quantization(
     zero_point_shape = None if precomputed_zero_point is None else precomputed_zero_point.shape
 
     ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params)
-    ov_model_params.input_dtypes = ov_model_params.input_dtypes or {
-        "weight": weight.dtype,
-        "scale": TensorDataType.float32,
-        "zero_point": TensorDataType.int32,
-    }
-    ov_model_params.output_dtypes = ov_model_params.output_dtypes or {
-        "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8,
-        "scale": TensorDataType.float32,
-        "zero_point": TensorDataType.int32,
-    }
+    ov_model_params.input_dtypes["weight"] = weight.dtype
+    if precomputed_scale is not None:
+        ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
+    if precomputed_zero_point is not None:
+        ov_model_params.input_dtypes["zero_point"] = precomputed_zero_point.dtype
     if config.num_bits == 4 and weight.backend == TensorBackend.ov:
         # Return ov tensors in target precision to seamlessly insert them into openvino model later
         ov_model_params.return_ov_tensors = weight.backend == TensorBackend.ov
@@ -596,17 +591,11 @@ def calculate_quantized_dequantized_weight(
     zero_point_shape = precomputed_zero_point.shape if precomputed_zero_point is not None else None
 
     ov_model_params = OVModelParameters() if ov_model_params is None else copy.deepcopy(ov_model_params)
-    ov_model_params.input_dtypes = ov_model_params.input_dtypes or {
-        "weight": weight.dtype,
-        "scale": TensorDataType.float32,
-        "zero_point": TensorDataType.int32,
-    }
-    ov_model_params.output_dtypes = ov_model_params.output_dtypes or {
-        "decompressed_weight": TensorDataType.float32,
-        "compressed_weight": TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8,
-        "scale": TensorDataType.float32,
-        "zero_point": TensorDataType.int32,
-    }
+    ov_model_params.input_dtypes["weight"] = weight.dtype
+    if precomputed_scale is not None:
+        ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
+    if precomputed_zero_point is not None:
+        ov_model_params.input_dtypes["zero_point"] = precomputed_zero_point.dtype
 
     model = get_compress_decompress_weight_model(
         ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
new file mode 100644
index 00000000000..9b314863108
--- /dev/null
+++ b/nncf/results_caching.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+
+class ResultsCacheContainer:
+    def __init__(self):
+        self._cache = {}
+        self._access_count = {}
+
+    def clear(self):
+        self._cache.clear()
+        self._access_count.clear()
+
+    def is_empty(self):
+        return len(self._cache) == 0
+
+    def __getitem__(self, item):
+        self._access_count[item] += 1
+        return self._cache[item]
+
+    def __setitem__(self, key, value):
+        self._access_count[key] = 0
+        self._cache[key] = value
+
+    def __contains__(self, item):
+        return item in self._cache
+
+
+def cache_results(cache: ResultsCacheContainer):
+    def decorator(func):
+        def wrapper(*args, disable_caching=False, **kwargs):
+            sig = inspect.signature(func)
+            new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
+            new_kwargs.update(kwargs)
+            cache_key = (func.__name__, frozenset(new_kwargs.items()))
+            if cache_key in cache:
+                return cache[cache_key]
+            result = func(*args, **kwargs)
+            if not disable_caching:
+                cache[cache_key] = result
+            return result
+
+        return wrapper
+
+    return decorator

From b13f1865b9deab0b684bb074f840b3285db70f81 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Nov 2024 16:25:06 +0100
Subject: [PATCH 35/73] Added OV tensor tests

---
 nncf/tensor/functions/ov.py          | 50 ++++++++-------
 tests/openvino/native/test_tensor.py | 94 ++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 22 deletions(-)
 create mode 100644 tests/openvino/native/test_tensor.py

diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index a868d310190..96bb441e45f 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -18,6 +18,7 @@
 from nncf.tensor.functions import numeric
 
 from ..definitions import TensorBackend
+from ..definitions import TensorDeviceType
 from .numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
 from .numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP
 
@@ -37,26 +38,9 @@
 DTYPE_MAP_REV = {v: k for k, v in DTYPE_MAP.items()}
 
 
-def _ov_astype(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
-    from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
-    from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
-
-    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
-
-    model = get_astype_model(
-        OVModelParameters(
-            input_dtypes={"input": a_dtype},
-            output_dtypes={"output": dtype},
-            dynamic_shapes=True,
-            recompile=False,
-            release_memory=True,
-            share_inputs=True,
-            share_outputs=True,
-            return_ov_tensors=True,
-        ),
-        tuple(a.shape),
-    )
-    return model([Tensor(a)])[0].data
+@numeric.device.register(ov.Tensor)
+def _(a: ov.Tensor) -> TensorDeviceType:
+    return TensorDeviceType.CPU
 
 
 @numeric.backend.register(ov.Tensor)
@@ -71,7 +55,7 @@ def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
         TensorDataType.int4,
         TensorDataType.uint4,
     ]:
-        return _ov_astype(a, dtype)
+        return _astype_ov(a, dtype)
     return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype]))
 
 
@@ -114,6 +98,28 @@ def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
             dtype = TensorDataType.uint8
         elif a_dtype == TensorDataType.int4:
             dtype = TensorDataType.int8
-        a = _ov_astype(a, dtype)
+        a = _astype_ov(a, dtype)
 
     return a.data
+
+
+def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+    from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
+
+    a_dtype = DTYPE_MAP_REV[a.get_element_type()]
+
+    model = get_astype_model(
+        OVModelParameters(
+            input_dtypes={"input": a_dtype},
+            output_dtypes={"output": dtype},
+            dynamic_shapes=False,
+            recompile=True,
+            release_memory=True,
+            share_inputs=True,
+            share_outputs=True,
+            return_ov_tensors=True,
+        ),
+        tuple(a.shape),
+    )
+    return model([Tensor(a)])[0].data
diff --git a/tests/openvino/native/test_tensor.py b/tests/openvino/native/test_tensor.py
new file mode 100644
index 00000000000..e9b1a136a4b
--- /dev/null
+++ b/tests/openvino/native/test_tensor.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import openvino as ov
+import pytest
+
+import openvino.runtime.opset13 as opset
+from nncf.tensor import TensorDataType, Tensor
+from nncf.tensor.definitions import TensorBackend
+from nncf.tensor.definitions import TensorDeviceType
+import nncf.tensor.functions as fns
+from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
+from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV
+
+
+class TestOVNNCFTensorOperators:
+    @staticmethod
+    def to_tensor(x, backend=TensorBackend.ov, dtype=TensorDataType.float32):
+        if backend == TensorBackend.ov:
+            if dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
+                ov_const = opset.constant(x, dtype=DTYPE_MAP_OV[dtype])
+                return ov.Tensor(ov_const.data, ov_const.data.shape, DTYPE_MAP_OV[dtype])
+            else:
+                return ov.Tensor(np.array(x, dtype=DTYPE_MAP_NP[dtype]))
+        elif backend == TensorBackend.numpy:
+            if dtype in [TensorDataType.bfloat16, TensorDataType.uint4, TensorDataType.int4]:
+                raise ValueError(f"Can't create NumPY tensor in dtype {dtype}")
+            return np.array(x, dtype=DTYPE_MAP_NP[dtype])
+        else:
+            raise ValueError("Unsupported backend")
+
+    @staticmethod
+    def backend() -> TensorBackend:
+        return TensorBackend.ov
+
+    def test_property_backend(self):
+        tensor_a = Tensor(self.to_tensor([1, 2]))
+        assert tensor_a.backend == self.backend()
+
+    def test_device(self):
+        tensor = Tensor(self.to_tensor([1]))
+        assert tensor.device == TensorDeviceType.CPU
+
+    def test_size(self):
+        tensor = Tensor(self.to_tensor([1, 1]))
+        res = tensor.size
+        assert res == 2
+
+    def test_astype(self):
+        tensor = Tensor(self.to_tensor([1]))
+        res = tensor.astype(TensorDataType.int8)
+        assert isinstance(res, Tensor)
+        assert res.dtype == TensorDataType.int8
+        assert res.device == tensor.device
+
+    def test_fn_astype(self):
+        tensor = Tensor(self.to_tensor([1]))
+        res = fns.astype(tensor, TensorDataType.int8)
+        assert isinstance(res, Tensor)
+        assert res.dtype == TensorDataType.int8
+
+    def test_reshape(self):
+        tensor = Tensor(self.to_tensor([1, 1]))
+        res = tensor.reshape((1, 2))
+        assert tensor.shape == (2,)
+        assert res.shape == (1, 2)
+        assert res.device == tensor.device
+
+    def test_fn_reshape(self):
+        tensor = Tensor(self.to_tensor([1, 1]))
+        res = fns.reshape(tensor, (1, 2))
+        assert tensor.shape == (2,)
+        assert res.shape == (1, 2)
+        assert res.device == tensor.device
+
+    @pytest.mark.parametrize("from_backend", [TensorBackend.numpy, TensorBackend.ov])
+    @pytest.mark.parametrize("to_backend", [TensorBackend.numpy, TensorBackend.ov])
+    def test_to_backend(self, from_backend, to_backend):
+        tensor1 = Tensor(self.to_tensor([1], backend=from_backend))
+        assert tensor1.backend == from_backend
+        tensor2 = tensor1.to_backend(to_backend)
+        assert tensor2.backend == to_backend
+        assert tensor1.dtype == tensor2.dtype
+        assert tensor1.shape == tensor2.shape
+        assert tensor1.device == tensor2.device

From 9e90d5abec32f554c66884ff8c84328dee6d6559 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Nov 2024 16:29:06 +0100
Subject: [PATCH 36/73] Minor file reorg

---
 nncf/{utils.py => import_utils.py} |  0
 nncf/results_caching.py            | 55 ------------------------------
 2 files changed, 55 deletions(-)
 rename nncf/{utils.py => import_utils.py} (100%)
 delete mode 100644 nncf/results_caching.py

diff --git a/nncf/utils.py b/nncf/import_utils.py
similarity index 100%
rename from nncf/utils.py
rename to nncf/import_utils.py
diff --git a/nncf/results_caching.py b/nncf/results_caching.py
deleted file mode 100644
index 9b314863108..00000000000
--- a/nncf/results_caching.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-
-
-class ResultsCacheContainer:
-    def __init__(self):
-        self._cache = {}
-        self._access_count = {}
-
-    def clear(self):
-        self._cache.clear()
-        self._access_count.clear()
-
-    def is_empty(self):
-        return len(self._cache) == 0
-
-    def __getitem__(self, item):
-        self._access_count[item] += 1
-        return self._cache[item]
-
-    def __setitem__(self, key, value):
-        self._access_count[key] = 0
-        self._cache[key] = value
-
-    def __contains__(self, item):
-        return item in self._cache
-
-
-def cache_results(cache: ResultsCacheContainer):
-    def decorator(func):
-        def wrapper(*args, disable_caching=False, **kwargs):
-            sig = inspect.signature(func)
-            new_kwargs = {name: arg for name, arg in zip(sig.parameters, args)}
-            new_kwargs.update(kwargs)
-            cache_key = (func.__name__, frozenset(new_kwargs.items()))
-            if cache_key in cache:
-                return cache[cache_key]
-            result = func(*args, **kwargs)
-            if not disable_caching:
-                cache[cache_key] = result
-            return result
-
-        return wrapper
-
-    return decorator

From 5f46593aaacb4152494f12be1dabde9bc95ff959 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Nov 2024 16:42:29 +0100
Subject: [PATCH 37/73] Tweaks

---
 .../algorithms/weight_compression/weight_lowering.py       | 2 +-
 nncf/tensor/tensor.py                                      | 6 ------
 .../native/quantization/test_ov_modeling_compression.py    | 6 +-----
 tests/openvino/native/test_tensor.py                       | 7 ++++---
 4 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 6aa3bdf9867..459af440696 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -17,6 +17,7 @@
 
 import nncf
 from nncf.common.logging.logger import log_once
+from nncf.import_utils import is_openvino_available
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.fake_quantize import calculate_scale_zero_point
@@ -24,7 +25,6 @@
 from nncf.tensor import functions as fns
 from nncf.tensor.definitions import TensorBackend
 from nncf.tensor.definitions import TensorDataType
-from nncf.utils import is_openvino_available
 
 ReductionAxes = Tuple[int, ...]
 
diff --git a/nncf/tensor/tensor.py b/nncf/tensor/tensor.py
index a17758c2ab1..19cba0482a9 100644
--- a/nncf/tensor/tensor.py
+++ b/nncf/tensor/tensor.py
@@ -116,12 +116,6 @@ def __ipow__(self, other: Union[Tensor, float]) -> Tensor:
         self._data **= unwrap_tensor_data(other)
         return self
 
-    # def __truediv__(self, other: Union[Tensor, float]) -> Tensor:
-    #     return self * _call_function("_binary_op_nowarn", 1.0, other, operator.truediv)
-    #
-    # def __rtruediv__(self, other: Union[Tensor, float]) -> Tensor:
-    #     return other * _call_function("_binary_reverse_op_nowarn", self, 1.0, operator.truediv)
-
     def __truediv__(self, other: Union[Tensor, float]) -> Tensor:
         return _call_function("_binary_op_nowarn", self, other, operator.truediv)
 
diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
index d8c6bfa7ffa..068795b485b 100644
--- a/tests/openvino/native/quantization/test_ov_modeling_compression.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -58,8 +58,6 @@ class QuantizationTask(Enum):
     WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2),
 ]
 
-DATA_TYPES = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16]
-
 MAX_MISALIGNMENT_FREQUENCY = {
     TensorDataType.float32: 1e-2,  # tends to < 5e-6
     TensorDataType.float16: 1e-2,  # tends to < 5e-5
@@ -68,8 +66,6 @@ class QuantizationTask(Enum):
 
 MAX_MISALIGNMENT_MAGNITUDE = 1
 
-TENSOR_BACKENDS = [TensorBackend.numpy, TensorBackend.ov]
-
 EPS = np.finfo(np.float32).eps
 
 REDUCTION_AXES = (1,)
@@ -126,7 +122,7 @@ def openvino_available(available: bool):
         (QuantizationTask.Q_DQ_RQ, "auto"),
     ],
 )
-@pytest.mark.parametrize("dtype", DATA_TYPES)
+@pytest.mark.parametrize("dtype", [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16])
 @pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"])
 @pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"])
 def test_quantization_alignment(
diff --git a/tests/openvino/native/test_tensor.py b/tests/openvino/native/test_tensor.py
index e9b1a136a4b..1adb98c5d66 100644
--- a/tests/openvino/native/test_tensor.py
+++ b/tests/openvino/native/test_tensor.py
@@ -11,13 +11,14 @@
 
 import numpy as np
 import openvino as ov
+import openvino.runtime.opset13 as opset
 import pytest
 
-import openvino.runtime.opset13 as opset
-from nncf.tensor import TensorDataType, Tensor
+import nncf.tensor.functions as fns
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
 from nncf.tensor.definitions import TensorBackend
 from nncf.tensor.definitions import TensorDeviceType
-import nncf.tensor.functions as fns
 from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
 from nncf.tensor.functions.ov import DTYPE_MAP as DTYPE_MAP_OV
 

From e7617f1816a11de673a6e549c18e1969074136fd Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 14 Nov 2024 16:47:50 +0100
Subject: [PATCH 38/73] Tweaks

---
 .../native/quantization/test_ov_modeling_compression.py   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
index 068795b485b..3e09714cae0 100644
--- a/tests/openvino/native/quantization/test_ov_modeling_compression.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -99,12 +99,12 @@ def get_random_integer_tensor(shape, low, high, dtype, backend, seed=0):
 
 @contextmanager
 def openvino_available(available: bool):
-    import nncf.utils
+    import nncf.import_utils
 
-    original_value = nncf.utils._openvino_available
-    nncf.utils._openvino_available = available
+    original_value = nncf.import_utils._openvino_available
+    nncf.import_utils._openvino_available = available
     yield
-    nncf.utils._openvino_available = original_value
+    nncf.import_utils._openvino_available = original_value
 
 
 @pytest.mark.parametrize("weight_shape", [(10000, 4)], ids=[""])

From 925f830dd9e40f673edb243bd2d71c235c493e2f Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 15 Nov 2024 10:43:50 +0100
Subject: [PATCH 39/73] Switch to OV 2024.5 rc2

---
 .github/workflows/precommit.yml | 4 ++--
 tests/openvino/native/models.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
index 218d9c32fd1..822c360349e 100644
--- a/.github/workflows/precommit.yml
+++ b/.github/workflows/precommit.yml
@@ -64,8 +64,8 @@ jobs:
           cache: pip
       - name: Install NNCF and test requirements
         run: make install-openvino-test
-      - name: Install OpenVINO Nightly
-        run: pip install -U --pre openvino==2024.5.0.dev20241015 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - name: Install OpenVINO 2024.5
+        run: pip install -U --pre openvino==2024.5.0rc2 openvino-tokenizers==2024.5.0rc2 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - name: Print installed modules
         run: pip list
       - name: Run OV precommit test scope
diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py
index 7bca0e5b04f..95f079a8800 100644
--- a/tests/openvino/native/models.py
+++ b/tests/openvino/native/models.py
@@ -290,11 +290,11 @@ def __init__(self, const_dtype: ov.Type = ov.Type.f32, input_dtype: ov.Type = ov
     def _create_ov_model(self):
         input_shape = [1, 3, 4, 2]
         input_1 = opset.parameter(input_shape, name="Input", dtype=self.input_dtype)
-        data = opset.constant(self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const")
+        data = opset.constant(value=self._rng.random((1, 3, 4, 5)), dtype=self.const_dtype, name="MatMul_const")
         if self.const_dtype != self.input_dtype:
             data = opset.convert(data, self.input_dtype.to_string())
         matmul = opset.matmul(input_1, data, transpose_a=True, transpose_b=False, name="MatMul")
-        bias = opset.constant(self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias")
+        bias = opset.constant(value=self._rng.random((1, 3, 1, 1)), dtype=self.const_dtype, name="MatMul_bias")
         if self.const_dtype != self.input_dtype:
             bias = opset.convert(bias, self.input_dtype.to_string())
         add = opset.add(matmul, bias, name="Add")

From 5831fcda2d855226f79a37550a6f92584efe0315 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 15 Nov 2024 11:34:17 +0100
Subject: [PATCH 40/73] Additional tests for ov_modeling

---
 .../openvino/native/test_openvino_modeling.py | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py
index 14ec9f740ab..4a6f11654c2 100644
--- a/tests/openvino/native/test_openvino_modeling.py
+++ b/tests/openvino/native/test_openvino_modeling.py
@@ -18,6 +18,7 @@
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import run_model
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.definitions import TensorBackend
@@ -222,3 +223,75 @@ def test_return_ov_tensors(model_getter, return_ov_tensors):
     outputs = model_run_fn(inputs)
 
     assert all([out.backend == (TensorBackend.ov if return_ov_tensors else TensorBackend.numpy) for out in outputs])
+
+
+@pytest.mark.parametrize("release_memory", [True, False])
+def test_release_memory(mocker, release_memory):
+    compiled_model = mocker.Mock()
+    compiled_model.release_memory = mocker.Mock()
+
+    input_mock = mocker.Mock()
+    input_mock.any_name = "input"
+    compiled_model.inputs = [input_mock]
+
+    output_mock = mocker.Mock()
+    compiled_model.return_value = [output_mock]
+
+    ov_model_params = OVModelParameters(input_dtypes={"input": TensorDataType.float32}, release_memory=release_memory)
+    input_tensor = mocker.Mock()
+    input_tensor.dtype = TensorDataType.float32
+    input_tensor.data = [1, 2, 3]
+    inputs = [input_tensor]
+
+    run_model(ov_model_params, compiled_model, return_ov_tensors=False, inputs=inputs)
+    if release_memory:
+        compiled_model.release_memory.assert_called_once()
+    else:
+        compiled_model.release_memory.assert_not_called()
+
+
+@pytest.mark.parametrize("share_inputs", [True, False])
+@pytest.mark.parametrize("share_outputs", [True, False])
+@pytest.mark.parametrize("return_ov_tensors", [True, False])
+def test_share_inputs_outputs(mocker, share_inputs, share_outputs, return_ov_tensors):
+    compiled_model = mocker.Mock()
+
+    input_mock = mocker.Mock()
+    input_mock.any_name = "input"
+    compiled_model.inputs = [input_mock]
+
+    output_mock = mocker.Mock()
+
+    if return_ov_tensors:
+        infer_request = mocker.Mock()
+        compiled_model.create_infer_request.return_value = infer_request
+
+        infer_request.infer = mocker.Mock()
+        infer_request.results = [output_mock]
+
+        infer_request.get_output_tensor.return_value = output_mock
+    else:
+        compiled_model.return_value = [output_mock]
+
+    ov_model_params = OVModelParameters(
+        input_dtypes={"input": TensorDataType.float32},
+        return_ov_tensors=return_ov_tensors,
+        share_inputs=share_inputs,
+        share_outputs=share_outputs,
+    )
+
+    input_tensor = mocker.Mock()
+    input_tensor.dtype = TensorDataType.float32
+    input_tensor.data = [1, 2, 3]
+    inputs = [input_tensor]
+
+    run_model(ov_model_params, compiled_model, return_ov_tensors=return_ov_tensors, inputs=inputs)
+
+    if return_ov_tensors:
+        infer_request.infer.assert_called_once_with(
+            [input_tensor.data], share_inputs=share_inputs, share_outputs=share_outputs
+        )
+    else:
+        compiled_model.assert_called_once_with(
+            [input_tensor.data], share_inputs=share_inputs, share_outputs=share_outputs
+        )

From 9160de3dec73725cd0ba1d287bf23b89e9133883 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 15 Nov 2024 11:44:28 +0100
Subject: [PATCH 41/73] Type hints

---
 nncf/common/utils/decorators.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py
index b3fd2a0e3ad..4d5b2247ba6 100644
--- a/nncf/common/utils/decorators.py
+++ b/nncf/common/utils/decorators.py
@@ -55,32 +55,32 @@ def wrapped_f(*args: Any, **kwargs: Any):  # type: ignore
 
 
 class ResultsCacheContainer:
-    def __init__(self):
-        self._cache = {}
-        self._access_count = {}
+    def __init__(self) -> None:
+        self._cache: Dict[Any, Any] = {}
+        self._access_count: Dict[Any, int] = {}
 
-    def clear(self):
+    def clear(self) -> None:
         self._cache.clear()
         self._access_count.clear()
 
-    def is_empty(self):
+    def is_empty(self) -> bool:
         return len(self._cache) == 0
 
-    def __getitem__(self, item):
+    def __getitem__(self, item: Any) -> Any:
         self._access_count[item] += 1
         return self._cache[item]
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: Any, value: Any) -> None:
         self._access_count[key] = 0
         self._cache[key] = value
 
-    def __contains__(self, item):
+    def __contains__(self, item: Any) -> bool:
         return item in self._cache
 
 
-def cache_results(cache: ResultsCacheContainer):
-    def decorator(func):
-        def wrapper(*args, disable_caching=False, **kwargs):
+def cache_results(cache: ResultsCacheContainer) -> Callable:
+    def decorator(func: Callable) -> Callable:
+        def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any:
             if disable_caching:
                 return func(*args, **kwargs)
             sig = inspect.signature(func)

From c7c63eb34b10b77e0b48229d7be92a94186c75a1 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 15 Nov 2024 11:50:43 +0100
Subject: [PATCH 42/73] Ignore mypy

---
 nncf/common/utils/decorators.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py
index 4d5b2247ba6..c2ef9a4fe92 100644
--- a/nncf/common/utils/decorators.py
+++ b/nncf/common/utils/decorators.py
@@ -78,9 +78,9 @@ def __contains__(self, item: Any) -> bool:
         return item in self._cache
 
 
-def cache_results(cache: ResultsCacheContainer) -> Callable:
-    def decorator(func: Callable) -> Callable:
-        def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any:
+def cache_results(cache: ResultsCacheContainer) -> Callable:  # type: ignore
+    def decorator(func: Callable) -> Callable:  # type: ignore
+        def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any:  # type: ignore
             if disable_caching:
                 return func(*args, **kwargs)
             sig = inspect.signature(func)

From 764f7222caa44e65c25ec9139d6a65b4682b44b7 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 15 Nov 2024 11:56:04 +0100
Subject: [PATCH 43/73] Reuse DTYPE_MAP_REV

---
 .../weight_compression/openvino_backend.py        | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index b0d0ae79c96..ffec97d080e 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -53,6 +53,7 @@
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorBackend
 from nncf.tensor.definitions import TensorDataType
+from nncf.tensor.functions.ov import DTYPE_MAP_REV
 
 
 class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
@@ -130,19 +131,9 @@ def get_weight(self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.
     def get_weight_dtype(
         self, node_with_weight: NNCFNode, weight_port_id: int, model: ov.Model, graph: NNCFGraph
     ) -> TensorDataType:
-        # TODO: use from nncf.tensor.functions.ov import DTYPE_MAP
         ov_type_name = node_with_weight.layer_attributes.constant_attributes[weight_port_id]["dtype"]
-        dtype_map = {
-            "f16": TensorDataType.float16,
-            "bf16": TensorDataType.bfloat16,
-            "f32": TensorDataType.float32,
-            "f64": TensorDataType.float64,
-            "i8": TensorDataType.int8,
-            "i32": TensorDataType.int32,
-            "i64": TensorDataType.int64,
-            "u8": TensorDataType.uint8,
-        }
-        return dtype_map.get(ov_type_name)
+        ov_type = getattr(ov.Type, ov_type_name)
+        return DTYPE_MAP_REV[ov_type]
 
     @staticmethod
     def get_weight_shape(node_with_weight: NNCFNode, weight_port_id: int, graph: NNCFGraph) -> Tuple:

From 4a448e1c46ae89a24c602663053f857279163677 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 18 Nov 2024 16:22:13 +0100
Subject: [PATCH 44/73] Added docstrings

---
 nncf/common/logging/logger.py                 |   7 +-
 nncf/common/utils/decorators.py               |  15 +++
 nncf/import_utils.py                          |   4 +
 nncf/openvino/graph/node_utils.py             |   2 +
 .../weight_compression/openvino_backend.py    |  16 ++-
 .../weight_compression/openvino_modeling.py   | 126 ++++++++++++++----
 .../weight_compression/weight_lowering.py     |  58 ++++----
 nncf/quantization/fake_quantize.py            |   4 +-
 nncf/tensor/functions/__init__.py             |   4 +-
 nncf/tensor/functions/numeric.py              |  10 +-
 nncf/tensor/functions/ov.py                   |   7 +
 .../openvino/native/test_openvino_modeling.py |   6 +-
 12 files changed, 188 insertions(+), 71 deletions(-)

diff --git a/nncf/common/logging/logger.py b/nncf/common/logging/logger.py
index e13fcaa8442..5b02bbb77f1 100644
--- a/nncf/common/logging/logger.py
+++ b/nncf/common/logging/logger.py
@@ -90,5 +90,10 @@ def warn_bkc_version_mismatch(backend: str, bkc_version: str, current_version: s
 
 
 @lru_cache(None)
-def log_once(level, message):
+def log_once(level: int, message: str) -> None:
+    """
+    Logs a message only once.
+    :param level: Logging level, e.g. logging.WARNING.
+    :param message: The message to log.
+    """
     nncf_logger.log(level, message)
diff --git a/nncf/common/utils/decorators.py b/nncf/common/utils/decorators.py
index c2ef9a4fe92..5f9f14eaf4c 100644
--- a/nncf/common/utils/decorators.py
+++ b/nncf/common/utils/decorators.py
@@ -55,8 +55,14 @@ def wrapped_f(*args: Any, **kwargs: Any):  # type: ignore
 
 
 class ResultsCacheContainer:
+    """
+    A container for results decorated with @cache_results decorator.
+    """
+
     def __init__(self) -> None:
+        # Stores the results of the decorated function
         self._cache: Dict[Any, Any] = {}
+        # Stores the number of times the cached result was accessed
         self._access_count: Dict[Any, int] = {}
 
     def clear(self) -> None:
@@ -79,6 +85,15 @@ def __contains__(self, item: Any) -> bool:
 
 
 def cache_results(cache: ResultsCacheContainer) -> Callable:  # type: ignore
+    """
+    Decorator to cache the results of a function.
+
+    Decorated function additionally accepts a `disable_caching` argument do disable caching if needed. If it is True,
+    the result will not be stored saved to a cache. Also, if there is a corresponding result in the cache, it will be
+    recomputed.
+    :param cache: A cache container where results will be stored.
+    """
+
     def decorator(func: Callable) -> Callable:  # type: ignore
         def wrapper(*args, disable_caching: bool = False, **kwargs) -> Any:  # type: ignore
             if disable_caching:
diff --git a/nncf/import_utils.py b/nncf/import_utils.py
index 50a315e4048..3608deeae20 100644
--- a/nncf/import_utils.py
+++ b/nncf/import_utils.py
@@ -29,4 +29,8 @@
 
 
 def is_openvino_available():
+    """
+    Check if OpenVINO is available.
+    :return: True if openvino package is installed, False otherwise.
+    """
     return _openvino_available
diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 24677d52968..67bf9143cd4 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -113,6 +113,8 @@ def get_const_value(const_node: ov.Node, cast_bf16_to_fp32: Optional[bool] = Tru
     This method is applicable only for the floating-point constant data.
 
     :param const_node: OpenVINO node.
+    :param cast_bf16_to_fp32: Whether to cast bf16 node data to fp32 or not. If False and the node contains bf16 data,
+        the resulting bf16 value will be returned encoded inside a numpy.float16 array.
     :return: The constant value.
     """
     if const_node.get_element_type() == ov.Type.bf16 and cast_bf16_to_fp32:
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index ffec97d080e..49e842f72d5 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -30,6 +30,7 @@
 from nncf.openvino.graph.metatypes import openvino_metatypes as om
 from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS
 from nncf.openvino.graph.model_transformer import OVModelTransformer
+from nncf.openvino.graph.node_utils import convert_if_needed
 from nncf.openvino.graph.node_utils import get_const_value
 from nncf.openvino.graph.node_utils import get_weight_channel_axes
 from nncf.openvino.graph.transformations.command_creation import OVCommandCreator
@@ -242,8 +243,7 @@ def _create_compression_subgraph(
         compressed_const = self._create_ov_const_from_tensor(
             compressed_weight.tensor, compression_dtype, name=const_node_name
         )
-        if compressed_const.get_element_type() != compression_dtype:
-            compressed_const = opset.convert(compressed_const, compression_dtype)
+        compressed_const = convert_if_needed(compressed_const, compression_dtype)
         converted_const = opset.convert(compressed_const, ov.Type.f16)
 
         if compressed_weight.zero_point is not None:
@@ -258,8 +258,7 @@ def _create_compression_subgraph(
         scale_const = self._create_ov_const_from_tensor(
             compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale"
         )
-        if scale_const.get_element_type() != ov.Type.f16:
-            scale_const = opset.convert(scale_const, ov.Type.f16)
+        scale_const = convert_if_needed(scale_const, ov.Type.f16)
 
         mul = opset.multiply(
             converted_const,
@@ -338,6 +337,7 @@ def transform_model(
         # reset name_to_node_mapping
         self.name_to_node_mapping = None
 
+        # clear openvino model cache
         OV_MODEL_CACHE.clear()
 
         return model
@@ -350,6 +350,14 @@ def dump_parameters(
 
     @staticmethod
     def _create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant:
+        """
+        Create an OpenVINO Constant node from the given tensor.
+        :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created
+            directly from underlying OV tensor.
+        :param dtype: Data type of the constant.
+        :param name: Optional name of the constant.
+        :return: OpenVINO Constant node.
+        """
         if x.backend == TensorBackend.ov:
             assert x.data.get_element_type() == dtype
             return opset.constant(x.data, name=name)
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index d11679a9081..69abd5309b1 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -16,6 +16,8 @@
 
 import numpy as np
 import openvino as ov
+from openvino._pyopenvino.op import Parameter
+from openvino.runtime import Node
 from openvino.runtime import opset13 as opset
 
 from nncf.common.utils.decorators import ResultsCacheContainer
@@ -35,6 +37,10 @@
 
 @dataclass(init=False)
 class OVModelParameters:
+    """
+    A class to hold parameters for building and inferring an OpenVINO model.
+    """
+
     def __init__(
         self,
         input_dtypes: Optional[Dict[str, TensorDataType]] = None,
@@ -46,6 +52,18 @@ def __init__(
         share_outputs: bool = True,
         return_ov_tensors: bool = False,
     ):
+        """
+        :param input_dtypes: Optional dictionary mapping input names to their data types.
+        :param output_dtypes: Optional dictionary mapping output names to their data types.
+        :param dynamic_shapes: Whether to use dynamic shapes for the model. When dynamic shapes are used and
+            recompile is False, it allows to save on the number of models stored in a model cache.
+        :param recompile: Whether to recompile the model before every inference. Otherwise, compiled models are cached.
+        :param release_memory: Whether to release memory after every inference. If memory is released, it will be
+            reallocated during every inference, reducing performance to some extent.
+        :param share_inputs: Whether to share input tensors. Avoids cloning inputs for inference.
+        :param share_outputs: Whether to share output tensors. Avoids cloning outputs after the inference.
+        :param return_ov_tensors: Whether to return results as OpenVINO tensors or NumPy arrays.
+        """
         self.input_dtypes = input_dtypes or {}
         self.output_dtypes = output_dtypes or {}
         self.dynamic_shapes = dynamic_shapes
@@ -94,9 +112,19 @@ def __hash__(self):
         )
 
 
-def run_model(
-    ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, return_ov_tensors: bool, inputs: TensorList
+ModelAsNodes = Tuple[List[Parameter], List[Node], OVModelParameters]
+
+
+def _infer_ov_model(
+    ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList
 ) -> TensorList:
+    """
+    Run compiled OpenVINO model inference on the given inputs.
+    :param ov_model_params: OV model related parameters.
+    :param compiled_model: Compiled OpenVINO model.
+    :param inputs: Input tensors.
+    :return: List of output tensors. Tensor backend is OV if return_ov_tensors is True, else NumPy.
+    """
     # Check that input dtypes match the expected dtypes
     for i, inp in enumerate(compiled_model.inputs):
         input_name = inp.any_name
@@ -107,7 +135,7 @@ def run_model(
 
     # Infer the model
     inputs = [inp.data for inp in inputs]
-    if return_ov_tensors:
+    if ov_model_params.return_ov_tensors:
         infer_request = compiled_model.create_infer_request()
         infer_request.infer(
             inputs, share_inputs=ov_model_params.share_inputs, share_outputs=ov_model_params.share_outputs
@@ -134,10 +162,28 @@ def get_compress_weight_model(
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
     return_nodes: Optional[bool] = False,
-) -> ModelCallable:
+) -> Union[ModelCallable, ModelAsNodes]:
+    """
+    Get a model that compresses weights using the given configuration.
+    :param ov_model_params: OV model parameters.
+    :param config: Compression configuration.
+    :param weight_shape: Shape of the weight to compress. Weight is assumed to be already reshaped as needed.
+    :param scale_shape: Optional shape of the scale. If not provided, scale will be computed by the OV model.
+        Otherwise, it is expected that the scale tensor is given as an input to the model.
+    :param zero_point_shape: Optional shape of the zero point tensor. If not provided and the mode is asymmetric,
+        zero point will be computed by the OV model. Otherwise, it is expected that the zero point tensor is provided
+        as an input.
+    :param reduction_axes: Optional axes to reduce the weight tensor. Not needed if scale (and z.p.) are provided as
+        inputs.
+    :param return_nodes: Whether to return the OV model inputs parameters and results nodes instead of the model
+        callable.
+    :return: A model callable that compresses weights using the given configuration. Or a model as nodes, if
+        `return_nodes` is True.
+    """
     if scale_shape is None and zero_point_shape is not None:
         raise Exception("Zero point shape can only be provided if scale shape is provided.")
 
+    # Set dynamic shapes if needed
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
         if scale_shape is not None:
@@ -166,6 +212,25 @@ def get_compress_decompress_weight_model(
     reduction_axes: Optional[Tuple] = None,
     return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
+    """
+    Get a model that performs compression and decompression of the given weight.
+    :param ov_model_params: OV model parameters.
+    :param config: Compression configuration.
+    :param weight_shape: Shape of the weight. Weight is assumed to be already reshaped as needed.
+    :param scale_shape: Optional shape of the scale. If not provided, scale will be computed by the OV model.
+        Otherwise, it is expected that the scale tensor is given as an input to the model.
+    :param zero_point_shape: Optional shape of the zero point tensor. If not provided and the mode is asymmetric,
+        zero point will be computed by the OV model. Otherwise, it is expected that the zero point is provided as an
+        input.
+    :param reduction_axes: Optional axes to reduce the weight tensor. Not needed if scale (and z.p.) are provided as
+        inputs.
+    :param return_compressed_weight: Whether to also return compressed weight, scale, (and zero point) besides the
+        decompressed weight.
+    :return: A model callable that returns a decompressed weight, and optionally compressed weight, scale,
+        (and zero point) if `return_compressed_weight` is True.
+    """
+
+    # Set dynamic shapes if needed
     if ov_model_params.dynamic_shapes:
         weight_shape = (-1,) * len(weight_shape)
         if scale_shape is not None:
@@ -194,8 +259,9 @@ def _build_compress_model(
     zero_point_shape: Optional[Tuple] = None,
     reduction_axes: Optional[Tuple] = None,
     return_nodes: bool = False,
-) -> Union[ModelCallable, Tuple[OVModelParameters, List[ov._pyopenvino.Node], List[ov._pyopenvino.Node]]]:
+) -> Union[ModelCallable, ModelAsNodes]:
     is_int_asym = config.is_int_asym
+
     default_input_dtypes = {
         "scale": TensorDataType.float32,
         "zero_point": TensorDataType.int32,
@@ -205,10 +271,15 @@ def _build_compress_model(
         "scale": TensorDataType.float32,
         "zero_point": TensorDataType.int32,
     }
+
+    # Update input and output dtypes with the default values
     ov_model_params = copy.deepcopy(ov_model_params)
     ov_model_params.input_dtypes = {**default_input_dtypes, **ov_model_params.input_dtypes}
     ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes}
 
+    if "weight" not in ov_model_params.input_dtypes:
+        raise ValueError("Input weight dtype is required!")
+
     weight_dtype = ov_model_params.input_dtypes["weight"]
     input_scale_dtype = ov_model_params.input_dtypes["scale"]
     input_zero_point_dtype = ov_model_params.input_dtypes["zero_point"]
@@ -255,12 +326,8 @@ def _build_compress_model(
 
     num_bits = config.num_bits
     eps = np.finfo(np.float32).eps
-    if is_int_asym:
-        level_low = 0
-        level_high = 2**num_bits - 1
-    else:
-        level_low = -(2 ** (num_bits - 1))
-        level_high = 2 ** (num_bits - 1) - 1
+    level_low = 0 if is_int_asym else -(2 ** (num_bits - 1))
+    level_high = 2**num_bits - 1 if is_int_asym else 2 ** (num_bits - 1) - 1
 
     min_values = None
     if scale_shape is not None:
@@ -270,12 +337,9 @@ def _build_compress_model(
     else:
         # Compute scale
         if is_int_asym:
-            min_values = opset.reduce_min(
-                weight, reduction_axes=reduction_axes, keep_dims=True
-            )  # [a1, r, a2] -> [a1, 1, a2]
-            max_values = opset.reduce_max(
-                weight, reduction_axes=reduction_axes, keep_dims=True
-            )  # [a1, r, a2] -> [a1, 1, a2]
+            # [a1, r, a2] -> [a1, 1, a2]
+            min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)
+            max_values = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True)
             min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)
 
             levels = level_high - level_low + 1
@@ -300,9 +364,8 @@ def _build_compress_model(
     elif is_int_asym:
         # Compute zero point
         if min_values is None:
-            min_values = opset.reduce_min(
-                weight, reduction_axes=reduction_axes, keep_dims=True
-            )  # [a1, r, a2] -> [a1, 1, a2]
+            # [a1, r, a2] -> [a1, 1, a2]
+            min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)
             min_values = opset.convert(min_values, ov.Type.f32)
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
         zero_point = opset.clamp(zero_point, level_low, level_high)
@@ -325,12 +388,12 @@ def _build_compress_model(
             ov_results.append(zero_point)
 
     if return_nodes:
-        return ov_model_params, ov_parameters, ov_results
+        return ov_parameters, ov_results, ov_model_params
 
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
+    return partial(_infer_ov_model, ov_model_params, compiled_model)
 
 
 @cache_results(OV_MODEL_CACHE)
@@ -345,6 +408,7 @@ def _build_compress_decompress_model(
 ) -> ModelCallable:
     default_output_dtypes = {"decompressed_weight": TensorDataType.float32}
     if not return_compressed_weight:
+        # If compressed weight is not returned to a user, we can keep it in float32 to avoid additional conversion
         default_output_dtypes["compressed_weight"] = TensorDataType.float32
     ov_model_params = copy.deepcopy(ov_model_params)
     ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes}
@@ -353,7 +417,8 @@ def _build_compress_decompress_model(
     if decompressed_weight_dtype != TensorDataType.float32:
         raise ValueError(f"Decompressed weight must be of float32 data type. But found: {decompressed_weight_dtype}.")
 
-    ov_model_params, ov_parameters, ov_results = get_compress_weight_model(
+    # Get compression model as input/result nodes and potentially modified ov model parameters
+    ov_parameters, ov_results, ov_model_params = get_compress_weight_model(
         ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_nodes=True
     )
 
@@ -384,10 +449,21 @@ def _build_compress_decompress_model(
     model = ov.Model(ov_results, ov_parameters)
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
+    return partial(_infer_ov_model, ov_model_params, compiled_model)
 
 
 def get_astype_model(ov_model_params: OVModelParameters, input_shape: Tuple) -> ModelCallable:
+    """
+    Return a model that cast the input of the given shape to the given data type. Especially useful for
+    casting from/to data types not supported by NumPy such as bfloat16, uint4 and int4.
+    These data types are represented as the following data types in numpy:
+        - bfloat16 -> np.float16,
+        - uint4 -> uint8,
+        - int4 -> int8.
+    :param ov_model_params: OV model related parameters.
+    :param input_shape: Shape of the tensor to cast.
+    :return: A model callable that casts the input tensor to the given data type.
+    """
     if ov_model_params.dynamic_shapes:
         input_shape = (-1,) * len(input_shape)
     return _build_astype_model(ov_model_params, input_shape, disable_caching=ov_model_params.recompile)
@@ -411,4 +487,4 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) ->
     model = ov.Model([res], [arg])
     compiled_model = ov.compile_model(model, device_name="CPU")
 
-    return partial(run_model, ov_model_params, compiled_model, ov_model_params.return_ov_tensors)
+    return partial(_infer_ov_model, ov_model_params, compiled_model)
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 459af440696..339154ffa52 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -158,7 +158,7 @@ def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bi
     w_max = fns.max(weight, axis=reduction_axes, keepdims=True)
 
     scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max)
-    fns.inplace_divide(scale, level_high)
+    fns.inplace_inverted_divide(scale, level_high)
 
     eps = fns.finfo(scale).eps
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
@@ -179,7 +179,7 @@ def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor:
     if scale.dtype != TensorDataType.float32:
         scale = scale.astype(TensorDataType.float32)
 
-    return fns.divide(weight, scale)
+    return fns.inverted_divide(weight, scale)
 
 
 def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor:
@@ -312,7 +312,7 @@ def calculate_quantized_weight(
     level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
-    compressed_weights = fns.divide(weight, scale)
+    compressed_weights = fns.inverted_divide(weight, scale)
     if zero_point is not None:
         compressed_weights += zero_point.astype(weight.dtype)
     compressed_weights = fns.round(compressed_weights)
@@ -430,7 +430,7 @@ def do_int_quantization(
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
     ov_model_params: Optional = None,
-):
+) -> Tuple[Tensor, Tensor, Tensor]:
     """
     Performs integer quantization on the given weight tensor.
 
@@ -450,14 +450,9 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    # import os
-    accelerate_through_ov = (
-        is_openvino_available()
-        and weight.backend != TensorBackend.torch
-        # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
-    )
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
-        log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
+        log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
     # When reduction axes are not provided, assuming that the weights are already reshaped
     if config.group_size != -1 and reduction_axes is not None:
@@ -466,15 +461,15 @@ def do_int_quantization(
 
     if not accelerate_through_ov:
         # Reference implementation
-
         if weight.backend == TensorBackend.ov:
             weight = weight.to_backend(TensorBackend.numpy)
-
         if weight.dtype != TensorDataType.float32:
             weight = weight.astype(TensorDataType.float32)
 
         scale, zero_point = None, None
         if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None):
+            if reduction_axes is None:
+                raise ValueError("Reduction axes are required for computing the scale and (zero point) vectors.")
             scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config)
         if precomputed_scale is not None:
             scale = precomputed_scale
@@ -505,11 +500,6 @@ def do_int_quantization(
             {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
         )
 
-    # ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
-    # ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
-    # ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
-    # ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
-
     model = get_compress_weight_model(
         ov_model_params,
         config,
@@ -553,25 +543,29 @@ def calculate_quantized_dequantized_weight(
     return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional = None,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
-    # import os
-    accelerate_through_ov = (
-        is_openvino_available()
-        and weight.backend != TensorBackend.torch
-        # and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
-    )
+    """
+    First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
+    :param weight: The weight tensor to quantize-dequantize.
+    :param config: Compression configuration.
+    :param reduction_axes: Axes along which to reduce (collect) statistics (e.g., min, max). Not required if
+        precomputed scale (and zero point) are provided.
+    :param precomputed_scale: Optional precomputed scale tensor.
+    :param precomputed_zero_point: Optional precomputed zero point tensor.
+    :param return_compressed_weight: If True, besides decompressed weight will also return compressed weight, scale,
+        (and zero point).
+    :param ov_model_params: OpenVINO model parameters for acceleration.
+    :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale,
+        (and zero point).
+    """
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 
     if not accelerate_through_ov:
         # Reference implementation
-        if precomputed_scale is None or (config.is_int_asym and precomputed_zero_point is None):
-            compressed_weight, scale, zero_point = do_int_quantization(
-                weight, config, reduction_axes, precomputed_scale, precomputed_zero_point
-            )
-        else:
-            scale = precomputed_scale if precomputed_scale is not None else None
-            zero_point = precomputed_zero_point if precomputed_zero_point is not None else None
-            compressed_weight = calculate_quantized_weight(weight, config, scale, zero_point)
+        compressed_weight, scale, zero_point = do_int_quantization(
+            weight, config, reduction_axes, precomputed_scale, precomputed_zero_point
+        )
         decompressed_weight = do_int_dequantization(compressed_weight, scale, zero_point)
         if return_compressed_weight:
             return decompressed_weight, compressed_weight, scale, zero_point
diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py
index 385cef9ca2e..9b258e40d56 100644
--- a/nncf/quantization/fake_quantize.py
+++ b/nncf/quantization/fake_quantize.py
@@ -359,11 +359,11 @@ def calculate_scale_zero_point(
     :return: Scale and Zero point values.
     """
     levels = level_high - level_low if narrow_range else level_high - level_low + 1
-    scale = fns.divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32)
+    scale = fns.inverted_divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32)
     eps = fns.finfo(scale).eps
     # NOTE: adding machine epsilon to avoid division by zero
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
     expected_level_low = level_low + 1 if narrow_range else level_low
-    zero_point = expected_level_low - fns.round(fns.divide(input_low, scale))
+    zero_point = expected_level_low - fns.round(fns.inverted_divide(input_low, scale))
     zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high)
     return scale, zero_point
diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py
index 52bc666dfa3..bacd09ee2bf 100644
--- a/nncf/tensor/functions/__init__.py
+++ b/nncf/tensor/functions/__init__.py
@@ -24,14 +24,14 @@
 from nncf.tensor.functions.numeric import count_nonzero as count_nonzero
 from nncf.tensor.functions.numeric import device as device
 from nncf.tensor.functions.numeric import diag as diag
-from nncf.tensor.functions.numeric import divide as divide
 from nncf.tensor.functions.numeric import dtype as dtype
 from nncf.tensor.functions.numeric import expand_dims as expand_dims
 from nncf.tensor.functions.numeric import eye as eye
 from nncf.tensor.functions.numeric import finfo as finfo
 from nncf.tensor.functions.numeric import flatten as flatten
 from nncf.tensor.functions.numeric import from_numpy as from_numpy
-from nncf.tensor.functions.numeric import inplace_divide as inplace_divide
+from nncf.tensor.functions.numeric import inplace_inverted_divide as inplace_inverted_divide
+from nncf.tensor.functions.numeric import inverted_divide as inverted_divide
 from nncf.tensor.functions.numeric import isclose as isclose
 from nncf.tensor.functions.numeric import isempty as isempty
 from nncf.tensor.functions.numeric import item as item
diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py
index c6276a5e22f..9ce0876f191 100644
--- a/nncf/tensor/functions/numeric.py
+++ b/nncf/tensor/functions/numeric.py
@@ -910,12 +910,18 @@ def ceil(a: Tensor) -> Tensor:
 @functools.singledispatch
 @tensor_guard
 def to_backend(a: Tensor, b: TensorBackend) -> Tensor:
+    """
+    Change backend of the tensor to the given one.
+    :param a: Tensor to change backend for.
+    :param b: Target backend to change to.
+    :return: Tensor in the target backend.
+    """
     return Tensor(to_backend(a.data, b))
 
 
 @functools.singledispatch
 @tensor_guard
-def divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor:
+def inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor:
     """
     Divide two tensors or a tensor and a float.
     This function divides `a` by `b`. If `invert` is True, it performs the division as `a * (1.0 / b)`.
@@ -931,7 +937,7 @@ def divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bo
 
 @functools.singledispatch
 @tensor_guard
-def inplace_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None:
+def inplace_inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None:
     """
     In-place division of two tensors or a tensor and a float.
     This function divides `a` by `b` in place. If `invert` is True, it performs the division as `a *= (1.0 / b)`.
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index 96bb441e45f..a316d76ac43 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -55,6 +55,7 @@ def _(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
         TensorDataType.int4,
         TensorDataType.uint4,
     ]:
+        # Cannot cast to/from bfloat16, uint4, int4 directly
         return _astype_ov(a, dtype)
     return ov.Tensor(a.data.astype(DTYPE_MAP_NP[dtype]))
 
@@ -104,6 +105,12 @@ def _(a: ov.Tensor, b: TensorBackend) -> Union[np.ndarray, ov.Tensor]:
 
 
 def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
+    """
+    Cast to a different data type using an OpenVINO model.
+    :param a: Tensor to cast.
+    :param dtype: Data type to cast to.
+    :return: Casted openvino tensor.
+    """
     from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
     from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 
diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py
index 4a6f11654c2..b4bb991d592 100644
--- a/tests/openvino/native/test_openvino_modeling.py
+++ b/tests/openvino/native/test_openvino_modeling.py
@@ -15,10 +15,10 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import _infer_ov_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_astype_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_decompress_weight_model
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import get_compress_weight_model
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import run_model
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 from nncf.tensor.definitions import TensorBackend
@@ -243,7 +243,7 @@ def test_release_memory(mocker, release_memory):
     input_tensor.data = [1, 2, 3]
     inputs = [input_tensor]
 
-    run_model(ov_model_params, compiled_model, return_ov_tensors=False, inputs=inputs)
+    _infer_ov_model(ov_model_params, compiled_model, inputs=inputs)
     if release_memory:
         compiled_model.release_memory.assert_called_once()
     else:
@@ -285,7 +285,7 @@ def test_share_inputs_outputs(mocker, share_inputs, share_outputs, return_ov_ten
     input_tensor.data = [1, 2, 3]
     inputs = [input_tensor]
 
-    run_model(ov_model_params, compiled_model, return_ov_tensors=return_ov_tensors, inputs=inputs)
+    _infer_ov_model(ov_model_params, compiled_model, inputs=inputs)
 
     if return_ov_tensors:
         infer_request.infer.assert_called_once_with(

From 73f61fca98e69017803a79394e1a267bd17499b0 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 14:46:42 +0100
Subject: [PATCH 45/73] Remove inverted NP division. Add non-convertable OV
 division.

---
 .../weight_compression/openvino_modeling.py   | 18 +++++++---
 .../weight_compression/weight_lowering.py     |  6 ++--
 nncf/quantization/fake_quantize.py            |  4 +--
 nncf/tensor/functions/__init__.py             |  2 --
 nncf/tensor/functions/numeric.py              | 35 -------------------
 5 files changed, 19 insertions(+), 46 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 69abd5309b1..d4ed33e6f73 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -343,7 +343,7 @@ def _build_compress_model(
             min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)
 
             levels = level_high - level_low + 1
-            scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32)
+            scale = _non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32))
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
         else:
             w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True))
@@ -351,7 +351,7 @@ def _build_compress_model(
             w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)
 
             scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max))
-            scale /= opset.constant(-level_low, ov.Type.f32)
+            scale = _non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32))
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
 
     zero_point = None
@@ -367,11 +367,12 @@ def _build_compress_model(
             # [a1, r, a2] -> [a1, 1, a2]
             min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)
             min_values = opset.convert(min_values, ov.Type.f32)
-        zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale)
+        scaled_min_values = _non_convertable_divide(min_values, scale)
+        zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values)
         zero_point = opset.clamp(zero_point, level_low, level_high)
 
     weight = convert_if_needed(weight, ov.Type.f32)
-    compressed_weight = weight / scale
+    compressed_weight = _non_convertable_divide(weight, scale)
 
     if is_int_asym:
         compressed_weight += zero_point
@@ -488,3 +489,12 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) ->
     compiled_model = ov.compile_model(model, device_name="CPU")
 
     return partial(_infer_ov_model, ov_model_params, compiled_model)
+
+
+def _non_convertable_divide(a: Node, b: Node) -> Node:
+    """
+    Creates a "non-convertable" divide operation. It won't be converted to a*(1/b).
+    """
+    divide_node = a / b
+    divide_node.get_rt_info()["nonconvertable_divide_0"] = True
+    return divide_node
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 339154ffa52..263d457a3e3 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -158,7 +158,7 @@ def calculate_signed_scale(weight: Tensor, reduction_axes: ReductionAxes, num_bi
     w_max = fns.max(weight, axis=reduction_axes, keepdims=True)
 
     scale = fns.where(w_abs_min >= w_max, w_abs_min, -w_max)
-    fns.inplace_inverted_divide(scale, level_high)
+    scale /= level_high
 
     eps = fns.finfo(scale).eps
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
@@ -179,7 +179,7 @@ def calculate_normalized_weight(weight: Tensor, scale: Tensor) -> Tensor:
     if scale.dtype != TensorDataType.float32:
         scale = scale.astype(TensorDataType.float32)
 
-    return fns.inverted_divide(weight, scale)
+    return weight / scale
 
 
 def do_nf4_quantization(weight: Tensor, scale: Tensor, is_normalized_weight: bool = False) -> Tensor:
@@ -312,7 +312,7 @@ def calculate_quantized_weight(
     level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
 
-    compressed_weights = fns.inverted_divide(weight, scale)
+    compressed_weights = weight / scale
     if zero_point is not None:
         compressed_weights += zero_point.astype(weight.dtype)
     compressed_weights = fns.round(compressed_weights)
diff --git a/nncf/quantization/fake_quantize.py b/nncf/quantization/fake_quantize.py
index 9b258e40d56..cd72bd5ce4f 100644
--- a/nncf/quantization/fake_quantize.py
+++ b/nncf/quantization/fake_quantize.py
@@ -359,11 +359,11 @@ def calculate_scale_zero_point(
     :return: Scale and Zero point values.
     """
     levels = level_high - level_low if narrow_range else level_high - level_low + 1
-    scale = fns.inverted_divide((input_high - input_low), (levels - 1)).astype(TensorDataType.float32)
+    scale = ((input_high - input_low) / (levels - 1)).astype(TensorDataType.float32)
     eps = fns.finfo(scale).eps
     # NOTE: adding machine epsilon to avoid division by zero
     scale = fns.where(fns.abs(scale) < eps, eps, scale)
     expected_level_low = level_low + 1 if narrow_range else level_low
-    zero_point = expected_level_low - fns.round(fns.inverted_divide(input_low, scale))
+    zero_point = expected_level_low - fns.round(input_low / scale)
     zero_point = fns.clip(zero_point.astype(TensorDataType.int32), level_low, level_high)
     return scale, zero_point
diff --git a/nncf/tensor/functions/__init__.py b/nncf/tensor/functions/__init__.py
index bacd09ee2bf..9affab79c90 100644
--- a/nncf/tensor/functions/__init__.py
+++ b/nncf/tensor/functions/__init__.py
@@ -30,8 +30,6 @@
 from nncf.tensor.functions.numeric import finfo as finfo
 from nncf.tensor.functions.numeric import flatten as flatten
 from nncf.tensor.functions.numeric import from_numpy as from_numpy
-from nncf.tensor.functions.numeric import inplace_inverted_divide as inplace_inverted_divide
-from nncf.tensor.functions.numeric import inverted_divide as inverted_divide
 from nncf.tensor.functions.numeric import isclose as isclose
 from nncf.tensor.functions.numeric import isempty as isempty
 from nncf.tensor.functions.numeric import item as item
diff --git a/nncf/tensor/functions/numeric.py b/nncf/tensor/functions/numeric.py
index 9ce0876f191..4d73549f9b4 100644
--- a/nncf/tensor/functions/numeric.py
+++ b/nncf/tensor/functions/numeric.py
@@ -917,38 +917,3 @@ def to_backend(a: Tensor, b: TensorBackend) -> Tensor:
     :return: Tensor in the target backend.
     """
     return Tensor(to_backend(a.data, b))
-
-
-@functools.singledispatch
-@tensor_guard
-def inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> Tensor:
-    """
-    Divide two tensors or a tensor and a float.
-    This function divides `a` by `b`. If `invert` is True, it performs the division as `a * (1.0 / b)`.
-    Otherwise, it performs the division as `a / b`.
-    :param a: The first input tensor or float.
-    :param b: The second input tensor or float.
-    :param invert: If True, the division is performed as `a * (1.0 / b)`. If False, it is performed as `a / b`.
-                   Defaults to True.
-    :return: A new tensor resulting from the division.
-    """
-    return Tensor(a * (1.0 / b) if invert else a / b)
-
-
-@functools.singledispatch
-@tensor_guard
-def inplace_inverted_divide(a: Union[Tensor, float], b: Union[Tensor, float], invert: Optional[bool] = True) -> None:
-    """
-    In-place division of two tensors or a tensor and a float.
-    This function divides `a` by `b` in place. If `invert` is True, it performs the division as `a *= (1.0 / b)`.
-    Otherwise, it performs the division as `a /= b`.
-    :param a: The first input tensor or float.
-    :param b: The second input tensor or float.
-    :param invert: If True, the division is performed as `a *= (1.0 / b)`. If False, the division it is as `a /= b`.
-                   Defaults to True.
-    :return: None. The operation is performed in place.
-    """
-    if invert:
-        a *= 1.0 / b
-    else:
-        a /= b

From cd884ebba8df966228ca06372985812ebb1dd462 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 14:54:51 +0100
Subject: [PATCH 46/73] Remove OV 2024.5 RC installation

---
 .github/workflows/precommit.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
index b084db6ae23..6dd6bff293d 100644
--- a/.github/workflows/precommit.yml
+++ b/.github/workflows/precommit.yml
@@ -64,8 +64,6 @@ jobs:
           cache: pip
       - name: Install NNCF and test requirements
         run: make install-openvino-test
-      - name: Install OpenVINO 2024.5
-        run: pip install -U --pre openvino==2024.5.0rc2 openvino-tokenizers==2024.5.0rc2 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - name: Print installed modules
         run: pip list
       - name: Run OV precommit test scope

From 608cfe9f105d1a9cc99c40dcd71737b323fbe254 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:08:29 +0100
Subject: [PATCH 47/73] Add a test for non-convertable division

---
 nncf/openvino/graph/node_utils.py             |  9 +++++++++
 .../weight_compression/openvino_modeling.py   | 18 +++++-------------
 tests/openvino/native/test_node_utils.py      | 19 +++++++++++++++++++
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index 67bf9143cd4..a34f3c9d785 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -645,3 +645,12 @@ def convert_if_needed(node: ov.Node, target_dtype: ov.Type) -> ov.Node:
     if node.get_element_type() == target_dtype:
         return node
     return opset.convert(node, target_dtype)
+
+
+def non_convertable_divide(a: ov.Node, b: ov.Node) -> ov.Node:
+    """
+    Creates a "non-convertable" divide operation. It won't be converted to a*(1/b).
+    """
+    divide_node = a / b
+    divide_node.get_rt_info()["nonconvertable_divide_0"] = True
+    return divide_node
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index d4ed33e6f73..54bb083a711 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -23,6 +23,7 @@
 from nncf.common.utils.decorators import ResultsCacheContainer
 from nncf.common.utils.decorators import cache_results
 from nncf.openvino.graph.node_utils import convert_if_needed
+from nncf.openvino.graph.node_utils import non_convertable_divide
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
@@ -343,7 +344,7 @@ def _build_compress_model(
             min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)
 
             levels = level_high - level_low + 1
-            scale = _non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32))
+            scale = non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32))
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
         else:
             w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True))
@@ -351,7 +352,7 @@ def _build_compress_model(
             w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)
 
             scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max))
-            scale = _non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32))
+            scale = non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32))
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
 
     zero_point = None
@@ -367,12 +368,12 @@ def _build_compress_model(
             # [a1, r, a2] -> [a1, 1, a2]
             min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)
             min_values = opset.convert(min_values, ov.Type.f32)
-        scaled_min_values = _non_convertable_divide(min_values, scale)
+        scaled_min_values = non_convertable_divide(min_values, scale)
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values)
         zero_point = opset.clamp(zero_point, level_low, level_high)
 
     weight = convert_if_needed(weight, ov.Type.f32)
-    compressed_weight = _non_convertable_divide(weight, scale)
+    compressed_weight = non_convertable_divide(weight, scale)
 
     if is_int_asym:
         compressed_weight += zero_point
@@ -489,12 +490,3 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) ->
     compiled_model = ov.compile_model(model, device_name="CPU")
 
     return partial(_infer_ov_model, ov_model_params, compiled_model)
-
-
-def _non_convertable_divide(a: Node, b: Node) -> Node:
-    """
-    Creates a "non-convertable" divide operation. It won't be converted to a*(1/b).
-    """
-    divide_node = a / b
-    divide_node.get_rt_info()["nonconvertable_divide_0"] = True
-    return divide_node
diff --git a/tests/openvino/native/test_node_utils.py b/tests/openvino/native/test_node_utils.py
index 241b9e6f156..dc09cda77e5 100644
--- a/tests/openvino/native/test_node_utils.py
+++ b/tests/openvino/native/test_node_utils.py
@@ -22,6 +22,7 @@
 from nncf.openvino.graph.node_utils import get_weight_channel_axes
 from nncf.openvino.graph.node_utils import get_weighted_layer_attributes
 from nncf.openvino.graph.node_utils import is_node_with_bias
+from nncf.openvino.graph.node_utils import non_convertable_divide
 from tests.openvino.native.models import ConvModel
 from tests.openvino.native.models import ConvNotBiasModel
 from tests.openvino.native.models import MatMul2DModel
@@ -147,3 +148,21 @@ def test_get_weight_channel_axes_for_matmul(weights_port_id, transpose, shape, d
 
     assert len(actual_channel_axes) == len(expected_channel_axes)
     assert all(a == b for a, b in zip(actual_channel_axes, expected_channel_axes))
+
+
+@pytest.mark.parametrize(
+    "a,b,convertable,ref_result",
+    [
+        (0.058599039912223816, 15, True, 0.003906603),
+        (0.058599039912223816, 15, False, 0.003906602505594492),
+    ],
+)
+def test_non_convertable_division(a, b, convertable, ref_result):
+    a, b, ref_result = tuple(map(lambda x: np.array([x], np.float32), [a, b, ref_result]))
+    a_param = opset.parameter((-1,), ov.Type.f32)
+    b_param = opset.parameter((-1,), ov.Type.f32)
+    division = (a_param / b_param) if convertable else non_convertable_divide(a_param, b_param)
+    model = ov.Model([division], [a_param, b_param])
+    compiled_model = ov.compile_model(model, device_name="CPU")
+    actual_result = compiled_model([a, b])[0]
+    np.testing.assert_allclose(actual_result, ref_result, atol=0, rtol=0)

From 9569e1e41cc8eefb274297cde3c7196d9de6a798 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:11:47 +0100
Subject: [PATCH 48/73] Make the test more strict

---
 .../native/quantization/test_ov_modeling_compression.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
index 3e09714cae0..9a319aba742 100644
--- a/tests/openvino/native/quantization/test_ov_modeling_compression.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -202,9 +202,9 @@ def test_quantization_alignment(
             assert scale.backend == TensorBackend.numpy
             if precompute_s_zp:
                 # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones
-                np.testing.assert_allclose(precomputed_scale.data, scale.data)
+                np.testing.assert_allclose(precomputed_scale.data, scale.data, atol=0, rtol=0)
                 if config.is_int_asym:
-                    np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data)
+                    np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0)
 
         if (
             quantization_task == QuantizationTask.Q
@@ -274,7 +274,7 @@ def test_quantization_alignment(
 
         # Check that the computed tensors are equal between implementations
         np.testing.assert_allclose(
-            numpy_result.data, ov_result.data, atol=atol, err_msg=f"Results do not align for {key}."
+            ov_result.data, numpy_result.data, atol=atol, rtol=0, err_msg=f"Results do not align for {key}."
         )
 
         if max_misalignment_frequency is not None:

From f962bd1ef26ed154cc2ca40898f0eb69abb0eff6 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:23:06 +0100
Subject: [PATCH 49/73] Remove unnecessary lines

---
 .../algorithms/weight_compression/openvino_modeling.py        | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 54bb083a711..c131161a945 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -364,10 +364,6 @@ def _build_compress_model(
         zero_point = convert_if_needed(zero_point, ov.Type.f32)
     elif is_int_asym:
         # Compute zero point
-        if min_values is None:
-            # [a1, r, a2] -> [a1, 1, a2]
-            min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)
-            min_values = opset.convert(min_values, ov.Type.f32)
         scaled_min_values = non_convertable_divide(min_values, scale)
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values)
         zero_point = opset.clamp(zero_point, level_low, level_high)

From 5dcd83df999f462608c15f24b6adcaf1dab5867a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:30:35 +0100
Subject: [PATCH 50/73] Update get_integer_quantization_error implementation

---
 .../algorithms/weight_compression/weight_lowering.py           | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 263d457a3e3..9e80e1e95de 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -340,8 +340,7 @@ def get_integer_quantization_error(
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
 
-    compressed_weights, scale, zero_point = do_int_quantization(weight, config, reduction_axes)
-    decompressed_weight = do_int_dequantization(compressed_weights, scale, zero_point)
+    decompressed_weight = calculate_quantized_dequantized_weight(weight, config, reduction_axes)
 
     decompressed_weight = decompressed_weight.reshape(orig_shape)
     diff = (decompressed_weight - weight) ** 2

From 6e22ef5e7af9ab2720a71ff4755a09bcd3090efb Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:44:02 +0100
Subject: [PATCH 51/73] Remove unnecessary convert

---
 .../algorithms/weight_compression/openvino_backend.py            | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 49e842f72d5..4a944553abe 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -243,7 +243,6 @@ def _create_compression_subgraph(
         compressed_const = self._create_ov_const_from_tensor(
             compressed_weight.tensor, compression_dtype, name=const_node_name
         )
-        compressed_const = convert_if_needed(compressed_const, compression_dtype)
         converted_const = opset.convert(compressed_const, ov.Type.f16)
 
         if compressed_weight.zero_point is not None:

From b45e7889a0f6e16a7a6c836e1571e884710f30d3 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:46:53 +0100
Subject: [PATCH 52/73] Move create_ov_const_from_tensor to node_utils

---
 nncf/openvino/graph/node_utils.py             | 19 ++++++++++++++
 .../weight_compression/openvino_backend.py    | 26 +++----------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/nncf/openvino/graph/node_utils.py b/nncf/openvino/graph/node_utils.py
index a34f3c9d785..80432c7fc7e 100644
--- a/nncf/openvino/graph/node_utils.py
+++ b/nncf/openvino/graph/node_utils.py
@@ -14,6 +14,7 @@
 import numpy as np
 import openvino.runtime as ov
 import openvino.runtime.opset13 as opset
+from openvino._pyopenvino.op import Constant
 
 import nncf
 from nncf.common.graph.graph import NNCFGraph
@@ -41,6 +42,8 @@
 from nncf.openvino.graph.metatypes.openvino_metatypes import OVMatMulMetatype
 from nncf.openvino.graph.metatypes.openvino_metatypes import OVOpMetatype
 from nncf.openvino.graph.metatypes.openvino_metatypes import get_node_metatype
+from nncf.tensor import Tensor
+from nncf.tensor import TensorBackend
 
 InplaceInsertionFnType = Callable[[ov.Node, int, str], ov.Node]
 
@@ -654,3 +657,19 @@ def non_convertable_divide(a: ov.Node, b: ov.Node) -> ov.Node:
     divide_node = a / b
     divide_node.get_rt_info()["nonconvertable_divide_0"] = True
     return divide_node
+
+
+def create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant:
+    """
+    Create an OpenVINO Constant node from the given tensor.
+    :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created
+        directly from underlying OV tensor.
+    :param dtype: Data type of the constant.
+    :param name: Optional name of the constant.
+    :return: OpenVINO Constant node.
+    """
+    if x.backend == TensorBackend.ov:
+        assert x.data.get_element_type() == dtype
+        return opset.constant(x.data, name=name)
+    const = opset.constant(x.data, dtype=dtype, name=name)
+    return const
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 4a944553abe..0f26b1a800b 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -12,7 +12,6 @@
 
 import openvino as ov
 from openvino.runtime import opset13 as opset
-from openvino.runtime.op import Constant
 
 import nncf
 from nncf.common.graph import NNCFGraph
@@ -31,6 +30,7 @@
 from nncf.openvino.graph.metatypes.groups import ATOMIC_ACTIVATIONS_OPERATIONS
 from nncf.openvino.graph.model_transformer import OVModelTransformer
 from nncf.openvino.graph.node_utils import convert_if_needed
+from nncf.openvino.graph.node_utils import create_ov_const_from_tensor
 from nncf.openvino.graph.node_utils import get_const_value
 from nncf.openvino.graph.node_utils import get_weight_channel_axes
 from nncf.openvino.graph.transformations.command_creation import OVCommandCreator
@@ -240,13 +240,13 @@ def _create_compression_subgraph(
         original_shape = weight.shape
         compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)
 
-        compressed_const = self._create_ov_const_from_tensor(
+        compressed_const = create_ov_const_from_tensor(
             compressed_weight.tensor, compression_dtype, name=const_node_name
         )
         converted_const = opset.convert(compressed_const, ov.Type.f16)
 
         if compressed_weight.zero_point is not None:
-            zero_point_const = self._create_ov_const_from_tensor(
+            zero_point_const = create_ov_const_from_tensor(
                 compressed_weight.zero_point, compression_dtype, name=f"{const_node_name}/zero_point"
             )
             zero_point_const = opset.convert(zero_point_const, ov.Type.f16)
@@ -254,9 +254,7 @@ def _create_compression_subgraph(
                 converted_const, zero_point_const, name=f"{const_node_name}/zero_point/subtract"
             )
 
-        scale_const = self._create_ov_const_from_tensor(
-            compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale"
-        )
+        scale_const = create_ov_const_from_tensor(compressed_weight.scale, scale_dtype, name=f"{const_node_name}/scale")
         scale_const = convert_if_needed(scale_const, ov.Type.f16)
 
         mul = opset.multiply(
@@ -347,22 +345,6 @@ def dump_parameters(
     ) -> None:
         dump_parameters(model, parameters, algo_name, path)
 
-    @staticmethod
-    def _create_ov_const_from_tensor(x: Tensor, dtype: ov.Type, name: Optional[str] = None) -> Constant:
-        """
-        Create an OpenVINO Constant node from the given tensor.
-        :param x: Data tensor. Supports NumPy and OV tensor backends. If x backend is OV, the constant node is created
-            directly from underlying OV tensor.
-        :param dtype: Data type of the constant.
-        :param name: Optional name of the constant.
-        :return: OpenVINO Constant node.
-        """
-        if x.backend == TensorBackend.ov:
-            assert x.data.get_element_type() == dtype
-            return opset.constant(x.data, name=name)
-        const = opset.constant(x.data, dtype=dtype, name=name)
-        return const
-
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod

From b2cebd0d7a52c70a5b5ffd490c899c0e1d3087fa Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 15:57:03 +0100
Subject: [PATCH 53/73] Separate checking logic into standalone methods

---
 .../test_ov_modeling_compression.py           | 103 +++++++++++-------
 1 file changed, 66 insertions(+), 37 deletions(-)

diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
index 9a319aba742..3d7f9d3b4c1 100644
--- a/tests/openvino/native/quantization/test_ov_modeling_compression.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -196,42 +196,11 @@ def test_quantization_alignment(
                 else:
                     mock.assert_called_once()
 
-        if quantization_task != QuantizationTask.Q_DQ:
-            # Scale should always be float32 and numpy backend
-            assert scale.dtype == TensorDataType.float32
-            assert scale.backend == TensorBackend.numpy
-            if precompute_s_zp:
-                # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones
-                np.testing.assert_allclose(precomputed_scale.data, scale.data, atol=0, rtol=0)
-                if config.is_int_asym:
-                    np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0)
-
-        if (
-            quantization_task == QuantizationTask.Q
-            and cb == ComputationBackend.OV
-            and weight_tensor_backend == TensorBackend.ov
-            and config.num_bits == 4
-        ):
-            # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed
-            # zero point must be in ov backend and have (u)int4 dtype in order to be able to insert them into OV model
-            # without re-packing
-            assert compressed_weight.backend == TensorBackend.ov
-            assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4)
-            if config.is_int_asym and not precompute_s_zp:
-                assert zero_point.backend == TensorBackend.ov
-                assert zero_point.dtype == TensorDataType.uint4
-        else:
-            if quantization_task != QuantizationTask.Q_DQ:
-                # Otherwise compressed weight and zero point must be returned in numpy backend, compressed weight must
-                # be of (u)int8 data type, zero point -- in int32
-                assert compressed_weight.backend == TensorBackend.numpy
-                assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8)
-                if config.is_int_asym and not precompute_s_zp:
-                    assert zero_point.backend == TensorBackend.numpy
-                    assert zero_point.dtype == TensorDataType.int32
-            if quantization_task != QuantizationTask.Q:
-                assert decompressed_weight.backend == TensorBackend.numpy
-                assert decompressed_weight.dtype == TensorDataType.float32
+        if quantization_task != QuantizationTask.Q_DQ and precompute_s_zp:
+            # In case of precomputed scale or zero point, the returned scale and z.p. should equal the given ones
+            np.testing.assert_allclose(precomputed_scale.data, scale.data, atol=0, rtol=0)
+            if config.is_int_asym:
+                np.testing.assert_allclose(precomputed_zero_point.data, zero_point.data, atol=0, rtol=0)
 
         # Save results for comparison between implementations
         if quantization_task != QuantizationTask.Q:
@@ -242,6 +211,66 @@ def test_quantization_alignment(
             if config.is_int_asym:
                 results[cb]["zero_point"] = zero_point.to_backend(TensorBackend.numpy)
 
+        _check_backends_and_dtypes(
+            quantization_task,
+            cb,
+            weight_tensor_backend,
+            config,
+            precompute_s_zp,
+            compressed_weight,
+            scale,
+            zero_point,
+            decompressed_weight,
+        )
+
+    _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape)
+
+
+def _check_backends_and_dtypes(
+    quantization_task,
+    cb,
+    weight_tensor_backend,
+    config,
+    precompute_s_zp,
+    compressed_weight,
+    scale,
+    zero_point,
+    decompressed_weight,
+):
+    if quantization_task != QuantizationTask.Q_DQ:
+        # Scale should always be float32 and numpy backend
+        assert scale.dtype == TensorDataType.float32
+        assert scale.backend == TensorBackend.numpy
+
+    if (
+        quantization_task == QuantizationTask.Q
+        and cb == ComputationBackend.OV
+        and weight_tensor_backend == TensorBackend.ov
+        and config.num_bits == 4
+    ):
+        # For 4 bit compression in case of ov implementation and ov backend the compressed weight and the computed
+        # zero point must be in ov backend and have (u)int4 dtype in order to be able to insert them into OV model
+        # without re-packing
+        assert compressed_weight.backend == TensorBackend.ov
+        assert compressed_weight.dtype == (TensorDataType.uint4 if config.is_int_asym else TensorDataType.int4)
+        if config.is_int_asym and not precompute_s_zp:
+            assert zero_point.backend == TensorBackend.ov
+            assert zero_point.dtype == TensorDataType.uint4
+    else:
+        if quantization_task != QuantizationTask.Q_DQ:
+            # Otherwise compressed weight and zero point must be returned in numpy backend, compressed weight must
+            # be of (u)int8 data type, zero point -- in int32
+            assert compressed_weight.backend == TensorBackend.numpy
+            assert compressed_weight.dtype == (TensorDataType.uint8 if config.is_int_asym else TensorDataType.int8)
+            if config.is_int_asym and not precompute_s_zp:
+                assert zero_point.backend == TensorBackend.numpy
+                assert zero_point.dtype == TensorDataType.int32
+        if quantization_task != QuantizationTask.Q:
+            assert decompressed_weight.backend == TensorBackend.numpy
+            assert decompressed_weight.dtype == TensorDataType.float32
+
+
+def _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape):
     keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy]))
     for key in keys:
         numpy_result = results[ComputationBackend.NumPy][key]
@@ -250,7 +279,7 @@ def test_quantization_alignment(
         atol = 0
         scale = None
         # For static-shaped OV models doing asymmetric compression there maybe misalignments between OV and NumPy
-        # For more details see 156511
+        # For more details see ticket 156511
         if static_shapes and config.is_int_asym:
             if key == "compressed_weight":
                 atol = MAX_MISALIGNMENT_MAGNITUDE

From 3a7114121e20c0c129d674082cc4f47ea5ed972b Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 17:31:30 +0100
Subject: [PATCH 54/73] Add debug conditions

---
 .../algorithms/weight_compression/openvino_backend.py     | 5 +++--
 .../algorithms/weight_compression/weight_lowering.py      | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 0f26b1a800b..018ff64985c 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -285,7 +285,8 @@ def transform_model(
             const_node = self.name_to_node_mapping[const_node_name]
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
-            weight = get_const_value(const_node, cast_bf16_to_fp32=False)
+            import os
+            weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))))
             # Creation of ov.Tensor is required for two reasons:
             #   1. To be able to process BF16 weight properly
             #   2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
@@ -335,7 +336,7 @@ def transform_model(
         self.name_to_node_mapping = None
 
         # clear openvino model cache
-        OV_MODEL_CACHE.clear()
+        # OV_MODEL_CACHE.clear()
 
         return model
 
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 9e80e1e95de..99f3053fbe1 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -449,7 +449,8 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
+    import os
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
@@ -499,6 +500,11 @@ def do_int_quantization(
             {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
         )
 
+    ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
+    ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
+    ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
+    ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
+
     model = get_compress_weight_model(
         ov_model_params,
         config,

From eeadf1d1b856b3e3039aeff2a5f62f8b952e8717 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 10:40:48 +0100
Subject: [PATCH 55/73] Move ov model cache clearing to ov backend destructor

---
 .../algorithms/weight_compression/openvino_backend.py     | 8 ++++----
 .../algorithms/weight_compression/openvino_modeling.py    | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 018ff64985c..1e302dcfa6a 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -49,7 +49,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OV_MODEL_CACHE
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import clear_ov_model_cache
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
 from nncf.tensor.definitions import TensorBackend
@@ -335,9 +335,6 @@ def transform_model(
         # reset name_to_node_mapping
         self.name_to_node_mapping = None
 
-        # clear openvino model cache
-        # OV_MODEL_CACHE.clear()
-
         return model
 
     @staticmethod
@@ -346,6 +343,9 @@ def dump_parameters(
     ) -> None:
         dump_parameters(model, parameters, algo_name, path)
 
+    def __del__(self):
+        clear_ov_model_cache()
+
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
     @staticmethod
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index c131161a945..fa08fd5f7b9 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -116,6 +116,10 @@ def __hash__(self):
 ModelAsNodes = Tuple[List[Parameter], List[Node], OVModelParameters]
 
 
+def clear_ov_model_cache():
+    OV_MODEL_CACHE.clear()
+
+
 def _infer_ov_model(
     ov_model_params: OVModelParameters, compiled_model: ov.CompiledModel, inputs: TensorList
 ) -> TensorList:

From 40aef547f5384f26dfbb534893bda41a7c10ea2b Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 10:52:06 +0100
Subject: [PATCH 56/73] Update default ov model parameters

---
 .../weight_compression/openvino_backend.py      | 10 +++++++++-
 .../weight_compression/openvino_modeling.py     |  2 +-
 .../weight_compression/weight_lowering.py       |  4 +++-
 nncf/tensor/functions/ov.py                     | 17 ++++-------------
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 1e302dcfa6a..5272fcf166f 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -49,6 +49,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import clear_ov_model_cache
 from nncf.quantization.algorithms.weight_compression.weight_lowering import compress_weight
 from nncf.tensor import Tensor
@@ -238,7 +239,14 @@ def _create_compression_subgraph(
             raise nncf.ParameterNotSupportedError(f"{compression_config.mode.value} is not supported.")
 
         original_shape = weight.shape
-        compressed_weight = compress_weight(weight, reduction_axes, compression_config, layer_scales, layer_zero_points)
+        compressed_weight = compress_weight(
+            weight,
+            reduction_axes,
+            compression_config,
+            layer_scales,
+            layer_zero_points,
+            OVModelParameters(recompile=True, release_memory=False),
+        )
 
         compressed_const = create_ov_const_from_tensor(
             compressed_weight.tensor, compression_dtype, name=const_node_name
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index fa08fd5f7b9..e90d1716f49 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -46,7 +46,7 @@ def __init__(
         self,
         input_dtypes: Optional[Dict[str, TensorDataType]] = None,
         output_dtypes: Optional[Dict[str, TensorDataType]] = None,
-        dynamic_shapes: bool = False,
+        dynamic_shapes: bool = True,
         recompile: bool = False,
         release_memory: bool = True,
         share_inputs: bool = True,
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 99f3053fbe1..999888462c3 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -355,6 +355,7 @@ def compress_weight(
     config: WeightCompressionConfig,
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
+    ov_model_params: Optional = None,
 ):
     """
     Compress weight using compression configuration.
@@ -364,6 +365,7 @@ def compress_weight(
     :param config: Compression configuration.
     :param precomputed_scale: Precomputed scale.
     :param precomputed_zero_point: Precomputed zero point.
+    :param ov_model_params: OpenVINO model parameters for acceleration.
     :return: The compressed weight and decompression parameters as instance of CompressedWeight
     """
     if not config.is_integer:
@@ -375,7 +377,7 @@ def compress_weight(
         )
         return CompressedWeight(compressed_weight, scale)
     compressed_weight, scale, zero_point = do_int_quantization(
-        weight, config, reduction_axes, precomputed_scale, precomputed_zero_point
+        weight, config, reduction_axes, precomputed_scale, precomputed_zero_point, ov_model_params
     )
 
     return CompressedWeight(compressed_weight, scale, zero_point)
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index a316d76ac43..9a2d43d79d3 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -116,17 +116,8 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
 
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
 
-    model = get_astype_model(
-        OVModelParameters(
-            input_dtypes={"input": a_dtype},
-            output_dtypes={"output": dtype},
-            dynamic_shapes=False,
-            recompile=True,
-            release_memory=True,
-            share_inputs=True,
-            share_outputs=True,
-            return_ov_tensors=True,
-        ),
-        tuple(a.shape),
-    )
+    ov_model_params = OVModelParameters(recompile=True, release_memory=False, return_ov_tensors=True)
+    ov_model_params.input_dtypes = {"input": a_dtype}
+    ov_model_params.output_dtypes = {"output": dtype}
+    model = get_astype_model(ov_model_params, tuple(a.shape))
     return model([Tensor(a)])[0].data

From ab3d35f0dc5a5a174c46f7ef63ac7948c7ca4c62 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 11:04:55 +0100
Subject: [PATCH 57/73] Revert debug logic

---
 .../algorithms/weight_compression/openvino_backend.py     | 3 +--
 .../algorithms/weight_compression/weight_lowering.py      | 8 +-------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 5272fcf166f..0eaa6b72532 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -293,8 +293,7 @@ def transform_model(
             const_node = self.name_to_node_mapping[const_node_name]
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
-            import os
-            weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))))
+            weight = get_const_value(const_node, cast_bf16_to_fp32=False)
             # Creation of ov.Tensor is required for two reasons:
             #   1. To be able to process BF16 weight properly
             #   2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 999888462c3..9de76e5ce71 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -451,8 +451,7 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    import os
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
@@ -502,11 +501,6 @@ def do_int_quantization(
             {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
         )
 
-    ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
-    ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
-    ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
-    ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
-
     model = get_compress_weight_model(
         ov_model_params,
         config,

From d48c748bcb2f53d481d20751dc4693c581c916f8 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 15:59:51 +0100
Subject: [PATCH 58/73] Update reference

---
 tests/post_training/data/wc_reference_data.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 6c48904c91a..683dc62f401 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -23,7 +23,7 @@ tinyllama_int8_data_free_backend_TORCH:
   num_int4: 0
   num_int8: 312
 tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV:
-  metric_value: 0.86503
+  metric_value: 0.88669
   num_int4: 94
   num_int8: 124
   metrics_xfail_reason: "Issue-148819"

From 9a56fae2692fc22bb6e25f74a1c5f00dfc078e86 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 11 Dec 2024 17:31:30 +0100
Subject: [PATCH 59/73] Add debug conditions

---
 .../algorithms/weight_compression/openvino_backend.py     | 3 ++-
 .../algorithms/weight_compression/weight_lowering.py      | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 0eaa6b72532..5272fcf166f 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -293,7 +293,8 @@ def transform_model(
             const_node = self.name_to_node_mapping[const_node_name]
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
-            weight = get_const_value(const_node, cast_bf16_to_fp32=False)
+            import os
+            weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))))
             # Creation of ov.Tensor is required for two reasons:
             #   1. To be able to process BF16 weight properly
             #   2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 9de76e5ce71..999888462c3 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -451,7 +451,8 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
+    import os
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
@@ -501,6 +502,11 @@ def do_int_quantization(
             {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
         )
 
+    ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
+    ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
+    ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
+    ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
+
     model = get_compress_weight_model(
         ov_model_params,
         config,

From e10d806de2f20200f8bcddbf94220aadbfa1aba3 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 20:15:16 +0100
Subject: [PATCH 60/73] Disable dynamic shapes by default

---
 .../algorithms/weight_compression/openvino_backend.py      | 2 +-
 .../algorithms/weight_compression/openvino_modeling.py     | 2 +-
 nncf/tensor/functions/ov.py                                | 7 ++++++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 5272fcf166f..2cbb0904706 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -245,7 +245,7 @@ def _create_compression_subgraph(
             compression_config,
             layer_scales,
             layer_zero_points,
-            OVModelParameters(recompile=True, release_memory=False),
+            OVModelParameters(dynamic_shapes=False, recompile=True, release_memory=False),
         )
 
         compressed_const = create_ov_const_from_tensor(
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index e90d1716f49..fa08fd5f7b9 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -46,7 +46,7 @@ def __init__(
         self,
         input_dtypes: Optional[Dict[str, TensorDataType]] = None,
         output_dtypes: Optional[Dict[str, TensorDataType]] = None,
-        dynamic_shapes: bool = True,
+        dynamic_shapes: bool = False,
         recompile: bool = False,
         release_memory: bool = True,
         share_inputs: bool = True,
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index 9a2d43d79d3..b7eda808447 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -116,7 +116,12 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
 
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
 
-    ov_model_params = OVModelParameters(recompile=True, release_memory=False, return_ov_tensors=True)
+    ov_model_params = OVModelParameters(
+        dynamic_shapes=True,
+        recompile=True,
+        release_memory=False,
+        return_ov_tensors=True
+    )
     ov_model_params.input_dtypes = {"input": a_dtype}
     ov_model_params.output_dtypes = {"output": dtype}
     model = get_astype_model(ov_model_params, tuple(a.shape))

From b372dc70543a377298f5956503713b5271980360 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 20:40:10 +0100
Subject: [PATCH 61/73] Revert "Add debug conditions"

This reverts commit 9a56fae2692fc22bb6e25f74a1c5f00dfc078e86.
---
 .../algorithms/weight_compression/openvino_backend.py     | 3 +--
 .../algorithms/weight_compression/weight_lowering.py      | 8 +-------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 2cbb0904706..1bb5ea1adcd 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -293,8 +293,7 @@ def transform_model(
             const_node = self.name_to_node_mapping[const_node_name]
             const_node_output = const_node.output(0)
             const_dtype = const_node_output.get_element_type()
-            import os
-            weight = get_const_value(const_node, cast_bf16_to_fp32=False or bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))))
+            weight = get_const_value(const_node, cast_bf16_to_fp32=False)
             # Creation of ov.Tensor is required for two reasons:
             #   1. To be able to process BF16 weight properly
             #   2. To indicate that it is allowed for the compressed constant to be returned as int4/uint4 if needed
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 999888462c3..9de76e5ce71 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -451,8 +451,7 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    import os
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not bool(int(os.environ.get("NUMPY_COMPRESSION", "0")))
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
@@ -502,11 +501,6 @@ def do_int_quantization(
             {"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
         )
 
-    ov_model_params.dynamic_shapes = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0")))
-    ov_model_params.recompile = bool(int(os.environ.get("RECOMPILE", "0")))
-    ov_model_params.release_memory = bool(int(os.environ.get("RELEASE_MEMORY", "0")))
-    ov_model_params.share_outputs = bool(int(os.environ.get("SHARE_OUTPUTS", "0")))
-
     model = get_compress_weight_model(
         ov_model_params,
         config,

From 63858d3958929bff306c90d62197389f019f0dba Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 12 Dec 2024 21:14:04 +0100
Subject: [PATCH 62/73] Linters

---
 nncf/tensor/functions/ov.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index b7eda808447..f277da53d28 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -117,10 +117,7 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
 
     ov_model_params = OVModelParameters(
-        dynamic_shapes=True,
-        recompile=True,
-        release_memory=False,
-        return_ov_tensors=True
+        dynamic_shapes=True, recompile=True, release_memory=False, return_ov_tensors=True
     )
     ov_model_params.input_dtypes = {"input": a_dtype}
     ov_model_params.output_dtypes = {"output": dtype}

From 87b5c1069d292bd9c0e170184e3b56655c42f2b2 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 13 Dec 2024 10:47:11 +0100
Subject: [PATCH 63/73] Fix lora correction

---
 .../algorithms/weight_compression/weight_lowering.py          | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 9de76e5ce71..227ac16342c 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -415,7 +415,9 @@ def do_int_dequantization(
         original shapes. If equals to -1: weights are not reshaped, assumed not a group quantization. Default to -1.
     :return: dequantized/decompressed weights.
     """
-    decompressed_weight = compressed_weights - zero_point if zero_point is not None else compressed_weights
+    decompressed_weight = (
+        compressed_weights.astype(TensorDataType.int32) - zero_point if zero_point is not None else compressed_weights
+    )
     decompressed_weight = decompressed_weight.astype(scale.dtype) * scale
 
     if reduction_axis > -1:

From 7134e6d43bfd3ea9b927cbf9ced3d2ee2692a86d Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 13 Dec 2024 13:46:05 +0100
Subject: [PATCH 64/73] Remove not used argument

---
 nncf/quantization/algorithms/weight_compression/gptq.py       | 1 -
 .../algorithms/weight_compression/scale_estimation.py         | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py
index d2178b19e91..4a5686a0ef8 100644
--- a/nncf/quantization/algorithms/weight_compression/gptq.py
+++ b/nncf/quantization/algorithms/weight_compression/gptq.py
@@ -267,7 +267,6 @@ def _quantize_weights(
                             activations = [inp[..., (i1 + i) : (i1 + i + group_size)] for inp in inputs]
                             wc_statistics = ScaleEstimation.activations_to_wc_statistics(activations)
                             scale, zero_point = ScaleEstimation.calculate_quantization_params(
-                                self._backend_entity,
                                 wc_statistics,
                                 weight_tensor[:, (i1 + i) : (i1 + i + group_size)],
                                 reduction_axes,
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index f7eff80c321..af51182a586 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -23,7 +23,6 @@
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
-from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
@@ -146,7 +145,6 @@ def apply(
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
 
             scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
-                self._backend_entity,
                 stats,
                 weight,
                 wp.reduction_axes,
@@ -161,7 +159,6 @@ def apply(
 
     @staticmethod
     def calculate_quantization_params(
-        backend_entity: WeightCompressionAlgoBackend,
         statistics: WCTensorStatistic,
         weight: Tensor,
         reduction_axes: Tuple[int, ...],
@@ -181,7 +178,6 @@ def calculate_quantization_params(
         1. Initial scale rectification based on activation statistics.
         2. A grid search to further refine the scale parameters.
 
-        :param backend_entity: The backend-specific implementation of the weight compression algorithm.
         :param statistics: The input activations of the layer reduced over batch and sequence length dimensions,
             together with original activation tensor shapes.
         :param weight: The weight tensor that is being quantized.

From 5a1866f5506aef5aa7eae251a9b44e2dbbc26032 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 13 Dec 2024 14:53:13 +0100
Subject: [PATCH 65/73] Remove static shapes testing because it is not needed
 with non-convertable division

---
 .../test_ov_modeling_compression.py           | 77 +++----------------
 1 file changed, 10 insertions(+), 67 deletions(-)

diff --git a/tests/openvino/native/quantization/test_ov_modeling_compression.py b/tests/openvino/native/quantization/test_ov_modeling_compression.py
index 3d7f9d3b4c1..6919c23b3a0 100644
--- a/tests/openvino/native/quantization/test_ov_modeling_compression.py
+++ b/tests/openvino/native/quantization/test_ov_modeling_compression.py
@@ -31,7 +31,6 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
-from nncf.tensor import functions as fns
 from nncf.tensor.definitions import TensorBackend
 from nncf.tensor.functions.numpy_numeric import DTYPE_MAP as DTYPE_MAP_NP
 from nncf.tensor.functions.numpy_numeric import DTYPE_MAP_REV as DTYPE_MAP_REV_NP
@@ -58,16 +57,6 @@ class QuantizationTask(Enum):
     WeightCompressionConfig(CompressWeightsMode.INT4_SYM, group_size=2),
 ]
 
-MAX_MISALIGNMENT_FREQUENCY = {
-    TensorDataType.float32: 1e-2,  # tends to < 5e-6
-    TensorDataType.float16: 1e-2,  # tends to < 5e-5
-    TensorDataType.bfloat16: 1e-2,  # tends to < 5e-4
-}
-
-MAX_MISALIGNMENT_MAGNITUDE = 1
-
-EPS = np.finfo(np.float32).eps
-
 REDUCTION_AXES = (1,)
 
 RANDOM_TENSOR_CACHE_CONTAINER = ResultsCacheContainer()
@@ -107,7 +96,7 @@ def openvino_available(available: bool):
     nncf.import_utils._openvino_available = original_value
 
 
-@pytest.mark.parametrize("weight_shape", [(10000, 4)], ids=[""])
+@pytest.mark.parametrize("weight_shape", [(100000, 4)], ids=[""])
 @pytest.mark.parametrize("config", COMPRESSION_CONFIGS, ids=[str(c) for c in COMPRESSION_CONFIGS])
 @pytest.mark.parametrize(
     ("quantization_task", "tensor_backend"),
@@ -124,10 +113,7 @@ def openvino_available(available: bool):
 )
 @pytest.mark.parametrize("dtype", [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16])
 @pytest.mark.parametrize("precompute_s_zp", [False, True], ids=["no-precompute", "precompute"])
-@pytest.mark.parametrize("static_shapes", [False, True], ids=["dynamic-shapes", "static-shapes"])
-def test_quantization_alignment(
-    weight_shape, config, quantization_task, tensor_backend, dtype, precompute_s_zp, static_shapes
-):
+def test_quantization_alignment(weight_shape, config, quantization_task, tensor_backend, dtype, precompute_s_zp):
     d1, d2 = weight_shape
     group_size = config.group_size
     zero_point_shape = scale_shape = (d1, 1) if group_size == -1 else (d1, d2 // group_size, 1)
@@ -174,7 +160,7 @@ def test_quantization_alignment(
 
                 kwargs = {}
                 if cb == ComputationBackend.OV:
-                    ov_model_params = OVModelParameters(dynamic_shapes=not static_shapes)
+                    ov_model_params = OVModelParameters()
                     kwargs["ov_model_params"] = ov_model_params
                 if quantization_task == QuantizationTask.Q_DQ_RQ:
                     kwargs["return_compressed_weight"] = True
@@ -223,7 +209,7 @@ def test_quantization_alignment(
             decompressed_weight,
         )
 
-    _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape)
+    _check_values(results)
 
 
 def _check_backends_and_dtypes(
@@ -270,59 +256,16 @@ def _check_backends_and_dtypes(
             assert decompressed_weight.dtype == TensorDataType.float32
 
 
-def _check_values(static_shapes, config, precompute_s_zp, dtype, results, precomputed_scale, weight_shape):
+def _check_values(results):
+    # Check that the computed tensors are equal between implementations
     keys = set(results[ComputationBackend.OV]).union(set(results[ComputationBackend.NumPy]))
     for key in keys:
         numpy_result = results[ComputationBackend.NumPy][key]
         ov_result = results[ComputationBackend.OV][key]
 
-        atol = 0
-        scale = None
-        # For static-shaped OV models doing asymmetric compression there maybe misalignments between OV and NumPy
-        # For more details see ticket 156511
-        if static_shapes and config.is_int_asym:
-            if key == "compressed_weight":
-                atol = MAX_MISALIGNMENT_MAGNITUDE
-            elif key == "decompressed_weight":
-                if "scale" in results[ComputationBackend.NumPy]:
-                    scale = results[ComputationBackend.NumPy]["scale"]
-                else:
-                    if precompute_s_zp:
-                        scale = precomputed_scale
-                    else:
-                        weight = get_random_float_tensor(weight_shape, dtype, TensorBackend.numpy)
-                        with openvino_available(False):
-                            _, _, scale, _ = calculate_quantized_dequantized_weight(
-                                weight, config, REDUCTION_AXES, return_compressed_weight=True
-                            )
-                # For decompressed weight the misalignment magnitude depends on the scale
-                atol = MAX_MISALIGNMENT_MAGNITUDE * fns.abs(scale).max().item() + EPS
-            max_misalignment_frequency = MAX_MISALIGNMENT_FREQUENCY[dtype]
-        else:
-            max_misalignment_frequency = None
-
-        # Check that the computed tensors are equal between implementations
+        # Note: For static-shaped OV models doing asymmetric compression with convertable divisions there maybe
+        # misalignments equal to 1 quant between OV and NumPy. For more details see ticket 156511.
+
         np.testing.assert_allclose(
-            ov_result.data, numpy_result.data, atol=atol, rtol=0, err_msg=f"Results do not align for {key}."
+            ov_result.data, numpy_result.data, atol=0, rtol=0, err_msg=f"Results do not align for {key}."
         )
-
-        if max_misalignment_frequency is not None:
-            if key == "compressed_weight":
-                diff = fns.abs(numpy_result.astype(TensorDataType.int32) - ov_result.astype(TensorDataType.int32))
-            else:
-                diff = fns.abs(numpy_result - ov_result)
-
-            if diff.max() > 0:
-                # Check that the proportion of misaligned values is small
-                n_not_equal = fns.sum(diff > 0)
-                assert n_not_equal / numpy_result.size < max_misalignment_frequency
-
-                # Check that the magnitude of misalignment is as small as expected
-                if key == "decompressed_weight":
-                    # Reshape scale to match the shape of decompressed weight
-                    scale = np.repeat(scale.data, diff.shape[-1], axis=-1)
-                    np.testing.assert_array_less(
-                        diff.data,
-                        MAX_MISALIGNMENT_MAGNITUDE * np.abs(scale) + EPS,
-                        err_msg=f"Too large misalignment for {key}.",
-                    )

From 6a2c9fc928bb4862cd2c16d56a9742f6b5e5042e Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 13 Dec 2024 14:53:25 +0100
Subject: [PATCH 66/73] Set dynamic shapes by default

---
 .../algorithms/weight_compression/openvino_backend.py         | 2 +-
 .../algorithms/weight_compression/openvino_modeling.py        | 2 +-
 nncf/tensor/functions/ov.py                                   | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 1bb5ea1adcd..0eaa6b72532 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -245,7 +245,7 @@ def _create_compression_subgraph(
             compression_config,
             layer_scales,
             layer_zero_points,
-            OVModelParameters(dynamic_shapes=False, recompile=True, release_memory=False),
+            OVModelParameters(recompile=True, release_memory=False),
         )
 
         compressed_const = create_ov_const_from_tensor(
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index fa08fd5f7b9..e90d1716f49 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -46,7 +46,7 @@ def __init__(
         self,
         input_dtypes: Optional[Dict[str, TensorDataType]] = None,
         output_dtypes: Optional[Dict[str, TensorDataType]] = None,
-        dynamic_shapes: bool = False,
+        dynamic_shapes: bool = True,
         recompile: bool = False,
         release_memory: bool = True,
         share_inputs: bool = True,
diff --git a/nncf/tensor/functions/ov.py b/nncf/tensor/functions/ov.py
index f277da53d28..9a2d43d79d3 100644
--- a/nncf/tensor/functions/ov.py
+++ b/nncf/tensor/functions/ov.py
@@ -116,9 +116,7 @@ def _astype_ov(a: ov.Tensor, dtype: TensorDataType) -> ov.Tensor:
 
     a_dtype = DTYPE_MAP_REV[a.get_element_type()]
 
-    ov_model_params = OVModelParameters(
-        dynamic_shapes=True, recompile=True, release_memory=False, return_ov_tensors=True
-    )
+    ov_model_params = OVModelParameters(recompile=True, release_memory=False, return_ov_tensors=True)
     ov_model_params.input_dtypes = {"input": a_dtype}
     ov_model_params.output_dtypes = {"output": dtype}
     model = get_astype_model(ov_model_params, tuple(a.shape))

From 92fbba57394b7dd5aebdce84698a5d05e5e2d355 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 12:04:23 +0100
Subject: [PATCH 67/73] Guarantee call order

---
 .../utils/test_cache_results_decorator.py     | 221 +++++++++---------
 1 file changed, 110 insertions(+), 111 deletions(-)

diff --git a/tests/common/utils/test_cache_results_decorator.py b/tests/common/utils/test_cache_results_decorator.py
index 599e41a421d..1a6e3e107c8 100644
--- a/tests/common/utils/test_cache_results_decorator.py
+++ b/tests/common/utils/test_cache_results_decorator.py
@@ -8,7 +8,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import pytest
 
 from nncf.common.utils.decorators import ResultsCacheContainer
 from nncf.common.utils.decorators import cache_results
@@ -21,113 +20,113 @@ def cached_addition(a, b):
     return a + b
 
 
-@pytest.mark.parametrize(
-    "inputs,disable_caching,output,clear_cache,cache_size,ref_cache,ref_access_count",
-    [
-        (
-            (1, 2),
-            False,
-            3,
-            False,
-            1,
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0},
-        ),
-        (
-            (1, 2),
-            False,
-            3,
-            False,
-            1,
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1},
-        ),
-        (
-            (2, 3),
-            True,
-            5,
-            False,
-            1,
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1},
-        ),
-        (
-            (3, 4),
-            False,
-            7,
-            False,
-            2,
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
-            },
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 1,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0,
-            },
-        ),
-        (
-            (1, 2),
-            False,
-            3,
-            False,
-            2,
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
-            },
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0,
-            },
-        ),
-        (
-            (3, 4),
-            False,
-            7,
-            False,
-            2,
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
-            },
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1,
-            },
-        ),
-        (
-            (3, 4),
-            True,
-            7,
-            False,
-            2,
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
-            },
-            {
-                ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
-                ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1,
-            },
-        ),
-        ((3, 4), True, 7, True, 0, {}, {}),
-        (
-            (1, 2),
-            False,
-            3,
-            False,
-            1,
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
-            {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0},
-        ),
-    ],
-)
-def test_caching_results(inputs, disable_caching, output, clear_cache, cache_size, ref_cache, ref_access_count):
-    if clear_cache:
-        TEST_CACHE_CONTAINER.clear()
-    kwargs = {"disable_caching": True} if disable_caching else {}
-    assert cached_addition(*inputs, **kwargs) == output
-    assert len(TEST_CACHE_CONTAINER._cache) == cache_size
-    assert TEST_CACHE_CONTAINER._cache == ref_cache
-    assert TEST_CACHE_CONTAINER._access_count == ref_access_count
+CALL_SEQUENCE = [
+    (
+        (1, 2),
+        False,
+        3,
+        False,
+        1,
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0},
+    ),
+    (
+        (1, 2),
+        False,
+        3,
+        False,
+        1,
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1},
+    ),
+    (
+        (2, 3),
+        True,
+        5,
+        False,
+        1,
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 1},
+    ),
+    (
+        (3, 4),
+        False,
+        7,
+        False,
+        2,
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+        },
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 1,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0,
+        },
+    ),
+    (
+        (1, 2),
+        False,
+        3,
+        False,
+        2,
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+        },
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 0,
+        },
+    ),
+    (
+        (3, 4),
+        False,
+        7,
+        False,
+        2,
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+        },
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1,
+        },
+    ),
+    (
+        (3, 4),
+        True,
+        7,
+        False,
+        2,
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 3,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 7,
+        },
+        {
+            ("cached_addition", frozenset({("a", 1), ("b", 2)})): 2,
+            ("cached_addition", frozenset({("a", 3), ("b", 4)})): 1,
+        },
+    ),
+    ((3, 4), True, 7, True, 0, {}, {}),
+    (
+        (1, 2),
+        False,
+        3,
+        False,
+        1,
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 3},
+        {("cached_addition", frozenset({("a", 1), ("b", 2)})): 0},
+    ),
+]
+
+
+def test_caching_results():
+    for inputs, disable_caching, output, clear_cache, cache_size, ref_cache, ref_access_count in CALL_SEQUENCE:
+        if clear_cache:
+            TEST_CACHE_CONTAINER.clear()
+        kwargs = {"disable_caching": True} if disable_caching else {}
+        assert cached_addition(*inputs, **kwargs) == output
+        assert len(TEST_CACHE_CONTAINER._cache) == cache_size
+        assert TEST_CACHE_CONTAINER._cache == ref_cache
+        assert TEST_CACHE_CONTAINER._access_count == ref_access_count

From b27c720e4e98d3ed67cd311c4a96901b47f3a68c Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 17:02:07 +0100
Subject: [PATCH 68/73] Add convertable_division parameter

---
 .../weight_compression/openvino_backend.py    |  58 +++
 .../weight_compression/openvino_modeling.py   |  18 +-
 .../weight_compression/scale_estimation.py    |  98 +++-
 .../scale_estimation_old.py                   | 424 ++++++++++++++++++
 .../weight_compression/weight_lowering.py     |   6 +-
 5 files changed, 592 insertions(+), 12 deletions(-)
 create mode 100644 nncf/quantization/algorithms/weight_compression/scale_estimation_old.py

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index 0eaa6b72532..bfe00223755 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -352,6 +352,64 @@ def dump_parameters(
 
     def __del__(self):
         clear_ov_model_cache()
+    @staticmethod
+    def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
+        from openvino.properties.hint import inference_precision
+        import openvino as ov
+
+        parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline(
+            config, w_shape, s_shape, z_p_shape, True
+        )
+
+        if len(parameters) == 3:
+            _, s, zp = parameters
+            result = (clamp - zp) * s
+        else:
+            s = parameters[1]
+            result = clamp * s
+
+        model = ov.Model([result], parameters)
+
+        compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32})
+
+        return lambda parameters: compiled_model(parameters)[0]
+
+    @staticmethod
+    def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None,
+                              return_nodes=False):
+        from openvino.properties.hint import inference_precision
+        import openvino as ov
+
+        mode = config.mode
+        assert mode in [
+            CompressWeightsMode.INT4_SYM,
+            CompressWeightsMode.INT4_ASYM,
+        ], f"Only int4 supported, but given={mode}"
+        num_bits = config.num_bits
+
+        asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
+        level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
+        level_high = 2 ** num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
+
+        w = opset.parameter(w_shape, name="w")
+        s = opset.parameter(s_shape, name="s")
+        parameters = [w, s]
+        compressed_w = w / s
+        if z_p_shape is not None:
+            zp = opset.parameter(z_p_shape, name="zp")
+            parameters.append(zp)
+            compressed_w += zp
+
+        result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")
+
+        if return_nodes:
+            return parameters, result
+
+        model = ov.Model([result], parameters)
+
+        compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32})
+
+        return lambda parameters: compiled_model(parameters)[0]
 
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index e90d1716f49..5092eb61978 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -10,6 +10,7 @@
 # limitations under the License.
 
 import copy
+import os
 from dataclasses import dataclass
 from functools import partial
 from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -52,6 +53,7 @@ def __init__(
         share_inputs: bool = True,
         share_outputs: bool = True,
         return_ov_tensors: bool = False,
+        convertable_division: bool = False,
     ):
         """
         :param input_dtypes: Optional dictionary mapping input names to their data types.
@@ -64,6 +66,8 @@ def __init__(
         :param share_inputs: Whether to share input tensors. Avoids cloning inputs for inference.
         :param share_outputs: Whether to share output tensors. Avoids cloning outputs after the inference.
         :param return_ov_tensors: Whether to return results as OpenVINO tensors or NumPy arrays.
+        :param convertable_division: Whether to use convertable division for division operations. If True, division a/b
+            will be transformed at runtime to a*(1/b).
         """
         self.input_dtypes = input_dtypes or {}
         self.output_dtypes = output_dtypes or {}
@@ -73,6 +77,7 @@ def __init__(
         self.share_inputs = share_inputs
         self.share_outputs = share_outputs
         self.return_ov_tensors = return_ov_tensors
+        self.convertable_division = convertable_division
 
     def __copy__(self):
         return OVModelParameters(
@@ -84,6 +89,7 @@ def __copy__(self):
             share_inputs=self.share_inputs,
             share_outputs=self.share_outputs,
             return_ov_tensors=self.return_ov_tensors,
+            convertable_division=self.convertable_division,
         )
 
     def __deepcopy__(self, memo):
@@ -96,6 +102,7 @@ def __deepcopy__(self, memo):
             share_inputs=self.share_inputs,
             share_outputs=self.share_outputs,
             return_ov_tensors=self.return_ov_tensors,
+            convertable_division=self.convertable_division,
         )
 
     def __hash__(self):
@@ -109,6 +116,7 @@ def __hash__(self):
                 self.share_inputs,
                 self.share_outputs,
                 self.return_ov_tensors,
+                self.convertable_division,
             )
         )
 
@@ -334,6 +342,8 @@ def _build_compress_model(
     level_low = 0 if is_int_asym else -(2 ** (num_bits - 1))
     level_high = 2**num_bits - 1 if is_int_asym else 2 ** (num_bits - 1) - 1
 
+    divide_op = opset.divide if ov_model_params.convertable_division else non_convertable_divide
+
     min_values = None
     if scale_shape is not None:
         # Scale is given as an input
@@ -348,7 +358,7 @@ def _build_compress_model(
             min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32)
 
             levels = level_high - level_low + 1
-            scale = non_convertable_divide(max_values - min_values, opset.constant(levels - 1, ov.Type.f32))
+            scale = divide_op(max_values - min_values, opset.constant(levels - 1, ov.Type.f32))
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
         else:
             w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True))
@@ -356,7 +366,7 @@ def _build_compress_model(
             w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32)
 
             scale = opset.select(w_abs_min >= w_max, w_abs_min, opset.negative(w_max))
-            scale = non_convertable_divide(scale, opset.constant(-level_low, ov.Type.f32))
+            scale = divide_op(scale, opset.constant(-level_low, ov.Type.f32))
             scale = opset.select(opset.abs(scale) < eps, eps, scale)
 
     zero_point = None
@@ -368,12 +378,12 @@ def _build_compress_model(
         zero_point = convert_if_needed(zero_point, ov.Type.f32)
     elif is_int_asym:
         # Compute zero point
-        scaled_min_values = non_convertable_divide(min_values, scale)
+        scaled_min_values = divide_op(min_values, scale)
         zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(scaled_min_values)
         zero_point = opset.clamp(zero_point, level_low, level_high)
 
     weight = convert_if_needed(weight, ov.Type.f32)
-    compressed_weight = non_convertable_divide(weight, scale)
+    compressed_weight = divide_op(weight, scale)
 
     if is_int_asym:
         compressed_weight += zero_point
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index af51182a586..948c04be951 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -8,10 +8,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 from copy import deepcopy
 from typing import Any, Dict, List, Optional, Tuple, TypeVar
 
+import numpy as np
+
 import nncf
 from nncf import Dataset
 from nncf.common.graph.graph import NNCFGraph
@@ -25,7 +27,9 @@
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
+from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale, \
+    do_int_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
@@ -38,6 +42,61 @@
 TModel = TypeVar("TModel")
 
 
+def compare_tensors(
+        weight,
+        data_ref: Dict["str", Tensor],
+        data_actual: Dict["str", Tensor],
+        node_name: str,
+        rel_diff_threshold=5e-1,
+        verbosity=2
+):
+    hists = {}
+    stats = {}
+    for name in data_ref:
+        ref = data_ref[name]
+        actual = data_actual[name]
+        try:
+            np.testing.assert_allclose(actual.data, ref.data, atol=0, rtol=0)
+        except Exception as e:
+            not_equal = np.where(ref.data != actual.data)
+            diff = fns.abs(ref - actual).data[not_equal]
+            rel_diff = diff / fns.maximum(fns.abs(ref).data[not_equal], 1e-9)
+            stats[name] = (np.median(rel_diff), rel_diff.max(), len(not_equal[0]) / ref.size)
+
+            is_fp32 = ref.dtype == TensorDataType.float32
+            bins = np.logspace(-10, 2,) if is_fp32 else np.arange(17)
+            hists[name] = np.histogram(diff, bins=bins, density=False)
+
+            if verbosity > 0:
+                print()
+                print(node_name, name)
+                print(str(e).replace("Not equal to tolerance rtol=1e-07, atol=0", "").strip())
+                if verbosity > 1:
+                    # format_str = "{:.2e}"
+                    format_str = "{:.10f}"
+                    zip_arg = (
+                        rel_diff.tolist(),
+                        [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in ref.data[not_equal].tolist()],
+                        [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in actual.data[not_equal].tolist()],
+                    )
+                    if weight is not None:
+                        # zip_arg += ([f"{it:.30f}" for it in weight.data[not_equal].tolist()],)
+                        zip_arg += ([it for it in weight.data[not_equal].tolist()],)
+                    data = list(zip(*zip_arg))
+                    data = list(filter(lambda it: it[0] > (rel_diff_threshold if is_fp32 else 1), data))
+                    if len(data) > 0:
+                        data = sorted(data, key=lambda it: it[0], reverse=True)
+                        data = list(zip(*data))
+                        print("Rel. diff:", [f"{it:.2e}" if is_fp32 else int(it) for it in data[0]][:100])
+                        print("Reference:", data[1][:100])
+                        print("Actual:", data[2][:100])
+                        if weight is not None:
+                            print("Weight:", data[3][:100])
+    if verbosity > 0:
+        print('-' * 50)
+    return hists, stats
+
+
 class ScaleEstimation:
     """
     Scale estimation algorithm implementation.
@@ -79,6 +138,9 @@ def __init__(
 
         self._set_backend_entity(model)
 
+        from nncf.quantization.algorithms.weight_compression.scale_estimation_old import ScaleEstimationOld
+        self.se_old = ScaleEstimationOld(model, name_to_node_mapping, all_weight_params, nodes_to_compress, statistics, subset_size, initial_steps, scale_steps, weight_penalty)
+
     @property
     def available_backends(self) -> List[BackendType]:
         return [BackendType.OPENVINO]
@@ -155,6 +217,24 @@ def apply(
                 self._weight_penalty,
             )
 
+            scale_, zero_point = self.se_old.calculate_quantization_params(
+                self._backend_entity,
+                stats,
+                weight,
+                wp.reduction_axes,
+                config,
+                self._subset_size,
+                self._initial_steps,
+                self._scale_steps,
+                self._weight_penalty,
+            )
+            compare_tensors(
+                None,
+                {"scale": scale_, "zero_point": zero_point},
+                {"scale": scales[weight_name], "zero_point": zero_points[weight_name]},
+                node_name
+            )
+
         return scales, zero_points
 
     @staticmethod
@@ -255,6 +335,9 @@ def calculate_quantization_params(
         zero_scale = 0.001
         zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
 
+        # This is required for alignment with a previous OpenVINO models implementation
+        ov_model_params = OVModelParameters(dynamic_shapes=False, convertable_division=True)
+
         # iterative rectification of initial scale
         for i in range(initial_steps):
             near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
@@ -265,8 +348,10 @@ def calculate_quantization_params(
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
                 out = calculate_quantized_dequantized_weight(
-                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp
+                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp,
+                    ov_model_params=ov_model_params
                 )
+
             q_weights_ = fns.zeros_like(original_weight) + out
             q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
 
@@ -295,7 +380,8 @@ def calculate_quantization_params(
                     out = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 else:
                     out, _, _ = do_int_quantization(
-                        original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp
+                        original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp,
+                        ov_model_params=ov_model_params
                     )
                 compressed_weights = fns.zeros_like(original_weight) + out
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
@@ -310,7 +396,7 @@ def calculate_quantization_params(
                 out = do_nf4_quantization(original_weight, scaled_scale)
             else:
                 out, _, _ = do_int_quantization(
-                    original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp
+                    original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params,
                 )
             compressed_weights = fns.zeros_like(original_weight) + out
 
@@ -324,7 +410,7 @@ def calculate_quantization_params(
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
                 out = calculate_quantized_dequantized_weight(
-                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp
+                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params
                 )
             q_weights_ = fns.zeros_like(original_weight) + out
 
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py b/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py
new file mode 100644
index 00000000000..88455f5b651
--- /dev/null
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py
@@ -0,0 +1,424 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Tuple, TypeVar
+
+import nncf
+from nncf import Dataset
+from nncf.common.graph.graph import NNCFGraph
+from nncf.common.graph.graph import NNCFNode
+from nncf.common.logging.track_progress import track
+from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
+from nncf.common.utils.backend import BackendType
+from nncf.common.utils.backend import get_backend
+from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
+from nncf.parameters import CompressWeightsMode
+from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
+from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
+from nncf.tensor import Tensor
+from nncf.tensor import TensorDataType
+from nncf.tensor import functions as fns
+
+TModel = TypeVar("TModel")
+
+
+class ScaleEstimationOld:
+    """
+    Scale estimation algorithm implementation.
+    """
+
+    compress_decompress_cache = {}
+
+    def __init__(
+        self,
+        model: TModel,
+        name_to_node_mapping: Dict[str, Any],
+        all_weight_params: List[WeightCompressionParameters],
+        nodes_to_compress: List[NNCFNode],
+        statistics: Dict[str, WCTensorStatistic],
+        subset_size: int = 32,
+        initial_steps: int = 5,
+        scale_steps: int = 10,
+        weight_penalty: float = -1.0,
+    ):
+        """
+        :param model: Model for applying algorithm.
+        :param name_to_node_mapping: Name to node mapping for updating node weights.
+        :param all_weight_params: List of all weight parameters.
+        :param nodes_to_compress: List of nodes for processing.
+        :param statistics: Input activation statistics for each node.
+        :param subset_size: The number of samples for scale estimation.
+        :param initial_steps: The number of the steps for absmax scale rectification.
+        :param scale_steps: The number of the steps for grid search scale rectification
+                            from 1.0 to 1.0 - 0.05 * scale_step.
+        :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply.
+        """
+        super().__init__()
+        self.name_to_node_mapping = name_to_node_mapping
+        self._all_weight_params = all_weight_params
+        self._nodes_to_compress = nodes_to_compress
+        self._statistics = statistics
+        self._subset_size = subset_size
+        self._initial_steps = initial_steps
+        self._scale_steps = scale_steps
+        self._weight_penalty = weight_penalty
+
+        self._set_backend_entity(model)
+
+    @property
+    def available_backends(self) -> List[BackendType]:
+        return [BackendType.OPENVINO]
+
+    def _set_backend_entity(self, model: TModel) -> None:
+        """
+        Creates a helper class with a backed-specific logic of the algorithm.
+
+        :param model: Backend-specific input model.
+        :param all_weight_params: List of all weight parameters.
+        :param nodes_to_compress: List of nodes for processing.
+        :param activations: The input activations of the layers considered for compression.
+        """
+
+        model_backend = get_backend(model)
+        if model_backend == BackendType.OPENVINO:
+            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
+
+            self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
+        else:
+            raise nncf.UnsupportedBackendError(
+                "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value)
+            )
+
+    def apply(
+        self,
+        model: TModel,
+        graph: NNCFGraph,
+        statistic_points: Optional[StatisticPointsContainer] = None,
+        dataset: Optional[Dataset] = None,
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
+        """
+        Estimates better scale for the int4 nodes in the model.
+        Minimizes per-group difference between floating point MatMul and
+        MatMul with compressed weights.
+        The algorithm computes weighted scale for the group of weights in MatMul, which
+        shared the same scale.
+
+        :param model: Model for applying algorithm.
+        :param graph: Model graph.
+        :param statistic_points: Statistic points with collected statistics values.
+        :param dataset: A representative dataset for the calibration process.
+        :return: Two dictionaries for estimated scales and zero points for each weight name.
+        """
+
+        scales, zero_points = dict(), dict()
+
+        for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
+            weight_name = wp.weight_name
+            node_name = wp.node_with_weight.node_name
+            config = wp.compression_config
+
+            if config.num_bits != 4 or node_name not in self._statistics:
+                scales[weight_name] = None
+                continue
+
+            stats = self._statistics[node_name]
+
+            weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
+            if len(weight_data) != 1:  # not supported by the algorithm
+                continue
+            _, weight_port_id = weight_data[0]
+
+            weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
+
+            scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
+                self._backend_entity,
+                stats,
+                weight,
+                wp.reduction_axes,
+                config,
+                self._subset_size,
+                self._initial_steps,
+                self._scale_steps,
+                self._weight_penalty,
+            )
+
+        return scales, zero_points
+
+    @staticmethod
+    def calculate_quantization_params(
+        backend_entity: WeightCompressionAlgoBackend,
+        statistics: WCTensorStatistic,
+        weight: Tensor,
+        reduction_axes: Tuple[int, ...],
+        config: WeightCompressionConfig,
+        subset_size: int = 32,
+        initial_steps: int = 5,
+        scale_steps: int = 10,
+        weight_penalty: float = -1.0,
+    ) -> Tensor:
+        """
+        Calculates the quantization parameters for a given set of weights and activations.
+        This function estimates the optimal quantization scale for weight compression by
+        minimizing the difference between floating-point operations and operations with
+        quantized weights.
+
+        The function uses an iterative process:
+        1. Initial scale rectification based on activation statistics.
+        2. A grid search to further refine the scale parameters.
+
+        :param backend_entity: The backend-specific implementation of the weight compression algorithm.
+        :param statistics: The input activations of the layer reduced over batch and sequence length dimensions,
+            together with original activation tensor shapes.
+        :param weight: The weight tensor that is being quantized.
+        :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization.
+        :param config: Configuration parameters for the weight compression, including quantization settings.
+        :param subset_size: The number of samples to use for scale estimation. Defaults to 32.
+        :param initial_steps: The number of steps for initial scale rectification using activation statistics.
+            Defaults to 5.
+        :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10.
+        :param weight_penalty: Penalty coefficient applied to the difference between floating-point
+            and quantized weights. A value of -1 disables the penalty. Defaults to -1.0.
+        :return: A tensor containing the calculated quantization scales and zero points if applicable.
+        """
+        reduction_axis = reduction_axes[0]
+
+        s, X = process_stats(statistics, subset_size)
+
+        weight = weight.astype(TensorDataType.float32)
+        eps = fns.finfo(weight).eps
+
+        if reduction_axis == 0:
+            weight = fns.transpose(weight)
+            reduction_axis = 1
+
+        group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
+        cur_config = deepcopy(config)
+        cur_config.group_size = group_size
+
+        original_weight = fns.zeros_like(weight) + weight
+        if config.mode == CompressWeightsMode.NF4:
+            norm_weight, scale = calculate_normalized_weight_and_fp4_scale(
+                original_weight, reduction_axis, cur_config.group_size
+            )
+            compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True)
+            q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis)
+            zp = None
+        else:
+            compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis, is_numpy=True)
+            if zp is not None:
+                zp = zp.astype(scale.dtype)
+            q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis)
+
+        s = fns.unsqueeze(s, 0)
+        s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size)
+
+        original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size)
+
+        # all weight in group has importance based on corresponding input activations
+        importance = fns.ones_like(original_weight)
+        importance = importance * s
+
+        target, zero_mask = get_target_zero_mask(compressed_weights, zp)
+        importance = fns.where(zero_mask, 0.0, importance)
+
+        # normalize importances for every group of weights to make sum of them equal to 1.0
+        denum = fns.sum(importance, axis=2, keepdims=True)
+        importance = importance / (denum + eps)
+
+        X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
+        q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
+        best_diffs = None
+        result_scale = None
+
+        fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
+        q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)
+
+        # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE
+        min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
+        min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0))
+        if weight_penalty > 0.0:
+            min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
+
+        zp_shape = zp.shape if zp is not None else None
+        key = (config.mode, config.num_bits) + q_weights.shape + scale.shape
+        if zp is not None:
+            key += zp_shape
+        if config.mode != CompressWeightsMode.NF4:
+            if key in ScaleEstimationOld.compress_decompress_cache:
+                compress_decompress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_decompress_model"]
+                compress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_model"]
+            else:
+                compress_decompress_model = backend_entity.get_compress_decompress_pipeline(
+                    config, q_weights.shape, scale.shape, zp_shape
+                )
+                compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape)
+                ScaleEstimationOld.compress_decompress_cache[key] = {
+                    "compress_decompress_model": compress_decompress_model,
+                    "compress_model": compress_model,
+                }
+        scale_sign = scale / fns.abs(scale)
+        zero_scale = 0.001
+        zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+
+        input_tensors = [original_weight.data, None]
+        if zp is not None:
+            input_tensors.append(zp.data)
+        # iterative rectification of initial scale
+        for i in range(initial_steps):
+            near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
+            near_to_ideal_scale = near_to_ideal_scale * scale_sign
+            input_tensors[1] = near_to_ideal_scale.data
+
+            if config.mode == CompressWeightsMode.NF4:
+                g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
+                out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
+            else:
+                out = compress_decompress_model(input_tensors)
+            q_weights_ = fns.zeros_like(original_weight) + out
+            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+
+            ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
+            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
+            if weight_penalty > 0.0:
+                ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
+
+            if best_diffs is None:
+                best_diffs = min_max_scale_diffs
+
+            mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
+
+            best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
+
+            mask = fns.unsqueeze(mask, axis=2)
+
+            if result_scale is None:
+                near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
+            else:
+                near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
+            result_scale = near_to_ideal_scale
+            input_tensors[1] = near_to_ideal_scale.data
+
+            if i < initial_steps - 1:
+                if config.mode == CompressWeightsMode.NF4:
+                    out = do_nf4_quantization(original_weight, near_to_ideal_scale)
+                else:
+                    out = compress_model(input_tensors)
+                compressed_weights = fns.zeros_like(original_weight) + out
+                target, zero_mask = get_target_zero_mask(compressed_weights, zp)
+                zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+
+        # iterative rectification of scale based on grid search
+        for scale_step in range(scale_steps):
+            factor = 1.0 - 0.05 * scale_step
+            scaled_scale = factor * scale
+
+            input_tensors[1] = scaled_scale.data
+            if config.mode == CompressWeightsMode.NF4:
+                out = do_nf4_quantization(original_weight, scaled_scale)
+            else:
+                out = compress_model(input_tensors)
+            compressed_weights = fns.zeros_like(original_weight) + out
+
+            target, zero_mask = get_target_zero_mask(compressed_weights, zp)
+            zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
+            near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
+            near_to_ideal_scale = near_to_ideal_scale * scale_sign
+
+            input_tensors[1] = near_to_ideal_scale.data
+            if config.mode == CompressWeightsMode.NF4:
+                g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
+                out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
+            else:
+                out = compress_decompress_model(input_tensors)
+            q_weights_ = fns.zeros_like(original_weight) + out
+
+            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
+            ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
+            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
+            if weight_penalty > 0.0:
+                ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
+
+            mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
+
+            best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
+
+            mask = fns.unsqueeze(mask, axis=2)
+
+            if result_scale is None:
+                near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
+            else:
+                near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
+            result_scale = near_to_ideal_scale
+
+        if config.group_size == -1:
+            result_scale = fns.squeeze(result_scale, axis=1)
+        if zp is not None and config.group_size == -1:
+            zp = fns.squeeze(zp, axis=1)
+
+        return result_scale, zp
+
+    @staticmethod
+    def activations_to_wc_statistics(activations: List[Tensor]) -> WCTensorStatistic:
+        """
+        Mimic the activation reducing logic from WeightCompression.get_statistic_points.
+
+        :param activations: List of raw activations.
+        :return: Instance of WCTensorStatistic class containing reduced activations and shapes.
+        """
+        mean_values = []
+        shapes = []
+        for act in activations:
+            shapes.append(act.shape)
+            reduction_shape = tuple(range(act.ndim - 1))
+            mean_values.append(fns.mean(act, axis=reduction_shape))
+        wc_statistics = WCTensorStatistic(mean_values, shapes)
+        return wc_statistics
+
+
+def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+    """
+    Computes the target values and a mask indicating zero values in the target.
+
+    :param compressed_weights: The compressed weights tensor.
+    :param zp: The zero point tensor.
+    :return: The compressed weights optionally adjusted by the zero point and
+        a boolean mask indicating positions in the target that are close to zero.
+    """
+    target = compressed_weights
+    if zp is not None:
+        target = target.astype(dtype=zp.dtype) - zp
+    zero_mask = fns.isclose(target, 0)
+    return target, zero_mask
+
+
+def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor:
+    """
+    Estimates scales for the given weight, target, zero mask, and importance.
+
+    :param weight: The weights tensor.
+    :param target: The target values tensor.
+    :param zero_mask: A boolean mask indicating positions in the target that are close to zero.
+    :param importance: The importance values tensor.
+    :return: The estimated scales
+    """
+    ideal_scale = fns.abs(weight) / (fns.abs(target) + zero_mask)
+    weighted_scale = ideal_scale * importance
+    near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True)
+    return near_to_ideal_scale
\ No newline at end of file
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 227ac16342c..0447d8db90b 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -433,6 +433,7 @@ def do_int_quantization(
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
     ov_model_params: Optional = None,
+    is_numpy: bool = False,
 ) -> Tuple[Tensor, Tensor, Tensor]:
     """
     Performs integer quantization on the given weight tensor.
@@ -453,7 +454,7 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
@@ -545,6 +546,7 @@ def calculate_quantized_dequantized_weight(
     precomputed_zero_point: Optional[Tensor] = None,
     return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional = None,
+    is_numpy: bool = False,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
     """
     First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
@@ -560,7 +562,7 @@ def calculate_quantized_dequantized_weight(
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale,
         (and zero point).
     """
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 

From 6ab1c0847a8524ae3a348d52fe19333611e6602f Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 17:07:33 +0100
Subject: [PATCH 69/73] Cleanup

---
 .../weight_compression/openvino_backend.py    |  58 ---
 .../weight_compression/openvino_modeling.py   |  10 +-
 .../weight_compression/scale_estimation.py    | 109 +----
 .../scale_estimation_old.py                   | 424 ------------------
 .../weight_compression/weight_lowering.py     |   8 +-
 5 files changed, 30 insertions(+), 579 deletions(-)
 delete mode 100644 nncf/quantization/algorithms/weight_compression/scale_estimation_old.py

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
index bfe00223755..0eaa6b72532 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py
@@ -352,64 +352,6 @@ def dump_parameters(
 
     def __del__(self):
         clear_ov_model_cache()
-    @staticmethod
-    def get_compress_decompress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None):
-        from openvino.properties.hint import inference_precision
-        import openvino as ov
-
-        parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline(
-            config, w_shape, s_shape, z_p_shape, True
-        )
-
-        if len(parameters) == 3:
-            _, s, zp = parameters
-            result = (clamp - zp) * s
-        else:
-            s = parameters[1]
-            result = clamp * s
-
-        model = ov.Model([result], parameters)
-
-        compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32})
-
-        return lambda parameters: compiled_model(parameters)[0]
-
-    @staticmethod
-    def get_compress_pipeline(config: WeightCompressionConfig, w_shape, s_shape, z_p_shape=None,
-                              return_nodes=False):
-        from openvino.properties.hint import inference_precision
-        import openvino as ov
-
-        mode = config.mode
-        assert mode in [
-            CompressWeightsMode.INT4_SYM,
-            CompressWeightsMode.INT4_ASYM,
-        ], f"Only int4 supported, but given={mode}"
-        num_bits = config.num_bits
-
-        asym_quant = mode in [CompressWeightsMode.INT4_ASYM]
-        level_low = 0 if asym_quant else -(2 ** (num_bits - 1))
-        level_high = 2 ** num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1
-
-        w = opset.parameter(w_shape, name="w")
-        s = opset.parameter(s_shape, name="s")
-        parameters = [w, s]
-        compressed_w = w / s
-        if z_p_shape is not None:
-            zp = opset.parameter(z_p_shape, name="zp")
-            parameters.append(zp)
-            compressed_w += zp
-
-        result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights")
-
-        if return_nodes:
-            return parameters, result
-
-        model = ov.Model([result], parameters)
-
-        compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision: ov.Type.f32})
-
-        return lambda parameters: compiled_model(parameters)[0]
 
 
 class OVAWQAlgoAlgoBackend(AWQAlgoBackend, OVWeightCompressionAlgoBackend):
diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index 5092eb61978..a9c569ea663 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -10,7 +10,6 @@
 # limitations under the License.
 
 import copy
-import os
 from dataclasses import dataclass
 from functools import partial
 from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -32,6 +31,7 @@
 
 TensorList = List[Tensor]
 ModelCallable = Callable[[TensorList], TensorList]
+ReductionAxes = Union[int, Tuple[int, ...]]
 
 
 OV_MODEL_CACHE = ResultsCacheContainer()
@@ -173,7 +173,7 @@ def get_compress_weight_model(
     weight_shape: Tuple,
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
-    reduction_axes: Optional[Tuple] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
     return_nodes: Optional[bool] = False,
 ) -> Union[ModelCallable, ModelAsNodes]:
     """
@@ -222,7 +222,7 @@ def get_compress_decompress_weight_model(
     weight_shape: Tuple,
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
-    reduction_axes: Optional[Tuple] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
     return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
     """
@@ -270,7 +270,7 @@ def _build_compress_model(
     weight_shape: Tuple,
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
-    reduction_axes: Optional[Tuple] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
     return_nodes: bool = False,
 ) -> Union[ModelCallable, ModelAsNodes]:
     is_int_asym = config.is_int_asym
@@ -415,7 +415,7 @@ def _build_compress_decompress_model(
     weight_shape: Tuple,
     scale_shape: Optional[Tuple] = None,
     zero_point_shape: Optional[Tuple] = None,
-    reduction_axes: Optional[Tuple] = None,
+    reduction_axes: Optional[ReductionAxes] = None,
     return_compressed_weight: Optional[bool] = False,
 ) -> ModelCallable:
     default_output_dtypes = {"decompressed_weight": TensorDataType.float32}
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 948c04be951..3330b1f7279 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -8,12 +8,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+
 from copy import deepcopy
 from typing import Any, Dict, List, Optional, Tuple, TypeVar
 
-import numpy as np
-
 import nncf
 from nncf import Dataset
 from nncf.common.graph.graph import NNCFGraph
@@ -28,8 +26,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale, \
-    do_int_dequantization
+from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
@@ -42,61 +39,6 @@
 TModel = TypeVar("TModel")
 
 
-def compare_tensors(
-        weight,
-        data_ref: Dict["str", Tensor],
-        data_actual: Dict["str", Tensor],
-        node_name: str,
-        rel_diff_threshold=5e-1,
-        verbosity=2
-):
-    hists = {}
-    stats = {}
-    for name in data_ref:
-        ref = data_ref[name]
-        actual = data_actual[name]
-        try:
-            np.testing.assert_allclose(actual.data, ref.data, atol=0, rtol=0)
-        except Exception as e:
-            not_equal = np.where(ref.data != actual.data)
-            diff = fns.abs(ref - actual).data[not_equal]
-            rel_diff = diff / fns.maximum(fns.abs(ref).data[not_equal], 1e-9)
-            stats[name] = (np.median(rel_diff), rel_diff.max(), len(not_equal[0]) / ref.size)
-
-            is_fp32 = ref.dtype == TensorDataType.float32
-            bins = np.logspace(-10, 2,) if is_fp32 else np.arange(17)
-            hists[name] = np.histogram(diff, bins=bins, density=False)
-
-            if verbosity > 0:
-                print()
-                print(node_name, name)
-                print(str(e).replace("Not equal to tolerance rtol=1e-07, atol=0", "").strip())
-                if verbosity > 1:
-                    # format_str = "{:.2e}"
-                    format_str = "{:.10f}"
-                    zip_arg = (
-                        rel_diff.tolist(),
-                        [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in ref.data[not_equal].tolist()],
-                        [format_str.format(it) if is_fp32 else f"{int(it):02d}" for it in actual.data[not_equal].tolist()],
-                    )
-                    if weight is not None:
-                        # zip_arg += ([f"{it:.30f}" for it in weight.data[not_equal].tolist()],)
-                        zip_arg += ([it for it in weight.data[not_equal].tolist()],)
-                    data = list(zip(*zip_arg))
-                    data = list(filter(lambda it: it[0] > (rel_diff_threshold if is_fp32 else 1), data))
-                    if len(data) > 0:
-                        data = sorted(data, key=lambda it: it[0], reverse=True)
-                        data = list(zip(*data))
-                        print("Rel. diff:", [f"{it:.2e}" if is_fp32 else int(it) for it in data[0]][:100])
-                        print("Reference:", data[1][:100])
-                        print("Actual:", data[2][:100])
-                        if weight is not None:
-                            print("Weight:", data[3][:100])
-    if verbosity > 0:
-        print('-' * 50)
-    return hists, stats
-
-
 class ScaleEstimation:
     """
     Scale estimation algorithm implementation.
@@ -138,9 +80,6 @@ def __init__(
 
         self._set_backend_entity(model)
 
-        from nncf.quantization.algorithms.weight_compression.scale_estimation_old import ScaleEstimationOld
-        self.se_old = ScaleEstimationOld(model, name_to_node_mapping, all_weight_params, nodes_to_compress, statistics, subset_size, initial_steps, scale_steps, weight_penalty)
-
     @property
     def available_backends(self) -> List[BackendType]:
         return [BackendType.OPENVINO]
@@ -217,24 +156,6 @@ def apply(
                 self._weight_penalty,
             )
 
-            scale_, zero_point = self.se_old.calculate_quantization_params(
-                self._backend_entity,
-                stats,
-                weight,
-                wp.reduction_axes,
-                config,
-                self._subset_size,
-                self._initial_steps,
-                self._scale_steps,
-                self._weight_penalty,
-            )
-            compare_tensors(
-                None,
-                {"scale": scale_, "zero_point": zero_point},
-                {"scale": scales[weight_name], "zero_point": zero_points[weight_name]},
-                node_name
-            )
-
         return scales, zero_points
 
     @staticmethod
@@ -348,8 +269,11 @@ def calculate_quantization_params(
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
                 out = calculate_quantized_dequantized_weight(
-                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp,
-                    ov_model_params=ov_model_params
+                    original_weight,
+                    config,
+                    precomputed_scale=near_to_ideal_scale,
+                    precomputed_zero_point=zp,
+                    ov_model_params=ov_model_params,
                 )
 
             q_weights_ = fns.zeros_like(original_weight) + out
@@ -380,8 +304,11 @@ def calculate_quantization_params(
                     out = do_nf4_quantization(original_weight, near_to_ideal_scale)
                 else:
                     out, _, _ = do_int_quantization(
-                        original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp,
-                        ov_model_params=ov_model_params
+                        original_weight,
+                        config,
+                        precomputed_scale=near_to_ideal_scale,
+                        precomputed_zero_point=zp,
+                        ov_model_params=ov_model_params,
                     )
                 compressed_weights = fns.zeros_like(original_weight) + out
                 target, zero_mask = get_target_zero_mask(compressed_weights, zp)
@@ -396,7 +323,11 @@ def calculate_quantization_params(
                 out = do_nf4_quantization(original_weight, scaled_scale)
             else:
                 out, _, _ = do_int_quantization(
-                    original_weight, config, precomputed_scale=scaled_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params,
+                    original_weight,
+                    config,
+                    precomputed_scale=scaled_scale,
+                    precomputed_zero_point=zp,
+                    ov_model_params=ov_model_params,
                 )
             compressed_weights = fns.zeros_like(original_weight) + out
 
@@ -410,7 +341,11 @@ def calculate_quantization_params(
                 out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
             else:
                 out = calculate_quantized_dequantized_weight(
-                    original_weight, config, precomputed_scale=near_to_ideal_scale, precomputed_zero_point=zp, ov_model_params=ov_model_params
+                    original_weight,
+                    config,
+                    precomputed_scale=near_to_ideal_scale,
+                    precomputed_zero_point=zp,
+                    ov_model_params=ov_model_params,
                 )
             q_weights_ = fns.zeros_like(original_weight) + out
 
diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py b/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py
deleted file mode 100644
index 88455f5b651..00000000000
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation_old.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#      http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from copy import deepcopy
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
-
-import nncf
-from nncf import Dataset
-from nncf.common.graph.graph import NNCFGraph
-from nncf.common.graph.graph import NNCFNode
-from nncf.common.logging.track_progress import track
-from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
-from nncf.common.utils.backend import BackendType
-from nncf.common.utils.backend import get_backend
-from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
-from nncf.parameters import CompressWeightsMode
-from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
-from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
-from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_dequantization
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_dequantization
-from nncf.quantization.algorithms.weight_compression.weight_lowering import do_nf4_quantization
-from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
-from nncf.tensor import Tensor
-from nncf.tensor import TensorDataType
-from nncf.tensor import functions as fns
-
-TModel = TypeVar("TModel")
-
-
-class ScaleEstimationOld:
-    """
-    Scale estimation algorithm implementation.
-    """
-
-    compress_decompress_cache = {}
-
-    def __init__(
-        self,
-        model: TModel,
-        name_to_node_mapping: Dict[str, Any],
-        all_weight_params: List[WeightCompressionParameters],
-        nodes_to_compress: List[NNCFNode],
-        statistics: Dict[str, WCTensorStatistic],
-        subset_size: int = 32,
-        initial_steps: int = 5,
-        scale_steps: int = 10,
-        weight_penalty: float = -1.0,
-    ):
-        """
-        :param model: Model for applying algorithm.
-        :param name_to_node_mapping: Name to node mapping for updating node weights.
-        :param all_weight_params: List of all weight parameters.
-        :param nodes_to_compress: List of nodes for processing.
-        :param statistics: Input activation statistics for each node.
-        :param subset_size: The number of samples for scale estimation.
-        :param initial_steps: The number of the steps for absmax scale rectification.
-        :param scale_steps: The number of the steps for grid search scale rectification
-                            from 1.0 to 1.0 - 0.05 * scale_step.
-        :param weight_penalty: coefficient for penalty between fp and compressed weights. If -1 then doesn't apply.
-        """
-        super().__init__()
-        self.name_to_node_mapping = name_to_node_mapping
-        self._all_weight_params = all_weight_params
-        self._nodes_to_compress = nodes_to_compress
-        self._statistics = statistics
-        self._subset_size = subset_size
-        self._initial_steps = initial_steps
-        self._scale_steps = scale_steps
-        self._weight_penalty = weight_penalty
-
-        self._set_backend_entity(model)
-
-    @property
-    def available_backends(self) -> List[BackendType]:
-        return [BackendType.OPENVINO]
-
-    def _set_backend_entity(self, model: TModel) -> None:
-        """
-        Creates a helper class with a backed-specific logic of the algorithm.
-
-        :param model: Backend-specific input model.
-        :param all_weight_params: List of all weight parameters.
-        :param nodes_to_compress: List of nodes for processing.
-        :param activations: The input activations of the layers considered for compression.
-        """
-
-        model_backend = get_backend(model)
-        if model_backend == BackendType.OPENVINO:
-            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
-
-            self._backend_entity = OVWeightCompressionAlgoBackend(model, self.name_to_node_mapping)
-        else:
-            raise nncf.UnsupportedBackendError(
-                "Cannot return backend-specific AWQ entity because {} is not supported!".format(model_backend.value)
-            )
-
-    def apply(
-        self,
-        model: TModel,
-        graph: NNCFGraph,
-        statistic_points: Optional[StatisticPointsContainer] = None,
-        dataset: Optional[Dataset] = None,
-    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
-        """
-        Estimates better scale for the int4 nodes in the model.
-        Minimizes per-group difference between floating point MatMul and
-        MatMul with compressed weights.
-        The algorithm computes weighted scale for the group of weights in MatMul, which
-        shared the same scale.
-
-        :param model: Model for applying algorithm.
-        :param graph: Model graph.
-        :param statistic_points: Statistic points with collected statistics values.
-        :param dataset: A representative dataset for the calibration process.
-        :return: Two dictionaries for estimated scales and zero points for each weight name.
-        """
-
-        scales, zero_points = dict(), dict()
-
-        for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
-            weight_name = wp.weight_name
-            node_name = wp.node_with_weight.node_name
-            config = wp.compression_config
-
-            if config.num_bits != 4 or node_name not in self._statistics:
-                scales[weight_name] = None
-                continue
-
-            stats = self._statistics[node_name]
-
-            weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
-            if len(weight_data) != 1:  # not supported by the algorithm
-                continue
-            _, weight_port_id = weight_data[0]
-
-            weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
-
-            scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
-                self._backend_entity,
-                stats,
-                weight,
-                wp.reduction_axes,
-                config,
-                self._subset_size,
-                self._initial_steps,
-                self._scale_steps,
-                self._weight_penalty,
-            )
-
-        return scales, zero_points
-
-    @staticmethod
-    def calculate_quantization_params(
-        backend_entity: WeightCompressionAlgoBackend,
-        statistics: WCTensorStatistic,
-        weight: Tensor,
-        reduction_axes: Tuple[int, ...],
-        config: WeightCompressionConfig,
-        subset_size: int = 32,
-        initial_steps: int = 5,
-        scale_steps: int = 10,
-        weight_penalty: float = -1.0,
-    ) -> Tensor:
-        """
-        Calculates the quantization parameters for a given set of weights and activations.
-        This function estimates the optimal quantization scale for weight compression by
-        minimizing the difference between floating-point operations and operations with
-        quantized weights.
-
-        The function uses an iterative process:
-        1. Initial scale rectification based on activation statistics.
-        2. A grid search to further refine the scale parameters.
-
-        :param backend_entity: The backend-specific implementation of the weight compression algorithm.
-        :param statistics: The input activations of the layer reduced over batch and sequence length dimensions,
-            together with original activation tensor shapes.
-        :param weight: The weight tensor that is being quantized.
-        :param reduction_axes: Tuple specifying the axes along which the reduction is performed for quantization.
-        :param config: Configuration parameters for the weight compression, including quantization settings.
-        :param subset_size: The number of samples to use for scale estimation. Defaults to 32.
-        :param initial_steps: The number of steps for initial scale rectification using activation statistics.
-            Defaults to 5.
-        :param scale_steps: The number of steps for refining the scale using a grid search. Defaults to 10.
-        :param weight_penalty: Penalty coefficient applied to the difference between floating-point
-            and quantized weights. A value of -1 disables the penalty. Defaults to -1.0.
-        :return: A tensor containing the calculated quantization scales and zero points if applicable.
-        """
-        reduction_axis = reduction_axes[0]
-
-        s, X = process_stats(statistics, subset_size)
-
-        weight = weight.astype(TensorDataType.float32)
-        eps = fns.finfo(weight).eps
-
-        if reduction_axis == 0:
-            weight = fns.transpose(weight)
-            reduction_axis = 1
-
-        group_size = config.group_size if config.group_size != -1 else weight.shape[reduction_axis]
-        cur_config = deepcopy(config)
-        cur_config.group_size = group_size
-
-        original_weight = fns.zeros_like(weight) + weight
-        if config.mode == CompressWeightsMode.NF4:
-            norm_weight, scale = calculate_normalized_weight_and_fp4_scale(
-                original_weight, reduction_axis, cur_config.group_size
-            )
-            compressed_weights = do_nf4_quantization(norm_weight, scale, is_normalized_weight=True)
-            q_weights = do_nf4_dequantization(compressed_weights, scale, reduction_axis)
-            zp = None
-        else:
-            compressed_weights, scale, zp = do_int_quantization(original_weight, cur_config, reduction_axis, is_numpy=True)
-            if zp is not None:
-                zp = zp.astype(scale.dtype)
-            q_weights = do_int_dequantization(compressed_weights, scale, zp, reduction_axis)
-
-        s = fns.unsqueeze(s, 0)
-        s, _ = reshape_weight_for_grouped_quantization(s, reduction_axis, group_size)
-
-        original_weight, _ = reshape_weight_for_grouped_quantization(original_weight, reduction_axis, group_size)
-
-        # all weight in group has importance based on corresponding input activations
-        importance = fns.ones_like(original_weight)
-        importance = importance * s
-
-        target, zero_mask = get_target_zero_mask(compressed_weights, zp)
-        importance = fns.where(zero_mask, 0.0, importance)
-
-        # normalize importances for every group of weights to make sum of them equal to 1.0
-        denum = fns.sum(importance, axis=2, keepdims=True)
-        importance = importance / (denum + eps)
-
-        X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
-        q_weights, _ = reshape_weight_for_grouped_quantization(q_weights, reduction_axis, group_size)
-        best_diffs = None
-        result_scale = None
-
-        fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
-        q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)
-
-        # metric for minimization with shape [C_OUT, N_GROUPS], N_GROUPS = C_IN / GROUP_SIZE
-        min_max_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-        min_max_scale_diffs = fns.transpose(min_max_scale_diffs, (1, 0))
-        if weight_penalty > 0.0:
-            min_max_scale_diffs += weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1)
-
-        zp_shape = zp.shape if zp is not None else None
-        key = (config.mode, config.num_bits) + q_weights.shape + scale.shape
-        if zp is not None:
-            key += zp_shape
-        if config.mode != CompressWeightsMode.NF4:
-            if key in ScaleEstimationOld.compress_decompress_cache:
-                compress_decompress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_decompress_model"]
-                compress_model = ScaleEstimationOld.compress_decompress_cache[key]["compress_model"]
-            else:
-                compress_decompress_model = backend_entity.get_compress_decompress_pipeline(
-                    config, q_weights.shape, scale.shape, zp_shape
-                )
-                compress_model = backend_entity.get_compress_pipeline(config, q_weights.shape, scale.shape, zp_shape)
-                ScaleEstimationOld.compress_decompress_cache[key] = {
-                    "compress_decompress_model": compress_decompress_model,
-                    "compress_model": compress_model,
-                }
-        scale_sign = scale / fns.abs(scale)
-        zero_scale = 0.001
-        zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
-
-        input_tensors = [original_weight.data, None]
-        if zp is not None:
-            input_tensors.append(zp.data)
-        # iterative rectification of initial scale
-        for i in range(initial_steps):
-            near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
-            near_to_ideal_scale = near_to_ideal_scale * scale_sign
-            input_tensors[1] = near_to_ideal_scale.data
-
-            if config.mode == CompressWeightsMode.NF4:
-                g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
-                out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
-            else:
-                out = compress_decompress_model(input_tensors)
-            q_weights_ = fns.zeros_like(original_weight) + out
-            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
-
-            ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
-            if weight_penalty > 0.0:
-                ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
-
-            if best_diffs is None:
-                best_diffs = min_max_scale_diffs
-
-            mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
-
-            best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
-
-            mask = fns.unsqueeze(mask, axis=2)
-
-            if result_scale is None:
-                near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
-            else:
-                near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
-            result_scale = near_to_ideal_scale
-            input_tensors[1] = near_to_ideal_scale.data
-
-            if i < initial_steps - 1:
-                if config.mode == CompressWeightsMode.NF4:
-                    out = do_nf4_quantization(original_weight, near_to_ideal_scale)
-                else:
-                    out = compress_model(input_tensors)
-                compressed_weights = fns.zeros_like(original_weight) + out
-                target, zero_mask = get_target_zero_mask(compressed_weights, zp)
-                zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
-
-        # iterative rectification of scale based on grid search
-        for scale_step in range(scale_steps):
-            factor = 1.0 - 0.05 * scale_step
-            scaled_scale = factor * scale
-
-            input_tensors[1] = scaled_scale.data
-            if config.mode == CompressWeightsMode.NF4:
-                out = do_nf4_quantization(original_weight, scaled_scale)
-            else:
-                out = compress_model(input_tensors)
-            compressed_weights = fns.zeros_like(original_weight) + out
-
-            target, zero_mask = get_target_zero_mask(compressed_weights, zp)
-            zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
-            near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance)
-            near_to_ideal_scale = near_to_ideal_scale * scale_sign
-
-            input_tensors[1] = near_to_ideal_scale.data
-            if config.mode == CompressWeightsMode.NF4:
-                g_compressed_weighs = do_nf4_quantization(original_weight, near_to_ideal_scale)
-                out = do_nf4_dequantization(g_compressed_weighs, near_to_ideal_scale)
-            else:
-                out = compress_decompress_model(input_tensors)
-            q_weights_ = fns.zeros_like(original_weight) + out
-
-            q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X)
-            ideal_scale_diffs = fns.mean((fp_outs - q_outs) ** 2, axis=-1)
-            ideal_scale_diffs = fns.transpose(ideal_scale_diffs, (1, 0))
-            if weight_penalty > 0.0:
-                ideal_scale_diffs += weight_penalty * fns.mean((q_weights_ - original_weight) ** 2, axis=-1)
-
-            mask = (ideal_scale_diffs > best_diffs).astype(best_diffs.dtype)
-
-            best_diffs = mask * best_diffs + (1.0 - mask) * ideal_scale_diffs
-
-            mask = fns.unsqueeze(mask, axis=2)
-
-            if result_scale is None:
-                near_to_ideal_scale = mask * scale + (1.0 - mask) * near_to_ideal_scale
-            else:
-                near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale
-            result_scale = near_to_ideal_scale
-
-        if config.group_size == -1:
-            result_scale = fns.squeeze(result_scale, axis=1)
-        if zp is not None and config.group_size == -1:
-            zp = fns.squeeze(zp, axis=1)
-
-        return result_scale, zp
-
-    @staticmethod
-    def activations_to_wc_statistics(activations: List[Tensor]) -> WCTensorStatistic:
-        """
-        Mimic the activation reducing logic from WeightCompression.get_statistic_points.
-
-        :param activations: List of raw activations.
-        :return: Instance of WCTensorStatistic class containing reduced activations and shapes.
-        """
-        mean_values = []
-        shapes = []
-        for act in activations:
-            shapes.append(act.shape)
-            reduction_shape = tuple(range(act.ndim - 1))
-            mean_values.append(fns.mean(act, axis=reduction_shape))
-        wc_statistics = WCTensorStatistic(mean_values, shapes)
-        return wc_statistics
-
-
-def get_target_zero_mask(compressed_weights: Tensor, zp: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
-    """
-    Computes the target values and a mask indicating zero values in the target.
-
-    :param compressed_weights: The compressed weights tensor.
-    :param zp: The zero point tensor.
-    :return: The compressed weights optionally adjusted by the zero point and
-        a boolean mask indicating positions in the target that are close to zero.
-    """
-    target = compressed_weights
-    if zp is not None:
-        target = target.astype(dtype=zp.dtype) - zp
-    zero_mask = fns.isclose(target, 0)
-    return target, zero_mask
-
-
-def estimate_scales(weight: Tensor, target: Tensor, zero_mask: Tensor, importance: Tensor) -> Tensor:
-    """
-    Estimates scales for the given weight, target, zero mask, and importance.
-
-    :param weight: The weights tensor.
-    :param target: The target values tensor.
-    :param zero_mask: A boolean mask indicating positions in the target that are close to zero.
-    :param importance: The importance values tensor.
-    :return: The estimated scales
-    """
-    ideal_scale = fns.abs(weight) / (fns.abs(target) + zero_mask)
-    weighted_scale = ideal_scale * importance
-    near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True)
-    return near_to_ideal_scale
\ No newline at end of file
diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
index 0447d8db90b..b6a6c218809 100644
--- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py
+++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -26,7 +26,7 @@
 from nncf.tensor.definitions import TensorBackend
 from nncf.tensor.definitions import TensorDataType
 
-ReductionAxes = Tuple[int, ...]
+ReductionAxes = Union[int, Tuple[int, ...]]
 
 NF4_QUANTILES = np.array(
     [
@@ -433,7 +433,6 @@ def do_int_quantization(
     precomputed_scale: Tensor = None,
     precomputed_zero_point: Tensor = None,
     ov_model_params: Optional = None,
-    is_numpy: bool = False,
 ) -> Tuple[Tensor, Tensor, Tensor]:
     """
     Performs integer quantization on the given weight tensor.
@@ -454,7 +453,7 @@ def do_int_quantization(
             "for asymmetric quantization."
         )
 
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Running time may be improved after installing OpenVINO")
 
@@ -546,7 +545,6 @@ def calculate_quantized_dequantized_weight(
     precomputed_zero_point: Optional[Tensor] = None,
     return_compressed_weight: Optional[bool] = False,
     ov_model_params: Optional = None,
-    is_numpy: bool = False,
 ) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor, Tensor]]:
     """
     First quantizes the given weight tensor and then dequantizes it back to obtain float32 values.
@@ -562,7 +560,7 @@ def calculate_quantized_dequantized_weight(
     :return: Dequantized weight tensor or a tuple containing the decompressed weight, compressed weight, scale,
         (and zero point).
     """
-    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch and not is_numpy
+    accelerate_through_ov = is_openvino_available() and weight.backend != TensorBackend.torch
     if not is_openvino_available() and weight.backend != TensorBackend.torch:
         log_once(logging.INFO, "Compression time may be improved after installing OpenVINO")
 

From a0fe91a52c8cf38865cd65cc84bf80331f9bac9c Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 17:26:55 +0100
Subject: [PATCH 70/73] Add convertable division test

---
 .../openvino/native/test_openvino_modeling.py | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tests/openvino/native/test_openvino_modeling.py b/tests/openvino/native/test_openvino_modeling.py
index b4bb991d592..e41b39b2359 100644
--- a/tests/openvino/native/test_openvino_modeling.py
+++ b/tests/openvino/native/test_openvino_modeling.py
@@ -295,3 +295,25 @@ def test_share_inputs_outputs(mocker, share_inputs, share_outputs, return_ov_ten
         compiled_model.assert_called_once_with(
             [input_tensor.data], share_inputs=share_inputs, share_outputs=share_outputs
         )
+
+
+@pytest.mark.parametrize(
+    "weight,convertable_division,ref_compressed_weight",
+    [
+        ([[0.70361328125, 0.92919921875, 0.37109375, -0.98974609375]], True, [[225, 255, 181, 0]]),
+        ([[0.70361328125, 0.92919921875, 0.37109375, -0.98974609375]], False, [[226, 255, 181, 0]]),
+    ],
+)
+def test_convertable_divison(weight, convertable_division, ref_compressed_weight):
+    ov_model_params = OVModelParameters(
+        input_dtypes={"weight": TensorDataType.float32},
+        dynamic_shapes=not convertable_division,
+        convertable_division=convertable_division,
+    )
+    config = WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)
+
+    weight = np.array(weight, np.float32)
+    ref_compressed_weight = np.array(ref_compressed_weight, np.uint8)
+    model_run_fn = get_compress_weight_model(ov_model_params, config, weight.shape, reduction_axes=(1,))
+    compressed_weight = model_run_fn([Tensor(weight)])[0]
+    np.testing.assert_allclose(compressed_weight.data, ref_compressed_weight, atol=0, rtol=0)

From 97bd61d5c46a2a278918251b15bf04f8dd6cec63 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 17:28:25 +0100
Subject: [PATCH 71/73] Add explicit inference precision

---
 .../weight_compression/openvino_modeling.py   | 55 ++++++++++++-------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
index a9c569ea663..0abad80eb98 100644
--- a/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
+++ b/nncf/quantization/algorithms/weight_compression/openvino_modeling.py
@@ -17,6 +17,7 @@
 import numpy as np
 import openvino as ov
 from openvino._pyopenvino.op import Parameter
+from openvino._pyopenvino.properties.hint import inference_precision
 from openvino.runtime import Node
 from openvino.runtime import opset13 as opset
 
@@ -167,6 +168,32 @@ def _infer_ov_model(
     return outputs
 
 
+def _prepare_compression_model_inputs(
+    ov_model_params,
+    weight_shape: Tuple,
+    scale_shape: Optional[Tuple],
+    zero_point_shape: Optional[Tuple],
+    reduction_axes: Optional[ReductionAxes],
+) -> Tuple[Tuple, Optional[Tuple], Optional[Tuple]]:
+    """
+    Do some input checks and convert static shapes to dynamic shapes if needed.
+    """
+    if scale_shape is None and zero_point_shape is not None:
+        raise Exception("Zero point shape can only be provided if scale shape is provided.")
+    if scale_shape is None and reduction_axes is None:
+        raise ValueError("Reduction axes must be provided if scale shape is not provided.")
+
+    # Set dynamic shapes if needed
+    if ov_model_params.dynamic_shapes:
+        weight_shape = (-1,) * len(weight_shape)
+        if scale_shape is not None:
+            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
+        if zero_point_shape is not None:
+            zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+
+    return weight_shape, scale_shape, zero_point_shape
+
+
 def get_compress_weight_model(
     ov_model_params: OVModelParameters,
     config: WeightCompressionConfig,
@@ -193,16 +220,10 @@ def get_compress_weight_model(
     :return: A model callable that compresses weights using the given configuration. Or a model as nodes, if
         `return_nodes` is True.
     """
-    if scale_shape is None and zero_point_shape is not None:
-        raise Exception("Zero point shape can only be provided if scale shape is provided.")
 
-    # Set dynamic shapes if needed
-    if ov_model_params.dynamic_shapes:
-        weight_shape = (-1,) * len(weight_shape)
-        if scale_shape is not None:
-            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
-        if zero_point_shape is not None:
-            zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+    weight_shape, scale_shape, zero_point_shape = _prepare_compression_model_inputs(
+        ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes
+    )
 
     return _build_compress_model(
         config,
@@ -243,13 +264,9 @@ def get_compress_decompress_weight_model(
         (and zero point) if `return_compressed_weight` is True.
     """
 
-    # Set dynamic shapes if needed
-    if ov_model_params.dynamic_shapes:
-        weight_shape = (-1,) * len(weight_shape)
-        if scale_shape is not None:
-            scale_shape = (-1,) * (len(scale_shape) - 1) + (1,)
-        if zero_point_shape is not None:
-            zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,)
+    weight_shape, scale_shape, zero_point_shape = _prepare_compression_model_inputs(
+        ov_model_params, weight_shape, scale_shape, zero_point_shape, reduction_axes
+    )
 
     return _build_compress_decompress_model(
         config,
@@ -403,7 +420,7 @@ def _build_compress_model(
         return ov_parameters, ov_results, ov_model_params
 
     model = ov.Model(ov_results, ov_parameters)
-    compiled_model = ov.compile_model(model, device_name="CPU")
+    compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
 
     return partial(_infer_ov_model, ov_model_params, compiled_model)
 
@@ -459,7 +476,7 @@ def _build_compress_decompress_model(
 
     ov_results = [decompressed_weight] + ov_results if return_compressed_weight else [decompressed_weight]
     model = ov.Model(ov_results, ov_parameters)
-    compiled_model = ov.compile_model(model, device_name="CPU")
+    compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
 
     return partial(_infer_ov_model, ov_model_params, compiled_model)
 
@@ -497,6 +514,6 @@ def _build_astype_model(ov_model_params: OVModelParameters, arg_shape: Tuple) ->
     arg = opset.parameter(arg_shape, dtype=DTYPE_MAP_OV[input_dtypes["input"]], name="input")
     res = opset.convert(arg, DTYPE_MAP_OV[output_dtypes["output"]])
     model = ov.Model([res], [arg])
-    compiled_model = ov.compile_model(model, device_name="CPU")
+    compiled_model = ov.compile_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32})
 
     return partial(_infer_ov_model, ov_model_params, compiled_model)

From 58963abffee560316e5bb1ef5c89eb6eba1df23a Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 18:10:07 +0100
Subject: [PATCH 72/73] Fix import

---
 .../algorithms/weight_compression/scale_estimation.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
index 3330b1f7279..38116efa1a1 100644
--- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py
+++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -21,11 +21,11 @@
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
+from nncf.import_utils import is_openvino_available
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
-from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_normalized_weight_and_fp4_scale
 from nncf.quantization.algorithms.weight_compression.weight_lowering import calculate_quantized_dequantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_int_quantization
@@ -256,8 +256,13 @@ def calculate_quantization_params(
         zero_scale = 0.001
         zero_mask = zero_scale * zero_mask.astype(original_weight.dtype)
 
-        # This is required for alignment with a previous OpenVINO models implementation
-        ov_model_params = OVModelParameters(dynamic_shapes=False, convertable_division=True)
+        if is_openvino_available():
+            # This is required for alignment with a previous OpenVINO models implementation
+            from nncf.quantization.algorithms.weight_compression.openvino_modeling import OVModelParameters
+
+            ov_model_params = OVModelParameters(dynamic_shapes=False, convertable_division=True)
+        else:
+            ov_model_params = None
 
         # iterative rectification of initial scale
         for i in range(initial_steps):

From ec21996f4c7130347e09dc0f4ba14d4050b3ff38 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 16 Dec 2024 21:21:40 +0100
Subject: [PATCH 73/73] Update tests/post_training/data/wc_reference_data.yaml

---
 tests/post_training/data/wc_reference_data.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 683dc62f401..6c48904c91a 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -23,7 +23,7 @@ tinyllama_int8_data_free_backend_TORCH:
   num_int4: 0
   num_int8: 312
 tinyllama_data_aware_gptq_scale_estimation_stateful_backend_OV:
-  metric_value: 0.88669
+  metric_value: 0.86503
   num_int4: 94
   num_int8: 124
   metrics_xfail_reason: "Issue-148819"